def testDistributedProxima(self): # params doc_count, query_count, dimension = 200, 15, 10 topk = 10 doc_chunk, query_chunk = 50, 5 service_ep = 'http://127.0.0.1:' + self.web_port with new_session(service_ep) as sess: # data doc, query = gen_data(doc_count=doc_count, query_count=query_count, dimension=dimension) df = md.DataFrame(pd.DataFrame(doc), chunk_size=(doc_chunk, dimension)) q = mt.tensor(query, chunk_size=(query_chunk, dimension)) index = build_index(df, session=sess) # proxima_data pk_p, distance_p = proxima_build_and_query(doc, query, topk) pk_m, distance_m = search_index(q, topk, index, session=sess) # testing np.testing.assert_array_equal(pk_p, pk_m) np.testing.assert_array_equal(distance_p, distance_m)
def test_build_and_search_index_with_filesystem(setup): with tempfile.TemporaryDirectory() as f: # params doc_count, query_count, dimension = 2000, 50, 10 topk = 10 # data doc, query = gen_data(doc_count=doc_count, query_count=query_count, dimension=dimension) df = md.DataFrame(pd.DataFrame(doc)) q = mt.tensor(query) index = build_index(tensor=df, index_path=f, column_number=2) assert len(os.listdir(f)) > 0 # proxima_data pk_p, distance_p = proxima_build_and_query(doc, query, topk) pk_m, distance_m = search_index(tensor=q, topk=topk, index=index, row_number=5) # testing np.testing.assert_array_equal(pk_p, pk_m) np.testing.assert_array_equal(distance_p, distance_m)
def testBuildAndSearchIndexWithFilesystem(self): with tempfile.TemporaryDirectory() as f: # params doc_count, query_count, dimension = 2000, 15, 10 topk = 10 doc_chunk, query_chunk = 1000, 5 # data doc, query = gen_data(doc_count=doc_count, query_count=query_count, dimension=dimension) df = md.DataFrame(pd.DataFrame(doc), chunk_size=(doc_chunk, dimension)) q = mt.tensor(query, chunk_size=(query_chunk, dimension)) index = build_index(df, index_path=f) self.assertGreater(len(os.listdir(f)), 0) # proxima_data pk_p, distance_p = proxima_build_and_query(doc, query, topk) pk_m, distance_m = search_index(q, topk, index) # testing np.testing.assert_array_equal(pk_p, pk_m) np.testing.assert_array_equal(distance_p, distance_m)
def build_and_query(self, doc, query, topk, doc_chunk, query_chunk, threads=1, dimension=None, measure_name=None, index_builder=None, builder_params=None, index_converter=None, index_converter_params=None, index_searcher=None, searcher_params=None, index_reformer=None, index_reformer_params=None): if measure_name is None: measure_name = "SquaredEuclidean" if dimension is None: dimension = doc.shape[1] if index_builder is None: index_builder = "SsgBuilder" if builder_params is None: builder_params = {} if index_converter_params is None: index_converter_params = {} if index_searcher is None: index_searcher = "" if searcher_params is None: searcher_params = {} if index_reformer is None: index_reformer = "" if index_reformer_params is None: index_reformer_params = {} doc = md.DataFrame(pd.DataFrame(doc), chunk_size=(doc_chunk, dimension)) query = mt.tensor(query, chunk_size=(query_chunk, dimension)) index = build_index(tensor=doc, need_shuffle=False, distance_metric=measure_name, dimension=dimension, index_builder=index_builder, index_builder_params=builder_params, index_converter=index_converter, index_converter_params=index_converter_params, session=self.session) paths = index.fetch() if not isinstance(paths, list): paths = [paths] try: for path in paths: with open(path, 'rb') as f: self.assertGreater(len(f.read()), 0) pk2, distance = search_index(tensor=query, threads=threads, distance_metric=measure_name, dimension=dimension, topk=topk, index=index, index_searcher=index_searcher, index_searcher_params=searcher_params, index_reformer=index_reformer, index_reformer_params=index_reformer_params, session=self.session) self.assertEqual(pk2.shape, (len(query), topk)) self.assertEqual(distance.shape, (len(query), topk)) return pk2, distance finally: for path in paths: os.remove(path)
def test_build_and_search_index_with_filesystem_download(setup): with tempfile.TemporaryDirectory() as f: # params doc_count, query_count, dimension = 2000, 15, 10 topk = 10 doc_chunk, query_chunk = 1000, 5 # data doc, query = gen_data(doc_count=doc_count, query_count=query_count, dimension=dimension) df = md.DataFrame(pd.DataFrame(doc), chunk_size=(doc_chunk, dimension)) q = mt.tensor(query, chunk_size=(query_chunk, dimension)) index = build_index(tensor=df, index_path=f, column_number=2) assert len(os.listdir(f)) > 0 search_index(q[0:5], topk, index) search_index(q[5:10], topk, index) search_index(q[10:15], topk, index)
def build_and_query(doc, query, topk, column_number, row_number, threads=1, dimension=None, measure_name=None, index_builder=None, builder_params=None, index_converter=None, index_converter_params=None, index_searcher=None, searcher_params=None, index_reformer=None, index_reformer_params=None): if measure_name is None: measure_name = "SquaredEuclidean" if dimension is None: dimension = doc.shape[1] if index_builder is None: index_builder = "SsgBuilder" if builder_params is None: builder_params = {} if index_converter_params is None: index_converter_params = {} if index_searcher is None: index_searcher = "" if searcher_params is None: searcher_params = {} if index_reformer is None: index_reformer = "" if index_reformer_params is None: index_reformer_params = {} doc = md.DataFrame(pd.DataFrame(doc)) query = mt.tensor(query) index = build_index(tensor=doc, need_shuffle=False, column_number=column_number, distance_metric=measure_name, dimension=dimension, index_builder=index_builder, index_builder_params=builder_params, index_converter=index_converter, index_converter_params=index_converter_params) paths = index.fetch() if not isinstance(paths, list): paths = [paths] try: for path in paths: with open(path, 'rb') as f: assert len(f.read()) > 0 pk2, distance = search_index( tensor=query, threads=threads, row_number=row_number, distance_metric=measure_name, dimension=dimension, topk=topk, index=index, index_searcher=index_searcher, index_searcher_params=searcher_params, index_reformer=index_reformer, index_reformer_params=index_reformer_params) assert pk2.shape == (len(query), topk) assert distance.shape == (len(query), topk) return pk2, distance finally: for path in paths: os.remove(path)