Exemple #1
0
    def testDistributedProxima(self):
        # params
        doc_count, query_count, dimension = 200, 15, 10
        topk = 10
        doc_chunk, query_chunk = 50, 5

        service_ep = 'http://127.0.0.1:' + self.web_port
        with new_session(service_ep) as sess:
            # data
            doc, query = gen_data(doc_count=doc_count,
                                  query_count=query_count,
                                  dimension=dimension)

            df = md.DataFrame(pd.DataFrame(doc),
                              chunk_size=(doc_chunk, dimension))
            q = mt.tensor(query, chunk_size=(query_chunk, dimension))

            index = build_index(df, session=sess)

            # proxima_data
            pk_p, distance_p = proxima_build_and_query(doc, query, topk)

            pk_m, distance_m = search_index(q, topk, index, session=sess)

            # testing
            np.testing.assert_array_equal(pk_p, pk_m)
            np.testing.assert_array_equal(distance_p, distance_m)
Exemple #2
0
def test_build_and_search_index_with_filesystem(setup):
    with tempfile.TemporaryDirectory() as f:
        # params
        doc_count, query_count, dimension = 2000, 50, 10
        topk = 10

        # data
        doc, query = gen_data(doc_count=doc_count,
                              query_count=query_count,
                              dimension=dimension)

        df = md.DataFrame(pd.DataFrame(doc))
        q = mt.tensor(query)

        index = build_index(tensor=df, index_path=f, column_number=2)

        assert len(os.listdir(f)) > 0

        # proxima_data
        pk_p, distance_p = proxima_build_and_query(doc, query, topk)
        pk_m, distance_m = search_index(tensor=q,
                                        topk=topk,
                                        index=index,
                                        row_number=5)

        # testing
        np.testing.assert_array_equal(pk_p, pk_m)
        np.testing.assert_array_equal(distance_p, distance_m)
Exemple #3
0
    def testBuildAndSearchIndexWithFilesystem(self):
        with tempfile.TemporaryDirectory() as f:
            # params
            doc_count, query_count, dimension = 2000, 15, 10
            topk = 10
            doc_chunk, query_chunk = 1000, 5

            # data
            doc, query = gen_data(doc_count=doc_count, query_count=query_count, dimension=dimension)

            df = md.DataFrame(pd.DataFrame(doc), chunk_size=(doc_chunk, dimension))
            q = mt.tensor(query, chunk_size=(query_chunk, dimension))

            index = build_index(df, index_path=f)

            self.assertGreater(len(os.listdir(f)), 0)

            # proxima_data
            pk_p, distance_p = proxima_build_and_query(doc, query, topk)

            pk_m, distance_m = search_index(q, topk, index)

            # testing
            np.testing.assert_array_equal(pk_p, pk_m)
            np.testing.assert_array_equal(distance_p, distance_m)
Exemple #4
0
    def build_and_query(self, doc, query, topk, doc_chunk, query_chunk,
                        threads=1, dimension=None, measure_name=None,
                        index_builder=None, builder_params=None,
                        index_converter=None, index_converter_params=None,
                        index_searcher=None, searcher_params=None,
                        index_reformer=None, index_reformer_params=None):

        if measure_name is None:
            measure_name = "SquaredEuclidean"
        if dimension is None:
            dimension = doc.shape[1]
        if index_builder is None:
            index_builder = "SsgBuilder"
        if builder_params is None:
            builder_params = {}
        if index_converter_params is None:
            index_converter_params = {}
        if index_searcher is None:
            index_searcher = ""
        if searcher_params is None:
            searcher_params = {}
        if index_reformer is None:
            index_reformer = ""
        if index_reformer_params is None:
            index_reformer_params = {}

        doc = md.DataFrame(pd.DataFrame(doc), chunk_size=(doc_chunk, dimension))
        query = mt.tensor(query, chunk_size=(query_chunk, dimension))

        index = build_index(tensor=doc, need_shuffle=False,
                            distance_metric=measure_name, dimension=dimension,
                            index_builder=index_builder, index_builder_params=builder_params,
                            index_converter=index_converter, index_converter_params=index_converter_params,
                            session=self.session)
        paths = index.fetch()
        if not isinstance(paths, list):
            paths = [paths]

        try:
            for path in paths:
                with open(path, 'rb') as f:
                    self.assertGreater(len(f.read()), 0)

            pk2, distance = search_index(tensor=query, threads=threads,
                                         distance_metric=measure_name, dimension=dimension,
                                         topk=topk, index=index, index_searcher=index_searcher,
                                         index_searcher_params=searcher_params,
                                         index_reformer=index_reformer,
                                         index_reformer_params=index_reformer_params,
                                         session=self.session)
            self.assertEqual(pk2.shape, (len(query), topk))
            self.assertEqual(distance.shape, (len(query), topk))
            return pk2, distance
        finally:
            for path in paths:
                os.remove(path)
Exemple #5
0
def test_build_and_search_index_with_filesystem_download(setup):
    with tempfile.TemporaryDirectory() as f:
        # params
        doc_count, query_count, dimension = 2000, 15, 10
        topk = 10
        doc_chunk, query_chunk = 1000, 5

        # data
        doc, query = gen_data(doc_count=doc_count,
                              query_count=query_count,
                              dimension=dimension)

        df = md.DataFrame(pd.DataFrame(doc), chunk_size=(doc_chunk, dimension))
        q = mt.tensor(query, chunk_size=(query_chunk, dimension))

        index = build_index(tensor=df, index_path=f, column_number=2)

        assert len(os.listdir(f)) > 0

        search_index(q[0:5], topk, index)
        search_index(q[5:10], topk, index)
        search_index(q[10:15], topk, index)
Exemple #6
0
def build_and_query(doc,
                    query,
                    topk,
                    column_number,
                    row_number,
                    threads=1,
                    dimension=None,
                    measure_name=None,
                    index_builder=None,
                    builder_params=None,
                    index_converter=None,
                    index_converter_params=None,
                    index_searcher=None,
                    searcher_params=None,
                    index_reformer=None,
                    index_reformer_params=None):

    if measure_name is None:
        measure_name = "SquaredEuclidean"
    if dimension is None:
        dimension = doc.shape[1]
    if index_builder is None:
        index_builder = "SsgBuilder"
    if builder_params is None:
        builder_params = {}
    if index_converter_params is None:
        index_converter_params = {}
    if index_searcher is None:
        index_searcher = ""
    if searcher_params is None:
        searcher_params = {}
    if index_reformer is None:
        index_reformer = ""
    if index_reformer_params is None:
        index_reformer_params = {}

    doc = md.DataFrame(pd.DataFrame(doc))
    query = mt.tensor(query)

    index = build_index(tensor=doc,
                        need_shuffle=False,
                        column_number=column_number,
                        distance_metric=measure_name,
                        dimension=dimension,
                        index_builder=index_builder,
                        index_builder_params=builder_params,
                        index_converter=index_converter,
                        index_converter_params=index_converter_params)
    paths = index.fetch()
    if not isinstance(paths, list):
        paths = [paths]

    try:
        for path in paths:
            with open(path, 'rb') as f:
                assert len(f.read()) > 0

        pk2, distance = search_index(
            tensor=query,
            threads=threads,
            row_number=row_number,
            distance_metric=measure_name,
            dimension=dimension,
            topk=topk,
            index=index,
            index_searcher=index_searcher,
            index_searcher_params=searcher_params,
            index_reformer=index_reformer,
            index_reformer_params=index_reformer_params)
        assert pk2.shape == (len(query), topk)
        assert distance.shape == (len(query), topk)
        return pk2, distance
    finally:
        for path in paths:
            os.remove(path)