Python HnswIndexの例、n2.HnswIndex Pythonの例

コード例 #1

0

ファイルを表示

 def fit(self, X):
     X = numpy.array(X)
     X = X.astype(numpy.float32)
     self._index = HnswIndex(X.shape[1], "L2")
     for el in X:
         self._index.add_data(el)
     self._index.build(m=self._m, n_threads=self._threads)

コード例 #2

0

ファイルを表示

class N2(BaseANN):
    def __init__(self, m):
        threads = 8
        self.name = 'N2(m={}, threads={})'.format(m,threads)
        self._m = m
        self._threads = threads
        self._index = None
        print("Init done")

    def fit(self, X):
        X = numpy.array(X)	
        X = X.astype(numpy.float32)
        self._index = HnswIndex(X.shape[1],"L2")
        print("Shape", X.shape[1])
        for el in X:
            self._index.add_data(el) 
        self._index.build(m=self._m, n_threads=self._threads)
        print("Fit done")

    def query(self, v, n):
        v = v.astype(numpy.float32)
        #print(v)
        #print(n)
        #print("-----------------------------------")
        nns = self._index.search_by_vector(v,n)
        #print("[search_by_vector]: Nearest neighborhoods of vector {}: {}".format(v, nns))
        return nns
    def use_threads(self):
        return False

コード例 #3

0

ファイルを表示

    def test01_most_similar(self):
        set_log_level(1)
        model = self.load_text8_model()
        index = HnswIndex(model.L0.shape[1])
        model.normalize('item')
        for f in model.L0:
            index.add_data(f)
        index.build(n_threads=4)
        index.save('n2.bin')

        par = ParW2V(model)

        model.opt.num_workers = 1
        all_keys = model._idmanager.itemids[::][:10000]
        start_t = time.time()
        [model.most_similar(k, topk=10) for k in all_keys]
        naive_elapsed = time.time() - start_t

        par.num_workers = 4
        start_t = time.time()
        par.most_similar(all_keys, topk=10, repr=True)
        par_elapsed = time.time() - start_t

        start_t = time.time()
        par.set_hnsw_index('n2.bin', 'item')
        par.most_similar(all_keys, topk=10, repr=True)
        ann_elapsed = time.time() - start_t
        self.assertTrue(naive_elapsed > par_elapsed * 1.5 > ann_elapsed * 5.0,
                        msg=f'{naive_elapsed} > {par_elapsed} > {ann_elapsed}')
        index.unload()
        os.remove('n2.bin')

コード例 #4

0

ファイルを表示

ファイル: test_n2.py プロジェクト: RossSong/n2

 def test01_small_invalid_dimension(self):
     index = HnswIndex(30)
     this_is_abnormal = False
     try:
         index.load(self.model_fname)
         this_is_abnormal = True
     except:
         pass
     finally:
         del index
     self.assertFalse(this_is_abnormal)

コード例 #5

0

ファイルを表示

ファイル: test_n2.py プロジェクト: RossSong/n2

    def test_search_by_vector(self):
        f = 3
        i = HnswIndex(f)
        i.add_data([0, 0, 1])
        i.add_data([0, 1, 0])
        i.add_data([1, 0, 0])
        i.build(max_m0=10, m=5)

        self.assertEqual(i.search_by_vector([3, 2, 1], 3), [2, 1, 0])
        self.assertEqual(i.search_by_vector([1, 2, 3], 3), [0, 1, 2])
        self.assertEqual(i.search_by_vector([2, 0, 1], 3), [2, 0, 1])

コード例 #6

0

ファイルを表示

ファイル: test_n2.py プロジェクト: RossSong/n2

    def test_search_by_vector(self):
        f = 2
        i = HnswIndex(f, 'L2')
        i.add_data([2, 2])
        i.add_data([3, 2])
        i.add_data([3, 3])
        i.build()

        self.assertEqual(i.search_by_vector([4, 4], 3), [2, 1, 0])
        self.assertEqual(i.search_by_vector([1, 1], 3), [0, 1, 2])
        self.assertEqual(i.search_by_vector([4, 2], 3), [1, 2, 0])

コード例 #7

0

ファイルを表示

ファイル: test_n2.py プロジェクト: RossSong/n2

 def test02_small_invalid_dimension2(self):
     index = HnswIndex(80)
     this_is_abnormal = False
     try:
         v = [random.gauss(0, 1) for z in xrange(100)]
         index.add_data(v)
         this_is_abnormal = True
     except:
         pass
     finally:
         del index
     self.assertFalse(this_is_abnormal)

コード例 #8

0

ファイルを表示

ファイル: test_n2.py プロジェクト: RossSong/n2

    def test_search_by_id(self):
        f = 3
        i = HnswIndex(f)
        i.add_data([2, 1, 0])
        i.add_data([1, 2, 0])
        i.add_data([0, 0, 1])
        i.build(max_m0=10)

        self.assertEqual(i.search_by_id(0, 3), [0, 1, 2])
        self.assertEqual(i.search_by_id(1, 3), [1, 0, 2])
        self.assertTrue(i.search_by_id(2, 3) in [[2, 0, 1],
                                                 [2, 1, 0]])  # could be either

コード例 #9

0

ファイルを表示

ファイル: test_n2.py プロジェクト: RossSong/n2

    def test_search_by_id(self):
        f = 2
        i = HnswIndex(f, 'L2')
        i.add_data([2, 2])
        i.add_data([3, 2])
        i.add_data([3, 3])
        i.build()

        self.assertEqual(i.search_by_id(0, 3), [0, 1, 2])
        self.assertEqual(i.search_by_id(2, 3), [2, 1, 0])

コード例 #10

0

ファイルを表示

ファイル: test_n2.py プロジェクト: RossSong/n2

 def setUpClass(self):
     index = HnswIndex(self.dim)
     for i in xrange(self.data_num):
         v = [random.gauss(0, 1) for z in xrange(self.dim)]
         index.add_data(v)
     index.build(n_threads=12)
     index.save(self.model_fname)

コード例 #11

0

ファイルを表示

ファイル: example_als.py プロジェクト: yjs03057/buffalo

def example2():
    log.set_log_level(log.INFO)
    als_option = ALSOption().get_default_option()
    data_option = MatrixMarketOptions().get_default_option()
    data_option.input.main = '../tests/ext/ml-20m/main'
    data_option.input.iid = '../tests/ext/ml-20m/iid'
    data_option.data.path = './ml20m.h5py'
    data_option.data.use_cache = True

    als = ALS(als_option, data_opt=data_option)
    als.initialize()
    als.train()
    als.normalize('item')
    als.build_itemid_map()

    print(
        'Make item recommendation on als.ml20m.par.top10.tsv with Paralell(Thread=4)'
    )
    par = ParALS(als)
    par.num_workers = 4
    all_items = als._idmanager.itemids
    start_t = time.time()
    with open('als.ml20m.par.top10.tsv', 'w') as fout:
        for idx in range(0, len(all_items), 128):
            topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True)
            for q, p in zip(all_items[idx:idx + 128], topks):
                fout.write('%s\t%s\n' % (q, '\t'.join(p)))
    print('took: %.3f secs' % (time.time() - start_t))

    from n2 import HnswIndex
    index = HnswIndex(als.Q.shape[1])
    for f in als.Q:
        index.add_data(f)
    index.build(n_threads=4)
    index.save('ml20m.n2.index')
    index.unload()
    print(
        'Make item recommendation on als.ml20m.par.top10.tsv with Ann(Thread=1)'
    )
    par.set_hnsw_index('ml20m.n2.index', 'item')
    par.num_workers = 4
    start_t = time.time()
    with open('als.ml20m.ann.top10.tsv', 'w') as fout:
        for idx in range(0, len(all_items), 128):
            topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True)
            for q, p in zip(all_items[idx:idx + 128], topks):
                fout.write('%s\t%s\n' % (q, '\t'.join(p)))
    print('took: %.3f secs' % (time.time() - start_t))

コード例 #12

0

ファイルを表示

ファイル: benchmark_script.py プロジェクト: zsaladin/n2

    def fit(self, X):
        if self._metric == 'euclidean':
            self._n2 = HnswIndex(X.shape[1], 'L2')
        else:
            self._n2 = HnswIndex(X.shape[1])

        if os.path.exists(self._index_name):
            n2_logger.info("Loading index from file")
            self._n2.load(self._index_name, use_mmap=False)
            return

        n2_logger.debug("Create Index")
        for i, x in enumerate(X):
            self._n2.add_data(x)
        self._n2.build(m=self._m,
                       max_m0=self._m0,
                       ef_construction=self._ef_construction,
                       n_threads=self._n_threads)
        self._n2.save(self._index_name)

コード例 #13

0

ファイルを表示

ファイル: benchmark_script.py プロジェクト: vx0918/n2

class N2(BaseANN):
    def __init__(self, m, ef_construction, n_threads, ef_search, metric, batch):
        self.name = "N2_M%d_efCon%d_n_thread%s_efSearch%d%s" % (m, ef_construction, n_threads, ef_search,
                                                                '_batch' if batch else '')
        self._m = m
        self._m0 = m * 2
        self._ef_construction = ef_construction
        self._n_threads = n_threads
        self._ef_search = ef_search
        self._index_name = os.path.join(CACHE_DIR, "index_n2_%s_M%d_efCon%d_n_thread%s"
                                        % (args.dataset, m, ef_construction, n_threads))
        self._metric = metric

    def fit(self, X):
        if self._metric == 'euclidean':
            self._n2 = HnswIndex(X.shape[1], 'L2')
        elif self._metric == 'dot':
            self._n2 = HnswIndex(X.shape[1], 'dot')
        else:
            self._n2 = HnswIndex(X.shape[1])

        if os.path.exists(self._index_name):
            n2_logger.info("Loading index from file")
            self._n2.load(self._index_name, use_mmap=False)
            return

        n2_logger.info("Create Index")
        for i, x in enumerate(X):
            self._n2.add_data(x)
        self._n2.build(m=self._m, max_m0=self._m0, ef_construction=self._ef_construction, n_threads=self._n_threads)
        self._n2.save(self._index_name)

    def query(self, v, n):
        return self._n2.search_by_vector(v, n, self._ef_search)

    def batch_query(self, X, n):
        self.b_res = self._n2.batch_search_by_vectors(X, n, self._ef_search, self._n_threads)

    def get_batch_results(self):
        return self.b_res

    def __str__(self):
        return self.name

コード例 #14

0

ファイルを表示

    def fit(self, X):
        if self._metric == 'euclidean':
            self._n2 = HnswIndex(X.shape[1], 'L2')
        else:
            self._n2 = HnswIndex(X.shape[1])
        if os.path.exists(self._index_name):
            n2_logger.info("Loading index from file")
            self._n2.load(self._index_name)
        else:
            n2_logger.info("Index file is not exist: {0}".format(
                self._index_name))
            n2_logger.info("Start fitting")

            for i, x in enumerate(X):
                self._n2.add_data(x.tolist())
            self._n2.build(m=self._m,
                           max_m0=self._m0,
                           ef_construction=self._ef_construction,
                           n_threads=self._n_threads)
            self._n2.save(self._index_name)

コード例 #15

0

ファイルを表示

class N2(BaseANN):
    def __init__(self, m):
        threads = 8
        self.name = 'N2(m={}, threads={})'.format(m, threads)
        self._m = m
        self._threads = threads
        self._index = None

    def fit(self, X):
        X = numpy.array(X)
        X = X.astype(numpy.float32)
        self._index = HnswIndex(X.shape[1], "L2")
        for el in X:
            self._index.add_data(el)
        self._index.build(m=self._m, n_threads=self._threads)

    def query(self, v, n):
        v = v.astype(numpy.float32)
        nns = self._index.search_by_vector(v, n)
        return nns

コード例 #16

0

ファイルを表示

ファイル: mgz_tfidf_by_doc.py プロジェクト: seongpil0948/tf-idf

    def build_n2(self):
        t = self.tfs_by_doc
        all_words = []
        mapper = {'from_hnsw': {}, 'from_doc_id': {}}

        # build all_words
        for doc_id in t.keys():
            for word in t[doc_id].keys():
                if word not in all_words:
                    all_words.append(word)
        col_len = len(all_words)

        hnsw = HnswIndex(dimension=col_len, metric='angular')
        for h_idx, doc_id in enumerate(
                tqdm(list(t.keys()), desc="Build N2 Search Space")):
            assert h_idx not in mapper['from_hnsw']
            mapper['from_hnsw'][h_idx] = doc_id
            mapper['from_doc_id'][doc_id] = h_idx
            parchment = np.zeros(col_len, dtype=np.uint16)
            for word, count in t[doc_id].items():
                word_idx = all_words.index(word)
                parchment[word_idx] = count
            hnsw.add_data(parchment)
        hnsw.build(n_threads=4)
        self.n2 = {'hnsw': hnsw, 'mapper': mapper, 'all_words': all_words}

コード例 #17

0

ファイルを表示

ファイル: scoring.py プロジェクト: bestend/kakao_arena-recommendation

def gen_item_index(model):
    article_embedding_matrix = model.get_layer('E-Article').get_weights()[0]

    embedding_size = article_embedding_matrix.shape[1]
    index = HnswIndex(embedding_size)
    for embedding in article_embedding_matrix:
        index.add_data(embedding)
    index.build(n_threads=4)

    article_to_id = load_data('article_to_id')
    id_to_article = {v: k for k, v in article_to_id.items()}

    def most_similar(item, topn=100, threshold=0.3):
        if item not in id_to_article:
            return []

        output = []
        iid = id_to_article[item]
        for tiid in [
                e[0] for e in index.search_by_id(
                    iid, topn * 2, include_distances=True) if e[1] < threshold
        ][1:]:
            target_item = id_to_article[tiid]
            output.append(target_item)
            if len(output) == topn:
                break
        return output

    return most_similar

コード例 #18

0

ファイルを表示

class N2(BaseANN):
    def __init__(self, m, ef_construction, n_threads, ef_search, metric):
        self._m = m
        self._m0 = m * 2
        self._ef_construction = ef_construction
        self._n_threads = n_threads
        self._ef_search = ef_search
        self._index_name = os.path.join(
            INDEX_DIR, "youtube_n2_M%d_efCon%d_n_thread%s" %
            (m, ef_construction, n_threads))
        self.name = "N2_M%d_efCon%d_n_thread%s_efSearch%d" % (
            m, ef_construction, n_threads, ef_search)
        self._metric = metric

        d = os.path.dirname(self._index_name)
        if not os.path.exists(d):
            os.makedirs(d)

    def fit(self, X):
        from n2 import HnswIndex
        if self._metric == 'euclidean':
            self._n2 = HnswIndex(X.shape[1], 'L2')
        else:
            self._n2 = HnswIndex(X.shape[1])
        if os.path.exists(self._index_name):
            logging.debug("Loading index from file")
            self._n2.load(self._index_name)
        else:
            logging.debug("Index file is not exist: {0}".format(
                self._index_name))
            logging.debug("Start fitting")

            for i, x in enumerate(X):
                self._n2.add_data(x.tolist())
            self._n2.build(m=self._m,
                           max_m0=self._m0,
                           ef_construction=self._ef_construction,
                           n_threads=self._n_threads)
            self._n2.save(self._index_name)

    def query(self, v, n):
        return self._n2.search_by_vector(v.tolist(), n, self._ef_search)

    def __str__(self):
        return self.name

コード例 #19

0

ファイルを表示

ファイル: scoring.py プロジェクト: bestend/kakao_arena-recommendation

def get_user_embeddings(user_model, seens_total, user_list, batch_size=10000):
    inputs = [[], [], [], [], []]
    includes = []

    user_embeddings = {}
    for user, seens in tqdm(seens_total.items(), desc='user embedding'):
        seens = seens_total[user]
        if seens:
            includes.append(user)
            sequence_info = get_sequential_feature(user,
                                                   seens['articles'],
                                                   seens['ages'],
                                                   data_type='test',
                                                   random_range=False,
                                                   random_sample_length=False,
                                                   positive=True)
            article_sequence, magazine_sequence, author_sequence, user_feature_sequence, target_age, target = sequence_info
            search_keyword_sequence = get_search_keyword_feature(user)
            inputs[0].append(article_sequence)
            inputs[1].append(magazine_sequence)
            inputs[2].append(author_sequence)
            inputs[3].append(user_feature_sequence)
            inputs[4].append(search_keyword_sequence)

    inputs = [np.asarray(x) for x in inputs]
    predicts = user_model.predict(inputs, batch_size=batch_size)

    user_index = HnswIndex(200)
    for embedding in predicts:
        user_index.add_data(embedding)
    user_index.build(n_threads=multiprocessing.cpu_count())

    user_to_id = {user: i for i, user in enumerate(includes)}
    id_to_user = {v: k for k, v in user_to_id.items()}

    for user in user_list:
        if user in user_to_id:
            user_embeddings[user] = predicts[user_to_id[user]]
        else:
            user_embeddings[user] = None

    def most_similar(user, topn=100, threshold=0.3):
        if user not in user_to_id:
            return []

        output = []
        uid = user_to_id[user]
        for tuid in [
                e[0] for e in user_index.search_by_id(
                    uid, topn * 2, include_distances=True) if e[1] < threshold
        ][1:]:
            target_user = id_to_user[tuid]
            output.append(target_user)
            if len(output) == topn:
                break
        return output

    return user_embeddings, most_similar

コード例 #20

0

ファイルを表示

ファイル: data_helper_dense.py プロジェクト: Ramay7/GNNs

def find_edges(input, test, K):
    print(f"building kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type <= 3:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=10)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("finding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    else:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((index1, index2))
    print(f"done! .... time={time.time()-st_time:.3f}s")
    return edge_list

コード例 #21

0

ファイルを表示

ファイル: test_n2.py プロジェクト: RossSong/n2

    def test_large_index(self):
        # Generate pairs of random points where the pair is super close
        f = 10
        # q = [random.gauss(0, 10) for z in xrange(f)]
        i = HnswIndex(f, 'L2')
        for j in xrange(0, 10000, 2):
            p = [random.gauss(0, 1) for z in xrange(f)]
            x = [1 + pi + random.gauss(0, 1e-2)
                 for pi in p]  # todo: should be q[i]
            y = [1 + pi + random.gauss(0, 1e-2) for pi in p]
            i.add_data(x)
            i.add_data(y)

        i.build()
        for j in xrange(0, 10000, 2):
            self.assertEqual(i.search_by_id(j, 2), [j, j + 1])
            self.assertEqual(i.search_by_id(j + 1, 2), [j + 1, j])

コード例 #22

0

ファイルを表示

def kNN(matrix: np.ndarray, k: int) -> List[float]:
    index = HnswIndex(matrix.shape[1], 'L2')
    for sample in matrix:
        index.add_data(sample)
    index.build(m=32,
                max_m0=48,
                ef_construction=int(k * 1.1),
                n_threads=cpu_count())

    result = []
    for i in range(0, matrix.shape[0]):
        results = index.search_by_id(i, k, include_distances=True)
        result.append(np.mean(np.sqrt(np.array([dist
                                                for _, dist in results]))))
    return np.sort(result)

コード例 #23

0

ファイルを表示

ファイル: test_n2.py プロジェクト: RossSong/n2

 def test04_batch_search_by_ids(self):
     index = HnswIndex(self.dim)
     index.load(self.model_fname)
     T = [random.randrange(0, self.data_num) for y in xrange(100)]
     batch_res = index.batch_search_by_ids(T,
                                           10,
                                           num_threads=12,
                                           include_distances=True)
     normal_res = [
         index.search_by_id(t, 10, include_distances=True) for t in T
     ]
     self.assertEqual(batch_res, normal_res)

コード例 #24

0

ファイルを表示

ファイル: test_n2.py プロジェクト: RossSong/n2

 def test04_batch_search_by_vectors(self):
     index = HnswIndex(self.dim)
     index.load(self.model_fname)
     T = [[random.gauss(0, 1) for z in xrange(self.dim)]
          for y in xrange(100)]
     batch_res = index.batch_search_by_vectors(T,
                                               10,
                                               num_threads=12,
                                               include_distances=True)
     normal_res = [
         index.search_by_vector(t, 10, include_distances=True) for t in T
     ]
     self.assertEqual(batch_res, normal_res)

コード例 #25

0

ファイルを表示

ファイル: test_n2.py プロジェクト: RossSong/n2

 def test03_small_add_data_after_loading(self):
     index = HnswIndex(self.dim)
     index.load(self.model_fname)
     this_is_abnormal = False
     try:
         v = [random.gauss(0, 1) for z in xrange(self.dim)]
         index.add_data(v)
         this_is_abnormal = True
     except:
         pass
     finally:
         del index
     self.assertFalse(this_is_abnormal)

コード例 #26

0

ファイルを表示

ファイル: test_n2.py プロジェクト: RossSong/n2

    def precision(self, n, n_trees=10, n_points=10000, n_rounds=10):
        found = 0
        for r in xrange(n_rounds):
            # create random points at distance x from (1000, 0, 0, ...)
            f = 10
            i = HnswIndex(f, 'L2')
            for j in xrange(n_points):
                p = [random.gauss(0, 1) for z in xrange(f - 1)]
                norm = sum([pi**2 for pi in p])**0.5
                x = [1000] + [pi / norm * j for pi in p]
                i.add_data(x)

            i.build()

            nns = i.search_by_vector([1000] + [0] * (f - 1), n)
            self.assertEqual(nns, sorted(nns))  # should be in order
            # The number of gaps should be equal to the last item minus n-1
            found += len([_x for _x in nns if _x < n])

        return 1.0 * found / (n * n_rounds)

コード例 #27

0

ファイルを表示

ファイル: example_angular.py プロジェクト: oddconcepts/n2o

from n2 import HnswIndex
import random

f = 3
t = HnswIndex(f)  # HnswIndex(f, "L2 or angular")
for i in xrange(1000):
    v = [random.gauss(0, 1) for z in xrange(f)]
    t.add_data(v)

t.build(m=5, max_m0=10, n_threads=4)
t.save('test.n2')

u = HnswIndex(f, "angular")
u.load('test.n2')

search_id = 1
k = 3
neighbor_ids = u.search_by_id(search_id, k)
print(
    "[search_by_id]: Nearest neighborhoods of id {}: {}".format(
        search_id,
        neighbor_ids))

example_vector_query = [random.gauss(0, 1) for z in xrange(f)]
nns = u.search_by_vector(example_vector_query, k, include_distances=True)
print(
    "[search_by_vector]: Nearest neighborhoods of vector {}: {}".format(
        example_vector_query,
        nns))

コード例 #28

0

ファイルを表示

ファイル: test_parallel.py プロジェクト: zzong2006/buffalo

def _buffalo(algo_name, database):
    repeat = 3
    options = {'als': {'num_workers': 4,
                       'compute_loss_on_training': False,
                       'd': 32,
                       'num_iters': 10},
               'bpr': {'num_workers': 4,
                       'compute_loss_on_training': False,
                       'd': 32,
                       'num_iters': 100},
              }
    opt = options[algo_name]

    # linear
    if algo_name == 'als':
        PAR = ParALS
        model = BuffaloLib().als(database, return_instance_before_train=True, **opt)
    elif algo_name == 'bpr':
        PAR = ParBPRMF
        model = BuffaloLib().bpr(database, return_instance_before_train=True, **opt)
    model.train()
    model.build_itemid_map()
    model.normalize('item')

    # parallel
    par = PAR(model)

    # ann
    index = HnswIndex(model.P.shape[1])
    for f in model.P:
        index.add_data(f)
    index.build(n_threads=4)
    index.save('bm_n2.bin')

    ann = PAR(model)
    ann.set_hnsw_index('bm_n2.bin', 'item')

    total_queries = 10000
    keys = model._idmanager.itemids[::][:total_queries]
    print('Total queries: %s' % len(keys))
    results = {}
    nn_opts = {'topk': 10}
    for p, m in [('S', model), ('P', par), ('A', ann)]:
        results[p] = {}
        opt = nn_opts.copy()
        if not isinstance(m, PAR):
            opt['iterable'] = keys
        for num_workers in [1, 2, 4]:
            if isinstance(m, PAR):
                m.num_workers = num_workers
            else:
                m.opt.num_workers = num_workers
            opt['model'] = m
            elapsed, memory_usage = _get_elapsed_time('most_similar',
                                                      keys,
                                                      BuffaloLib(), repeat, **opt)
            s = elapsed / len(keys)
            results[p][f'S={num_workers}'] = s
            results[p][f'E={num_workers}'] = elapsed
            results[p][f'M={num_workers}'] = memory_usage['max']
            results[p][f'A={num_workers}'] = memory_usage['avg']
            results[p][f'B={num_workers}'] = memory_usage['min']
            print(f'{p}M={num_workers} {elapsed} {memory_usage}')
    return results

コード例 #29

0

ファイルを表示

ファイル: data_helper_sparse.py プロジェクト: Ramay7/GNNs

def find_edges(input, test, K):
    print(f"\tbuilding kNN classifier ... ", end=" ")
    st_time = time.time()

    if kNN_type in [1, 2]:
        input, test = input.todense(), test.todense()

    if kNN_type == 1:
        from sklearn.neighbors import NearestNeighbors
        tree = NearestNeighbors(n_neighbors=K + 1, algorithm='ball_tree').fit(input)
    elif kNN_type == 2:
        from scipy import spatial
        tree = spatial.KDTree(input)
    elif kNN_type == 3:
        from n2 import HnswIndex
        tree = HnswIndex(input.shape[1], distance_type) # distance_type in ['angular', 'L2']
        for index in tqdm(range(input.shape[0])):
            tree.add_data(input[index, :])
        tree.build(n_threads=20)
    elif kNN_type == 4:
        import pysparnn.cluster_index as ci
        input_num = input.shape[0]
        tree = ci.MultiClusterIndex(input, range(input_num))
    elif kNN_type == 5:
        import nmslib
        M, efC = 30, 100
        index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post': 0}
        
        space_names = ['l2_sparse', 'cosinesimil_sparse'] # https://github.com/nmslib/nmslib/blob/master/manual/spaces.md
        space_name = space_names[0]
        data_type = nmslib.DataType.SPARSE_VECTOR
        tree = nmslib.init(method='hnsw', space=space_name, data_type=data_type)
        
        '''
        def calc_zero_rows(i):
            if input[i, :].getnnz() == 0:
                return 1
            else:
                return 0
        pool = Pool(num_threads)
        zero_row_num = sum(pool.map(calc_zero_rows, range(input.shape[0])))
        print(f"# zero rows in input = {zero_row_num}", end=" ")
        '''
        tree.addDataPointBatch(input)

        tree.createIndex(index_time_params, print_progress=True)
        # Setting query-time parameters
        efS = 100
        query_time_params = {'efSearch': efS}
        print('Setting query-time parameters', query_time_params, end=" ")
        tree.setQueryTimeParams(query_time_params)
    else:
        raise NotImplementedError
    print(f"time={time.time()-st_time:.3f}s")


    print("\tfinding indices ... ", end=" ")
    if kNN_type == 1:
        _, indices = tree.kneighbors(test)
    elif kNN_type == 2:
        _, indices = tree.query(test, k=K + 1)
    elif kNN_type == 3:
        indices = []
        for i in tqdm(range(test.shape[0])):
            indices.append(tree.search_by_vector(test[i, :], k=K + 1))
    elif kNN_type == 4:
        indices = tree.search(test, k=K+1, k_clusters=100, return_distance=False)
    elif kNN_type == 5:
        '''
        def calc_zero_rows2(i):
            if test[i, :].getnnz() == 0:
                return 1
            else:
                return 0
        pool = Pool(num_threads)
        zero_row_num = sum(pool.map(calc_zero_rows2, range(test.shape[0])))
        print(f"# zero rows in test = {zero_row_num}")
        '''

        indices_ = tree.knnQueryBatch(test, k=K+1, num_threads=num_threads)
        indices = [i[0] for i in indices_]
        del indices_
    else:
        raise NotImplementedError

    print(f"time={time.time()-st_time:.3f}s")


    edge_list = []
    for index1, per in enumerate(indices):
        assert len(per) == K+1, f"index1={index1} len(per)={len(per)} != K={K}"
        for index2 in per:
            index2 = int(index2)
            if index1 != index2:
                edge_list.append((index1, index2))
    print(f"\tget edges done! .... time={time.time()-st_time:.3f}s")
    return edge_list

コード例 #30

0

ファイルを表示

            f.close()
            fg.close()
            fq.close()
            ff.close()
    # N2

    f = open('n2_results.txt', 'a')
    fg = open('n2_fg.txt', 'a')
    fq = open('n2_fq.txt', 'a')
    ff = open('n2_ff.txt', 'a')
    M_vec = [4, 8, 12, 16, 24, 36, 48, 64, 96]
    _k = [10, 20, 40, 80, 120, 200, 400, 600, 800]
    for M in M_vec:
        break
        start_graph = time.time()
        t = HnswIndex(128)
        for i in range(len(xb)):
            t.add_data(xb[i])
        t.build(m=M, ef_construction=500)
        end_graph = time.time()
        for kk in _k:
            print("M:", M, "kk:", kk)

            start_query = time.time()
            accuracy = 0
            for i in range(len(xq)):
                ans = t.search_by_vector(xq[i], k, kk)
                for x in ans:
                    if x in gt[i]:
                        accuracy += 1