def precision(f=40, n=1000000): t = AnnoyIndex(f) for i in xrange(n): v = [] for z in xrange(f): v.append(random.gauss(0, 1)) t.add_item(i, v) t.build(2 * f) t.save('test.tree') limits = [10, 100, 1000, 10000] k = 10 prec_sum = {} prec_n = 1000 time_sum = {} for i in xrange(prec_n): j = random.randrange(0, n) print 'finding nbs for', j closest = set(t.get_nns_by_item(j, n)[:k]) for limit in limits: t0 = time.time() toplist = t.get_nns_by_item(j, limit) T = time.time() - t0 found = len(closest.intersection(toplist)) hitrate = 1.0 * found / k prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate time_sum[limit] = time_sum.get(limit, 0.0) + T for limit in limits: print 'limit: %-9d precision: %6.2f%% avg time: %.6fs' % (limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1))
def t1est_large_index_batch(self): print "test_large_index_batch" start_time = int(round(time.time() * 1000)) os.system("rm -rf test_db") os.system("mkdir test_db") # Generate pairs of random points where the pair is super close f = 100 i = AnnoyIndex(f, 12, "test_db", 10, 1000, 3048576000, 0) i_v = [] v_v = [] for j in xrange(0, 100000, 2): p = [random.gauss(0, 1) for z in xrange(f)] f1 = random.random() + 1 f2 = random.random() + 1 x = [f1 * pi + random.gauss(0, 1e-2) for pi in p] y = [f2 * pi + random.gauss(0, 1e-2) for pi in p] i_v.append(j) i_v.append(j+1) v_v.append(x) v_v.append(y) i.add_item_batch(i_v, v_v) i = AnnoyIndex(f, 12, "test_db", 10, 1000, 3048576000, 1) for j in xrange(0, 100000, 2): self.assertEqual(i.get_nns_by_item(j, 2, 50), [j, j+1]) self.assertEqual(i.get_nns_by_item(j+1, 2, 50), [j+1, j]) print "Total time = ", (int(round(time.time() * 1000)) - start_time)/1000
def test_get_nns_by_item_batch(self): print "test_get_nns_by_item_batch " os.system("rm -rf test_db") os.system("mkdir test_db") f = 3 i = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 0) i.add_item_batch([0,1,2], [[2, 1, 0], [1, 2, 0], [0, 0, 1]]) self.assertEqual(i.get_nns_by_item(0, 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_item(1, 3), [1, 0, 2]) self.assertTrue(i.get_nns_by_item(2, 3) in [[2, 0, 1], [2, 1, 0]]) # could be either
def test_get_nns_by_item(self): f = 3 i = AnnoyIndex(f) i.add_item(0, [2, 1, 0]) i.add_item(1, [1, 2, 0]) i.add_item(2, [0, 0, 1]) i.build(10) self.assertEqual(i.get_nns_by_item(0, 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_item(1, 3), [1, 0, 2]) self.assertTrue(i.get_nns_by_item(2, 3) in [[2, 0, 1], [2, 1, 0]]) # could be either
def test_basic_nns(self): f = 100 i = AnnoyIndex(f, 'hamming') u = numpy.random.binomial(1, 0.5, f) v = numpy.random.binomial(1, 0.5, f) i.add_item(0, u) i.add_item(1, v) i.build(10) self.assertEquals(i.get_nns_by_item(0, 99), [0, 1]) self.assertEquals(i.get_nns_by_item(1, 99), [1, 0]) rs, ds = i.get_nns_by_item(0, 99, include_distances=True) self.assertEquals(rs, [0, 1]) self.assertAlmostEqual(ds[0], 0) self.assertAlmostEqual(ds[1], numpy.dot(u-v, u-v))
def test_item_vector_after_save(self): # Issue #279 a = AnnoyIndex(3) a.verbose(True) a.add_item(1, [1, 0, 0]) a.add_item(2, [0, 1, 0]) a.add_item(3, [0, 0, 1]) a.build(-1) self.assertEqual(a.get_n_items(), 4) self.assertEqual(a.get_item_vector(3), [0, 0, 1]) self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3])) a.save('something.annoy') self.assertEqual(a.get_n_items(), 4) self.assertEqual(a.get_item_vector(3), [0, 0, 1]) self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
def test_large_index(self): # Generate pairs of random points where the pair is super close f = 10 i = AnnoyIndex(f, 'manhattan') for j in range(0, 10000, 2): p = [random.gauss(0, 1) for z in range(f)] x = [1 + pi + random.gauss(0, 1e-2) for pi in p] y = [1 + pi + random.gauss(0, 1e-2) for pi in p] i.add_item(j, x) i.add_item(j+1, y) i.build(10) for j in range(0, 10000, 2): self.assertEqual(i.get_nns_by_item(j, 2), [j, j+1]) self.assertEqual(i.get_nns_by_item(j+1, 2), [j+1, j])
def _test_holes_base(self, n, f=100, base_i=100000): annoy = AnnoyIndex(f) for i in range(n): annoy.add_item(base_i + i, numpy.random.normal(size=(f,))) annoy.build(100) res = annoy.get_nns_by_item(base_i, n) self.assertEquals(set(res), set([base_i + i for i in range(n)]))
def make_text_graph(user_lemma_matrix, dimensionality, metric, number_of_estimators, number_of_neighbors): user_lemma_matrix_tfidf = augmented_tf_idf(user_lemma_matrix) # print(user_lemma_matrix_tfidf.shape) if (user_lemma_matrix_tfidf.shape[0] <= dimensionality) or (user_lemma_matrix_tfidf.shape[1] <= dimensionality): X_svd = user_lemma_matrix_tfidf.toarray() else: X_svd = TruncatedSVD(n_components=dimensionality).fit_transform(user_lemma_matrix_tfidf) annoy_index = AnnoyIndex(X_svd.shape[1], metric=metric) for q in range(X_svd.shape[0]): annoy_index.add_item(q, X_svd[q, :]) annoy_index.build(number_of_estimators) row = list() col = list() data = list() for q in range(X_svd.shape[0]): neighbors, distances = annoy_index.get_nns_by_item(q, number_of_neighbors, include_distances=True) row.extend([q] * number_of_neighbors) col.extend(neighbors) data.extend(distances) row = np.array(row, dtype=np.int64) col = np.array(col, dtype=np.int64) data = np.array(data, dtype=np.float64) text_graph = spsp.coo_matrix((data, (row, col)), shape=(X_svd.shape[0], X_svd.shape[0])) text_graph = spsp.csr_matrix(text_graph) return text_graph
def test_zero_vectors(self): # Mentioned on the annoy-user list bitstrings = [ '0000000000011000001110000011111000101110111110000100000100000000', '0000000000011000001110000011111000101110111110000100000100000001', '0000000000011000001110000011111000101110111110000100000100000010', '0010010100011001001000010001100101011110000000110000011110001100', '1001011010000110100101101001111010001110100001101000111000001110', '0111100101111001011110010010001100010111000111100001101100011111', '0011000010011101000011010010111000101110100101111000011101001011', '0011000010011100000011010010111000101110100101111000011101001011', '1001100000111010001010000010110000111100100101001001010000000111', '0000000000111101010100010001000101101001000000011000001101000000', '1000101001010001011100010111001100110011001100110011001111001100', '1110011001001111100110010001100100001011000011010010111100100111', ] vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings] f = 64 idx = AnnoyIndex(f, 'hamming') for i, v in enumerate(vectors): idx.add_item(i, v) idx.build(10) idx.save('idx.ann') idx = AnnoyIndex(f, 'hamming') idx.load('idx.ann') js, ds = idx.get_nns_by_item(0, 5, include_distances=True) self.assertEquals(js[0], 0) self.assertEquals(ds[:4], [0, 1, 1, 22])
def create_walks(df,index_file,patient_dict_file,index_dict_file,n_neighbors = 25,walks_per_patient=10,walk_size=50,out_dir="./"): index = AnnoyIndex(df.shape[1]) index.load(index_file) patient_dict = {} for key, val in csv.reader(open(patient_dict_file)): patient_dict[key] = int(val) index_dict = {} for key, val in csv.reader(open(index_dict_file)): index_dict[int(key)] = val print("Computing nearest-neighbors...") neighbor_dict = {} for i in range(index.get_n_items()): if i % 1000 == 0: print str(i) patient_id = index_dict[i] neighbors = index.get_nns_by_item(i=i, n=n_neighbors, search_k=-1, include_distances=False) neighbor_ids = [index_dict[x] for x in neighbors] neighbor_dict[patient_id] = neighbor_ids f = open(out_dir+"patient_walks.txt", 'wb') for i in range(index.get_n_items()): if i % 1000 == 0: print str(i) patient_id = index_dict[i] patient_sentences = "" for j in range(walks_per_patient): sentence = generate_sentence(start=patient_id,neighbor_dict=neighbor_dict, n_neighbors=n_neighbors,walk_size=walk_size) patient_sentences = sentence + "\n" ## Write it ## f.write(patient_sentences)
def test_get_lots_of_nns(self): f = 10 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [random.gauss(0, 1) for x in xrange(f)]) i.build(10) for j in xrange(100): self.assertEqual(i.get_nns_by_item(0, 999999999), [0])
def do(indextype): a = AnnoyIndex(8, indextype[0]) a.load('points.%s.annoy' % indextype) with open('points.%s.ann.txt' % indextype, 'w') as out: for q_index in [1443, 1240, 818, 1725, 1290, 2031, 1117, 1211, 1902, 603]: nns = a.get_nns_by_item(q_index, 10) print >> out, '%s\t%s' % (q_index, ','.join([str(n) for n in nns]))
def test_large_index(self): # Generate pairs of random points where the pair is super close f = 10 i = AnnoyIndex(f) for j in xrange(0, 10000, 2): p = [random.gauss(0, 1) for z in xrange(f)] f1 = random.random() + 1 f2 = random.random() + 1 x = [f1 * pi + random.gauss(0, 1e-2) for pi in p] y = [f2 * pi + random.gauss(0, 1e-2) for pi in p] i.add_item(j, x) i.add_item(j+1, y) i.build(10) for j in xrange(0, 10000, 2): self.assertEquals(i.get_nns_by_item(j, 2), [j, j+1]) self.assertEquals(i.get_nns_by_item(j+1, 2), [j+1, j])
def t1est_large_index(self): os.system("rm -rf test_db") os.system("mkdir test_db") # Generate pairs of random points where the pair is super close f = 10 i = AnnoyIndex(f, 10, "test_db", 10, 1000, 3048576000) for j in xrange(0, 10000, 2): p = [random.gauss(0, 1) for z in xrange(f)] f1 = random.random() + 1 f2 = random.random() + 1 x = [f1 * pi + random.gauss(0, 1e-2) for pi in p] y = [f2 * pi + random.gauss(0, 1e-2) for pi in p] i.add_item(j, x) i.add_item(j+1, y) for j in xrange(0, 10000, 2): self.assertEqual(i.get_nns_by_item(j, 2), [j, j+1]) self.assertEqual(i.get_nns_by_item(j+1, 2), [j+1, j])
def test_include_dists_check_ranges(self): f = 3 i = AnnoyIndex(f) for j in xrange(100000): i.add_item(j, numpy.random.normal(size=f)) i.build(10) indices, dists = i.get_nns_by_item(0, 100000, include_distances=True) self.assertTrue(max(dists) < 2.0) self.assertAlmostEqual(min(dists), 0.0)
def test_get_nns_by_item(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [2, 2]) i.add_item(1, [3, 2]) i.add_item(2, [3, 3]) i.build(10) self.assertEqual(i.get_nns_by_item(0, 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_item(2, 3), [2, 1, 0])
def test_get_nns_by_item(self): f = 3 i = AnnoyIndex(f) i.add_item(0, [2,1,0]) i.add_item(1, [1,2,0]) i.add_item(2, [0,0,1]) i.build(10) self.assertEquals(i.get_nns_by_item(0, 3), [0,1,2]) self.assertEquals(i.get_nns_by_item(1, 3), [1,0,2])
def test_get_nns_search_k(self): f = 3 i = AnnoyIndex(f) i.add_item(0, [0, 0, 1]) i.add_item(1, [0, 1, 0]) i.add_item(2, [1, 0, 0]) i.build(10) self.assertEqual(i.get_nns_by_item(0, 3, 10), [0, 1, 2]) self.assertEqual(i.get_nns_by_vector([3, 2, 1], 3, 10), [2, 1, 0])
def test_include_dists(self): f = 40 i = AnnoyIndex(f, 'euclidean') v = numpy.random.normal(size=f) i.add_item(0, v) i.add_item(1, -v) i.build(10) indices, dists = i.get_nns_by_item(0, 2, 10, True) self.assertEqual(indices, [0, 1]) self.assertAlmostEqual(dists[0], 0.0)
def test_include_dists(self): # Double checking issue 112 f = 40 i = AnnoyIndex(f) v = numpy.random.normal(size=f) i.add_item(0, v) i.add_item(1, -v) i.build(10) indices, dists = i.get_nns_by_item(0, 2, 10, True) self.assertEqual(indices, [0, 1]) self.assertAlmostEqual(dists[0], 0.0) self.assertAlmostEqual(dists[1], 2.0)
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f, 'manhattan') for j in range(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, i.get_distance(a, b)) u = numpy.array(i.get_item_vector(a)) v = numpy.array(i.get_item_vector(b)) self.assertAlmostEqual(dist, numpy.sum(numpy.fabs(u - v))) self.assertAlmostEqual(dist, sum([abs(float(x)-float(y)) for x, y in zip(u, v)]))
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f, 'dot') for j in range(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, numpy.dot( i.get_item_vector(a), i.get_item_vector(b) )) self.assertEqual(dist, i.get_distance(a, b))
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f, 'euclidean') for j in xrange(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, i.get_distance(a, b)) u = numpy.array(i.get_item_vector(a)) v = numpy.array(i.get_item_vector(b)) # self.assertAlmostEqual(dist, euclidean(u, v)) self.assertAlmostEqual(dist, numpy.dot(u - v, u - v) ** 0.5) self.assertAlmostEqual(dist, sum([(x-y)**2 for x, y in zip(u, v)])**0.5)
def test_save_load(self): f = 100 i = AnnoyIndex(f, 'hamming') u = numpy.random.binomial(1, 0.5, f) v = numpy.random.binomial(1, 0.5, f) i.add_item(0, u) i.add_item(1, v) i.build(10) i.save('blah.ann') j = AnnoyIndex(f, 'hamming') j.load('blah.ann') rs, ds = j.get_nns_by_item(0, 99, include_distances=True) self.assertEquals(rs, [0, 1]) self.assertAlmostEqual(ds[0], 0) self.assertAlmostEqual(ds[1], numpy.dot(u-v, u-v))
def run(self): try: index = AnnoyIndex(self.n_dims, metric='angular') index.load(self.index_filepath) for i in range(self.data_indices[0], self.data_indices[1]): neighbour_indexes = index.get_nns_by_item( i, self.k, search_k=self.search_k, include_distances=False) neighbour_indexes = np.array(neighbour_indexes, dtype=np.uint32) self.results_queue.put( IndexNeighbours(row_index=i, neighbour_list=neighbour_indexes)) except Exception as e: self.exception = e finally: self.results_queue.close()
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f, 'manhattan') for j in xrange(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, i.get_distance(a, b)) u = numpy.array(i.get_item_vector(a)) v = numpy.array(i.get_item_vector(b)) self.assertAlmostEqual(dist, numpy.sum(numpy.fabs(u - v))) self.assertAlmostEqual( dist, sum([abs(float(x) - float(y)) for x, y in zip(u, v)]))
def debug(): f = 40 t = AnnoyIndex(f) # Length of item vector that will be indexed for i in xrange(1000): v = [random.gauss(0, 1) for z in xrange(f)] t.add_item(i, v) t.build(10) # 10 trees t.save('test.ann') # ... u = AnnoyIndex(f) u.load('test.ann') # super fast, will just mmap the file print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors
def test_build_sparse_annoy_index(annoy_index_file): data = np.random.choice([0, 1], size=(10, 5)) sparse_data = csr_matrix(data) index = build_annoy_index(sparse_data, annoy_index_file) assert os.path.exists(annoy_index_file) loaded_index = AnnoyIndex(5, metric='angular') loaded_index.load(annoy_index_file) assert index.f == loaded_index.f == 5 assert index.get_n_items() == loaded_index.get_n_items() == 10 assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5) index.unload() loaded_index.unload()
def get_nearest_neighbor_and_similarity(predictions, image_count, master_image, save_path, model): dims = 1000 if model == Models.VGG19_MODEL: dims = 4096 elif model == Models.INCEPTION_3: dims = 1000 elif model == Models.INCEPTION_RESNET_V2: dims = 25088 n_nearest_neighbors = image_count + 1 trees = 10000 file_index_to_file_vector = {} # Build an index (Approximate Nearest Neighbours) t = AnnoyIndex(dims) # for i in range(preds.shape[0]): i = 0 j = 0 for l in predictions: file_vector = predictions[i] file_index_to_file_vector[i] = file_vector t.add_item(i, file_vector) i += 1 t.build(trees) # t.save(save_path) # for i in range(preds.shape[0]): for o in predictions: master_vector = file_index_to_file_vector[j] # Here we assign master vector, SHOULD be one K named_nearest_neighbors = [] similarities = [] nearest_neighbors = t.get_nns_by_item(j, n_nearest_neighbors) j += 1 # Next we print all the neighbours on one axis, should redo new master and nearest for the second axis to plot for j in nearest_neighbors: # print (j) neighbor_vector = predictions[j] # The distance between objects,/ similarity, cosine for vinkel # similarity = 1 - spatial.distance.cosine(master_vector, neighbor_vector) similarity = 1 - spatial.distance.cosine(master_image, neighbor_vector) rounded_similarity = int((similarity * 10000)) / 10000.0 similarities.append(rounded_similarity) return similarities, nearest_neighbors
def get_similar_items(self, product_id: int, rec_type: int) -> pd.DataFrame: ''' Function that creates recommendation lists. The intuition behind using less components is reducing the number of latent factors that can be inferred. And, by excluding item features for the CAB model, recommendations will be less based off explicit features such as `aisle` and `department`. ------------------- type: 1 - Similar Items [DEFAULT_PARAMS] 2 - Complement Items [CAB_PARAMS] ''' logging.info( f'Logging recommendations for {self.model.config.ANNOY_PARAMS[rec_type]}' ) if rec_type == 1: annoy_model = AnnoyIndex( self.model.config.LIGHTFM_PARAMS['no_components']) annoy_model.load(self.config.PATHS.models + '/item.ann') elif rec_type == 2: annoy_model = AnnoyIndex( self.model.config.LIGHTFM_CAB_PARAMS['no_components']) annoy_model.load(self.config.PATHS.models + '/item_cab.ann') similar_variants = annoy_model.get_nns_by_item( product_id, self.model.config.ANNOY_PARAMS['nn_count'], search_k=-1, include_distances=False) logging.info(type(similar_variants)) logging.info(similar_variants) similar_variants_df = self.item_df.iloc[similar_variants, :] similarVariantsTable = PrettyTable( ['product_id', 'product_name', 'aisle', 'department', 'num']) similarVariantsTable.add_row([ similar_variants_df['product_id'], similar_variants_df['product_name'], similar_variants_df['aisle'], similar_variants_df['department'], similar_variants_df['num'] ]) logging.info( f'{self.model.config.ANNOY_PARAMS[rec_type]} Data: \n{similarVariantsTable}' ) return similar_variants_df
def test_holes_more(self): f = 10 index = AnnoyIndex(f) valid_indices = random.sample(range(2000), 1000) # leave holes for i in valid_indices: v = numpy.random.normal(size=(f,)) index.add_item(i, v) index.build(10) for i in valid_indices: js = index.get_nns_by_item(i, 10000) for j in js: self.assertTrue(j in valid_indices) for i in range(1000): v = numpy.random.normal(size=(f,)) js = index.get_nns_by_vector(v, 10000) for j in js: self.assertTrue(j in valid_indices)
def test_random_holes(self): f = 10 index = AnnoyIndex(f) valid_indices = random.sample(range(2000), 1000) # leave holes for i in valid_indices: v = numpy.random.normal(size=(f,)) index.add_item(i, v) index.build(10) for i in valid_indices: js = index.get_nns_by_item(i, 10000) for j in js: self.assertTrue(j in valid_indices) for i in range(1000): v = numpy.random.normal(size=(f,)) js = index.get_nns_by_vector(v, 10000) for j in js: self.assertTrue(j in valid_indices)
def main(): """ Main function """ # Building a k-nearest neighbor graph using annoy and cosine distance annoy = AnnoyIndex(len(DATA.columns), metric="angular") annoy_graph = [] for i, v in enumerate(DATA.values): annoy.add_item(i, v) annoy.build(10) for i in range(len(DATA)): for j in annoy.get_nns_by_item(i, 10): annoy_graph.append( (i, j, cosine_distance(DATA.values[i], DATA.values[j]))) # Creating the tmap layout x, y, s, t, _ = tm.layout_from_edge_list(len(DATA), annoy_graph) faerun = Faerun(view="front", coords=False) faerun.add_scatter( "MINIBOONE", { "x": x, "y": y, "c": LABELS, "labels": LABELS }, shader="smoothCircle", colormap="Set1", point_scale=2.0, max_point_size=20, has_legend=True, categorical=True, legend_labels={(0, "Noise"), (1, "Signal")}, ) faerun.add_tree( "MINIBOONE_tree", { "from": s, "to": t }, point_helper="MINIBOONE", color="#666666", ) faerun.plot("miniboone", template="default")
def test1(self): rows = self.query_country_name('%') annoyIndex = AnnoyIndex(768) # for i,row in enumerate(rows): # encode=self.bc.encode([row[1]]) # annoyIndex.add_item(i,encode[0]) # annoyIndex.build(10) # annoyIndex.save('articles') annoyIndex.load('articles') result, index = annoyIndex.get_nns_by_item(10, 5, include_distances=True) print(rows[10]) print(np.cos(index)) for i in result: print(rows[i])
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f, 'euclidean') for j in xrange(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, i.get_distance(a, b)) u = numpy.array(i.get_item_vector(a)) v = numpy.array(i.get_item_vector(b)) # self.assertAlmostEqual(dist, euclidean(u, v)) self.assertAlmostEqual(dist, numpy.dot(u - v, u - v)**0.5) self.assertAlmostEqual( dist, sum([(x - y)**2 for x, y in zip(u, v)])**0.5)
class Index: def __init__(self, directory): print('loading index from "%s"' % directory) self.vector_filename = directory + '/vectors.npy' self.index_filename = self.vector_filename + '.ann' self.images_filename = directory + '/names.txt' self.description_filename = directory + '/descriptions.txt' self.image_directory = directory + '/images' self.vectors = np.load(self.vector_filename, mmap_mode='r') dim = self.vectors.shape[1] if not os.path.isfile(self.index_filename): print('building index') index = AnnoyIndex(dim, metric='dot') for i in range(self.vectors.shape[0]): index.add_item(i, self.vectors[i]) if i % 1000 == 0: print(i) index.build(20) index.save(self.index_filename) self.index = AnnoyIndex(dim, metric='dot') self.index.load(self.index_filename) with open(self.images_filename) as fp: self.images = [x.strip().split('\t')[0] for x in fp.readlines()] self.descriptions = collections.defaultdict(str) if os.path.exists(self.description_filename): with open(self.description_filename) as fp: for i, line in enumerate(fp.readlines()): self.descriptions[i] = line.strip() def closest(self, i, n=10): return [(1, i)] + list( map(lambda x: (x[1], x[0]), zip(*self.index.get_nns_by_item(i, n, include_distances=True)))) def image(self, i): if i >= 0 and i < len(self.images): return self.image_directory + '/' + self.images[i] return None def description(self, i): return self.descriptions[i]
def nearest_neighbors(collection, num_neighbors=10, n_trees=100): """ Finds the num_neighbors nearest neighbors to each cell in the sparse matrix Return result is a dictionary of lists, where the key is an index into the cells, and the value is the neighbors of that cell """ nn_idx = AnnoyIndex(collection.num_genes()) # Add the elements in reverse order because Annoy allocates the memory based on # the value of the element added - so adding in increasing order will trigger # lots of allocations for i in range(collection.num_cells() - 1, -1, -1): nn_idx.add_item(i, collection.get_cell_expression_vector(i)) nn_idx.build(n_trees) return { i: nn_idx.get_nns_by_item(i, num_neighbors) for i in range(collection.num_cells()) }
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f) for j in xrange(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, i.get_distance(a, b)) u = i.get_item_vector(a) v = i.get_item_vector(b) u_norm = numpy.array(u) * numpy.dot(u, u)**-0.5 v_norm = numpy.array(v) * numpy.dot(v, v)**-0.5 # cos = numpy.clip(1 - cosine(u, v), -1, 1) # scipy returns 1 - cos self.assertAlmostEqual(dist, numpy.dot(u_norm - v_norm, u_norm - v_norm) ** 0.5) # self.assertAlmostEqual(dist, (2*(1 - cos))**0.5) self.assertAlmostEqual(dist, sum([(x-y)**2 for x, y in zip(u_norm, v_norm)])**0.5)
def nearest_neighbor_search(self, GE_csc): K = self.num_of_neighbor * 2 n, d = GE_csc.shape t = AnnoyIndex(d, "angular") for i in range(n): t.add_item(i, GE_csc[i, :]) t.build(100) t.save('test.ann') u = AnnoyIndex(d, "angular") u.load('test.ann') os.remove('test.ann') val = np.zeros((n, K)) ind = np.zeros((n, K)) for i in range(n): tmp, tmp1 = u.get_nns_by_item(i, K, include_distances=True) ind[i, :] = tmp val[i, :] = tmp1 return ind.astype('int'), val
def cross_community_search(save_name, dimension=8): predictions = [] t = AnnoyIndex(dimension, metric='euclidean') t.load(saved_objs_path + '/' + save_name) user_threshold = 1682 close_nb_amount = 30 actual_nb_amount = 20 ds = get_dataset() g = ds.original_graph for src_node in range(1682, 1682 + 942): nn_nodes, distances = t.get_nns_by_item(src_node, close_nb_amount, include_distances=True) each_user_close_nb = {} for idx, dst_node in enumerate(nn_nodes): # remove non-user node if dst_node - user_threshold > 0: continue # cal weigh of each path paths = nx.shortest_path(g, source=src_node, target=dst_node, weight='score') path_length = nx.shortest_path_length(g, source=src_node, target=dst_node, weight='score') if path_length == 0: continue print(idx, paths, path_length) real_rating = g[src_node][paths[1]]['score'] pred_rating = path_length / len(paths) pred = Prediction(src_node, paths[1], real_rating, pred_rating, {}) predictions.append(pred) rmse, prec, rec, ils_sim = evaluate_pred(g, predictions) ds_name = 'movielens' algo_name = 'GraphRec' with open(f'eval_{ds_name}.csv', 'a') as f: f.write( f'{ds_name}_{algo_name},rmse,{rmse},precision,{prec},recall,{rec},ils,{ils_sim}\n' )
def annoy_search(ref_index, features, tree_size, recommended_item_size, _metric): start = time.time() index = df.loc[ref_index]['features'] f = len(index) t = AnnoyIndex(f, metric=_metric) print("Metric use: ", _metric) for e in range(len(features)): t.add_item(e, features[e]) t.build(tree_size, n_jobs=-1) similar_img_ids, distances = t.get_nns_by_item(ref_index, recommended_item_size, include_distances=True) end = time.time() annoy_runtime = end - start print("ANNOY Runtime: ", end - start) return similar_img_ids, distances, annoy_runtime
def search(self, seed, k=5): ''' seed: seed item to find nearest neighbor k: number of cloest neighhbors ''' a = AnnoyIndex(self.dimension, 'angular') words = self.data['user'].unique().tolist() + self.data['item'].unique( ).tolist() le = preprocessing.LabelEncoder() le.fit(words) for word in words: a.add_item(le.transform([word])[0], self.w2v_model.wv[word]) a.build(-1) a_return = a.get_nns_by_item(le.transform([seed])[0], k) return le.inverse_transform(a_return)
def generate_neighbors(self, neighbors, **kwargs): # Extract parameters if provided in kwargs. dimension = 1 # Length of item vector that will be indexed metric = kwargs.get('metric', 'euclidean') num_trees = kwargs.get('num_trees', 10) # Build tree with the given data. t = AnnoyIndex(dimension, metric) for i in range(self.num_samples): t.add_item(i, [self.samples[i].item()]) t.build(num_trees) # Generate neighbor map array. neighbor_map = np.zeros((self.num_samples, neighbors)) for i in range(self.num_samples): nearest_neighbors = t.get_nns_by_item(i, neighbors) neighbor_map[i, :] = nearest_neighbors self.neighbor_map = neighbor_map.astype(int)
def representative_sample(X, num_samples, save=False): """Sample vectors in X, prefering edge cases and vectors farthest from other vectors in sample set """ X = X.values if hasattr(X, 'values') else np.array(X) N, M = X.shape rownums = np.arange(N) np.random.shuffle(rownums) idx = AnnoyIndex(M) for i, row in enumerate(X): idx.add_item(i, row) idx.build(int(np.log2(N)) + 1) if save: if isinstance(save, (bytes, str)): idxfilename = save else: idxfile = tempfile.NamedTemporaryFile(delete=False) idxfile.close() idxfilename = idxfile.name idx.save(idxfilename) idx = AnnoyIndex(M) idx.load(idxfile.name) samples = -1 * np.ones(shape=(num_samples, ), dtype=int) samples[0] = rownums[0] # FIXME: some integer determined by N and num_samples and distribution j, num_nns = 0, min(1000, int(num_samples / 2. + 1)) for i in rownums: if i in samples: continue nns = idx.get_nns_by_item(i, num_nns) # FIXME: pick vector furthest from past K (K > 1) points or outside of a hypercube (sized to uniformly fill the space) around the last sample try: samples[j + 1] = np.setdiff1d(nns, samples)[-1] except: samples[j + 1] if len(num_nns) < num_samples / 3.: num_nns = min(N, 1.3 * num_nns) j += 1 return samples
def test_holes_more(self): f = 10 index = AnnoyIndex(f) valid_indices = set() for i in range(1000): i2 = int(i*2**-0.5) # leave holes every few items valid_indices.add(i2) v = numpy.random.normal(size=(f,)) index.add_item(i2, v) index.build(10) for i in valid_indices: js = index.get_nns_by_item(i, 10000) for j in js: self.assertTrue(j in valid_indices) for i in range(1000): v = numpy.random.normal(size=(f,)) js = index.get_nns_by_vector(v, 10000) for j in js: self.assertTrue(j in valid_indices)
def get_hardest_negatives(samples_data, train_index, dim): u = AnnoyIndex(dim, 'angular') u.load(train_index) hardest_negatives = [] for index, sample in enumerate(samples_data): similar_questions = u.get_nns_by_item(index, 1000) cur_id = sample['table_id'] for close_index in similar_questions: close_sample = samples_data[close_index] close_id = close_sample['table_id'] if cur_id != close_id: hardest_negative = { 'table_id': cur_id, 'question_tokens': close_sample['question_tokens'], 'label': 0.0 } hardest_negatives.append(hardest_negative) break return hardest_negatives
def test_get_nns_with_distances(self): f = 3 i = AnnoyIndex(f, 'manhattan') i.add_item(0, [0, 0, 2]) i.add_item(1, [0, 1, 1]) i.add_item(2, [1, 0, 0]) i.build(10) l, d = i.get_nns_by_item(0, 3, -1, True) self.assertEqual(l, [0, 1, 2]) self.assertAlmostEqual(d[0], 0.0) self.assertAlmostEqual(d[1], 2.0) self.assertAlmostEqual(d[2], 3.0) l, d = i.get_nns_by_vector([2, 2, 1], 3, -1, True) self.assertEqual(l, [1, 2, 0]) self.assertAlmostEqual(d[0], 3.0) self.assertAlmostEqual(d[1], 4.0) self.assertAlmostEqual(d[2], 5.0)
def test_get_nns_with_distances(self): f = 3 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [0, 0, 2]) i.add_item(1, [0, 1, 1]) i.add_item(2, [1, 0, 0]) i.build(10) l, d = i.get_nns_by_item(0, 3, -1, True) self.assertEqual(l, [0, 1, 2]) self.assertAlmostEqual(d[0]**2, 0.0) self.assertAlmostEqual(d[1]**2, 2.0) self.assertAlmostEqual(d[2]**2, 5.0) l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True) self.assertEqual(l, [1, 0, 2]) self.assertAlmostEqual(d[0]**2, 6.0) self.assertAlmostEqual(d[1]**2, 8.0) self.assertAlmostEqual(d[2]**2, 9.0)
def test_get_nns_with_distances(self): f = 3 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [0, 0, 2]) i.add_item(1, [0, 1, 1]) i.add_item(2, [1, 0, 0]) i.build(10) l, d = i.get_nns_by_item(0, 3, -1, True) self.assertEquals(l, [0, 1, 2]) self.assertAlmostEquals(d[0]**2, 0.0) self.assertAlmostEquals(d[1]**2, 2.0) self.assertAlmostEquals(d[2]**2, 5.0) l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True) self.assertEquals(l, [1, 0, 2]) self.assertAlmostEquals(d[0]**2, 6.0) self.assertAlmostEquals(d[1]**2, 8.0) self.assertAlmostEquals(d[2]**2, 9.0)
def generate_pair(X, n_neighbors, n_MN, n_FP, distance='euclidean', verbose=True): '''Generate pairs for the dataset. ''' n, dim = X.shape # sample more neighbors than needed n_neighbors_extra = min(n_neighbors + 50, n - 1) tree = AnnoyIndex(dim, metric=distance) if _RANDOM_STATE is not None: tree.set_seed(_RANDOM_STATE) for i in range(n): tree.add_item(i, X[i, :]) tree.build(20) option = distance_to_option(distance=distance) nbrs = np.zeros((n, n_neighbors_extra), dtype=np.int32) knn_distances = np.empty((n, n_neighbors_extra), dtype=np.float32) for i in range(n): nbrs_ = tree.get_nns_by_item(i, n_neighbors_extra + 1) nbrs[i, :] = nbrs_[1:] for j in range(n_neighbors_extra): knn_distances[i, j] = tree.get_distance(i, nbrs[i, j]) print_verbose("Found nearest neighbor", verbose) sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10) print_verbose("Calculated sigma", verbose) scaled_dist = scale_dist(knn_distances, sig, nbrs) print_verbose("Found scaled dist", verbose) pair_neighbors = sample_neighbors_pair(X, scaled_dist, nbrs, n_neighbors) if _RANDOM_STATE is None: pair_MN = sample_MN_pair(X, n_MN, option) pair_FP = sample_FP_pair(X, pair_neighbors, n_neighbors, n_FP) else: pair_MN = sample_MN_pair_deterministic(X, n_MN, _RANDOM_STATE, option) pair_FP = sample_FP_pair_deterministic(X, pair_neighbors, n_neighbors, n_FP, _RANDOM_STATE) return pair_neighbors, pair_MN, pair_FP, tree
def KNN_Annoy(X, KK): NK = KK NN, NF = X.shape if KK > NF: raise ValueError("KK should be less than 2th-dim of X") t = AnnoyIndex(NF, metric='euclidean') for i, v in enumerate(X): t.add_item(i, v) t.build(100) ind = [] val = [] for i in range(NN): closest = t.get_nns_by_item(i, NK) ind.append(closest) val.append([t.get_distance(i, j) for j in closest]) return np.array(ind), np.array(val)
async def query_index(request, index_name): index = loaded.get(index_name) if index is None: dimmensions = 10 index = AnnoyIndex(dimmensions) file = (DATA_PATH / index_name).with_suffix('.ann') index.load(str(file.absolute())) loaded[index_name] = index id = int(request.args['id'][0]) count = request.args.get('count') if count is None: count = 10 items = index.get_nns_by_item(id, count) result = {'items': items} return json(result)
def find_closest_songs(self,song_id,id_songname_dict): layer_outputs = [] get_song_embeddings = Model(inputs=self.model.input,outputs=self.model.get_layer(index=13).output) with open('Metadata\\song_id_to_prediction.txt') as f: song_id_to_prediction = json.loads(f.read()) #print(song_id_to_prediction) song_ids,song_predictions = zip(*(song_id_to_prediction.items())) #prediction for the given song is the average of all spectrogram latent factors #song_predictions = np.array(song_predictions) t = AnnoyIndex(self.num_factors,'angular') for i in range(len(song_predictions)): t.add_item(i,song_predictions[i]) #assign each predicted latent factor and index t.build(10) closest_songs_indexes = t.get_nns_by_item(song_ids.index(song_id),10) print("The most similar songs to {} are:".format(id_songname_dict[song_id])) for index in closest_songs_indexes: try: print(id_songname_dict[song_ids[index]]) except KeyError: pass
def _random_nn(X): idx = AnnoyIndex(X.shape[1], 'euclidean') for i in range(X.shape[0]): idx.add_item(i, X[i]) logging.info("building an index with %d items" % X.shape[0]) idx.build(50) logging.info("finding %d neighbor groups" % self.n_clusters) seen = {} label = 0 guess = np.random.randint(X.shape[0]) centers = {guess: 0} while label < self.n_clusters: neighbors = idx.get_nns_by_item(guess, _get_num_neighbors()) for point in neighbors: seen[point] = label seen[guess] = label # find a distant point dists = np.array([[idx.get_distance(i, j) for i in centers] for j in range(X.shape[0])]) avg_dists = np.average(dists, axis=1) dist_prob = softmax(avg_dists) guess = np.random.choice(X.shape[0], p=dist_prob) while guess in seen: guess = np.random.choice(X.shape[0], p=dist_prob) centers[guess] = label label = label + 1 y = np.zeros(X.shape[0]) for k, v in seen.items(): y[k] = v return y
def nearest_neighbor_search(self, GE_csc): K = self.num_of_neighbor * 2 n, d = GE_csc.shape t = AnnoyIndex(d) for i in range(n): t.add_item(i, GE_csc[i, :]) t.build(100) print('#######OS PROCESS ID#####') print(str(os.getpid())) ann_file = str(os.getpid()) + 'test.ann' t.save(ann_file) u = AnnoyIndex(d) u.load(ann_file) os.remove(ann_file) val = np.zeros((n, K)) ind = np.zeros((n, K)) for i in range(n): tmp, tmp1 = u.get_nns_by_item(i, K, include_distances=True) ind[i, :] = tmp val[i, :] = tmp1 return ind.astype('int'), val
def find_nearest(self): ann = AnnoyIndex(num_merchants) for customer in self.customers: customer_vector = list(matrix.loc[[customer]]) ann.add_item(customer, customer_vector) if customer%200 == 0: print 'Adding '+ str(customer) print "Building" if len(self.merchantIDs) > max_trees: ann.build(max_trees) else: ann.build(len(self.merchantIDs)) print "...done" for customer in self.customers: neighbors = ann.get_nns_by_item(customer, num_neighbors) if customer%200 == 0: print "Found neighbors for " + str(customer) self.nearest[customer] = [] for neighbor in neighbors: if neighbor != customer: self.nearest[customer].append((neighbor, ann.get_distance(neighbor, customer)))
def ann_annoy(data, metric='euclidean', n_neighbors=10, trees=10): """My Approximate Nearest Neighbors function (ANN) using the annoy package. Parameters ---------- Returns ------- """ datapoints = data.shape[0] dimension = data.shape[1] # initialize the annoy database ann = AnnoyIndex(dimension) # store the datapoints for (i, row) in enumerate(data): ann.add_item(i, row.tolist()) # build the index ann.build(trees) # find the k-nearest neighbors for all points idx = np.zeros((datapoints, n_neighbors), dtype='int') distVals = idx.copy().astype(np.float) # extract the distance values for i in range(0, datapoints): idx[i,:] = ann.get_nns_by_item(i, n_neighbors) for j in range(0, n_neighbors): distVals[i,j] = ann.get_distance(i, idx[i,j]) return distVals, idx