def test_metric_kwarg(self): # Issue 211 i = AnnoyIndex(2, metric='euclidean') i.add_item(0, [1, 0]) i.add_item(1, [9, 0]) self.assertAlmostEqual(i.get_distance(0, 1), 8) self.assertEqual(i.f, 2)
def test_dist(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) self.assertAlmostEqual(i.get_distance(0, 1), 1.0)
def ANN(searchSpace): dimension = searchSpace[0].shape[0] t = AnnoyIndex(dimension, metric='euclidean') for i in range(len(searchSpace)): t.add_item(i, searchSpace[i]) t.build(10) return t
def build_annoy_index(corpus, dimension, winlen, winstep): print "Adding to Annoy index" index = AnnoyIndex(dimension, "euclidean") mfcc_list = [] i = 0 for filename, frames in corpus: # print filename, frames.shape for index_in_file, mfcc in enumerate(frames): mfcc_list.append((filename, index_in_file)) index.add_item(i, mfcc.tolist()) assert mfcc_list[i] == (filename, index_in_file) i += 1 opts = {"samplerate": desired_samplerate, "winlen": winlen, "winstep": winstep, "numcep": 13, "nfilt": 26, "nfft": 512, "ntrees": ANN_NTREES } cache_filename = "annoy_index_" + hashlib.md5(str([filename for filename, frames in corpus])).hexdigest() + "." + "_".join("%s=%s" % (k, v) for k, v in sorted(opts.items())) + ".tree" if not os.path.exists(cache_filename): print "Building Annoy index with %d trees" % ANN_NTREES # index.build(-1) index.build(ANN_NTREES) index.save(cache_filename) print "\tWrote cache to %s" % cache_filename else: print "\tReading cache from %s" % cache_filename index.load(cache_filename) return index, mfcc_list
def test_dist_2(self): f = 2 i = AnnoyIndex(f) i.add_item(0, [1000, 0]) i.add_item(1, [10, 0]) self.assertAlmostEqual(i.get_distance(0, 1), 0)
def test_dist(self): f = 2 i = AnnoyIndex(f) i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) self.assertAlmostEqual(i.get_distance(0, 1), (2 * (1.0 - 2 ** -0.5))**0.5)
def test_zero_vectors(self): # Mentioned on the annoy-user list bitstrings = [ '0000000000011000001110000011111000101110111110000100000100000000', '0000000000011000001110000011111000101110111110000100000100000001', '0000000000011000001110000011111000101110111110000100000100000010', '0010010100011001001000010001100101011110000000110000011110001100', '1001011010000110100101101001111010001110100001101000111000001110', '0111100101111001011110010010001100010111000111100001101100011111', '0011000010011101000011010010111000101110100101111000011101001011', '0011000010011100000011010010111000101110100101111000011101001011', '1001100000111010001010000010110000111100100101001001010000000111', '0000000000111101010100010001000101101001000000011000001101000000', '1000101001010001011100010111001100110011001100110011001111001100', '1110011001001111100110010001100100001011000011010010111100100111', ] vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings] f = 64 idx = AnnoyIndex(f, 'hamming') for i, v in enumerate(vectors): idx.add_item(i, v) idx.build(10) idx.save('idx.ann') idx = AnnoyIndex(f, 'hamming') idx.load('idx.ann') js, ds = idx.get_nns_by_item(0, 5, include_distances=True) self.assertEquals(js[0], 0) self.assertEquals(ds[:4], [0, 1, 1, 22])
def build_tree(df, metric): ''' INPUTS: Pandas DataFrame, Choice of Metric Space String OUTPUTS: Returns the built AnnoyIndex tree, returns a dictionary mapping index numbers to the DataFrame's index Builds a ANN tree using Spotify's ANNoy library. Metric is the metric space (either euclidean or angular) ''' tree = AnnoyIndex(len(df.iloc[0, :].values), metric=metric) indexes = {} for i in xrange(len(df)): v = df.iloc[i, :] indexes[i] = v.name tree.add_item(i, v.values) tree.build(50) tree.save(DATA_DIR + 'tree_' + metric + '.ann') with open(DATA_DIR + 'indexes_' + metric, 'wb') as f: pickle.dump(indexes, f) return (tree, indexes)
def recall_at(self, n, n_trees=10, n_points=1000, n_rounds=5): # the best movie/variable name total_recall = 0. for r in range(n_rounds): # create random points at distance x f = 10 idx = AnnoyIndex(f, 'dot') data = numpy.array([ [random.gauss(0, 1) for z in range(f)] for j in range(n_points) ]) expected_results = [ sorted( range(n_points), key=lambda j: dot_metric(data[i], data[j]) )[:n] for i in range(n_points) ] for i, vec in enumerate(data): idx.add_item(i, vec) idx.build(n_trees) for i in range(n_points): nns = idx.get_nns_by_vector(data[i], n) total_recall += recall(nns, expected_results[i]) return total_recall / float(n_rounds * n_points)
def test_get_lots_of_nns(self): f = 10 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [random.gauss(0, 1) for x in xrange(f)]) i.build(10) for j in xrange(100): self.assertEqual(i.get_nns_by_item(0, 999999999), [0])
def precision(f=40, n=1000000): t = AnnoyIndex(f) for i in xrange(n): v = [] for z in xrange(f): v.append(random.gauss(0, 1)) t.add_item(i, v) t.build(2 * f) t.save('test.tree') limits = [10, 100, 1000, 10000] k = 10 prec_sum = {} prec_n = 1000 time_sum = {} for i in xrange(prec_n): j = random.randrange(0, n) print 'finding nbs for', j closest = set(t.get_nns_by_item(j, n)[:k]) for limit in limits: t0 = time.time() toplist = t.get_nns_by_item(j, limit) T = time.time() - t0 found = len(closest.intersection(toplist)) hitrate = 1.0 * found / k prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate time_sum[limit] = time_sum.get(limit, 0.0) + T for limit in limits: print 'limit: %-9d precision: %6.2f%% avg time: %.6fs' % (limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1))
def test_tuple(self, n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(n_points): i.add_item(j, (random.gauss(0, 1) for x in xrange(f))) i.build(n_trees)
def _test_holes_base(self, n, f=100, base_i=100000): annoy = AnnoyIndex(f) for i in range(n): annoy.add_item(base_i + i, numpy.random.normal(size=(f,))) annoy.build(100) res = annoy.get_nns_by_item(base_i, n) self.assertEquals(set(res), set([base_i + i for i in range(n)]))
def make_text_graph(user_lemma_matrix, dimensionality, metric, number_of_estimators, number_of_neighbors): user_lemma_matrix_tfidf = augmented_tf_idf(user_lemma_matrix) # print(user_lemma_matrix_tfidf.shape) if (user_lemma_matrix_tfidf.shape[0] <= dimensionality) or (user_lemma_matrix_tfidf.shape[1] <= dimensionality): X_svd = user_lemma_matrix_tfidf.toarray() else: X_svd = TruncatedSVD(n_components=dimensionality).fit_transform(user_lemma_matrix_tfidf) annoy_index = AnnoyIndex(X_svd.shape[1], metric=metric) for q in range(X_svd.shape[0]): annoy_index.add_item(q, X_svd[q, :]) annoy_index.build(number_of_estimators) row = list() col = list() data = list() for q in range(X_svd.shape[0]): neighbors, distances = annoy_index.get_nns_by_item(q, number_of_neighbors, include_distances=True) row.extend([q] * number_of_neighbors) col.extend(neighbors) data.extend(distances) row = np.array(row, dtype=np.int64) col = np.array(col, dtype=np.int64) data = np.array(data, dtype=np.float64) text_graph = spsp.coo_matrix((data, (row, col)), shape=(X_svd.shape[0], X_svd.shape[0])) text_graph = spsp.csr_matrix(text_graph) return text_graph
def test_dist_degen(self): f = 2 i = AnnoyIndex(f) i.add_item(0, [1, 0]) i.add_item(1, [0, 0]) self.assertAlmostEqual(i.get_distance(0, 1), 2.0**0.5)
def _get_index(self, dataset): url = 'http://vectors.erikbern.com/%s.hdf5' % dataset vectors_fn = os.path.join('test', dataset + '.hdf5') index_fn = os.path.join('test', dataset + '.annoy') if not os.path.exists(vectors_fn): print('downloading', url, '->', vectors_fn) urlretrieve(url, vectors_fn) dataset_f = h5py.File(vectors_fn) distance = dataset_f.attrs['distance'] f = dataset_f['train'].shape[1] annoy = AnnoyIndex(f, distance) if not os.path.exists(index_fn): print('adding items', distance, f) for i, v in enumerate(dataset_f['train']): annoy.add_item(i, v) print('building index') annoy.build(10) annoy.save(index_fn) else: annoy.load(index_fn) return annoy, dataset_f
def test_overwrite_index(self): # Issue #335 f = 40 # Build the initial index t = AnnoyIndex(f) for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) t.save('test.ann') # Load index file t2 = AnnoyIndex(f) t2.load('test.ann') # Overwrite index file t3 = AnnoyIndex(f) for i in range(500): v = [random.gauss(0, 1) for z in range(f)] t3.add_item(i, v) t3.build(10) if os.name == 'nt': # Can't overwrite on Windows with self.assertRaises(IOError): t3.save('test.ann') else: t3.save('test.ann') # Get nearest neighbors v = [random.gauss(0, 1) for z in range(f)] nns = t2.get_nns_by_vector(v, 1000) # Should not crash
def build_index(df,n_trees = 50,dist_metric='angular',out_dir="./"): n_records = df.shape[0] n_col = df.shape[1] index = AnnoyIndex(n_col,metric=dist_metric) patient_dict = {} index_dict = {} i = 0 print "Adding items to the index..." for patient_id in df.index.values: if i % 10000 == 0: print str(i) vec = df.loc[patient_id].values index.add_item(i,vec) patient_dict[patient_id] = i index_dict[i] = patient_id i += 1 print "Building the index..." index.build(n_trees) index.save(out_dir+"annoy_index.ann") ## Save the patient_id -> index mapping ## w = csv.writer(open(out_dir+"patient_mapping.csv", "w")) for key, val in patient_dict.items(): w.writerow([key, val]) w = csv.writer(open(out_dir+"index_mapping.csv", "w")) for key, val in index_dict.items(): w.writerow([key, val])
def test_write_failed(self): f = 40 # Build the initial index t = AnnoyIndex(f) for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) if sys.platform == "linux" or sys.platform == "linux2": # linux try: t.save("/dev/full") self.fail("didn't get expected exception") except Exception as e: self.assertTrue(str(e).find("No space left on device") > 0) elif sys.platform == "darwin": volume = "FULLDISK" device = os.popen('hdiutil attach -nomount ram://64').read() os.popen('diskutil erasevolume MS-DOS %s %s' % (volume, device)) os.popen('touch "/Volumes/%s/full"' % volume) try: t.save('/Volumes/%s/annoy.tree' % volume) self.fail("didn't get expected exception") except Exception as e: self.assertTrue(str(e).find("No space left on device") > 0) finally: os.popen("hdiutil detach %s" % device)
def test_single_vector(self): # https://github.com/spotify/annoy/issues/194 a = AnnoyIndex(3) a.add_item(0, [1, 0, 0]) a.build(10) a.save('1.ann') self.assertEquals(a.get_nns_by_vector([1, 0, 0], 3, include_distances=True), ([0], [0.0]))
def test_wrong_length(self, n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [random.gauss(0, 1) for x in xrange(f)]) self.assertRaises(IndexError, i.add_item, 1, [random.gauss(0, 1) for x in xrange(f+1000)]) self.assertRaises(IndexError, i.add_item, 2, []) i.build(n_trees)
def build_index(counts,label_to_id,dimension): index = AnnoyIndex(dimension,metric='angular') for label,cnt_list in counts.items(): id = label_to_id[label] index.add_item(id,cnt_list) index.build(100) return index
def test_get_nns_by_vector(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [2,2]) i.add_item(1, [3,2]) i.build(10) self.assertEquals(i.get_nns_by_vector([3,3], 2), [1, 0])
def test_get_item_vector(self): f = 10 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [random.gauss(0, 1) for x in xrange(f)]) for j in xrange(100): print(j, '...') for k in xrange(1000 * 1000): i.get_item_vector(0)
def fit_annoy(data, n_trees=-1): logger.info('Fitting Annoy Matcher...') from annoy import AnnoyIndex matcher = AnnoyIndex(data.shape[1], metric='euclidean') for i, d in enumerate(data): matcher.add_item(i, d) matcher.build(n_trees) return matcher
def test_save_without_build(self): # Issue #61 i = AnnoyIndex(10) i.add_item(1000, [random.gauss(0, 1) for z in xrange(10)]) i.save('x.tree') j = AnnoyIndex(10) j.load('x.tree') j.build(10)
def test_dist_3(self): f = 2 i = AnnoyIndex(f) i.add_item(0, [97, 0]) i.add_item(1, [42, 42]) dist = ((1 - 2 ** -0.5) ** 2 + (2 ** -0.5) ** 2)**0.5 self.assertAlmostEqual(i.get_distance(0, 1), dist)
def test_include_dists_check_ranges(self): f = 3 i = AnnoyIndex(f) for j in xrange(100000): i.add_item(j, numpy.random.normal(size=f)) i.build(10) indices, dists = i.get_nns_by_item(0, 100000, include_distances=True) self.assertTrue(max(dists) < 2.0) self.assertAlmostEqual(min(dists), 0.0)
def test_numpy(self, n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(n_points): a = numpy.random.normal(size=f) a = a.astype(random.choice([numpy.float64, numpy.float32, numpy.uint8, numpy.int16])) i.add_item(j, a) i.build(n_trees)
def _build_from_model(self, vectors, labels, num_features): index = AnnoyIndex(num_features) for vector_num, vector in enumerate(vectors): index.add_item(vector_num, vector) index.build(self.num_trees) self.index = index self.labels = labels
def precision(self, n, n_trees=10, n_points=10000, n_rounds=10): found = 0 for r in xrange(n_rounds): # create random points at distance x from (1000, 0, 0, ...) f = 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(n_points): p = [random.gauss(0, 1) for z in xrange(f - 1)] norm = sum([pi**2 for pi in p])**0.5 x = [1000] + [pi / norm * j for pi in p] i.add_item(j, x) i.build(n_trees) nns = i.get_nns_by_vector([1000] + [0] * (f - 1), n) self.assertEqual(nns, sorted(nns)) # should be in order # The number of gaps should be equal to the last item minus n-1 found += len([x for x in nns if x < n]) return 1.0 * found / (n * n_rounds)
def nn_approx(ds1, ds2, knn=KNN, metric='manhattan', n_trees=10): # Build index. a = AnnoyIndex(ds2.shape[1], metric=metric) for i in range(ds2.shape[0]): a.add_item(i, ds2[i, :]) a.build(n_trees) # Search index. ind = [] for i in range(ds1.shape[0]): ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1)) ind = np.array(ind) # Match. match = set() for a, b in zip(range(ds1.shape[0]), ind): for b_i in b: match.add((a, b_i)) return match
def vsm(fp=FILEPATH): print("Train vsm from {}".format(fp)) with open(fp, 'r') as fin: index_id_dict = {} corpus = [] for index, line in tqdm(enumerate(fin)): event_id, *event_description_list = line.strip().split(',') event_description = " ".join(event_description_list) sentence = jieba.analyse.extract_tags(event_description) corpus.append(" ".join(sentence)) index_id_dict[index] = event_id vectorizer = TfidfVectorizer() document_term_matrix = vectorizer.fit_transform(corpus) dim = document_term_matrix.shape[1] annoy_index = AnnoyIndex(dim) for index, vector in enumerate(document_term_matrix): annoy_index.add_item(index, vector.toarray()[0]) annoy_index.build(10) # 10 trees annoy_index.save('vsm_tfidf.ann') return index_id_dict, vectorizer, document_term_matrix
def test_distance_consistency(self): n, f = 1000, 3 i = AnnoyIndex(f, 'angular') for j in range(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): self.assertAlmostEqual(dist, i.get_distance(a, b)) u = i.get_item_vector(a) v = i.get_item_vector(b) u_norm = numpy.array(u) * numpy.dot(u, u)**-0.5 v_norm = numpy.array(v) * numpy.dot(v, v)**-0.5 # cos = numpy.clip(1 - cosine(u, v), -1, 1) # scipy returns 1 - cos self.assertAlmostEqual( dist**2, numpy.dot(u_norm - v_norm, u_norm - v_norm)) # self.assertAlmostEqual(dist, (2*(1 - cos))**0.5) self.assertAlmostEqual( dist**2, sum([(x - y)**2 for x, y in zip(u_norm, v_norm)]))
def KNN_Annoy(X, KK): NK = KK NN, NF = X.shape if KK > NF: raise ValueError("KK should be less than 2th-dim of X") t = AnnoyIndex(NF, metric='euclidean') for i, v in enumerate(X): t.add_item(i, v) t.build(100) ind = [] val = [] for i in range(NN): closest = t.get_nns_by_item(i, NK) ind.append(closest) val.append([t.get_distance(i, j) for j in closest]) return np.array(ind), np.array(val)
def make_ann(n_dim=N_DIM, n_items=100): ids = [] ann = AnnoyIndex(n_dim, METRIC) ann.on_disk_build(PATH_DISK_SAVE) for ind in range(n_items): v = [random.gauss(0, 1) for _ in range(n_dim)] ann.add_item(ind, v) ids.append(str(ind)) ann.build(N_TREES) meta_d = { 'vec_src': Path(__file__).name, 'metric': METRIC, 'n_dim': n_dim, 'timestamp_utc': datetime.utcnow().isoformat(), } return ids, meta_d
def _random_nn(X): idx = AnnoyIndex(X.shape[1], 'euclidean') for i in range(X.shape[0]): idx.add_item(i, X[i]) logging.info("building an index with %d items" % X.shape[0]) idx.build(50) logging.info("finding %d neighbor groups" % self.n_clusters) seen = {} label = 0 guess = np.random.randint(X.shape[0]) centers = {guess: 0} while label < self.n_clusters: neighbors = idx.get_nns_by_item(guess, _get_num_neighbors()) for point in neighbors: seen[point] = label seen[guess] = label # find a distant point dists = np.array([[idx.get_distance(i, j) for i in centers] for j in range(X.shape[0])]) avg_dists = np.average(dists, axis=1) dist_prob = softmax(avg_dists) guess = np.random.choice(X.shape[0], p=dist_prob) while guess in seen: guess = np.random.choice(X.shape[0], p=dist_prob) centers[guess] = label label = label + 1 y = np.zeros(X.shape[0]) for k, v in seen.items(): y[k] = v return y
def main(args): index = AnnoyIndex(2048, 'euclidean') index_metadata = {} model = load_model() batch = [] total_size = 0 for i, fname in enumerate(os.listdir(args.images_dir)): if not (fname.endswith('.jpg') or fname.endswith('.png') or fname.endswith('.jpeg')): continue path = os.path.join(args.images_dir, fname) try: img = open_img(path) batch.append((i, img, fname)) except Exception as e: print(e) continue if len(batch) == args.batch_size: total_size += len(batch) print("Process batch: %d" % total_size) ids, imgs, img_fnames = zip(*batch) vectors = get_feature_vectors(model, imgs).numpy() for j, vector in enumerate(vectors): index.add_item(ids[j], vector.tolist()) index_metadata[ids[j]] = { 'filename': img_fnames[j] } batch = [] if total_size >= args.max_items: break print('Build index') index.build(args.n_trees) print('Save index') index.save(os.path.join(args.dst, 'index.ann')) json.dump(index_metadata, open(os.path.join(args.dst, 'index_metadata.json'), 'w'))
def build_db(face_path, save_path): """ Builds FaceEmbedding Database of people Args: face_path (Path): Face Directory Path save_path ([type]): Save Path for SkywatchDB """ face_path = parse_path(face_path) save_path = parse_path(save_path) print("SkywatchDB Build Started...") face_tree = AnnoyIndex(embedding_size, 'euclidean') image_paths = _get_image_paths(face_path) i = 1 person_id_map = {} for person, images in image_paths.items(): for image in images: faces = get_faces(image, enforce=True) try: aligned_face = align_face(faces[0]['image']) embedding = get_face_embedding(aligned_face) except IndexError: raise AssertionError('Could not detect face in ' + image) except TypeError: print(f"Cannot detect face for {person} in {image}") continue face_tree.add_item(i, embedding) person_id_map[i] = person i += 1 face_tree.build(5) try: face_tree.save(save_path.joinpath('faceEmbed.db').as_posix()) save_file = open(save_path.joinpath('nameMap.db').as_posix(), 'wb') pickle.dump(person_id_map, save_file) save_file.close() print('SkywatchDB successfully saved at ', save_path.as_posix()) except: raise SystemError( 'Storage Access Error. Cannot save Skywatch Database.')
def rebuild_index(self, items: List[int], texts: List[str], embeddings: List[np.ndarray]): try: __temp_index = AnnoyIndex(VECTOR_LENGTH, metric='angular') __temp_mapping = OrderedDict() for _i, _item in enumerate(items): __temp_index.add_item(_i, embeddings[_i]) __temp_mapping[_item] = { 'index': _i, 'text': texts[_i], 'embedding': embeddings[_i] } logger.info('A total of {} items added to the index'.format(_i)) logger.info('Building the index with {} trees...'.format(N_TREES)) __temp_index.build(n_trees=N_TREES) logger.info('Index is successfully built.') logger.info('Saving index to disk...') with tempfile.TemporaryFile() as fp: __temp_index.save(str(fp.name)) self.__index_file = str(fp.name) logger.info('Index is saved to disk.') logger.info("Index file size: {} GB".format( round(os.path.getsize(self.__index_file) / float(1024**3), 2))) logger.info('Saving mapping to disk...') with open(self.__index_file + '.mapping', 'wb') as handle: pickle.dump(__temp_mapping, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info('Mapping is saved to disk.') logger.info("Mapping file size: {} MB".format( round( os.path.getsize(self.__index_file + '.mapping') / float(1024**2), 2))) except Exception as e: logger.error("Error updating index " + str(e)) raise e else: self.__load_index__() return str(self.__index_file)
def representative_sample(X, num_samples, save=False): """Sample vectors in X, prefering edge cases and vectors farthest from other vectors in sample set """ X = X.values if hasattr(X, 'values') else np.array(X) N, M = X.shape rownums = np.arange(N) np.random.shuffle(rownums) idx = AnnoyIndex(M) for i, row in enumerate(X): idx.add_item(i, row) idx.build(int(np.log2(N)) + 1) if save: if isinstance(save, basestring): idxfilename = save else: idxfile = tempfile.NamedTemporaryFile(delete=False) idxfile.close() idxfilename = idxfile.name idx.save(idxfilename) idx = AnnoyIndex(M) idx.load(idxfile.name) samples = -1 * np.ones(shape=(num_samples, ), dtype=int) samples[0] = rownums[0] # FIXME: some integer determined by N and num_samples and distribution j, num_nns = 0, min(1000, int(num_samples / 2. + 1)) for i in rownums: if i in samples: continue nns = idx.get_nns_by_item(i, num_nns) # FIXME: pick vector furthest from past K (K > 1) points or outside of a hypercube # (sized to uniformly fill the space) around the last sample samples[j + 1] = np.setdiff1d(nns, samples)[-1] if len(num_nns) < num_samples / 3.: num_nns = min(N, 1.3 * num_nns) j += 1 return samples
def main(): embed = Embedding() images = [image for image in os.listdir(os.path.join(config.IMAGE_PATH, "face_db")) if image.endswith(".png")] images.sort(key=human_sort) with open(config.FACE_NAMES, 'w') as f: [f.write(image + '\n') for image in images] imgs = [cv2.imread(os.path.join(config.IMAGE_PATH, "face_db", image)) for image in images] t = AnnoyIndex(512, metric="euclidean") for i, img in enumerate(imgs): t.add_item(i, embed.get_feature(img)) t.build(10) # rewrite face_db.ann t.save(config.FACE_FEATURES)
class AnnoyCB: def __init__(self, n_sim_movie=10, trees=10, model_name='angular'): self.n_sim_movie = n_sim_movie self.trees = trees self.model_name = model_name def fit(self, item_matrix): num, vec_dim = item_matrix.shape self.model = AnnoyIndex(vec_dim, self.model_name) for i, vec in enumerate(item_matrix): self.model.add_item(i, vec) self.model.build(self.trees) def predict(self, item_matrix): num, vec_dim = item_matrix.shape res_result = [] for i in range(num): itmes = self.model.get_nns_by_item(i, self.n_sim_movie) res_result.append(itmes) return res_result
def update(): params = json.loads(request.get_data()) if len(params) == 0: return "No parameter" try: global annoy_index new_annoy_index = AnnoyIndex(vector_len, distance_metric) for id, url in params["urls"]: features = extract_features(url, model, applications.densenet.preprocess_input) new_annoy_index.add_item(id, features) new_annoy_index.build(10) annoy_index.unload() new_annoy_index.save("cat-pictures.annoy") annoy_index = new_annoy_index return "Update complete" except: return "Annoy error"
def createAnnoyIndex(codebook_path: str, bit_len: int, n_trees: int): def extractVectorsFromCodebook(codebook: str, bit_len: int): df = pd.read_csv(codebook) df['Barcode'] = [ f"{barcode:0{bit_len}}" for barcode in list(df['Barcode']) ] df['Vector'] = [ createBarcodeVector(barcode) for barcode in df['Barcode'] ] list_of_codebook_vectors = np.array(df['Vector']) return list_of_codebook_vectors list_of_codebook_vectors = extractVectorsFromCodebook(codebook_path, 16) n_vectors = len(list_of_codebook_vectors) t = AnnoyIndex(bit_len, 'euclidean') for i in range(0, n_vectors): v = list_of_codebook_vectors[i] t.add_item(i, v) t.build(n_trees) t.save("codebook_index.ann")
def histogram(data): f = 256 histogram = AnnoyIndex(f, 'angular') index = [] a = 0 for i in data: try: req = urlopen(i) arr = np.asarray(bytearray(req.read()), dtype=np.uint8) img = cv2.imdecode(arr, 1) # 'Load it as it is' gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) hist = cv2.calcHist([gray_image], [0], None, [256], [0, 256]) histogram.add_item(a, hist) index.append(a) except urllib.error.HTTPError: print("cannot find" + str(a)) except cv2.error: print("cannot find" + str(a)) a += 1 return histogram, index
def test_very_large_index(self): # 388 f = 3 dangerous_size = 2**31 size_per_vector = 4 * (f + 3) n_vectors = int(dangerous_size / size_per_vector) m = AnnoyIndex(3, 'angular') m.verbose(True) for i in range(100): m.add_item(n_vectors + i, [random.gauss(0, 1) for z in range(f)]) n_trees = 10 m.build(n_trees) path = 'test_big.annoy' m.save(path) # Raises on Windows # Sanity check size of index self.assertGreaterEqual(os.path.getsize(path), dangerous_size) self.assertLess(os.path.getsize(path), dangerous_size + 100e3) # Sanity check number of trees self.assertEquals(m.get_n_trees(), n_trees)
def __init__(self, documents, training_videos=None): if training_videos is None: training_videos = ALL_VIDEOS annoy_n_trees = CONFIGURATION.getint('annoy_n_trees') num_dense_units = CONFIGURATION.getint('num_dense_units') model = _KerasSiameseNeuralNetwork(training_videos) LOGGER.debug( 'Building an ANNOY index with {} trees'.format(annoy_n_trees)) annoy_index = AnnoyIndex(num_dense_units, metric='euclidean') pages = dict() for page_index, (page, page_features) in enumerate( model.get_page_features(chain(*documents))): annoy_index.add_item(page_index, page_features) pages[page_index] = page annoy_index.build(annoy_n_trees) self._annoy_index = annoy_index self._model = model self._pages = pages
def build_annoy_index(X, path, ntrees=50, verbose=1): index = AnnoyIndex(X.shape[1], metric='angular') if platform.system() != 'Windows': index.on_disk_build(path) if issparse(X): for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i].toarray()[0] index.add_item(i, v) else: for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i] index.add_item(i, v) # Build n trees index.build(ntrees) if platform.system() == 'Windows': index.save(path) return index
def nearest_neighbor_search(self, GE_csc): K = self.num_of_neighbor * 2 n, d = GE_csc.shape t = AnnoyIndex(d) for i in range(n): t.add_item(i, GE_csc[i, :]) t.build(100) print('#######OS PROCESS ID#####') print(str(os.getpid())) ann_file = str(os.getpid()) + 'test.ann' t.save(ann_file) u = AnnoyIndex(d) u.load(ann_file) os.remove(ann_file) val = np.zeros((n, K)) ind = np.zeros((n, K)) for i in range(n): tmp, tmp1 = u.get_nns_by_item(i, K, include_distances=True) ind[i, :] = tmp val[i, :] = tmp1 return ind.astype('int'), val
def annoy_train(spark, dirname, rank, regParam, n_trees, random_seed): # Load model model = ALSModel.load(f'{dirname}/{rank}_{regParam}_model') # get item factors item_factors = model.itemFactors item_factors, annoy_index_map = convert_annoy_index(item_factors) # train annoy model tree = AnnoyIndex(rank, 'dot') for item in tqdm(item_factors.collect()): tree.add_item(item.annoy_id, item.features) tree.set_seed(random_seed) # build the tree # num of trees: higher n_trees gives higher precision tree.build(n_trees) # save annoy model and index map tree.save(f'{dirname}_{rank}_{regParam}_tree.ann') annoy_index_map.write.parquet(f'{dirname}_{rank}_{regParam}_annoy_index_map.parquet')
def label_approx(X, sites, site_labels): from annoy import AnnoyIndex assert (X.shape[1] == sites.shape[1]) # Build index over site points. aindex = AnnoyIndex(sites.shape[1], metric='euclidean') for i in range(sites.shape[0]): aindex.add_item(i, sites[i, :]) aindex.build(10) labels = [] for i in range(X.shape[0]): # Find nearest site point. nearest_site = aindex.get_nns_by_vector(X[i, :], 1) if len(nearest_site) < 1: labels.append(None) continue labels.append(site_labels[nearest_site[0]]) return np.array(labels)
def find_nearest(self): ann = AnnoyIndex(num_merchants) for customer in self.customers: customer_vector = list(matrix.loc[[customer]]) ann.add_item(customer, customer_vector) if customer%200 == 0: print 'Adding '+ str(customer) print "Building" if len(self.merchantIDs) > max_trees: ann.build(max_trees) else: ann.build(len(self.merchantIDs)) print "...done" for customer in self.customers: neighbors = ann.get_nns_by_item(customer, num_neighbors) if customer%200 == 0: print "Found neighbors for " + str(customer) self.nearest[customer] = [] for neighbor in neighbors: if neighbor != customer: self.nearest[customer].append((neighbor, ann.get_distance(neighbor, customer)))
def build_index_annoy(h5fname , dset,out='data.ann',trees = 128,lazy=True): #establish connection to HDF5 file h5f = h5py.File(h5fname,'r') if lazy: X = h5f[dset] else: X = h5f[dset][:] #get dimension f = X.shape[1] #initialize annoy t = AnnoyIndex(f,'angular') #iterate over features, add to annoy for i,v in enumerate(X): t.add_item(i, v) #build and save index t.build(trees) t.save(out)
class spherefaceAnnoyDatabase(): def __init__(self): self.network = caffe.Net("pretrainedModels/sphereface_deploy.prototxt", "pretrainedModels/sphereface_model.caffemodel",0) self.index = AnnoyIndex(512, metric='angular') # 512 is the number of neurons in the last layer of the net self.indexToName = {} self.nameToIndex = {} def getEmbedding(self, imgPath): img = Image.open(imgPath) sampleImage = numpy.array(img.resize((net.blobs['data'].data.shape[3],net.blobs['data'].data.shape[2]))) sampleImage = numpy.reshape(sampleImage,(1,)+sampleImage.shape).transpose(0,3,1,2).astype(numpy.float32) net.blobs['data'].data[...]=sampleImage net.forward() return net.blobs['fc5'].data[0].copy() def addFaceWithName(self, imgPath, name): embedding = self.getEmbedding(imgPath) length = self.index.get_n_items() self.index.add_item(length, embedding) self.indexToName[length] = name self.nameToIndex[name] = length def addEmbeddingWithName(self, embedding, name): length = self.index.get_n_items() self.index.add_item(length, embedding) self.indexToName[length] = name self.nameToIndex[name] = length def addFaceWithoutName(self, imgPath): embedding = self.getEmbedding(imgPath) length = self.index.get_n_items() self.index.add_item(length, embedding) self.indexToName[length] = imgPath self.nameToIndex[imgPath] = length def freeze(self, nTrees = 20): self.index.build(nTrees) def lookupByFace(self, imgPath, numberOfNeighbours): embedding = self.getEmbedding(imgPath) results = self.index.get_nns_by_vector(embedding, numberOfNeighbours, search_k=-1, include_distances=True) for i in xrange(len(results[0])): results[0][i] = self.indexToName[results[0][i]] return results def lookupByEmbedding(self, embedding, numberOfNeighbours): if(numberOfNeighbours==-1): numberOfNeighbours = self.index.get_n_items() results = self.index.get_nns_by_vector(embedding, numberOfNeighbours, search_k=-1, include_distances=True) for i in xrange(len(results[0])): results[0][i] = self.indexToName[results[0][i]] return results def lookupByName(self, name, numberOfNeighbours): if(numberOfNeighbours==-1): numberOfNeighbours = self.index.get_n_items() results = self.index.get_nns_by_item(self.nameToIndex[name], numberOfNeighbours, search_k=-1, include_distances=True) for i in xrange(len(results[0])): results[0][i] = self.indexToName[results[0][i]] return results
def build_annoy_index(X, path, ntrees=50, build_index_on_disk=True, verbose=1): """ Build a standalone annoy index. :param array X: numpy array with shape (n_samples, n_features) :param str path: The filepath of a trained annoy index file saved on disk. :param int ntrees: The number of random projections trees built by Annoy to approximate KNN. The more trees the higher the memory usage, but the better the accuracy of results. :param bool build_index_on_disk: Whether to build the annoy index directly on disk. Building on disk should allow for bigger datasets to be indexed, but may cause issues. If None, on-disk building will be enabled for Linux, but not Windows due to issues on Windows. :param int verbose: Controls the volume of logging output the model produces when training. When set to 0, silences outputs, when above 0 will print outputs. """ index = AnnoyIndex(X.shape[1], metric='euclidean') if build_index_on_disk: index.on_disk_build(path) if issparse(X): for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i].toarray()[0] index.add_item(i, v) else: for i in tqdm(range(X.shape[0]), disable=verbose < 1): v = X[i] index.add_item(i, v) try: index.build(ntrees) except Exception: msg = ("Error building Annoy Index. Passing on_disk_build=False" " may solve the issue, especially on Windows.") raise IndexBuildingError(msg) else: if not build_index_on_disk: index.save(path) return index
def build_type_clusters(model, train_data_loader: DataLoader, valid_data_loader: DataLoader, type_vocab: set): computed_embed_labels = [] annoy_idx = AnnoyIndex(model.output_size, 'euclidean') curr_idx = 0 for _, (a, _, _) in enumerate( tqdm(train_data_loader, total=len(train_data_loader), desc="Computing Type Clusters - Train set")): model.eval() with torch.no_grad(): output_a = model(*(s.to(DEVICE) for s in a[0])) lables = a[1].data.cpu().numpy() #computed_embed_labels.append(lables) for i, v in enumerate(output_a.data.cpu().numpy()): if lables[i] in type_vocab: annoy_idx.add_item(curr_idx, v) computed_embed_labels.append(lables[i]) curr_idx += 1 for _, (a, _, _) in enumerate( tqdm(valid_data_loader, total=len(valid_data_loader), desc="Computing Type Clusters - Valid set")): model.eval() with torch.no_grad(): output_a = model(*(s.to(DEVICE) for s in a[0])) lables = a[1].data.cpu().numpy() #computed_embed_labels.append(a[1].data.cpu().numpy()) for i, v in enumerate(output_a.data.cpu().numpy()): if lables[i] in type_vocab: annoy_idx.add_item(curr_idx, v) computed_embed_labels.append(lables[i]) curr_idx += 1 annoy_idx.build(KNN_TREE_SIZE) return annoy_idx, np.array( computed_embed_labels) #np.hstack(computed_embed_labels)
def ann_annoy(data, metric='euclidean', n_neighbors=10, trees=10): """My Approximate Nearest Neighbors function (ANN) using the annoy package. Parameters ---------- Returns ------- """ datapoints = data.shape[0] dimension = data.shape[1] # initialize the annoy database ann = AnnoyIndex(dimension) # store the datapoints for (i, row) in enumerate(data): ann.add_item(i, row.tolist()) # build the index ann.build(trees) # find the k-nearest neighbors for all points idx = np.zeros((datapoints, n_neighbors), dtype='int') distVals = idx.copy().astype(np.float) # extract the distance values for i in range(0, datapoints): idx[i,:] = ann.get_nns_by_item(i, n_neighbors) for j in range(0, n_neighbors): distVals[i,j] = ann.get_distance(i, idx[i,j]) return distVals, idx
def compute_and_store_similarity(self): start = time.time() sessions_VSM, sessions_id = self._driver.session_vectors() print("Time to create the vector:", time.time() - start) t = AnnoyIndex(sessions_VSM.shape[1], 'angular') t.on_disk_build('/tmp/test.ann') start = time.time() i = 0 overall_size = sessions_VSM.shape[0] for ix in range(overall_size): x = sessions_VSM.getrow(ix) t.add_item(ix, x.toarray()[0]) i += 1 if i % 1000 == 0: print(i, "rows processed over", overall_size) print("Time to index:", time.time() - start) del sessions_VSM gc.collect() start = time.time() t.build(5) # 5 trees print("Time to build:", time.time() - start) knn_start = time.time() i = 0 for ix in range(overall_size): knn = self.compute_knn(ix, sessions_id, t, 50) start = time.time() self.store_knn(sessions_id[ix], knn) self.__time_to_store.append(time.time() - start) i +=1 if i%100 == 0: print(i, "rows processed over", overall_size) print(mean(self.__time_to_query), mean(self.__time_to_knn), mean(self.__time_to_sort), mean(self.__time_to_store)) self.__time_to_query = [] self.__time_to_knn = [] self.__time_to_sort = [] self.__time_to_store = [] print("Time to compute knn:", time.time() - knn_start)
def generate_pair(X, n_neighbors, n_MN, n_FP, distance='euclidean', verbose=True): n, dim = X.shape n_neighbors_extra = min(n_neighbors + 50, n) tree = AnnoyIndex(dim, metric=distance) if _RANDOM_STATE is not None: tree.set_seed(_RANDOM_STATE) for i in range(n): tree.add_item(i, X[i, :]) tree.build(20) nbrs = np.zeros((n, n_neighbors_extra), dtype=np.int32) knn_distances = np.empty((n, n_neighbors_extra), dtype=np.float32) for i in range(n): nbrs_ = tree.get_nns_by_item(i, n_neighbors_extra + 1) nbrs[i, :] = nbrs_[1:] for j in range(n_neighbors_extra): knn_distances[i, j] = tree.get_distance(i, nbrs[i, j]) if verbose: print("Found nearest neighbor") sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10) if verbose: print("Calculated sigma") scaled_dist = scale_dist(knn_distances, sig, nbrs) if verbose: print("Found scaled dist") pair_neighbors = sample_neighbors_pair(X, scaled_dist, nbrs, n_neighbors) if _RANDOM_STATE is None: pair_MN = sample_MN_pair(X, n_MN) pair_FP = sample_FP_pair(X, pair_neighbors, n_neighbors, n_FP) else: pair_MN = sample_MN_pair_deterministic(X, n_MN, _RANDOM_STATE) pair_FP = sample_FP_pair_deterministic(X, pair_neighbors, n_neighbors, n_FP, _RANDOM_STATE) return pair_neighbors, pair_MN, pair_FP