class ImageSearchAnnoyCombo: ''' load an Annoy index for approximate nearest neighbor computation Annoy's angular distance uses dist(u,v) = 2(1-cos(u,v)) ''' def __init__(self,h5fname = 'X_ILSVRC2015.hdf5',annf='ILSVRC2015.ann',imageListPath = '/home/scratch/benediktb/RegionOfInterest/ILSVRC2015_filelist.txt',dset = 'fc6fc7'): #load h5 data h5f = h5py.File(h5fname,'r') self.X = h5f[dset] #load filenames with open(imageListPath,'r') as f: self.line_to_file = {i:line.rstrip() for i,line in enumerate(f)} self.A = AnnoyIndex(self.X.shape[1],'angular') self.A.load(annf) def run_query_approx(self,query,n=100,accuracy_factor = 5): nearest,scores = self.A.get_nns_by_vector(query, n, search_k=n*int(accuracy_factor)*128, include_distances=True) return zip((self.line_to_file[i] for i in nearest),scores) def run_query_exact(self,query,n=1000,nsmall=100): #retrieve approximate nearest neighbors using Annoy, then do exact ranking by loading from h5 into memory #use Annoy if n < nsmall: n = nsmall indexes = self.A.get_nns_by_vector(query, n, search_k=-1, include_distances=False) indexes_sorted = sorted(indexes) #use scipy cdist (or normalize first and do dot product for faster computation) #getting X by index from disc is very slow. distance = (cdist(self.X[indexes_sorted], query.reshape((1,query.shape[0])), 'cosine'))[:,0] ind = np.argpartition(distance, nsmall)[:nsmall]#partial sort, indices for top n, s_ind = np.argsort(distance[ind])#sort nearest = ind[s_ind] scoresorted = distance[ind][s_ind] return zip((self.line_to_file[indexes_sorted[i]] for i in nearest),scoresorted)
def do(indextype): a = AnnoyIndex(8, indextype[0]) a.load('points.%s.annoy' % indextype) with open('points.%s.ann.txt' % indextype, 'w') as out: for q_index in [1443, 1240, 818, 1725, 1290, 2031, 1117, 1211, 1902, 603]: nns = a.get_nns_by_item(q_index, 10) print >> out, '%s\t%s' % (q_index, ','.join([str(n) for n in nns]))
def test_overwrite_index(self): # Issue #335 f = 40 # Build the initial index t = AnnoyIndex(f) for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) t.save('test.ann') # Load index file t2 = AnnoyIndex(f) t2.load('test.ann') # Overwrite index file t3 = AnnoyIndex(f) for i in range(500): v = [random.gauss(0, 1) for z in range(f)] t3.add_item(i, v) t3.build(10) if os.name == 'nt': # Can't overwrite on Windows with self.assertRaises(IOError): t3.save('test.ann') else: t3.save('test.ann') # Get nearest neighbors v = [random.gauss(0, 1) for z in range(f)] nns = t2.get_nns_by_vector(v, 1000) # Should not crash
def _get_index(self, dataset): url = 'http://vectors.erikbern.com/%s.hdf5' % dataset vectors_fn = os.path.join('test', dataset + '.hdf5') index_fn = os.path.join('test', dataset + '.annoy') if not os.path.exists(vectors_fn): print('downloading', url, '->', vectors_fn) urlretrieve(url, vectors_fn) dataset_f = h5py.File(vectors_fn) distance = dataset_f.attrs['distance'] f = dataset_f['train'].shape[1] annoy = AnnoyIndex(f, distance) if not os.path.exists(index_fn): print('adding items', distance, f) for i, v in enumerate(dataset_f['train']): annoy.add_item(i, v) print('building index') annoy.build(10) annoy.save(index_fn) else: annoy.load(index_fn) return annoy, dataset_f
def retrieve(self): print 'Loading necessary files..' u = AnnoyIndex(self.dim, metric='angular') u.load(index_file) print 'ANN Retrieval..' for n_neighbors in knns: print 'Number of neighbors: ' + str(n_neighbors) for mult in self.multipliers: print 'Multiplier: ' + str(mult) search_k = self.n_trees * n_neighbors * mult filename = '.'.join((self.test_file.split('/')[-1].split('.')[:-1])) with open(self.test_file, 'r') as data_file: data = json.load(data_file) qArray = [] for i in range(len(data["questions"])): question_body = data["questions"][i]["body"] question_id = data["questions"][i]["id"] qcentroid = np.transpose(np.array(get_centroid_idf(question_body, self.emb, self.idf, self.stopwords, self.dim))) anns = u.get_nns_by_vector(qcentroid, n_neighbors, search_k) doc_anns = [] for n in anns: doc_anns.append(self.idmap[n]) q = Question(question_body, question_id, doc_anns) qArray.append(q) directory = "system_results/" if not os.path.exists(directory): os.makedirs(directory) with open(str(directory)+"/"+"CentIDF_annoy_"+str(n_trees)+"_"+str(n_neighbors)+"_"+str(mult)+".json", "w+") as outfile: outfile.write(json.dumps({"questions":[ob.__dict__ for ob in qArray]}, indent=2))
def build_annoy_index(corpus, dimension, winlen, winstep): print "Adding to Annoy index" index = AnnoyIndex(dimension, "euclidean") mfcc_list = [] i = 0 for filename, frames in corpus: # print filename, frames.shape for index_in_file, mfcc in enumerate(frames): mfcc_list.append((filename, index_in_file)) index.add_item(i, mfcc.tolist()) assert mfcc_list[i] == (filename, index_in_file) i += 1 opts = {"samplerate": desired_samplerate, "winlen": winlen, "winstep": winstep, "numcep": 13, "nfilt": 26, "nfft": 512, "ntrees": ANN_NTREES } cache_filename = "annoy_index_" + hashlib.md5(str([filename for filename, frames in corpus])).hexdigest() + "." + "_".join("%s=%s" % (k, v) for k, v in sorted(opts.items())) + ".tree" if not os.path.exists(cache_filename): print "Building Annoy index with %d trees" % ANN_NTREES # index.build(-1) index.build(ANN_NTREES) index.save(cache_filename) print "\tWrote cache to %s" % cache_filename else: print "\tReading cache from %s" % cache_filename index.load(cache_filename) return index, mfcc_list
def test_zero_vectors(self): # Mentioned on the annoy-user list bitstrings = [ '0000000000011000001110000011111000101110111110000100000100000000', '0000000000011000001110000011111000101110111110000100000100000001', '0000000000011000001110000011111000101110111110000100000100000010', '0010010100011001001000010001100101011110000000110000011110001100', '1001011010000110100101101001111010001110100001101000111000001110', '0111100101111001011110010010001100010111000111100001101100011111', '0011000010011101000011010010111000101110100101111000011101001011', '0011000010011100000011010010111000101110100101111000011101001011', '1001100000111010001010000010110000111100100101001001010000000111', '0000000000111101010100010001000101101001000000011000001101000000', '1000101001010001011100010111001100110011001100110011001111001100', '1110011001001111100110010001100100001011000011010010111100100111', ] vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings] f = 64 idx = AnnoyIndex(f, 'hamming') for i, v in enumerate(vectors): idx.add_item(i, v) idx.build(10) idx.save('idx.ann') idx = AnnoyIndex(f, 'hamming') idx.load('idx.ann') js, ds = idx.get_nns_by_item(0, 5, include_distances=True) self.assertEquals(js[0], 0) self.assertEquals(ds[:4], [0, 1, 1, 22])
def create_walks(df,index_file,patient_dict_file,index_dict_file,n_neighbors = 25,walks_per_patient=10,walk_size=50,out_dir="./"): index = AnnoyIndex(df.shape[1]) index.load(index_file) patient_dict = {} for key, val in csv.reader(open(patient_dict_file)): patient_dict[key] = int(val) index_dict = {} for key, val in csv.reader(open(index_dict_file)): index_dict[int(key)] = val print("Computing nearest-neighbors...") neighbor_dict = {} for i in range(index.get_n_items()): if i % 1000 == 0: print str(i) patient_id = index_dict[i] neighbors = index.get_nns_by_item(i=i, n=n_neighbors, search_k=-1, include_distances=False) neighbor_ids = [index_dict[x] for x in neighbors] neighbor_dict[patient_id] = neighbor_ids f = open(out_dir+"patient_walks.txt", 'wb') for i in range(index.get_n_items()): if i % 1000 == 0: print str(i) patient_id = index_dict[i] patient_sentences = "" for j in range(walks_per_patient): sentence = generate_sentence(start=patient_id,neighbor_dict=neighbor_dict, n_neighbors=n_neighbors,walk_size=walk_size) patient_sentences = sentence + "\n" ## Write it ## f.write(patient_sentences)
def test_save_without_build(self): # Issue #61 i = AnnoyIndex(10) i.add_item(1000, [random.gauss(0, 1) for z in xrange(10)]) i.save('x.tree') j = AnnoyIndex(10) j.load('x.tree') j.build(10)
def test_no_items(self): idx = AnnoyIndex(100) idx.build(n_trees=10) idx.save('foo.idx') idx = AnnoyIndex(100) idx.load('foo.idx') self.assertEquals(idx.get_n_items(), 0) self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])
class FeatureNN: tree = None def __init__(self, features, tree_file): self.tree = AnnoyIndex(features, metric='euclidean') self.tree.load(str(tree_file)) def nn(self, x): return self.tree.get_nns_by_vector(x.tolist(), 1)[0]
def main(): # Annoy Vector Dimension vec_dimension =100 models_dir = "/raid/ankit/ann_models/" start = time.time() print "Starting: Loading of memory mapped models ... " # Load all models - memory mapped - quick ann1 = AnnoyIndex(vec_dimension) ann1.load(models_dir+"model10_split1.ann") ann2 = AnnoyIndex(vec_dimension) ann2.load(models_dir+"model10_split2.ann") ann3 = AnnoyIndex(vec_dimension) ann3.load(models_dir+"model10_split3.ann") ann4 = AnnoyIndex(vec_dimension) ann4.load(models_dir+"model10_split4.ann") ann5 = AnnoyIndex(vec_dimension) ann5.load(models_dir+"model10_split5.ann") end =time.time() print "All annoy-lsh models loaded! Time Taken: "+str((end-start)/60)+ " minutes." print "\nSimilar Queries - LSH Interface [All Top Queries]" print "----------------------------------------------------" flag = "True" while (flag == "True"): testquery = raw_input("Enter Query: ") nearest_num = raw_input("Number of similar queries: ") if nearest_num == 0 or nearest_num == "": nearest_num = 10 nearest_num = int(nearest_num) if not testquery.strip() =="": lsh_list_n = get_similar_queries(testquery.strip(), nearest_num, ann1, ann2, ann3, ann4, ann5) # Return and Print the Top 10 nearest Queries to the Original Query print "\nCandidate Nearest Queries [TOP 10]: " count = 0 for query,distance in lsh_list_n: if count == nearest_num: break print str(query)+"\t"+str(distance) count+=1 user_input = raw_input("\nDo you wish to continue again? (Type 'no' to quit): ") if user_input == "no": print "\nGoodbye!" break else: print "\n" continue
def test_only_one_item(self): # reported to annoy-user by Kireet Reddy idx = AnnoyIndex(100) idx.add_item(0, numpy.random.randn(100)) idx.build(n_trees=10) idx.save('foo.idx') idx = AnnoyIndex(100) idx.load('foo.idx') self.assertEquals(idx.get_n_items(), 1) self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [0])
def test_load_save(self): # Issue #61 i = AnnoyIndex(10) i.load('test/test.tree') u = i.get_item_vector(99) i.save('x.tree') v = i.get_item_vector(99) self.assertEqual(u, v) j = AnnoyIndex(10) j.load('test/test.tree') w = i.get_item_vector(99) self.assertEqual(u, w)
def get_tree_index(metric='angular', size=4096): ''' INPUT: Optional parameters for the metric space and size of AnnoyIndex OUTPUT: AnnoyIndex tree, dictionary of node assignment to image names ''' tree = AnnoyIndex(size, metric=metric) tree.load(DATA_DIR + 'tree_' + metric + '.ann') with open(DATA_DIR + 'indexes_' + metric, 'rb') as f: indexes = pickle.load(f) return tree, indexes
def test_on_disk(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.on_disk_build('on_disk.ann') self.add_items(i) i.build(10) self.check_nns(i) i.unload() i.load('on_disk.ann') self.check_nns(i) j = AnnoyIndex(f, 'euclidean') j.load('on_disk.ann') self.check_nns(j)
def main(args): """ Main entry. """ data = Dataset(args.dataset) f = data.base.shape[1] for ntrees in args.ntrees: t = AnnoyIndex(f) # Length of item vector that will be indexed idxpath = os.path.join(args.exp_dir, 'sift_annoy_ntrees%d.idx' % ntrees) if not os.path.exists(idxpath): logging.info("Adding items ...") for i in xrange(data.nbae): t.add_item(i, data.base[i]) if i % 100000 == 0: logging.info("\t%d/%d" % (i, data.nbae)) logging.info("\tDone!") logging.info("Building indexes ...") t.build(ntrees) logging.info("\tDone!") t.save(idxpath) else: logging.info("Loading indexes ...") t.load(idxpath) logging.info("\tDone!") ids = np.zeros((data.nqry, args.topk), np.int) logging.info("Searching ...") tic() for i in xrange(data.nqry): ids[i, :] = np.array(t.get_nns_by_vector(data.query[i], args.topk)) time_costs = toc() logging.info("\tDone!") report = os.path.join(args.exp_dir, "report.txt") with open(report, "a") as rptf: rptf.write("*" * 64 + "\n") rptf.write("* %s\n" % time.asctime()) rptf.write("*" * 64 + "\n") r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1] with open(report, "a") as rptf: rptf.write("=" * 64 + "\n") rptf.write("index_%s-ntrees_%s\n" % ("Annoy", ntrees)) rptf.write("-" * 64 + "\n") rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k)) rptf.write("time cost (ms): %.3f\n" % (time_costs * 1000 / data.nqry))
def load_index(path_index: PathType, meta_d: Dict) \ -> AnnoyIndex: """ We rely on ANNOY's usage of mmap to be fast loading (fast enough that we can load it on every single call) """ n_dim = meta_d['n_dim'] metric = meta_d['metric'] u = AnnoyIndex( n_dim, metric=metric, ) u.load(str(path_index)) u.set_seed(SEED) return u
class AnnoyLookup(object): def __init__(self, metadata_path): with open(os.path.join(metadata_path, "metadata.json")) as f: self._data = json.load(f) self._index = AnnoyIndex(self._data["feature_length"], metric="angular") self._index.load(os.path.join(metadata_path, "index.ann")) def get_neighbours(self, embedding, max_neigh=3): items, distances = self._index.get_nns_by_vector( embedding, max_neigh, include_distances=True) zipped = zip(items, distances) sorted_list = sorted(zipped, key=lambda t: t[1]) return [(self._data["filenames"][idx], distance) for idx, distance in sorted_list]
def test_save_load(self): f = 100 i = AnnoyIndex(f, 'hamming') u = numpy.random.binomial(1, 0.5, f) v = numpy.random.binomial(1, 0.5, f) i.add_item(0, u) i.add_item(1, v) i.build(10) i.save('blah.ann') j = AnnoyIndex(f, 'hamming') j.load('blah.ann') rs, ds = j.get_nns_by_item(0, 99, include_distances=True) self.assertEquals(rs, [0, 1]) self.assertAlmostEqual(ds[0], 0) self.assertAlmostEqual(ds[1], numpy.dot(u-v, u-v))
class _Annoy(object): def __init__(self, feature): model_path = 'model.ann'.format() n_dim = feature.shape[1] * feature.shape[2] feature = feature.reshape(feature.shape[0], n_dim) self.t = AnnoyIndex(n_dim, 'angular') if not os.path.exists(model_path): for i, f in enumerate(tqdm(feature)): # normarize v = f / np.sum(f) self.t.add_item(i, v) self.t.build(10) self.t.save(model_path) else: self.t.load(model_path)
def createAnnoyIndex(d, targetPoints, n_trees): #create AnnoyIndex in R^(2*d) targetIndex = AnnoyIndex(2 * d, metric='euclidean') #add each of the projected target points for i in range(targetPoints.shape[0]): targetIndex.add_item(i, projectToTorus(targetPoints[i])) #build the LSH-forest with the target points targetIndex.build(n_trees) #save and load with memory map targetIndex.save("LSHForest.ann") loadedIndex = AnnoyIndex(2 * d, metric='euclidean') loadedIndex.load("LSHForest.ann") return loadedIndex
def test_save_load(self): f = 100 i = AnnoyIndex(f, 'hamming') u = numpy.random.binomial(1, 0.5, f) v = numpy.random.binomial(1, 0.5, f) i.add_item(0, u) i.add_item(1, v) i.build(10) i.save('blah.ann') j = AnnoyIndex(f, 'hamming') j.load('blah.ann') rs, ds = j.get_nns_by_item(0, 99, include_distances=True) self.assertEquals(rs, [0, 1]) self.assertAlmostEqual(ds[0], 0) self.assertAlmostEqual(ds[1], numpy.dot(u - v, u - v))
def get_nn_by_name(name): Session = sessionmaker(bind=engine) session = Session() # This command should be used to name_string = "%{0}%".format(name) # print(name_string) result = session.query(annoy_table).filter( annoy_table.c.name.like(name_string)).first() u = AnnoyIndex(f) u.load('../../test.ann') # super fast, will just mmap the file list_of_near = u.get_nns_by_item(result[1], 4) # will find the 5 nearest neighbors # remove current node list_of_near.remove(result[1]) return list_of_near
class ANN: def __init__(self, dimension): self.ann = AnnoyIndex(dimension) def addVectors(self,vectors): for idx,v in enumerate(vectors): self.ann.add_item(idx,v) self.ann.build(10) def query(self,vector): match = self.ann.get_nns_by_vector(vector,1)[0] # return self.ann.get_item_vector(match),match return match def save(self): self.ann.save("analogies.ann") def load(self,filename): self.ann.load(filename)
def main(): parser = argparse.ArgumentParser(description='recommend system') parser.add_argument('--query', '-q', type=str, default="", help='query image path') parser.add_argument('--bbox', '-b', type=str, default="", help='bbox image') parser.add_argument('--genre', '-g', type=str, default="tops", help='genre') args = parser.parse_args() if args.query == "": raise ("") genre = args.genre data_path = [] base = os.path.dirname(os.path.abspath(__file__)) list_path = os.path.normpath(os.path.join(base, './img_list.txt')) with open(list_path, "r") as f: for line in f.readlines(): data_path.append(line.rstrip()) annoy_model = AnnoyIndex(256) annoy_model.load(base + "/{}.ann".format(genre)) query_path = args.query bbox = [int(item) for item in args.bbox.split(",")] query_img = utils.read_image(query_path, color=True) croped_query_img = crop_img(query_img, bbox) comparing_hist = cv2.calcHist([croped_query_img], [0], None, [256], [0, 256]) predict_indexes = annoy_model.get_nns_by_vector(comparing_hist, 5, search_k=-1) predict_indexes = [ data_path[idx].split("\\")[-1] for idx in predict_indexes ] #with open("recommend_image.json", "w") as f: json_data = json.dumps(predict_indexes) print(json_data)
def closest_topK(unseen_event, concept_embedding, concept_mapping, dim, topK=10, unseen_id=None): """ unseen_event: (title: str, description: str) concept_embedding: {word_id : [emb]} concept_mapping: {word_id : word_string} """ unseen_event_title_tags = jieba.analyse.extract_tags(unseen_event[0]) # Switch textrank or embedrank if ARGS.embedrank: unseen_event_description_words = embedrank_getkeywords(unseen_event[1]) elif ARGS.tfidf: unseen_event_description_words = tfidf_getkeywords(unseen_event[1]) else: unseen_event_description_words = textrank_getkeywords(unseen_event[1]) print('title words:', unseen_event_title_tags) print('description words:', unseen_event_description_words) keywords = [*unseen_event_title_tags, *unseen_event_description_words] # INVOLVE GENERE # try: # for word in GENERE_TO_KEYWORDS[ID_TO_GENERE[unseen_id]]: # if word not in keywords: # keywords.append(word) # except KeyError: # pass ### END OF INVOLVING GENERE print("keywords", keywords) # Generate the label embedding for a new item event_concept_embeddings = [] for word in keywords: try: event_concept_embeddings.append(concept_embedding[concept_mapping[word]]) except KeyError: continue unseen_event_vector = [ sum(value) / len(value) for value in zip(*event_concept_embeddings)] if unseen_event_vector == []: unseen_event_vector = [0] * dim annoy_index = AnnoyIndex(dim) annoy_index.load('cc2vec_textrank.ann') # Find topK colest item according to the label embedding ranking_list = annoy_index.get_nns_by_vector(unseen_event_vector, 10, search_k=-1, include_distances=True) propgation_list = [] for id_, score in zip(ranking_list[0], ranking_list[1]): propgation_list.append((id_, score)) return unseen_event_vector, propgation_list
def get_top_k_tables(sample_info_dict, id_to_index, index_file, dim, k): u = AnnoyIndex(dim, 'angular') u.load(index_file) ranks, top_k = [], {} for sentence, info in sample_info_dict.items(): table_id, embedding = info['table_id'], info['embedding'] table_index = id_to_index[table_id] closest_tables = u.get_nns_by_vector(embedding, 1000000) rank = closest_tables.index(table_index) if rank < k: label = [0 for _ in range(k)] label[rank] = 1 info['top_k'] = closest_tables[:k] info['labels'] = label ranks.append(rank) return ranks
def debug(): f = 40 t = AnnoyIndex(f) # Length of item vector that will be indexed for i in xrange(1000): v = [random.gauss(0, 1) for z in xrange(f)] t.add_item(i, v) t.build(10) # 10 trees t.save('test.ann') # ... u = AnnoyIndex(f) u.load('test.ann') # super fast, will just mmap the file print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors
def load_index(self, index_id): if self.annoy_index is None: log.info("loading initial index with id {}", self.current_index) else: log.info("switching index from {} to {}", self.current_index, index_id) newindex = AnnoyIndex(108, metric='euclidean') newindex.load(config.index_config['index_path'] + 'index_' + str(index_id) + '.ann') if self.annoy_index is not None: self.annoy_index.unload() self.annoy_index = newindex self.current_index = index_id log.info("finished switching index. now using index {}", self.current_index)
def test_build_sparse_annoy_index(annoy_index_file): data = np.random.choice([0, 1], size=(10, 5)) sparse_data = csr_matrix(data) index = build_annoy_index(sparse_data, annoy_index_file) assert os.path.exists(annoy_index_file) loaded_index = AnnoyIndex(5, metric='angular') loaded_index.load(annoy_index_file) assert index.f == loaded_index.f == 5 assert index.get_n_items() == loaded_index.get_n_items() == 10 assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5) index.unload() loaded_index.unload()
def _load_index(self, wherefrom, index_key): """Load an AnnoyIndex from disk""" est = self.estimator_ # I can't think of anything more clever because I've been up for # hours and hours and hours, so this is the kludgiest solution: if index_key == "similar_items_index": n_index = est.item_factors.shape[1] # Otherwise, "recommend_index" else: # This assumes approximate_recommend, since it's the only way # it will ever get to this code n_index = est.extra_ index = AnnoyIndex(n_index, "angular") index.load(join(wherefrom, index_key)) return index
def run(self): try: index = AnnoyIndex(self.n_dims, metric='euclidean') index.load(self.index_filepath) for i in range(self.data_indices[0], self.data_indices[1]): neighbour_indexes = index.get_nns_by_vector(self.X[i,:] , self.k, search_k=self.search_k, include_distances=False) neighbour_indexes = np.array(neighbour_indexes, dtype=np.uint32) self.results_queue.put( IndexNeighbours(row_index=i, neighbour_list=neighbour_indexes)) except Exception as e: self.exception = e finally: self.results_queue.close()
def test_on_disk(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.on_disk_build('test.ann') i.add_item(0, [2, 2]) i.add_item(1, [3, 2]) i.add_item(2, [3, 3]) i.build(10) i.unload() i.load('test.ann') self.assertEqual(i.get_nns_by_vector([4, 4], 3), [2, 1, 0]) self.assertEqual(i.get_nns_by_vector([1, 1], 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_vector([4, 2], 3), [1, 2, 0])
def test_celeba_embedding(self): PATHS_JSON = os.getenv('PATHS_JSON', abspath(join(__file__, '..', '..', 'data', 'paths_celeba.json'))) EMBEDDING_JSON = os.getenv('EMBEDDING_JSON', abspath(join(__file__, '..', '..', 'data', 'embeddings_celeba.json'))) INDEX_FILENAME = os.getenv('INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba.ann'))) NSW_INDEX_FILENAME = os.getenv('NSW_INDEX_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_nsw'))) TEST_CASES_FILENAME = os.getenv('TEST_CASES_FILENAME', os.path.abspath(os.path.join(__file__, '..', '..', 'data', 'index_celeba_test_cases.json'))) with open(PATHS_JSON, 'r') as fp: print('Loading paths') paths = np.array(json.load(fp)) with open(EMBEDDING_JSON, 'r') as fp: print('Loading embeddings') embeddings = json.load(fp) with open(TEST_CASES_FILENAME, 'r') as fp: print('Loading test_cases') test_cases = json.load(fp) annoy = AnnoyIndex(len(embeddings[0])) annoy_index = annoy.load(INDEX_FILENAME) print('building nsw index') nsw_index = PyNSW('l2') print('Creating nodes') nodes = [create_node(path, vector) for path, vector in zip(paths, embeddings)] print('Inserting nodes') for node in tqdm(nodes): nsw_index.nn_insert(node, 5, 1000) n, k_annoy, k_nsw = 0, 0, 0 print('Calculating accuracy on CelebA') for tk in test_cases: vector = embeddings[int(tk['embedding_index'])] closest_paths_real = tk['closest_paths_real'] closest_paths_annoy = paths[annoy.get_nns_by_vector(vector, 10, 1000)] closest_paths_nsw = [n[1] for n in nsw_index.nn_search(create_node('kek', vector), 5, 10)] assert len(closest_paths_real) == 10 assert len(closest_paths_annoy) == 10 assert len(closest_paths_nsw) == 10 n += 10 k_annoy += len(set(closest_paths_annoy).intersection(closest_paths_real)) k_nsw += len(set(closest_paths_nsw).intersection(closest_paths_real)) print('Annoy accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_annoy / n)) print('NSW accuracy on CelebA embeddings: {:.3f}%'.format(100.0 * k_nsw / n))
class ImageSearchAnnoy: ''' load an Annoy index for approximate nearest neighbor computation Annoy's angular distance uses dist(u,v) = 2(1-cos(u,v)) ''' def __init__(self,dimensions,annf='ILSVRC2015.ann',imageListPath = '/home/scratch/benediktb/RegionOfInterest/ILSVRC2015_filelist.txt'): #load filenames with open(imageListPath,'r') as f: #self.line_to_file = {i:line.split('/')[-1].rstrip() for i,line in enumerate(f)} self.line_to_file = {i:line.rstrip() for i,line in enumerate(f)} self.A = AnnoyIndex(dimensions,'angular') self.A.load(annf) def run_query(self,query,n=100,accuracy_factor = 2): nearest,scores = self.A.get_nns_by_vector(query, n, search_k=n*int(accuracy_factor)*128, include_distances=True) return zip((self.line_to_file[i] for i in nearest),scores)
def test(self): # feat=np.random.random((100000,4096)) # annoyIndex = AnnoyIndex(4096) # annoyIndex.on_disk_build('a') # for i,v in enumerate(feat): # annoyIndex.add_item(i,v) # for i,v in enumerate(feat): # annoyIndex.add_item(i,v) t = time.time() # # annoyIndex.build(100) # print(time.time()-t) annoyIndex = AnnoyIndex(4096) annoyIndex.load('a') print(annoyIndex.get_nns_by_item(0, 5)) print(time.time() - t)
def init(): global indices indices = defaultdict(lambda: defaultdict(dict)) for dim in ['matrix', 'tensor']: for size in [500, 5000]:#, 1000, 5000, 10000]: folder = 'data/' + dim + '/200x' + str(size) # loading the index t = AnnoyIndex(20, 'angular') t.load(folder + '/embd.ann') indices[dim][size]['index'] = t # loading the extractions exts = pd.read_csv(folder + '/extr_index.csv') ext2idx = dict(zip((x['modifier'] + ';' + x['aspect'] \ for _, x in exts.iterrows()), range(len(exts)))) indices[dim][size]['exts'] = exts indices[dim][size]['ext2idx'] = ext2idx
def load_indexes(ann_filepath=None, celeb_mapping_path=None): home = expanduser("~") if ann_filepath is None: ann_filepath = os.path.join(home, 'celeb_index_60.ann') celeb_ann_id = '1-3Wb7fiINbrk9FSagTxjLdSjp7KzrMp7' if not os.path.exists(ann_filepath): download_file_from_google_drive(celeb_ann_id, ann_filepath) if celeb_mapping_path is None: celeb_mapping_path = os.path.join(home, 'celeb_mapping.json') celeb_mapping_file_id = '1wDaaSQ6NjxLkxpzYyTRknefizZUKnKDj' if not os.path.exists(celeb_mapping_path): download_file_from_google_drive(celeb_mapping_file_id, celeb_mapping_path) ann_index = AnnoyIndex(2048, 'angular') _ = ann_index.load(ann_filepath) with open(celeb_mapping_path) as json_file: celeb_mapping_temp = json.load(json_file) celeb_mapping_dict = {} for key, value_list in celeb_mapping_temp.items(): for each_id in value_list: celeb_mapping_dict[each_id] = str(key) return ann_index, celeb_mapping_dict
def fetch_topK_similar(items_vec_file, ann_model_file, dim, topK, item_idx_map, items_list_batch, ddb_table, company_label): b_time = time.time() log.debug("[fetch_topK_similar] Start to get topK items") ann_model = AnnoyIndex(dim, 'angular') ann_model.load(ann_model_file) update_data = {} items_set = set([item for sublist in items_list_batch for item in sublist]) print(items_list_batch) print(items_set) with open(items_vec_file, 'r') as in_f: num_items, dim = in_f.readline().strip().split() for idx, line in enumerate(in_f): tmp = line.split() item_id = tmp[0] if item_id in items_set: action, content_id = item_id.split(':', 1) item_emb = list(map(float, tmp[1:])) if item_label not in update_data: update_data[item_label] = {'item_id': item_label, 'label': company_label} res_dict = OrderedDict() topK_item, topK_dist = ann_model.get_nns_by_vector(item_emb, topK*3, include_distances=True) for item_idx, dist in zip(topK_item, topK_dist): try: item = item_idx_map[item_idx].split(':', 1)[1].strip() if item not in res_dict: res_dict[item] = Decimal(f"{1-dist:.4f}") # Todo: maybe do score normalize here except Exception as err: log.error(err) log.warning(f"Couldn't find item name : {item_idx_map[item_idx]}") if len(res_dict) == topK: break if action == Action.View.value: update_data[item_label]['view_similar'] = res_dict elif action == Action.AddToCart.value: update_data[item_label]['add_cart_similar'] = res_dict elif action == Action.Purchase.value: update_data[item_label]['purchase_similar'] = res_dict else: log.warning(f"{e} -> {action} not a valided action...") continue log.debug(f"[Time|fetch_topK_similar] Cost : {time.time() - b_time}") if len(update_data) > 0: insert_ddb(ddb_table, company_label, update_data)
class ChexSearch(object): """ Searches Chex index for game states and associated games. """ #TODO: Combine results of board transforms with binary search algo. def __init__(self, chex_index, results=10, search_k=40): self.chex_index = chex_index self.results = results self.search_k = search_k self.annoy_index = AnnoyIndex(_bitboard_length, metric='angular') self.annoy_index.load(os.path.join(self.chex_index, 'annoy.idx')) self.chex_sql = SqliteDict(os.path.join(self.chex_index, 'sqlite.idx')) def search(self, board): """ Searches for board. board: game object of type chess.Board Return value: [ (board, similarity score, [(game_id, move number), ...]), ...] """ symmetrical_boards = [ board_to_bitboard(board), invert_board(board), flip_board(board), reverse_and_flip(board) ] results = [] for bitboard in symmetrical_boards: for annoy_id, similarity in zip( *self.annoy_index.get_nns_by_vector( bitboard, self.results, include_distances=True)): # Recompute ASCII key bitboard = self.annoy_index.get_item_vector(annoy_id) to_unhexlify = '%x' % int( ''.join(map(str, map(int, bitboard))), 2) try: key = binascii.unhexlify(to_unhexlify) except TypeError: key = binascii.unhexlify('0' + to_unhexlify) results.append((bitboard_to_board(bitboard), similarity, self.chex_sql[key])) return results def close(self): del self.annoy_index
def main(): # mnist画像の読み込み処理 train_imgs, train_lbls, test_imgs, test_lbls = load_mnist() print(train_imgs.shape, train_lbls.shape, test_imgs.shape, test_lbls.shape) if not os.path.isfile('./static/mnist_db.ann'): make_annoy_db(train_imgs) # annoydbのビルド annoy_db = AnnoyIndex((28 * 28), metric='euclidean') annoy_db.load('./static/mnist_db.ann') # annoyのデータベースをロードする # テストデータを入力して近い近傍を取ってきて実際と比較することで試しに精度をみてみる y_pred = [ train_lbls[annoy_db.get_nns_by_vector(test_img.flatten(), 1)[0]] for test_img in test_imgs ] score = accuracy_score(test_lbls, y_pred) print('acc:', score)
def baseline_train(olddata, f, trees): """" olddata to train with using f number of features of the data and building an index with trees number of trees """ t = AnnoyIndex(f) # Length of item vector that will be indexed if (os.path.isfile(saving_model)): print "Loading in a pre-made, large read-only data structure we previously made with training data to use for approximate nearest neighbors on holdout data..." t.load(saving_model) else: print "Creating a large read-only data structure with training data to use for approximate nearest neighbors on holdout data..." for i in olddata.index: v = list(olddata.ix[i, ['latitude', 'longitude', 'time_period']]) t.add_item(i, v) print "Building the trees..." t.build(trees) assert t.get_n_items() == olddata.shape[0] print "Saving the model..." t.save(saving_model) # Can easily be loaded into memory later. return (t)
def test1(self): rows = self.query_country_name('%') annoyIndex = AnnoyIndex(768) # for i,row in enumerate(rows): # encode=self.bc.encode([row[1]]) # annoyIndex.add_item(i,encode[0]) # annoyIndex.build(10) # annoyIndex.save('articles') annoyIndex.load('articles') result, index = annoyIndex.get_nns_by_item(10, 5, include_distances=True) print(rows[10]) print(np.cos(index)) for i in result: print(rows[i])
def get_similar_items(self, product_id: int, rec_type: int) -> pd.DataFrame: ''' Function that creates recommendation lists. The intuition behind using less components is reducing the number of latent factors that can be inferred. And, by excluding item features for the CAB model, recommendations will be less based off explicit features such as `aisle` and `department`. ------------------- type: 1 - Similar Items [DEFAULT_PARAMS] 2 - Complement Items [CAB_PARAMS] ''' logging.info( f'Logging recommendations for {self.model.config.ANNOY_PARAMS[rec_type]}' ) if rec_type == 1: annoy_model = AnnoyIndex( self.model.config.LIGHTFM_PARAMS['no_components']) annoy_model.load(self.config.PATHS.models + '/item.ann') elif rec_type == 2: annoy_model = AnnoyIndex( self.model.config.LIGHTFM_CAB_PARAMS['no_components']) annoy_model.load(self.config.PATHS.models + '/item_cab.ann') similar_variants = annoy_model.get_nns_by_item( product_id, self.model.config.ANNOY_PARAMS['nn_count'], search_k=-1, include_distances=False) logging.info(type(similar_variants)) logging.info(similar_variants) similar_variants_df = self.item_df.iloc[similar_variants, :] similarVariantsTable = PrettyTable( ['product_id', 'product_name', 'aisle', 'department', 'num']) similarVariantsTable.add_row([ similar_variants_df['product_id'], similar_variants_df['product_name'], similar_variants_df['aisle'], similar_variants_df['department'], similar_variants_df['num'] ]) logging.info( f'{self.model.config.ANNOY_PARAMS[rec_type]} Data: \n{similarVariantsTable}' ) return similar_variants_df
def baseline_train(olddata, f, trees): """" olddata to train with using f number of features of the data and building an index with trees number of trees """ t = AnnoyIndex(f) # Length of item vector that will be indexed if os.path.isfile(saving_model): print "Loading in a pre-made, large read-only data structure we previously made with training data to use for approximate nearest neighbors on holdout data..." t.load(saving_model) else: print "Creating a large read-only data structure with training data to use for approximate nearest neighbors on holdout data..." for i in olddata.index: v = list(olddata.ix[i, ["latitude", "longitude", "time_period"]]) t.add_item(i, v) print "Building the trees..." t.build(trees) assert t.get_n_items() == olddata.shape[0] print "Saving the model..." t.save(saving_model) # Can easily be loaded into memory later. return t
def read_annoy(embedding): f = 64 # Embedding size u = AnnoyIndex(f, 'euclidean') u.load('./neural_networks/models/saved_annoy.ann' ) # super fast, will just mmap the file n = 1 # Num neighbors neighbors = [] distances = [] for emb in embedding: neighbor, dist = u.get_nns_by_vector(emb, n, search_k=-1, include_distances=True) neighbors.append(neighbor) distances.append(dist) return neighbors, distances
def similarity_search(image, num_closest_items, hash_table_file_path, image_hash_file_path): """Input: image Output: a list of images similar to the input Get the feature set associated with the image. Use feature set to query the ANNoy hashmap. """ graph = create_graph(model_path) features = get_features_from_graph(graph, image_path) hash_table = AnnoyIndex(len(features)) hash_table.load(hash_table_file_path) image_hash_table = pickle.load(image_hash_file_path) for i in xrange(len(features)): search_results = hash_table.get_nns_by_vector( features[i], num_closest_items, include_distances=True) file_path = image_hash_table[ search_results] #Translate integer into image path. print(search_results)
class Annoy(ANN): """ Builds an ANN model using the Annoy library. """ def load(self, path): # Load index self.model = AnnoyIndex(self.config["dimensions"], self.config["metric"]) self.model.load(path) def index(self, embeddings): # Inner product is equal to cosine similarity on normalized vectors self.config["metric"] = "dot" # Create index self.model = AnnoyIndex(self.config["dimensions"], self.config["metric"]) # Add items for x in range(embeddings.shape[0]): self.model.add_item(x, embeddings[x]) # Build index self.model.build(self.setting("ntrees", 10)) def search(self, queries, limit): # Lookup search k setting searchk = self.setting("searchk", -1) # Annoy doesn't have a built in batch query method results = [] for query in queries: # Run the query ids, scores = self.model.get_nns_by_vector(query, n=limit, search_k=searchk, include_distances=True) # Map results to [(id, score)] results.append(list(zip(ids, scores))) return results def save(self, path): # Write index self.model.save(path)
class FoodGetter: def __init__(self): pass def load(self, data_path, doc2vec_path, annoy_path): """ Обучатся не в состоянии, можно только загрузить готовые модельки с диска Загружается секунд 5 - 7, это нормально """ self.data = pd.read_csv(data_path) self.data.ingredients = self.data.ingredients.apply(eval) self.data.steps = self.data.steps.apply(eval) self.doc2vec_model = gensim.models.doc2vec.Doc2Vec.load(doc2vec_path) self.length = len(self.doc2vec_model.infer_vector([" "])) self.annoy_model = AnnoyIndex(self.length, 'angular') self.annoy_model.load(annoy_path) def find(self, _input, N=5): """ На вход: строка с ингредиентами Выход: генерируем кортеж из имени, список ингредиентов и индекс Если ничего не найдено возвращается пустой массив """ _input = _input.split(" ") _res = set(_input) idx = self.annoy_model.get_nns_by_vector( self.doc2vec_model.infer_vector(_input), 1000, search_k=2000) res = filter( lambda index: (lambda x: len(_res & x) / len(x) > 0.55) (set(self.data.ingredients[index])), idx) ans = [] for i, index in enumerate(res): if i == N: return ans temp = self.data.loc[index] ans.append((temp["name"], " ".join(temp.ingredients), index)) return ans def get_steps(self, idx): "Для понравившегося индекса возвращаем инструкцию по приготовлению и номер шага" return enumerate(self.data.loc[idx].steps)
def load(self): self.prev_id = -1 self.indexes = [] logger.info("Loading index {0}".format(self.actor_urn)) for index in self.indexes: index.unload() for f in sorted(listdir(self.index_dir)): if f.endswith(".ann"): self.index_files.append(join(self.index_dir,f)) index = AnnoyIndex(self.feat_size, metric='euclidean') index.load(join(self.index_dir, f)) self.indexes.append(index) self.prev_id += index.get_n_items() elif f.endswith('saved_state'): self.mem_store = np.load(join(self.index_dir, f)).tolist() logger.info("Loaded {0} files with total {1} records for index {2}" .format(len(self.indexes), self.prev_id + 1, self.actor_urn))
class SimilarStringStore: def __init__(self, **kwargs): self.transformer = FeatureGenerator(k=1) print(self.transformer.n_features) self.store = AnnoyIndex(self.transformer.n_features) def vectorize(self, s): return self.transformer.transform(s) def add(self, id, s): ''' add a string to index ''' vector = self.transformer.transform(s) self.store.add_item(int(id), vector) return vector def build(self): self.store.build(500) def save(self, filename='store.knn'): self.store.save(filename) def build_and_save(self, filename='store.knn'): self.build() self.save(filename) def load(self, filename='store.knn'): self.store.load(filename) def query(self, s): ''' query index ''' vector = self.transformer.transform(s) neighbors = self.store.get_nns_by_vector(vector, 40) return neighbors def remove(self, id): ''' remove a string from the index ''' pass
def create_collage(input_image, profile_name, version_count): """ given an input image and an existing profile, create a set of new collages """ profile_folder = PROFILES_DIRECTORY + profile_name + "/" if not os.path.exists(OUTPUT_DIRECTORY): os.makedirs(OUTPUT_DIRECTORY) # todo: load feature dimensions from profile nns_index = AnnoyIndex(SAMPLE_DIMENSION[0]*SAMPLE_DIMENSION[1], metric="euclidean") print("loading trees...") nns_index.load(profile_folder + profile_name + ".tree") print("done.") subimage_index = pickle.load( open(profile_folder + profile_name + ".p", "rb")) template_image = Image.open(input_image) image_width, image_height = template_image.size[0], template_image.size[1] crop_width, crop_height = subimage_index[-1]["crop_width"], subimage_index[-1]["crop_height"] for i in xrange(version_count): print("Creating collage {}/{}...").format(i+1, version_count) output_image = template_image.copy() for x in xrange(0, image_width-crop_width, crop_width): for y in xrange(0, image_height-crop_height, crop_height): box = (x, y, x + crop_width, y + crop_height) crop_box = output_image.crop(box) crop_sample = crop_box.convert("LA").resize(SAMPLE_DIMENSION) gs_pixeldata = [] for pixel in list(crop_sample.getdata()): gs_pixeldata.append(pixel[0]) image_neighbor = nns_index.get_nns_by_vector(gs_pixeldata, version_count)[i] substitute_image = Image.open(subimage_index[image_neighbor]["image"]) substitute_crop = substitute_image.crop( subimage_index[image_neighbor]["box"]) output_image.paste(substitute_crop, box) output_path = OUTPUT_DIRECTORY + str(i) + ".png" output_image.save(output_path, "PNG") print("done.") print("{} image(s) saved in {}").format( version_count, OUTPUT_DIRECTORY) return
def merge_indicies(self, index_file_a, index_file_b, sender_urn): logger.info("Merging {0} and {1} for {2} index".format(index_file_a, index_file_b, sender_urn)) index_a = AnnoyIndex(self.feat_size, metric='euclidean') index_b = AnnoyIndex(self.feat_size, metric='euclidean') new_index = AnnoyIndex(self.feat_size, metric='euclidean') index_a.load(index_file_a) index_b.load(index_file_b) cnt = 0 for i in range(index_a.get_n_items()): new_index.add_item(cnt, index_a.get_item_vector(i)) cnt += 1 for i in range(index_b.get_n_items()): new_index.add_item(cnt, index_b.get_item_vector(i)) cnt += 1 new_index_file = index_file_a + ".merged" index_a.unload() index_b.unload() new_index.build(self.n_trees) new_index.save(new_index_file) logger.info("Merging {0} and {1} for {2} index, total number of items: {3}".format( index_file_a, index_file_b, sender_urn, cnt)) new_index.unload() pykka.ActorRegistry.get_by_urn(sender_urn).proxy().complete_compaction( new_index_file=new_index_file, index_file_a=index_file_a, index_file_b=index_file_b )
def _get_index(self, f, distance): input = 'test/glove.twitter.27B.%dd.txt.gz' % f output = 'test/glove.%d.%s.annoy' % (f, distance) if not os.path.exists(output): if not os.path.exists(input): # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/ url = 'http://www-nlp.stanford.edu/data/glove.twitter.27B.%dd.txt.gz' % f print('downloading', url, '->', input) urlretrieve(url, input) print('building index', distance, f) annoy = AnnoyIndex(f, distance) for i, line in enumerate(gzip.open(input, 'rb')): v = [float(x) for x in line.strip().split()[1:]] annoy.add_item(i, v); annoy.build(10) annoy.save(output) annoy = AnnoyIndex(f, distance) annoy.load(output) return annoy
def test_load_save_get_item_vector(self): f = 3 i = AnnoyIndex(f) i.add_item(0, [1.1, 2.2, 3.3]) i.add_item(1, [4.4, 5.5, 6.6]) i.add_item(2, [7.7, 8.8, 9.9]) numpy.testing.assert_array_almost_equal(i.get_item_vector(0), [1.1, 2.2, 3.3]) self.assertTrue(i.build(10)) self.assertTrue(i.save('blah.ann')) numpy.testing.assert_array_almost_equal(i.get_item_vector(1), [4.4, 5.5, 6.6]) j = AnnoyIndex(f) self.assertTrue(j.load('blah.ann')) numpy.testing.assert_array_almost_equal(j.get_item_vector(2), [7.7, 8.8, 9.9])
def _get_index(self, f, distance): input = "test/glove.twitter.27B.%dd.txt.gz" % f output = "test/glove.%d.%s.annoy" % (f, distance) output_correct = "test/glove.%d.%s.correct" % (f, distance) if not os.path.exists(output): if not os.path.exists(input): # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/ # Hosting them on my own S3 bucket since the original files changed format url = "https://s3-us-west-1.amazonaws.com/annoy-vectors/glove.twitter.27B.%dd.txt.gz" % f print("downloading", url, "->", input) urlretrieve(url, input) print("adding items", distance, f) annoy = AnnoyIndex(f, distance) for i, line in enumerate(gzip.open(input, "rb")): v = [float(x) for x in line.strip().split()[1:]] annoy.add_item(i, v) print("building index") annoy.build(10) annoy.save(output) annoy = AnnoyIndex(f, distance) annoy.load(output) if not os.path.exists(output_correct): print("finding correct answers") f_output = open(output_correct, "w") for i in range(10000): js_slow = annoy.get_nns_by_item(i, 11, 100000)[1:] assert len(js_slow) == 10 f_output.write(" ".join(map(str, js_slow)) + "\n") f_output.close() return annoy, open(output_correct)
def _get_index(self, f, distance): input = 'test/glove.twitter.27B.%dd.txt.gz' % f output = 'test/glove.%d.%s.annoy' % (f, distance) if not os.path.exists(output): if not os.path.exists(input): # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/ # Hosting them on my own S3 bucket since the original files changed format url = 'https://s3-us-west-1.amazonaws.com/annoy-vectors/glove.twitter.27B.%dd.txt.gz' % f print('downloading', url, '->', input) urlretrieve(url, input) print('building index', distance, f) annoy = AnnoyIndex(f, distance) for i, line in enumerate(gzip.open(input, 'rb')): v = [float(x) for x in line.strip().split()[1:]] annoy.add_item(i, v); annoy.build(10) annoy.save(output) annoy = AnnoyIndex(f, distance) annoy.load(output) return annoy
def test_load_save(self): # Issue #61 i = AnnoyIndex(10) i.load('test/test.tree') u = i.get_item_vector(99) i.save('i.tree') v = i.get_item_vector(99) self.assertEqual(u, v) j = AnnoyIndex(10) j.load('test/test.tree') w = i.get_item_vector(99) self.assertEqual(u, w) # Ensure specifying if prefault is allowed does not impact result j.save('j.tree', True) k = AnnoyIndex(10) k.load('j.tree', True) x = k.get_item_vector(99) self.assertEqual(u, x) k.save('k.tree', False) l = AnnoyIndex(10) l.load('k.tree', False) y = l.get_item_vector(99) self.assertEqual(u, y)