def test_write_failed(self): f = 40 # Build the initial index t = AnnoyIndex(f, 'angular') t.verbose(True) for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) if sys.platform == "linux" or sys.platform == "linux2": # linux try: t.save("/dev/full") self.fail("didn't get expected exception") except Exception as e: self.assertTrue('No space left on device' in str(e)) elif sys.platform == "darwin": volume = "FULLDISK" device = os.popen('hdiutil attach -nomount ram://64').read() os.popen('diskutil erasevolume MS-DOS %s %s' % (volume, device)) os.popen('touch "/Volumes/%s/full"' % volume) try: t.save('/Volumes/%s/annoy.tree' % volume) self.fail("didn't get expected exception") except Exception as e: self.assertTrue('No space left on device' in str(e)) finally: os.popen("hdiutil detach %s" % device)
def test_item_vector_after_save(self): # Issue #279 a = AnnoyIndex(3) a.verbose(True) a.add_item(1, [1, 0, 0]) a.add_item(2, [0, 1, 0]) a.add_item(3, [0, 0, 1]) a.build(-1) self.assertEquals(a.get_n_items(), 4) a.get_item_vector(3) a.save('something.annoy') self.assertEquals(a.get_n_items(), 4) a.get_item_vector(3)
def test_item_vector_after_save(self): # Issue #279 a = AnnoyIndex(3, 'angular') a.verbose(True) a.add_item(1, [1, 0, 0]) a.add_item(2, [0, 1, 0]) a.add_item(3, [0, 0, 1]) a.build(-1) self.assertEqual(a.get_n_items(), 4) self.assertEqual(a.get_item_vector(3), [0, 0, 1]) self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3])) a.save('something.annoy') self.assertEqual(a.get_n_items(), 4) self.assertEqual(a.get_item_vector(3), [0, 0, 1]) self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
def test_item_vector_after_save(self): # Issue #279 a = AnnoyIndex(3) a.verbose(True) a.add_item(1, [1, 0, 0]) a.add_item(2, [0, 1, 0]) a.add_item(3, [0, 0, 1]) a.build(-1) self.assertEqual(a.get_n_items(), 4) self.assertEqual(a.get_item_vector(3), [0, 0, 1]) self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3])) a.save('something.annoy') self.assertEqual(a.get_n_items(), 4) self.assertEqual(a.get_item_vector(3), [0, 0, 1]) self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
def test_write_failed(self): f = 40 # Build the initial index t = AnnoyIndex(f, 'angular') t.verbose(True) for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) if os.name == 'nt': path = 'Z:\\xyz.annoy' else: path = '/x/y/z.annoy' self.assertRaises(Exception, t.save, path)
def test_very_large_index(self): # 388 f = 3 dangerous_size = 2**31 size_per_vector = 4 * (f + 3) n_vectors = int(dangerous_size / size_per_vector) m = AnnoyIndex(3, 'angular') m.verbose(True) for i in range(100): m.add_item(n_vectors + i, [random.gauss(0, 1) for z in range(f)]) n_trees = 10 m.build(n_trees) path = 'test_big.annoy' m.save(path) # Raises on Windows # Sanity check size of index self.assertGreaterEqual(os.path.getsize(path), dangerous_size) self.assertLess(os.path.getsize(path), dangerous_size + 100e3) # Sanity check number of trees self.assertEquals(m.get_n_trees(), n_trees)
def tes1t_set_root(self): print "test_set_root" os.system("rm -rf test_db") os.system("mkdir test_db") f = 3 i = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 0) #i.verbose(True) i.create() for k in range(10): i.display_node(k) i.add_item(0, [0, 0, 1]) print "after adding 1 data" for k in range(10): i.display_node(k) i.add_item(1, [0, 1, 0]) print "after adding 2 data" for k in range(10): i.display_node(k) i.add_item(2, [1, 0, 0]) print "after adding 3 data" for k in range(10): i.display_node(k) print "get nns by vector [3,2,1]" print i.get_nns_by_vector([3, 2, 1], 3) self.assertEqual(i.get_nns_by_vector([3, 2, 1], 3), [2, 1, 0]) self.assertEqual(i.get_nns_by_vector([1, 2, 3], 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_vector([2, 0, 1], 3), [2, 0, 1]) print "create i2" i2 = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 1) i2.verbose(True) self.assertEqual(i2.get_nns_by_vector([3, 2, 1], 3), [2, 1, 0]) self.assertEqual(i2.get_nns_by_vector([1, 2, 3], 3), [0, 1, 2]) self.assertEqual(i2.get_nns_by_vector([2, 0, 1], 3), [2, 0, 1])
def create_index(file_list, start_count, model_filename, redis_index_file): f = 100 t = AnnoyIndex(f) t.verbose(True) redisindex = open("/raid/ankit/" + redis_index_file, "w") i = start_count for f in file_list: print "Processing {} ...".format(f) with open(query_vectors_directory + f) as cur_f: for line in cur_f: #print line if not line.strip(): continue if i % 1000000 == 0: print "{} lines complete.".format(i) query, vector = line.split('\t') vector = normalize_redis_vector(vector) redisindex.write(str(query) + "\t\t" + str(i) + "\n") try: t.add_item(i, vector) except: print "Exception : " + str(line) pass #print i i += 1 print "Done adding items, now starting to build 10 trees.." t.build(10) print "Saving Model on Disk..." t.save('/raid/ankit/ann_models/' + model_filename) print "Finished Building and Saving Model!" redisindex.close() return i
def create_index(file_list, start_count,model_filename, redis_index_file): f = 100 t = AnnoyIndex(f) t.verbose(True) redisindex = open("/raid/ankit/"+redis_index_file,"w") i = start_count for f in file_list: print "Processing {} ...".format(f) with open(query_vectors_directory+f) as cur_f: for line in cur_f: #print line if not line.strip(): continue if i%1000000 == 0: print "{} lines complete.".format(i) query, vector = line.split('\t') vector = normalize_redis_vector(vector) redisindex.write(str(query)+"\t\t"+str(i)+"\n") try: t.add_item(i,vector) except: print "Exception : "+ str(line) pass #print i i+=1 print "Done adding items, now starting to build 10 trees.." t.build(10) print "Saving Model on Disk..." t.save('/raid/ankit/ann_models/'+model_filename) print "Finished Building and Saving Model!" redisindex.close() return i
import sys from annoy import AnnoyIndex from scala_angular_result import result as angular_result from scala_euclidean_result import result as euclidean_result f = 10 angular_output = 'src/test/resources/annoy-index-angular-scala' euclidean_output = 'src/test/resources/annoy-index-euclidean-scala' angular = AnnoyIndex(f, 'angular') angular.verbose(True) angular.load(angular_output) euclidean = AnnoyIndex(f, 'euclidean') euclidean.verbose(True) euclidean.load(euclidean_output) for j in range(angular.get_n_items()): r = angular.get_nns_by_item(j, 10) t = angular_result[j] if len(set(r).intersection(t)) < 8: print(j, r, t) sys.exit(1) for j in range(euclidean.get_n_items()): r = euclidean.get_nns_by_item(j, 10) t = euclidean_result[j] if len(set(r).intersection(t)) < 5: print(j, r, t)
class EntityType(object): """Convenience wrapper around Annoy. More generally a way to collect vectors within the same entity type and quickly find similar vectors. * Helps deal with non-contiguous ids through an id map. * Checks for 0 vectors before returning matches. """ def __init__(self, nfactor, ntrees, metric='angular', entity_type_id=None, entity_type=None): """Initialize EntityType.""" # metadata self._nfactor = nfactor self._metric = metric # object is accessed using this id. e.g. 'user' self._entity_type = entity_type # data is loaded in using this id. This can be more compact than the # entity_type, depending on the data source self._entity_type_id = entity_type_id self._ntrees = ntrees # data self._ann_obj = AnnoyIndex(nfactor, metric) # maps entity id to internal representation of id self._ann_map = {} # maps internal representation of id to entity id self._ann_map_inv = {} self._nitems = 0 def add_item(self, entity_id, factors): """Add item, populating id map.""" if entity_id in self._ann_map: raise ValueError('Duplicate entity: type = {0}, id = {1}'.format( self._entity_type, entity_id)) self._ann_obj.add_item(self._nitems, factors) self._ann_map[entity_id] = self._nitems self._nitems = self._nitems + 1 def build(self, verbose=False): """Build annoy model, create invert dictionary for future lookups.""" self._ann_obj.verbose(verbose) self._ann_obj.build(self._ntrees) # this is only necessary after build, so we'll create it here self._ann_map_inv = {v: k for k, v in self._ann_map.items()} def get_nns_by_vector(self, vec, n, search_k): """Get nearest neighbors from an input vector.""" nns = self._ann_obj.get_nns_by_vector(vec, n, search_k) return [self._ann_map_inv[x] for x in nns] def get_item_vector(self, entity_id): """Get a vector for an entity.""" if entity_id in self._ann_map: return self._ann_obj.get_item_vector(self._ann_map[entity_id]) else: return [] def __iter__(self): """Iterate over object, return (entity_id, vector) tuples.""" return (EntityVector( entity_id=entity_id, vector=self.get_item_vector(entity_id) ) for entity_id in self._ann_map.keys()) def get_nfactor(self): return self._nfactor def load(self, pkl, filepath): entity_type = pkl.get_entity_type(self._entity_type_id) self.__dict__ = entity_type.__dict__ # initialize index self._ann_obj = AnnoyIndex(pkl.get_nfactor(), entity_type._metric) # mmap the file self._ann_obj.load(filepath)
metrics = ["angular", "euclidean", "manhattan", "dot", "hamming"] dim = 5 size = 100 for metric in metrics: fname = f'index.{metric}.{dim}d.ann' print(f'Generating index for {metric}') # t = AnnoyIndex(dim, metric) # Length of item vector that will be indexed # for i in range(size): # v = [random.gauss(0, 1) for z in range(dim)] # t.add_item(i, v) # t.build(10) # 10 trees # t.save(fname) # ... u = AnnoyIndex(dim, metric) u.verbose(True) u.load('./../tests/' + fname) # super fast, will just mmap the file print(u.get_item_vector(3)) v0 = u.get_item_vector(0) print(v0) nearests = u.get_nns_by_vector(v0, 5, include_distances=True) id_1 = nearests[0][1] print(u.get_item_vector(id_1)) print(u.get_distance(0, id_1)) # print(u.get_distance(0, 16)) print(nearests[0]) # will find the 1000 nearest neighbors print(nearests[1])