def test_write_failed(self):
        f = 40

        # Build the initial index
        t = AnnoyIndex(f, 'angular')
        t.verbose(True)
        for i in range(1000):
            v = [random.gauss(0, 1) for z in range(f)]
            t.add_item(i, v)
        t.build(10)

        if sys.platform == "linux" or sys.platform == "linux2":
            # linux
            try:
                t.save("/dev/full") 
                self.fail("didn't get expected exception")
            except Exception as e:
                self.assertTrue('No space left on device' in str(e))
        elif sys.platform == "darwin":
            volume = "FULLDISK"
            device = os.popen('hdiutil attach -nomount ram://64').read()
            os.popen('diskutil erasevolume MS-DOS %s %s' % (volume, device))
            os.popen('touch "/Volumes/%s/full"' % volume)
            try:
                t.save('/Volumes/%s/annoy.tree' % volume)
                self.fail("didn't get expected exception")
            except Exception as e:
                self.assertTrue('No space left on device' in str(e))
            finally:
                os.popen("hdiutil detach %s" % device)
Example #2
0
 def test_item_vector_after_save(self):
     # Issue #279
     a = AnnoyIndex(3)
     a.verbose(True)
     a.add_item(1, [1, 0, 0])
     a.add_item(2, [0, 1, 0])
     a.add_item(3, [0, 0, 1])
     a.build(-1)
     self.assertEquals(a.get_n_items(), 4)
     a.get_item_vector(3)
     a.save('something.annoy')
     self.assertEquals(a.get_n_items(), 4)
     a.get_item_vector(3)
Example #3
0
 def test_item_vector_after_save(self):
     # Issue #279
     a = AnnoyIndex(3, 'angular')
     a.verbose(True)
     a.add_item(1, [1, 0, 0])
     a.add_item(2, [0, 1, 0])
     a.add_item(3, [0, 0, 1])
     a.build(-1)
     self.assertEqual(a.get_n_items(), 4)
     self.assertEqual(a.get_item_vector(3), [0, 0, 1])
     self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
     a.save('something.annoy')
     self.assertEqual(a.get_n_items(), 4)
     self.assertEqual(a.get_item_vector(3), [0, 0, 1])
     self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
Example #4
0
 def test_item_vector_after_save(self):
     # Issue #279
     a = AnnoyIndex(3)
     a.verbose(True)
     a.add_item(1, [1, 0, 0])
     a.add_item(2, [0, 1, 0])
     a.add_item(3, [0, 0, 1])
     a.build(-1)
     self.assertEqual(a.get_n_items(), 4)
     self.assertEqual(a.get_item_vector(3), [0, 0, 1])
     self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
     a.save('something.annoy')
     self.assertEqual(a.get_n_items(), 4)
     self.assertEqual(a.get_item_vector(3), [0, 0, 1])
     self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
Example #5
0
    def test_write_failed(self):
        f = 40

        # Build the initial index
        t = AnnoyIndex(f, 'angular')
        t.verbose(True)
        for i in range(1000):
            v = [random.gauss(0, 1) for z in range(f)]
            t.add_item(i, v)
        t.build(10)

        if os.name == 'nt':
            path = 'Z:\\xyz.annoy'
        else:
            path = '/x/y/z.annoy'
        self.assertRaises(Exception, t.save, path)
Example #6
0
    def test_very_large_index(self):
        # 388
        f = 3
        dangerous_size = 2**31
        size_per_vector = 4 * (f + 3)
        n_vectors = int(dangerous_size / size_per_vector)
        m = AnnoyIndex(3, 'angular')
        m.verbose(True)
        for i in range(100):
            m.add_item(n_vectors + i, [random.gauss(0, 1) for z in range(f)])
        n_trees = 10
        m.build(n_trees)
        path = 'test_big.annoy'
        m.save(path)  # Raises on Windows

        # Sanity check size of index
        self.assertGreaterEqual(os.path.getsize(path), dangerous_size)
        self.assertLess(os.path.getsize(path), dangerous_size + 100e3)

        # Sanity check number of trees
        self.assertEquals(m.get_n_trees(), n_trees)
Example #7
0
    def tes1t_set_root(self):
        print "test_set_root"
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        f = 3
        i = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 0)
        #i.verbose(True)
        i.create()
        for k in range(10):
            i.display_node(k)

        i.add_item(0, [0, 0, 1])
        print "after adding 1 data"
        for k in range(10):
            i.display_node(k)

        i.add_item(1, [0, 1, 0])
        print "after adding 2 data"
        for k in range(10):
            i.display_node(k)

        i.add_item(2, [1, 0, 0])
        print "after adding 3 data"
        for k in range(10):
            i.display_node(k)

        print "get nns by vector [3,2,1]"
        print i.get_nns_by_vector([3, 2, 1], 3)
        self.assertEqual(i.get_nns_by_vector([3, 2, 1], 3), [2, 1, 0])
        self.assertEqual(i.get_nns_by_vector([1, 2, 3], 3), [0, 1, 2])
        self.assertEqual(i.get_nns_by_vector([2, 0, 1], 3), [2, 0, 1])

        print "create i2"
        i2 = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 1)
        i2.verbose(True)
        self.assertEqual(i2.get_nns_by_vector([3, 2, 1], 3), [2, 1, 0])
        self.assertEqual(i2.get_nns_by_vector([1, 2, 3], 3), [0, 1, 2])
        self.assertEqual(i2.get_nns_by_vector([2, 0, 1], 3), [2, 0, 1])
Example #8
0
    def tes1t_set_root(self):
        print "test_set_root"
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        f = 3
        i = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 0)
        #i.verbose(True)
        i.create()
        for k in range(10):
            i.display_node(k)

        i.add_item(0, [0, 0, 1])
        print "after adding 1 data"
        for k in range(10):
            i.display_node(k)

        i.add_item(1, [0, 1, 0])
        print "after adding 2 data"
        for k in range(10):
            i.display_node(k)

        i.add_item(2, [1, 0, 0])
        print "after adding 3 data"
        for k in range(10):
            i.display_node(k)

        print "get nns by vector [3,2,1]"
        print i.get_nns_by_vector([3, 2, 1], 3)
        self.assertEqual(i.get_nns_by_vector([3, 2, 1], 3), [2, 1, 0])
        self.assertEqual(i.get_nns_by_vector([1, 2, 3], 3), [0, 1, 2])
        self.assertEqual(i.get_nns_by_vector([2, 0, 1], 3), [2, 0, 1])

        print "create i2"
        i2 = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 1)
        i2.verbose(True)
        self.assertEqual(i2.get_nns_by_vector([3, 2, 1], 3), [2, 1, 0])
        self.assertEqual(i2.get_nns_by_vector([1, 2, 3], 3), [0, 1, 2])
        self.assertEqual(i2.get_nns_by_vector([2, 0, 1], 3), [2, 0, 1])
Example #9
0
def create_index(file_list, start_count, model_filename, redis_index_file):
    f = 100
    t = AnnoyIndex(f)
    t.verbose(True)
    redisindex = open("/raid/ankit/" + redis_index_file, "w")
    i = start_count
    for f in file_list:
        print "Processing {} ...".format(f)
        with open(query_vectors_directory + f) as cur_f:
            for line in cur_f:
                #print line

                if not line.strip():
                    continue

                if i % 1000000 == 0:
                    print "{} lines complete.".format(i)
                query, vector = line.split('\t')
                vector = normalize_redis_vector(vector)
                redisindex.write(str(query) + "\t\t" + str(i) + "\n")
                try:
                    t.add_item(i, vector)
                except:
                    print "Exception : " + str(line)
                    pass
                #print i
                i += 1

    print "Done adding items, now starting to build 10 trees.."
    t.build(10)
    print "Saving Model on Disk..."
    t.save('/raid/ankit/ann_models/' + model_filename)

    print "Finished Building and Saving Model!"
    redisindex.close()

    return i
Example #10
0
def create_index(file_list, start_count,model_filename, redis_index_file):
    f = 100
    t = AnnoyIndex(f)
    t.verbose(True)
    redisindex = open("/raid/ankit/"+redis_index_file,"w")
    i = start_count
    for f in file_list:
        print "Processing {} ...".format(f)
        with open(query_vectors_directory+f) as cur_f:
            for line in cur_f:
                #print line

                if not line.strip():
                    continue

                if i%1000000 == 0:
                    print "{} lines complete.".format(i)
                query, vector = line.split('\t')
                vector = normalize_redis_vector(vector)
                redisindex.write(str(query)+"\t\t"+str(i)+"\n")
                try:
                    t.add_item(i,vector)
                except:
                    print "Exception : "+ str(line)
                    pass
                #print i
                i+=1

    print "Done adding items, now starting to build 10 trees.."
    t.build(10)
    print "Saving Model on Disk..."
    t.save('/raid/ankit/ann_models/'+model_filename)

    print "Finished Building and Saving Model!"
    redisindex.close()

    return i
Example #11
0
import sys

from annoy import AnnoyIndex

from scala_angular_result import result as angular_result
from scala_euclidean_result import result as euclidean_result

f = 10

angular_output = 'src/test/resources/annoy-index-angular-scala'
euclidean_output = 'src/test/resources/annoy-index-euclidean-scala'

angular = AnnoyIndex(f, 'angular')
angular.verbose(True)
angular.load(angular_output)
euclidean = AnnoyIndex(f, 'euclidean')
euclidean.verbose(True)
euclidean.load(euclidean_output)

for j in range(angular.get_n_items()):
    r = angular.get_nns_by_item(j, 10)
    t = angular_result[j]
    if len(set(r).intersection(t)) < 8:
        print(j, r, t)
        sys.exit(1)

for j in range(euclidean.get_n_items()):
    r = euclidean.get_nns_by_item(j, 10)
    t = euclidean_result[j]
    if len(set(r).intersection(t)) < 5:
        print(j, r, t)
Example #12
0
class EntityType(object):
    """Convenience wrapper around Annoy.

    More generally a way to collect vectors within the same entity type and
    quickly find similar vectors.

    * Helps deal with non-contiguous ids through an id map.
    * Checks for 0 vectors before returning matches.
    """

    def __init__(self, nfactor, ntrees, metric='angular',
                 entity_type_id=None, entity_type=None):
        """Initialize EntityType."""
        # metadata
        self._nfactor = nfactor
        self._metric = metric
        # object is accessed using this id. e.g. 'user'
        self._entity_type = entity_type
        # data is loaded in using this id. This can be more compact than the
        # entity_type, depending on the data source
        self._entity_type_id = entity_type_id
        self._ntrees = ntrees

        # data
        self._ann_obj = AnnoyIndex(nfactor, metric)
        # maps entity id to internal representation of id
        self._ann_map = {}
        # maps internal representation of id to entity id
        self._ann_map_inv = {}
        self._nitems = 0

    def add_item(self, entity_id, factors):
        """Add item, populating id map."""
        if entity_id in self._ann_map:
            raise ValueError('Duplicate entity: type = {0}, id = {1}'.format(
                self._entity_type, entity_id))
        self._ann_obj.add_item(self._nitems, factors)
        self._ann_map[entity_id] = self._nitems
        self._nitems = self._nitems + 1

    def build(self, verbose=False):
        """Build annoy model, create invert dictionary for future lookups."""
        self._ann_obj.verbose(verbose)
        self._ann_obj.build(self._ntrees)
        # this is only necessary after build, so we'll create it here
        self._ann_map_inv = {v: k for k, v in self._ann_map.items()}

    def get_nns_by_vector(self, vec, n, search_k):
        """Get nearest neighbors from an input vector."""
        nns = self._ann_obj.get_nns_by_vector(vec, n, search_k)
        return [self._ann_map_inv[x] for x in nns]

    def get_item_vector(self, entity_id):
        """Get a vector for an entity."""
        if entity_id in self._ann_map:
            return self._ann_obj.get_item_vector(self._ann_map[entity_id])
        else:
            return []

    def __iter__(self):
        """Iterate over object, return (entity_id, vector) tuples."""
        return (EntityVector(
                    entity_id=entity_id,
                    vector=self.get_item_vector(entity_id)
                ) for entity_id in self._ann_map.keys())

    def get_nfactor(self):
        return self._nfactor

    def load(self, pkl, filepath):
        entity_type = pkl.get_entity_type(self._entity_type_id)
        self.__dict__ = entity_type.__dict__
        # initialize index
        self._ann_obj = AnnoyIndex(pkl.get_nfactor(), entity_type._metric)
        # mmap the file
        self._ann_obj.load(filepath)
Example #13
0
metrics = ["angular", "euclidean", "manhattan", "dot", "hamming"]
dim = 5
size = 100

for metric in metrics:
    fname = f'index.{metric}.{dim}d.ann'
    print(f'Generating index for {metric}')
    # t = AnnoyIndex(dim, metric)  # Length of item vector that will be indexed
    # for i in range(size):
    #     v = [random.gauss(0, 1) for z in range(dim)]
    #     t.add_item(i, v)

    # t.build(10)  # 10 trees
    # t.save(fname)

    # ...

    u = AnnoyIndex(dim, metric)
    u.verbose(True)
    u.load('./../tests/' + fname)  # super fast, will just mmap the file
    print(u.get_item_vector(3))
    v0 = u.get_item_vector(0)
    print(v0)
    nearests = u.get_nns_by_vector(v0, 5, include_distances=True)
    id_1 = nearests[0][1]
    print(u.get_item_vector(id_1))
    print(u.get_distance(0, id_1))
    # print(u.get_distance(0, 16))
    print(nearests[0])  # will find the 1000 nearest neighbors
    print(nearests[1])