Beispiel #1
0
    def t1est_large_index_batch(self):
        print "test_large_index_batch"
        start_time = int(round(time.time() * 1000))
        os.system("rm -rf test_db")
        os.system("mkdir test_db")
        # Generate pairs of random points where the pair is super close
        f = 100
        i = AnnoyIndex(f, 12, "test_db", 10,  1000, 3048576000, 0)
        i_v = []
        v_v = []
        for j in xrange(0, 100000, 2):
            p = [random.gauss(0, 1) for z in xrange(f)]
            f1 = random.random() + 1
            f2 = random.random() + 1
            x = [f1 * pi + random.gauss(0, 1e-2) for pi in p]
            y = [f2 * pi + random.gauss(0, 1e-2) for pi in p]
            i_v.append(j)
            i_v.append(j+1)
            v_v.append(x)
            v_v.append(y)
        
        i.add_item_batch(i_v, v_v)

        i = AnnoyIndex(f, 12, "test_db", 10,  1000, 3048576000, 1)
        for j in xrange(0, 100000, 2):
            self.assertEqual(i.get_nns_by_item(j, 2, 50), [j, j+1])
            self.assertEqual(i.get_nns_by_item(j+1, 2, 50), [j+1, j])
        print "Total time = ",  (int(round(time.time() * 1000)) - start_time)/1000
Beispiel #2
0
    def _get_index(self, f, distance):
        input = 'test/glove.twitter.27B.%dd.txt.gz' % f
        output = 'test/glove.%d.%s.annoy' % (f, distance)
        
        if not os.path.exists(output):
            if not os.path.exists(input):
                # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/
                # Hosting them on my own S3 bucket since the original files changed format
                url = 'https://s3-us-west-1.amazonaws.com/annoy-vectors/glove.twitter.27B.%dd.txt.gz' % f
                print('downloading', url, '->', input)
                urlretrieve(url, input)

            print('building index', distance, f)
            annoy =  AnnoyIndex(f, 12, "test_db", 10,  1000, 3048576000, 0)
            v_v = []
            items = []
            for i, line in enumerate(gzip.open(input, 'rb')):
                v = [float(x) for x in line.strip().split()[1:]]
                v_v.append(v)
                items.append(i)
                if (i+1) % 10000 == 0:
                    print (i+1)
                    annoy.add_item_batch(items, v_v)
                    v_v = []
                    items = []
            if v_v:
                annoy.add_item_batch(items, v_v)
        return annoy
Beispiel #3
0
    def _get_index(self, f, distance):
        input = 'test/glove.twitter.27B.%dd.txt.gz' % f
        output = 'test/glove.%d.%s.annoy' % (f, distance)

        if not os.path.exists(output):
            if not os.path.exists(input):
                # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/
                # Hosting them on my own S3 bucket since the original files changed format
                url = 'https://s3-us-west-1.amazonaws.com/annoy-vectors/glove.twitter.27B.%dd.txt.gz' % f
                print('downloading', url, '->', input)
                urlretrieve(url, input)

            print('building index', distance, f)
            annoy = AnnoyIndex(f, 12, "test_db", 10, 1000, 3048576000, 0)
            v_v = []
            items = []
            for i, line in enumerate(gzip.open(input, 'rb')):
                v = [float(x) for x in line.strip().split()[1:]]
                v_v.append(v)
                items.append(i)
                if (i + 1) % 10000 == 0:
                    print(i + 1)
                    annoy.add_item_batch(items, v_v)
                    v_v = []
                    items = []
            if v_v:
                annoy.add_item_batch(items, v_v)
        return annoy
Beispiel #4
0
 def test_get_nns_by_item_batch(self):
     print "test_get_nns_by_item_batch "
     os.system("rm -rf test_db")
     os.system("mkdir test_db")
     f = 3
     i = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 0)
     i.add_item_batch([0,1,2], [[2, 1, 0], [1, 2, 0], [0, 0, 1]])
    
     self.assertEqual(i.get_nns_by_item(0, 3), [0, 1, 2])
     self.assertEqual(i.get_nns_by_item(1, 3), [1, 0, 2])
     self.assertTrue(i.get_nns_by_item(2, 3) in [[2, 0, 1], [2, 1, 0]]) # could be either