예제 #1
1
파일: index_test.py 프로젝트: spotify/annoy
 def test_metric_kwarg(self):
     # Issue 211
     i = AnnoyIndex(2, metric='euclidean')
     i.add_item(0, [1, 0])
     i.add_item(1, [9, 0])
     self.assertAlmostEqual(i.get_distance(0, 1), 8)
     self.assertEqual(i.f, 2)
예제 #2
1
    def test_dist(self):
        f = 2
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [0, 1])
        i.add_item(1, [1, 1])

        self.assertAlmostEqual(i.get_distance(0, 1), 1.0)
예제 #3
0
def ANN(searchSpace):
    dimension = searchSpace[0].shape[0]
    t = AnnoyIndex(dimension, metric='euclidean')
    for i in range(len(searchSpace)):
        t.add_item(i, searchSpace[i])
    t.build(10)
    return t
예제 #4
0
def build_annoy_index(corpus, dimension, winlen, winstep):
    print "Adding to Annoy index"
    index = AnnoyIndex(dimension, "euclidean")
    mfcc_list = []
    i = 0
    for filename, frames in corpus:
#        print filename, frames.shape
        for index_in_file, mfcc in enumerate(frames):
            mfcc_list.append((filename, index_in_file))
            index.add_item(i, mfcc.tolist())
            assert mfcc_list[i] == (filename, index_in_file)
            i += 1

    opts = {"samplerate": desired_samplerate,
            "winlen": winlen,
            "winstep": winstep,
            "numcep": 13,
            "nfilt": 26,
            "nfft": 512,
            "ntrees": ANN_NTREES
            }
    cache_filename = "annoy_index_" + hashlib.md5(str([filename for filename, frames in corpus])).hexdigest() + "." + "_".join("%s=%s" % (k, v) for k, v in sorted(opts.items())) + ".tree"
    
    if not os.path.exists(cache_filename):
        print "Building Annoy index with %d trees" % ANN_NTREES
    #    index.build(-1)
        index.build(ANN_NTREES)
        index.save(cache_filename)
        print "\tWrote cache to %s" % cache_filename
    else:
        print "\tReading cache from %s" % cache_filename
        index.load(cache_filename)
    return index, mfcc_list
예제 #5
0
    def test_dist_2(self):
        f = 2
        i = AnnoyIndex(f)
        i.add_item(0, [1000, 0])
        i.add_item(1, [10, 0])

        self.assertAlmostEqual(i.get_distance(0, 1), 0)
예제 #6
0
    def test_dist(self):
        f = 2
        i = AnnoyIndex(f)
        i.add_item(0, [0, 1])
        i.add_item(1, [1, 1])

        self.assertAlmostEqual(i.get_distance(0, 1), (2 * (1.0 - 2 ** -0.5))**0.5)
예제 #7
0
    def test_zero_vectors(self):
        # Mentioned on the annoy-user list
        bitstrings = [
            '0000000000011000001110000011111000101110111110000100000100000000',
            '0000000000011000001110000011111000101110111110000100000100000001',
            '0000000000011000001110000011111000101110111110000100000100000010',
            '0010010100011001001000010001100101011110000000110000011110001100',
            '1001011010000110100101101001111010001110100001101000111000001110',
            '0111100101111001011110010010001100010111000111100001101100011111',
            '0011000010011101000011010010111000101110100101111000011101001011',
            '0011000010011100000011010010111000101110100101111000011101001011',
            '1001100000111010001010000010110000111100100101001001010000000111',
            '0000000000111101010100010001000101101001000000011000001101000000',
            '1000101001010001011100010111001100110011001100110011001111001100',
            '1110011001001111100110010001100100001011000011010010111100100111',
        ]
        vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings]

        f = 64
        idx = AnnoyIndex(f, 'hamming')
        for i, v in enumerate(vectors):
            idx.add_item(i, v)

        idx.build(10)
        idx.save('idx.ann')
        idx = AnnoyIndex(f, 'hamming')
        idx.load('idx.ann')
        js, ds = idx.get_nns_by_item(0, 5, include_distances=True)
        self.assertEquals(js[0], 0)
        self.assertEquals(ds[:4], [0, 1, 1, 22])
예제 #8
0
def build_tree(df, metric):
    '''
    INPUTS: Pandas DataFrame, Choice of Metric Space String
    OUTPUTS: Returns the built AnnoyIndex tree, returns a dictionary
             mapping index numbers to the DataFrame's index

    Builds a ANN tree using Spotify's ANNoy library. Metric is the
    metric space (either euclidean or angular)
    '''
    tree = AnnoyIndex(len(df.iloc[0, :].values), metric=metric)

    indexes = {}

    for i in xrange(len(df)):
        v = df.iloc[i, :]
        indexes[i] = v.name
        tree.add_item(i, v.values)

    tree.build(50)

    tree.save(DATA_DIR + 'tree_' + metric + '.ann')
    with open(DATA_DIR + 'indexes_' + metric, 'wb') as f:
        pickle.dump(indexes, f)

    return (tree, indexes)
예제 #9
0
    def recall_at(self, n, n_trees=10, n_points=1000, n_rounds=5):
        # the best movie/variable name
        total_recall = 0.

        for r in range(n_rounds):
            # create random points at distance x
            f = 10
            idx = AnnoyIndex(f, 'dot')

            data = numpy.array([
                [random.gauss(0, 1) for z in range(f)]
                for j in range(n_points)
            ])

            expected_results = [
                sorted(
                    range(n_points),
                    key=lambda j: dot_metric(data[i], data[j])
                )[:n]
                for i in range(n_points)
            ]

            for i, vec in enumerate(data):
                idx.add_item(i, vec)

            idx.build(n_trees)

            for i in range(n_points):
                nns = idx.get_nns_by_vector(data[i], n)
                total_recall += recall(nns, expected_results[i])

        return total_recall / float(n_rounds * n_points)
예제 #10
0
 def test_get_lots_of_nns(self):
     f = 10
     i = AnnoyIndex(f, 'euclidean')
     i.add_item(0, [random.gauss(0, 1) for x in xrange(f)])
     i.build(10)
     for j in xrange(100):
         self.assertEqual(i.get_nns_by_item(0, 999999999), [0])
예제 #11
0
def precision(f=40, n=1000000):
    t = AnnoyIndex(f)
    for i in xrange(n):
        v = []
        for z in xrange(f):
            v.append(random.gauss(0, 1))
        t.add_item(i, v)

    t.build(2 * f)
    t.save('test.tree')

    limits = [10, 100, 1000, 10000]
    k = 10
    prec_sum = {}
    prec_n = 1000
    time_sum = {}

    for i in xrange(prec_n):
        j = random.randrange(0, n)
        print 'finding nbs for', j
        
        closest = set(t.get_nns_by_item(j, n)[:k])
        for limit in limits:
            t0 = time.time()
            toplist = t.get_nns_by_item(j, limit)
            T = time.time() - t0
            
            found = len(closest.intersection(toplist))
            hitrate = 1.0 * found / k
            prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate
            time_sum[limit] = time_sum.get(limit, 0.0) + T

        for limit in limits:
            print 'limit: %-9d precision: %6.2f%% avg time: %.6fs' % (limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1))
예제 #12
0
    def test_tuple(self, n_points=1000, n_trees=10):
        f = 10
        i = AnnoyIndex(f, 'euclidean')
        for j in xrange(n_points):
            i.add_item(j, (random.gauss(0, 1) for x in xrange(f)))

        i.build(n_trees)
예제 #13
0
파일: holes_test.py 프로젝트: spotify/annoy
 def _test_holes_base(self, n, f=100, base_i=100000):
     annoy = AnnoyIndex(f)
     for i in range(n):
         annoy.add_item(base_i + i, numpy.random.normal(size=(f,)))
     annoy.build(100)
     res = annoy.get_nns_by_item(base_i, n)
     self.assertEquals(set(res), set([base_i + i for i in range(n)]))
예제 #14
0
def make_text_graph(user_lemma_matrix, dimensionality, metric, number_of_estimators, number_of_neighbors):
    user_lemma_matrix_tfidf = augmented_tf_idf(user_lemma_matrix)
    # print(user_lemma_matrix_tfidf.shape)
    if (user_lemma_matrix_tfidf.shape[0] <= dimensionality) or (user_lemma_matrix_tfidf.shape[1] <= dimensionality):
        X_svd = user_lemma_matrix_tfidf.toarray()
    else:
        X_svd = TruncatedSVD(n_components=dimensionality).fit_transform(user_lemma_matrix_tfidf)

    annoy_index = AnnoyIndex(X_svd.shape[1], metric=metric)

    for q in range(X_svd.shape[0]):
        annoy_index.add_item(q, X_svd[q, :])

    annoy_index.build(number_of_estimators)

    row = list()
    col = list()
    data = list()
    for q in range(X_svd.shape[0]):
        neighbors, distances = annoy_index.get_nns_by_item(q, number_of_neighbors, include_distances=True)

        row.extend([q] * number_of_neighbors)
        col.extend(neighbors)
        data.extend(distances)

    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    data = np.array(data, dtype=np.float64)

    text_graph = spsp.coo_matrix((data, (row, col)), shape=(X_svd.shape[0], X_svd.shape[0]))
    text_graph = spsp.csr_matrix(text_graph)

    return text_graph
예제 #15
0
    def test_dist_degen(self):
        f = 2
        i = AnnoyIndex(f)
        i.add_item(0, [1, 0])
        i.add_item(1, [0, 0])

        self.assertAlmostEqual(i.get_distance(0, 1), 2.0**0.5)
예제 #16
0
    def _get_index(self, dataset):
        url = 'http://vectors.erikbern.com/%s.hdf5' % dataset
        vectors_fn = os.path.join('test', dataset + '.hdf5')
        index_fn = os.path.join('test', dataset + '.annoy')

        if not os.path.exists(vectors_fn):
            print('downloading', url, '->', vectors_fn)
            urlretrieve(url, vectors_fn)

        dataset_f = h5py.File(vectors_fn)
        distance = dataset_f.attrs['distance']
        f = dataset_f['train'].shape[1]
        annoy = AnnoyIndex(f, distance)

        if not os.path.exists(index_fn):
            print('adding items', distance, f)
            for i, v in enumerate(dataset_f['train']):
                annoy.add_item(i, v)

            print('building index')
            annoy.build(10)
            annoy.save(index_fn)
        else:
            annoy.load(index_fn)
        return annoy, dataset_f
예제 #17
0
파일: index_test.py 프로젝트: spotify/annoy
    def test_overwrite_index(self):
        # Issue #335
        f = 40

        # Build the initial index
        t = AnnoyIndex(f)
        for i in range(1000):
            v = [random.gauss(0, 1) for z in range(f)]
            t.add_item(i, v)
        t.build(10)
        t.save('test.ann')

        # Load index file
        t2 = AnnoyIndex(f)
        t2.load('test.ann')

        # Overwrite index file
        t3 = AnnoyIndex(f)
        for i in range(500):
            v = [random.gauss(0, 1) for z in range(f)]
            t3.add_item(i, v)
        t3.build(10)
        if os.name == 'nt':
            # Can't overwrite on Windows
            with self.assertRaises(IOError):
                t3.save('test.ann')
        else:
            t3.save('test.ann')
            # Get nearest neighbors
            v = [random.gauss(0, 1) for z in range(f)]
            nns = t2.get_nns_by_vector(v, 1000)  # Should not crash
예제 #18
0
def build_index(df,n_trees = 50,dist_metric='angular',out_dir="./"):
    n_records = df.shape[0]
    n_col = df.shape[1]
    index = AnnoyIndex(n_col,metric=dist_metric)
    patient_dict = {}
    index_dict = {}
    i = 0
    print "Adding items to the index..."
    for patient_id in df.index.values:
        if i % 10000 == 0:
            print str(i)
        vec = df.loc[patient_id].values
        index.add_item(i,vec)
        patient_dict[patient_id] = i
        index_dict[i] = patient_id
        i += 1
    print "Building the index..."
    index.build(n_trees)
    index.save(out_dir+"annoy_index.ann")
    ## Save the patient_id -> index mapping ##
    w = csv.writer(open(out_dir+"patient_mapping.csv", "w"))
    for key, val in patient_dict.items():
        w.writerow([key, val])
    w = csv.writer(open(out_dir+"index_mapping.csv", "w"))
    for key, val in index_dict.items():
        w.writerow([key, val])
예제 #19
0
파일: index_test.py 프로젝트: spotify/annoy
    def test_write_failed(self):
        f = 40

        # Build the initial index
        t = AnnoyIndex(f)
        for i in range(1000):
            v = [random.gauss(0, 1) for z in range(f)]
            t.add_item(i, v)
        t.build(10)

        if sys.platform == "linux" or sys.platform == "linux2":
            # linux
            try:
                t.save("/dev/full") 
                self.fail("didn't get expected exception")
            except Exception as e:
                self.assertTrue(str(e).find("No space left on device") > 0)
        elif sys.platform == "darwin":
            volume = "FULLDISK"
            device = os.popen('hdiutil attach -nomount ram://64').read()
            os.popen('diskutil erasevolume MS-DOS %s %s' % (volume, device))
            os.popen('touch "/Volumes/%s/full"' % volume)
            try:
                t.save('/Volumes/%s/annoy.tree' % volume)
                self.fail("didn't get expected exception")
            except Exception as e:
                self.assertTrue(str(e).find("No space left on device") > 0)
            finally:
                os.popen("hdiutil detach %s" % device)
예제 #20
0
 def test_single_vector(self):
     # https://github.com/spotify/annoy/issues/194
     a = AnnoyIndex(3)
     a.add_item(0, [1, 0, 0])
     a.build(10)
     a.save('1.ann')
     self.assertEquals(a.get_nns_by_vector([1, 0, 0], 3, include_distances=True), ([0], [0.0]))
예제 #21
0
    def test_wrong_length(self, n_points=1000, n_trees=10):
        f = 10
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [random.gauss(0, 1) for x in xrange(f)])
        self.assertRaises(IndexError, i.add_item, 1, [random.gauss(0, 1) for x in xrange(f+1000)])
        self.assertRaises(IndexError, i.add_item, 2, [])

        i.build(n_trees)
def build_index(counts,label_to_id,dimension):
    index = AnnoyIndex(dimension,metric='angular')
    for label,cnt_list in counts.items():
        id = label_to_id[label]
        index.add_item(id,cnt_list)

    index.build(100)
    return index
예제 #23
0
    def test_get_nns_by_vector(self):
        f = 2
        i = AnnoyIndex(f, 'euclidean')
        i.add_item(0, [2,2])
        i.add_item(1, [3,2])
        i.build(10)

        self.assertEquals(i.get_nns_by_vector([3,3], 2), [1, 0])
예제 #24
0
 def test_get_item_vector(self):
     f = 10
     i = AnnoyIndex(f, 'euclidean')
     i.add_item(0, [random.gauss(0, 1) for x in xrange(f)])
     for j in xrange(100):
         print(j, '...')
         for k in xrange(1000 * 1000):
             i.get_item_vector(0)
예제 #25
0
def fit_annoy(data, n_trees=-1):
    logger.info('Fitting Annoy Matcher...')
    from annoy import AnnoyIndex
    matcher = AnnoyIndex(data.shape[1], metric='euclidean')
    for i, d in enumerate(data):
        matcher.add_item(i, d)
    matcher.build(n_trees)
    return matcher
예제 #26
0
 def test_save_without_build(self):
     # Issue #61
     i = AnnoyIndex(10)
     i.add_item(1000, [random.gauss(0, 1) for z in xrange(10)])
     i.save('x.tree')
     j = AnnoyIndex(10)
     j.load('x.tree')
     j.build(10)
예제 #27
0
    def test_dist_3(self):
        f = 2
        i = AnnoyIndex(f)
        i.add_item(0, [97, 0])
        i.add_item(1, [42, 42])

        dist = ((1 - 2 ** -0.5) ** 2 + (2 ** -0.5) ** 2)**0.5

        self.assertAlmostEqual(i.get_distance(0, 1), dist)
예제 #28
0
 def test_include_dists_check_ranges(self):
     f = 3
     i = AnnoyIndex(f)
     for j in xrange(100000):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     indices, dists = i.get_nns_by_item(0, 100000, include_distances=True)
     self.assertTrue(max(dists) < 2.0)
     self.assertAlmostEqual(min(dists), 0.0)
예제 #29
0
    def test_numpy(self, n_points=1000, n_trees=10):
        f = 10
        i = AnnoyIndex(f, 'euclidean')
        for j in xrange(n_points):
            a = numpy.random.normal(size=f)
            a = a.astype(random.choice([numpy.float64, numpy.float32, numpy.uint8, numpy.int16]))
            i.add_item(j, a)

        i.build(n_trees)
예제 #30
0
파일: index.py 프로젝트: leahic/gensim
    def _build_from_model(self, vectors, labels, num_features):
        index = AnnoyIndex(num_features)

        for vector_num, vector in enumerate(vectors):
            index.add_item(vector_num, vector)

        index.build(self.num_trees)
        self.index = index
        self.labels = labels
예제 #31
0
파일: annoy_test.py 프로젝트: zyfnhct/annoy
    def precision(self, n, n_trees=10, n_points=10000, n_rounds=10):
        found = 0
        for r in xrange(n_rounds):
            # create random points at distance x from (1000, 0, 0, ...)
            f = 10
            i = AnnoyIndex(f, 'euclidean')
            for j in xrange(n_points):
                p = [random.gauss(0, 1) for z in xrange(f - 1)]
                norm = sum([pi**2 for pi in p])**0.5
                x = [1000] + [pi / norm * j for pi in p]
                i.add_item(j, x)

            i.build(n_trees)

            nns = i.get_nns_by_vector([1000] + [0] * (f - 1), n)
            self.assertEqual(nns, sorted(nns))  # should be in order
            # The number of gaps should be equal to the last item minus n-1
            found += len([x for x in nns if x < n])

        return 1.0 * found / (n * n_rounds)
예제 #32
0
def nn_approx(ds1, ds2, knn=KNN, metric='manhattan', n_trees=10):
    # Build index.
    a = AnnoyIndex(ds2.shape[1], metric=metric)
    for i in range(ds2.shape[0]):
        a.add_item(i, ds2[i, :])
    a.build(n_trees)

    # Search index.
    ind = []
    for i in range(ds1.shape[0]):
        ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1))
    ind = np.array(ind)

    # Match.
    match = set()
    for a, b in zip(range(ds1.shape[0]), ind):
        for b_i in b:
            match.add((a, b_i))

    return match
예제 #33
0
파일: textrank.py 프로젝트: josix/NEPT
def vsm(fp=FILEPATH):
    print("Train vsm from {}".format(fp))
    with open(fp, 'r') as fin:
        index_id_dict = {}
        corpus = []
        for index, line in tqdm(enumerate(fin)):
            event_id, *event_description_list = line.strip().split(',')
            event_description = " ".join(event_description_list)
            sentence = jieba.analyse.extract_tags(event_description)
            corpus.append(" ".join(sentence))
            index_id_dict[index] = event_id
        vectorizer = TfidfVectorizer()
        document_term_matrix = vectorizer.fit_transform(corpus)
        dim = document_term_matrix.shape[1]
        annoy_index = AnnoyIndex(dim)
        for index, vector in enumerate(document_term_matrix):
            annoy_index.add_item(index, vector.toarray()[0])
        annoy_index.build(10) # 10 trees
        annoy_index.save('vsm_tfidf.ann')
        return index_id_dict, vectorizer, document_term_matrix
예제 #34
0
 def test_distance_consistency(self):
     n, f = 1000, 3
     i = AnnoyIndex(f, 'angular')
     for j in range(n):
         i.add_item(j, numpy.random.normal(size=f))
     i.build(10)
     for a in random.sample(range(n), 100):
         indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
         for b, dist in zip(indices, dists):
             self.assertAlmostEqual(dist, i.get_distance(a, b))
             u = i.get_item_vector(a)
             v = i.get_item_vector(b)
             u_norm = numpy.array(u) * numpy.dot(u, u)**-0.5
             v_norm = numpy.array(v) * numpy.dot(v, v)**-0.5
             # cos = numpy.clip(1 - cosine(u, v), -1, 1) # scipy returns 1 - cos
             self.assertAlmostEqual(
                 dist**2, numpy.dot(u_norm - v_norm, u_norm - v_norm))
             # self.assertAlmostEqual(dist, (2*(1 - cos))**0.5)
             self.assertAlmostEqual(
                 dist**2, sum([(x - y)**2 for x, y in zip(u_norm, v_norm)]))
예제 #35
0
def KNN_Annoy(X, KK):
    NK = KK
    NN, NF = X.shape
    if KK > NF:
        raise ValueError("KK should be less than 2th-dim of X")

    t = AnnoyIndex(NF, metric='euclidean')
    for i, v in enumerate(X):
        t.add_item(i, v)

    t.build(100)
    ind = []
    val = []

    for i in range(NN):
        closest = t.get_nns_by_item(i, NK)
        ind.append(closest)
        val.append([t.get_distance(i, j) for j in closest])

    return np.array(ind), np.array(val)
예제 #36
0
def make_ann(n_dim=N_DIM, n_items=100):
    ids = []
    ann = AnnoyIndex(n_dim, METRIC)
    ann.on_disk_build(PATH_DISK_SAVE)

    for ind in range(n_items):
        v = [random.gauss(0, 1) for _ in range(n_dim)]
        ann.add_item(ind, v)
        ids.append(str(ind))

    ann.build(N_TREES)

    meta_d = {
        'vec_src': Path(__file__).name,
        'metric': METRIC,
        'n_dim': n_dim,
        'timestamp_utc': datetime.utcnow().isoformat(),
    }

    return ids, meta_d
예제 #37
0
        def _random_nn(X):
            idx = AnnoyIndex(X.shape[1], 'euclidean')
            for i in range(X.shape[0]):
                idx.add_item(i, X[i])

            logging.info("building an index with %d items" % X.shape[0])
            idx.build(50)

            logging.info("finding %d neighbor groups" % self.n_clusters)
            seen = {}
            label = 0

            guess = np.random.randint(X.shape[0])
            centers = {guess: 0}

            while label < self.n_clusters:
                neighbors = idx.get_nns_by_item(guess, _get_num_neighbors())
                for point in neighbors:
                    seen[point] = label
                seen[guess] = label

                # find a distant point
                dists = np.array([[idx.get_distance(i, j) for i in centers]
                                  for j in range(X.shape[0])])

                avg_dists = np.average(dists, axis=1)
                dist_prob = softmax(avg_dists)

                guess = np.random.choice(X.shape[0], p=dist_prob)

                while guess in seen:
                    guess = np.random.choice(X.shape[0], p=dist_prob)
                centers[guess] = label

                label = label + 1

            y = np.zeros(X.shape[0])

            for k, v in seen.items():
                y[k] = v
            return y
예제 #38
0
def main(args):
    index = AnnoyIndex(2048, 'euclidean')
    index_metadata = {}

    model = load_model()

    batch = []
    total_size = 0
    for i, fname in enumerate(os.listdir(args.images_dir)):
        if not (fname.endswith('.jpg') or fname.endswith('.png') or fname.endswith('.jpeg')):
            continue

        path = os.path.join(args.images_dir, fname)
        try:
            img = open_img(path)
            batch.append((i, img, fname))
        except Exception as e:
            print(e)
            continue

        if len(batch) == args.batch_size:
            total_size += len(batch)
            print("Process batch: %d" % total_size)
            ids, imgs, img_fnames = zip(*batch)
            vectors = get_feature_vectors(model, imgs).numpy()
            for j, vector in enumerate(vectors):
                index.add_item(ids[j], vector.tolist())
                index_metadata[ids[j]] = {
                    'filename': img_fnames[j]
                }

            batch = []

            if total_size >= args.max_items:
                break

    print('Build index')
    index.build(args.n_trees)
    print('Save index')
    index.save(os.path.join(args.dst, 'index.ann'))
    json.dump(index_metadata, open(os.path.join(args.dst, 'index_metadata.json'), 'w'))
예제 #39
0
def build_db(face_path, save_path):
    """
    Builds FaceEmbedding Database of people

    Args:
        face_path (Path): Face Directory Path
        save_path ([type]): Save Path for SkywatchDB
    """
    face_path = parse_path(face_path)
    save_path = parse_path(save_path)

    print("SkywatchDB Build Started...")

    face_tree = AnnoyIndex(embedding_size, 'euclidean')
    image_paths = _get_image_paths(face_path)
    i = 1
    person_id_map = {}
    for person, images in image_paths.items():
        for image in images:
            faces = get_faces(image, enforce=True)
            try:
                aligned_face = align_face(faces[0]['image'])
                embedding = get_face_embedding(aligned_face)
            except IndexError:
                raise AssertionError('Could not detect face in ' + image)
            except TypeError:
                print(f"Cannot detect face for {person} in {image}")
                continue
            face_tree.add_item(i, embedding)
            person_id_map[i] = person
            i += 1
    face_tree.build(5)
    try:
        face_tree.save(save_path.joinpath('faceEmbed.db').as_posix())
        save_file = open(save_path.joinpath('nameMap.db').as_posix(), 'wb')
        pickle.dump(person_id_map, save_file)
        save_file.close()
        print('SkywatchDB successfully saved at ', save_path.as_posix())
    except:
        raise SystemError(
            'Storage Access Error. Cannot save Skywatch Database.')
예제 #40
0
    def rebuild_index(self, items: List[int], texts: List[str],
                      embeddings: List[np.ndarray]):
        try:
            __temp_index = AnnoyIndex(VECTOR_LENGTH, metric='angular')
            __temp_mapping = OrderedDict()
            for _i, _item in enumerate(items):
                __temp_index.add_item(_i, embeddings[_i])
                __temp_mapping[_item] = {
                    'index': _i,
                    'text': texts[_i],
                    'embedding': embeddings[_i]
                }

            logger.info('A total of {} items added to the index'.format(_i))
            logger.info('Building the index with {} trees...'.format(N_TREES))
            __temp_index.build(n_trees=N_TREES)
            logger.info('Index is successfully built.')

            logger.info('Saving index to disk...')
            with tempfile.TemporaryFile() as fp:
                __temp_index.save(str(fp.name))
                self.__index_file = str(fp.name)
            logger.info('Index is saved to disk.')
            logger.info("Index file size: {} GB".format(
                round(os.path.getsize(self.__index_file) / float(1024**3), 2)))
            logger.info('Saving mapping to disk...')
            with open(self.__index_file + '.mapping', 'wb') as handle:
                pickle.dump(__temp_mapping,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
            logger.info('Mapping is saved to disk.')
            logger.info("Mapping file size: {} MB".format(
                round(
                    os.path.getsize(self.__index_file + '.mapping') /
                    float(1024**2), 2)))
        except Exception as e:
            logger.error("Error updating index " + str(e))
            raise e
        else:
            self.__load_index__()
            return str(self.__index_file)
예제 #41
0
def representative_sample(X, num_samples, save=False):
    """Sample vectors in X, prefering edge cases and vectors farthest from other vectors in sample set


    """
    X = X.values if hasattr(X, 'values') else np.array(X)
    N, M = X.shape
    rownums = np.arange(N)
    np.random.shuffle(rownums)

    idx = AnnoyIndex(M)
    for i, row in enumerate(X):
        idx.add_item(i, row)
    idx.build(int(np.log2(N)) + 1)

    if save:
        if isinstance(save, basestring):
            idxfilename = save
        else:
            idxfile = tempfile.NamedTemporaryFile(delete=False)
            idxfile.close()
            idxfilename = idxfile.name
        idx.save(idxfilename)
        idx = AnnoyIndex(M)
        idx.load(idxfile.name)

    samples = -1 * np.ones(shape=(num_samples, ), dtype=int)
    samples[0] = rownums[0]
    # FIXME: some integer determined by N and num_samples and distribution
    j, num_nns = 0, min(1000, int(num_samples / 2. + 1))
    for i in rownums:
        if i in samples:
            continue
        nns = idx.get_nns_by_item(i, num_nns)
        # FIXME: pick vector furthest from past K (K > 1) points or outside of a hypercube
        #        (sized to uniformly fill the space) around the last sample
        samples[j + 1] = np.setdiff1d(nns, samples)[-1]
        if len(num_nns) < num_samples / 3.:
            num_nns = min(N, 1.3 * num_nns)
        j += 1
    return samples
def main():
    embed = Embedding()
    images = [image for image in
              os.listdir(os.path.join(config.IMAGE_PATH, "face_db"))
              if image.endswith(".png")]
    images.sort(key=human_sort)

    with open(config.FACE_NAMES, 'w') as f:
        [f.write(image + '\n') for image in images]

    imgs = [cv2.imread(os.path.join(config.IMAGE_PATH, "face_db", image))
            for image in images]

    t = AnnoyIndex(512, metric="euclidean")

    for i, img in enumerate(imgs):
        t.add_item(i, embed.get_feature(img))
    t.build(10)

    # rewrite face_db.ann
    t.save(config.FACE_FEATURES)
예제 #43
0
class AnnoyCB:
    def __init__(self, n_sim_movie=10, trees=10, model_name='angular'):
        self.n_sim_movie = n_sim_movie
        self.trees = trees
        self.model_name = model_name

    def fit(self, item_matrix):
        num, vec_dim = item_matrix.shape
        self.model = AnnoyIndex(vec_dim, self.model_name)
        for i, vec in enumerate(item_matrix):
            self.model.add_item(i, vec)
        self.model.build(self.trees)

    def predict(self, item_matrix):
        num, vec_dim = item_matrix.shape
        res_result = []
        for i in range(num):
            itmes = self.model.get_nns_by_item(i, self.n_sim_movie)
            res_result.append(itmes)

        return res_result
예제 #44
0
def update():
    params = json.loads(request.get_data())
    if len(params) == 0:
        return "No parameter"

    try:
        global annoy_index

        new_annoy_index = AnnoyIndex(vector_len, distance_metric)
        for id, url in params["urls"]:
            features = extract_features(url, model,
                                        applications.densenet.preprocess_input)
            new_annoy_index.add_item(id, features)
        new_annoy_index.build(10)

        annoy_index.unload()
        new_annoy_index.save("cat-pictures.annoy")
        annoy_index = new_annoy_index
        return "Update complete"
    except:
        return "Annoy error"
def createAnnoyIndex(codebook_path: str, bit_len: int, n_trees: int):
    def extractVectorsFromCodebook(codebook: str, bit_len: int):
        df = pd.read_csv(codebook)
        df['Barcode'] = [
            f"{barcode:0{bit_len}}" for barcode in list(df['Barcode'])
        ]
        df['Vector'] = [
            createBarcodeVector(barcode) for barcode in df['Barcode']
        ]
        list_of_codebook_vectors = np.array(df['Vector'])
        return list_of_codebook_vectors

    list_of_codebook_vectors = extractVectorsFromCodebook(codebook_path, 16)
    n_vectors = len(list_of_codebook_vectors)

    t = AnnoyIndex(bit_len, 'euclidean')
    for i in range(0, n_vectors):
        v = list_of_codebook_vectors[i]
        t.add_item(i, v)
    t.build(n_trees)
    t.save("codebook_index.ann")
예제 #46
0
def histogram(data):
    f = 256
    histogram = AnnoyIndex(f, 'angular')
    index = []
    a = 0

    for i in data:
        try:
            req = urlopen(i)
            arr = np.asarray(bytearray(req.read()), dtype=np.uint8)
            img = cv2.imdecode(arr, 1)  # 'Load it as it is'
            gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            hist = cv2.calcHist([gray_image], [0], None, [256], [0, 256])
            histogram.add_item(a, hist)
            index.append(a)
        except urllib.error.HTTPError:
            print("cannot find" + str(a))
        except cv2.error:
            print("cannot find" + str(a))
        a += 1
    return histogram, index
예제 #47
0
파일: index_test.py 프로젝트: zzszmyf/annoy
    def test_very_large_index(self):
        # 388
        f = 3
        dangerous_size = 2**31
        size_per_vector = 4 * (f + 3)
        n_vectors = int(dangerous_size / size_per_vector)
        m = AnnoyIndex(3, 'angular')
        m.verbose(True)
        for i in range(100):
            m.add_item(n_vectors + i, [random.gauss(0, 1) for z in range(f)])
        n_trees = 10
        m.build(n_trees)
        path = 'test_big.annoy'
        m.save(path)  # Raises on Windows

        # Sanity check size of index
        self.assertGreaterEqual(os.path.getsize(path), dangerous_size)
        self.assertLess(os.path.getsize(path), dangerous_size + 100e3)

        # Sanity check number of trees
        self.assertEquals(m.get_n_trees(), n_trees)
예제 #48
0
    def __init__(self, documents, training_videos=None):
        if training_videos is None:
            training_videos = ALL_VIDEOS

        annoy_n_trees = CONFIGURATION.getint('annoy_n_trees')
        num_dense_units = CONFIGURATION.getint('num_dense_units')
        model = _KerasSiameseNeuralNetwork(training_videos)

        LOGGER.debug(
            'Building an ANNOY index with {} trees'.format(annoy_n_trees))
        annoy_index = AnnoyIndex(num_dense_units, metric='euclidean')
        pages = dict()
        for page_index, (page, page_features) in enumerate(
                model.get_page_features(chain(*documents))):
            annoy_index.add_item(page_index, page_features)
            pages[page_index] = page
        annoy_index.build(annoy_n_trees)

        self._annoy_index = annoy_index
        self._model = model
        self._pages = pages
예제 #49
0
파일: knn.py 프로젝트: kevinrue/ivis
def build_annoy_index(X, path, ntrees=50, verbose=1):

    index = AnnoyIndex(X.shape[1], metric='angular')
    if platform.system() != 'Windows':
        index.on_disk_build(path)

    if issparse(X):
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i].toarray()[0]
            index.add_item(i, v)
    else:
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i]
            index.add_item(i, v)

    # Build n trees
    index.build(ntrees)
    if platform.system() == 'Windows':
        index.save(path)

    return index
예제 #50
0
 def nearest_neighbor_search(self, GE_csc):
     K = self.num_of_neighbor * 2
     n, d = GE_csc.shape
     t = AnnoyIndex(d)
     for i in range(n):
         t.add_item(i, GE_csc[i, :])
     t.build(100)
     print('#######OS PROCESS ID#####')
     print(str(os.getpid()))
     ann_file = str(os.getpid()) + 'test.ann'
     t.save(ann_file)
     u = AnnoyIndex(d)
     u.load(ann_file)
     os.remove(ann_file)
     val = np.zeros((n, K))
     ind = np.zeros((n, K))
     for i in range(n):
         tmp, tmp1 = u.get_nns_by_item(i, K, include_distances=True)
         ind[i, :] = tmp
         val[i, :] = tmp1
     return ind.astype('int'), val
def annoy_train(spark, dirname, rank, regParam, n_trees, random_seed):
    # Load model
    model = ALSModel.load(f'{dirname}/{rank}_{regParam}_model')

    # get item factors
    item_factors = model.itemFactors
    item_factors, annoy_index_map = convert_annoy_index(item_factors)

    # train annoy model
    tree = AnnoyIndex(rank, 'dot')
    for item in tqdm(item_factors.collect()):
        tree.add_item(item.annoy_id, item.features)
    tree.set_seed(random_seed)

    # build the tree
    # num of trees: higher n_trees gives higher precision
    tree.build(n_trees)

    # save annoy model and index map
    tree.save(f'{dirname}_{rank}_{regParam}_tree.ann')
    annoy_index_map.write.parquet(f'{dirname}_{rank}_{regParam}_annoy_index_map.parquet')
예제 #52
0
def label_approx(X, sites, site_labels):
    from annoy import AnnoyIndex

    assert (X.shape[1] == sites.shape[1])

    # Build index over site points.
    aindex = AnnoyIndex(sites.shape[1], metric='euclidean')
    for i in range(sites.shape[0]):
        aindex.add_item(i, sites[i, :])
    aindex.build(10)

    labels = []
    for i in range(X.shape[0]):
        # Find nearest site point.
        nearest_site = aindex.get_nns_by_vector(X[i, :], 1)
        if len(nearest_site) < 1:
            labels.append(None)
            continue
        labels.append(site_labels[nearest_site[0]])

    return np.array(labels)
예제 #53
0
	def find_nearest(self):
		ann = AnnoyIndex(num_merchants)
		for customer in self.customers:
			customer_vector = list(matrix.loc[[customer]])
			ann.add_item(customer, customer_vector)
			if customer%200 == 0:
				print 'Adding '+ str(customer)
		print "Building"
		if len(self.merchantIDs) > max_trees:
			ann.build(max_trees)
		else:
			ann.build(len(self.merchantIDs))
		print "...done"
		for customer in self.customers:
			neighbors = ann.get_nns_by_item(customer, num_neighbors)
			if customer%200 == 0:
				print "Found neighbors for " + str(customer)
			self.nearest[customer] = []
			for neighbor in neighbors:
				if neighbor != customer:
					self.nearest[customer].append((neighbor, ann.get_distance(neighbor, customer)))
예제 #54
0
def build_index_annoy(h5fname , dset,out='data.ann',trees = 128,lazy=True):
    #establish connection to HDF5 file
    h5f = h5py.File(h5fname,'r')
    if lazy:
        X = h5f[dset]
    else:
        X = h5f[dset][:]

    #get dimension
    f = X.shape[1]

    #initialize annoy
    t = AnnoyIndex(f,'angular')

    #iterate over features, add to annoy
    for i,v in enumerate(X):
        t.add_item(i, v)

    #build and save index
    t.build(trees)
    t.save(out)
예제 #55
0
class spherefaceAnnoyDatabase():
	def __init__(self):
		self.network = caffe.Net("pretrainedModels/sphereface_deploy.prototxt", "pretrainedModels/sphereface_model.caffemodel",0)
		self.index = AnnoyIndex(512, metric='angular') # 512 is the number of neurons in the last layer of the net
		self.indexToName = {}
		self.nameToIndex = {}

	def getEmbedding(self, imgPath):
		img = Image.open(imgPath)
        	sampleImage = numpy.array(img.resize((net.blobs['data'].data.shape[3],net.blobs['data'].data.shape[2])))
        	sampleImage = numpy.reshape(sampleImage,(1,)+sampleImage.shape).transpose(0,3,1,2).astype(numpy.float32)
        	net.blobs['data'].data[...]=sampleImage
        	net.forward()
		return net.blobs['fc5'].data[0].copy()
	
	def addFaceWithName(self, imgPath, name):
		embedding = self.getEmbedding(imgPath)
		length = self.index.get_n_items()
        	self.index.add_item(length, embedding)
		self.indexToName[length] = name
		self.nameToIndex[name] = length
	
	def addEmbeddingWithName(self, embedding, name):
		length = self.index.get_n_items()
        	self.index.add_item(length, embedding)
		self.indexToName[length] = name
		self.nameToIndex[name] = length
	
	def addFaceWithoutName(self, imgPath):
		embedding = self.getEmbedding(imgPath)
                length = self.index.get_n_items()
                self.index.add_item(length, embedding)
                self.indexToName[length] = imgPath
                self.nameToIndex[imgPath] = length

	def freeze(self, nTrees = 20):
		self.index.build(nTrees)

	def lookupByFace(self, imgPath, numberOfNeighbours):
		embedding = self.getEmbedding(imgPath)
		results = self.index.get_nns_by_vector(embedding, numberOfNeighbours, search_k=-1, include_distances=True)
		for i in xrange(len(results[0])):
                        results[0][i] = self.indexToName[results[0][i]]
		return results
	
	def lookupByEmbedding(self, embedding, numberOfNeighbours):
		if(numberOfNeighbours==-1):
			numberOfNeighbours = self.index.get_n_items()
		results = self.index.get_nns_by_vector(embedding, numberOfNeighbours, search_k=-1, include_distances=True)
		for i in xrange(len(results[0])):
                        results[0][i] = self.indexToName[results[0][i]]
		return results
	
	def lookupByName(self, name, numberOfNeighbours):
		if(numberOfNeighbours==-1):
			numberOfNeighbours = self.index.get_n_items()
		results = self.index.get_nns_by_item(self.nameToIndex[name], numberOfNeighbours, search_k=-1, include_distances=True)
		for i in xrange(len(results[0])):
			results[0][i] = self.indexToName[results[0][i]]
		return results
예제 #56
0
파일: tnn.py 프로젝트: lkmklsmn/bbtnn
def build_annoy_index(X, path, ntrees=50, build_index_on_disk=True, verbose=1):
    """ Build a standalone annoy index.
    :param array X: numpy array with shape (n_samples, n_features)
    :param str path: The filepath of a trained annoy index file
        saved on disk.
    :param int ntrees: The number of random projections trees built by Annoy to
        approximate KNN. The more trees the higher the memory usage, but the
        better the accuracy of results.
    :param bool build_index_on_disk: Whether to build the annoy index directly
        on disk. Building on disk should allow for bigger datasets to be indexed,
        but may cause issues. If None, on-disk building will be enabled for Linux,
        but not Windows due to issues on Windows.
    :param int verbose: Controls the volume of logging output the model
        produces when training. When set to 0, silences outputs, when above 0
        will print outputs.
    """

    index = AnnoyIndex(X.shape[1], metric='euclidean')
    if build_index_on_disk:
        index.on_disk_build(path)

    if issparse(X):
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i].toarray()[0]
            index.add_item(i, v)
    else:
        for i in tqdm(range(X.shape[0]), disable=verbose < 1):
            v = X[i]
            index.add_item(i, v)

    try:
        index.build(ntrees)
    except Exception:
        msg = ("Error building Annoy Index. Passing on_disk_build=False"
                " may solve the issue, especially on Windows.")
        raise IndexBuildingError(msg)
    else:
        if not build_index_on_disk:
            index.save(path)
        return index
예제 #57
0
def build_type_clusters(model, train_data_loader: DataLoader,
                        valid_data_loader: DataLoader, type_vocab: set):

    computed_embed_labels = []
    annoy_idx = AnnoyIndex(model.output_size, 'euclidean')
    curr_idx = 0

    for _, (a, _, _) in enumerate(
            tqdm(train_data_loader,
                 total=len(train_data_loader),
                 desc="Computing Type Clusters - Train set")):
        model.eval()
        with torch.no_grad():
            output_a = model(*(s.to(DEVICE) for s in a[0]))
            lables = a[1].data.cpu().numpy()
            #computed_embed_labels.append(lables)
            for i, v in enumerate(output_a.data.cpu().numpy()):
                if lables[i] in type_vocab:
                    annoy_idx.add_item(curr_idx, v)
                    computed_embed_labels.append(lables[i])
                    curr_idx += 1

    for _, (a, _, _) in enumerate(
            tqdm(valid_data_loader,
                 total=len(valid_data_loader),
                 desc="Computing Type Clusters - Valid set")):
        model.eval()
        with torch.no_grad():
            output_a = model(*(s.to(DEVICE) for s in a[0]))
            lables = a[1].data.cpu().numpy()
            #computed_embed_labels.append(a[1].data.cpu().numpy())
            for i, v in enumerate(output_a.data.cpu().numpy()):
                if lables[i] in type_vocab:
                    annoy_idx.add_item(curr_idx, v)
                    computed_embed_labels.append(lables[i])
                    curr_idx += 1

    annoy_idx.build(KNN_TREE_SIZE)
    return annoy_idx, np.array(
        computed_embed_labels)  #np.hstack(computed_embed_labels)
예제 #58
0
def ann_annoy(data, metric='euclidean',
              n_neighbors=10,
              trees=10):
    """My Approximate Nearest Neighbors function (ANN)
    using the annoy package.

    Parameters
    ----------


    Returns
    -------


    """
    datapoints = data.shape[0]
    dimension = data.shape[1]

    # initialize the annoy database
    ann = AnnoyIndex(dimension)

    # store the datapoints
    for (i, row) in enumerate(data):
        ann.add_item(i, row.tolist())

    # build the index
    ann.build(trees)

    # find the k-nearest neighbors for all points
    idx = np.zeros((datapoints, n_neighbors), dtype='int')
    distVals = idx.copy().astype(np.float)

    # extract the distance values
    for i in range(0, datapoints):
        idx[i,:] = ann.get_nns_by_item(i, n_neighbors)

        for j in range(0, n_neighbors):
            distVals[i,j] = ann.get_distance(i, idx[i,j])

    return distVals, idx
예제 #59
0
 def compute_and_store_similarity(self):
     start = time.time()
     sessions_VSM, sessions_id = self._driver.session_vectors()
     print("Time to create the vector:", time.time() - start)
     t = AnnoyIndex(sessions_VSM.shape[1], 'angular')
     t.on_disk_build('/tmp/test.ann')
     start = time.time()
     i = 0
     overall_size = sessions_VSM.shape[0]
     for ix in range(overall_size):
         x = sessions_VSM.getrow(ix)
         t.add_item(ix, x.toarray()[0])
         i += 1
         if i % 1000 == 0:
             print(i, "rows processed over", overall_size)
     print("Time to index:", time.time() - start)
     del sessions_VSM
     gc.collect()
     start = time.time()
     t.build(5)  # 5 trees
     print("Time to build:", time.time() - start)
     knn_start = time.time()
     i = 0
     for ix in range(overall_size):
         knn = self.compute_knn(ix, sessions_id, t, 50)
         start = time.time()
         self.store_knn(sessions_id[ix], knn)
         self.__time_to_store.append(time.time() - start)
         i +=1
         if i%100 == 0:
             print(i, "rows processed over", overall_size)
             print(mean(self.__time_to_query),
                   mean(self.__time_to_knn),
                   mean(self.__time_to_sort),
                   mean(self.__time_to_store))
             self.__time_to_query = []
             self.__time_to_knn = []
             self.__time_to_sort = []
             self.__time_to_store = []
     print("Time to compute knn:", time.time() - knn_start)
예제 #60
0
def generate_pair(X,
                  n_neighbors,
                  n_MN,
                  n_FP,
                  distance='euclidean',
                  verbose=True):
    n, dim = X.shape
    n_neighbors_extra = min(n_neighbors + 50, n)
    tree = AnnoyIndex(dim, metric=distance)
    if _RANDOM_STATE is not None:
        tree.set_seed(_RANDOM_STATE)
    for i in range(n):
        tree.add_item(i, X[i, :])
    tree.build(20)

    nbrs = np.zeros((n, n_neighbors_extra), dtype=np.int32)
    knn_distances = np.empty((n, n_neighbors_extra), dtype=np.float32)

    for i in range(n):
        nbrs_ = tree.get_nns_by_item(i, n_neighbors_extra + 1)
        nbrs[i, :] = nbrs_[1:]
        for j in range(n_neighbors_extra):
            knn_distances[i, j] = tree.get_distance(i, nbrs[i, j])
    if verbose:
        print("Found nearest neighbor")
    sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10)
    if verbose:
        print("Calculated sigma")
    scaled_dist = scale_dist(knn_distances, sig, nbrs)
    if verbose:
        print("Found scaled dist")
    pair_neighbors = sample_neighbors_pair(X, scaled_dist, nbrs, n_neighbors)
    if _RANDOM_STATE is None:
        pair_MN = sample_MN_pair(X, n_MN)
        pair_FP = sample_FP_pair(X, pair_neighbors, n_neighbors, n_FP)
    else:
        pair_MN = sample_MN_pair_deterministic(X, n_MN, _RANDOM_STATE)
        pair_FP = sample_FP_pair_deterministic(X, pair_neighbors, n_neighbors,
                                               n_FP, _RANDOM_STATE)
    return pair_neighbors, pair_MN, pair_FP