コード例 #1
0
from keras.applications.vgg16 import preprocess_input, VGG16
from keras.models import Model

from annoy import AnnoyIndex

ANNOY_MODEL_PATH = "./models/ramen.ann"
ANNOY_DIMENTION = 4096
SEARCH_IMAGE_PATH = "ramen_images/ramen1001.jpg"

# VGG19から中間層を抽出
base_model = VGG16(weights="imagenet")
model = Model(inputs=base_model.input,
              outputs=base_model.get_layer("fc2").output)

# Annoyのモデルを構築
loaded_model = AnnoyIndex(ANNOY_DIMENTION)
loaded_model.load(ANNOY_MODEL_PATH)

# 検索対象の画像をロードして、ベクトルに変換
img_path = SEARCH_IMAGE_PATH
img = image.load_img(img_path, target_size=(224, 224))

x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

fc2_features = model.predict(x)

# Annoyで検索
items = loaded_model.get_nns_by_vector(fc2_features[0],
                                       3,
コード例 #2
0
ファイル: index_test.py プロジェクト: ntwarijoshua/annoy
 def test_save_twice(self):
     # Issue #100
     t = AnnoyIndex(10)
     t.save("t.ann")
     t.save("t.ann")
コード例 #3
0
ファイル: index_test.py プロジェクト: ntwarijoshua/annoy
 def test_seed(self):
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     i.set_seed(42)
コード例 #4
0
ファイル: index_test.py プロジェクト: ntwarijoshua/annoy
 def test_not_found_tree(self):
     i = AnnoyIndex(10)
     self.assertRaises(IOError, i.load, 'nonexists.tree')
コード例 #5
0
ファイル: index_test.py プロジェクト: ntwarijoshua/annoy
 def test_construct_load_destruct(self):
     for x in range(100000):
         i = AnnoyIndex(10)
         i.load('test/test.tree')
コード例 #6
0
from annoy import AnnoyIndex
import pandas as pd
from v01.Utils import *

# Read feature file
df = pd.read_csv('../features/featuresplayground100-1.csv')
df.head()

# Convert feature from df to list of list
copydf = df.copy()
del copydf['filename']
features = copydf.values.tolist()
features = normalize_minmax_matrix(features)

# Get number of features
feature_nums = len(df.columns) - 1

# add feature data to annoy indexing
f = feature_nums
t = AnnoyIndex(f, metric='euclidean')

print(len(features))
for i in range(len(features)):
    v = features[i]
    t.add_item(i, v)

# create index tree
t.build(f)
t.save('../indextree/englishwordfeatures-minmax100-1.ann')

print("DONE")
コード例 #7
0
 def __init__(self, embeddings, ntrees):
     self.ntrees = ntrees
     self.index = AnnoyIndex(embeddings.shape[1], metric='angular')
     for i, embedding in enumerate(embeddings):
         self.index.add_item(i, embedding)
         self.index.build(ntrees)
コード例 #8
0
    hnsw.saveIndex('hnswIndex40.ann')
    del hnsw
    del rez
    del dist
    del result
    del neighbours
    gc.collect()

#______________________________________________#    
#annoy
#Annoy
from annoy import AnnoyIndex
for trs in [90]:#[5,15,30,45,60,80]:
    
    f = train.shape[1]
    t = AnnoyIndex(f, 'euclidean')
    
    startClock= time.clock()
    startTime = process_time()
    for i in range(train.shape[0]):
        t.add_item(i,train[i])
    t.build(trs)
    end_time = process_time()
    constructionTime = end_time - startTime
    endClock = time.clock()
    constructionClock= endClock - startClock
    
    
    rez = []
    dist = []
    startClock = time.clock()
コード例 #9
0
def create_pairs_from_unlabeled_data(x1,
                                     x2=None,
                                     y=None,
                                     p=None,
                                     k=5,
                                     tot_pairs=None,
                                     precomputed_knn_path='',
                                     use_approx=False,
                                     pre_shuffled=False,
                                     verbose=None):
    '''
    Generates positive and negative pairs for the siamese network from
    unlabeled data. Draws from the k nearest neighbors (where k is the
    provided parameter) of each point to form pairs. Number of neighbors
    to draw is determined by tot_pairs, if provided, or k if not provided.

    x1: input data array
    x2: parallel data array (pairs will exactly shadow the indices of x1,
        but be drawn from x2)
    y:  true labels (if available) purely for checking how good our pairs are
    p:  permutation vector - in cases where the array is shuffled and we
        use a precomputed knn matrix (where knn is performed on unshuffled
        data), we keep track of the permutations with p, and apply the same
        permutation to the precomputed knn matrix
    k:  the number of neighbors to use (the 'k' in knn)

    tot_pairs:              total number of pairs to produce
    precomputed_knn_path:   location of stored precomputed knn results -
                            empty string means we do not load precomputed
                            neighbors
    use_approx:             flag for running with LSH instead of KNN, in other
                            words, an approximation of KNN
    verbose:                flag for extra debugging printouts

    returns:    pairs for x1, (pairs for x2 if x2 is provided), labels
                (inferred by knn), (labels_true, the absolute truth, if y
                is provided
    '''
    if x2 is not None and x1.shape != x2.shape:
        raise ValueError("x1 and x2 must be the same shape!")

    n = len(p) if p is not None else len(x1)

    pairs_per_pt = max(1, min(k, int(
        tot_pairs / (n * 2)))) if tot_pairs is not None else max(1, k)

    if p is not None and not pre_shuffled:
        x1 = x1[p[:n]]
        y = y[p[:n]]

    pairs = []
    pairs2 = []
    labels = []
    true = []
    verbose = True
    if len(precomputed_knn_path):
        # load precomputed weights
        if verbose:
            print('loading precomputed weights...')
            print('load path:', precomputed_knn_path)
        if precomputed_knn_path.endswith('.h5'):
            with h5py.File(precomputed_knn_path, 'r') as f:
                kn_idxs_untouched = np.asarray(f.get('kn_idxs'),
                                               dtype='uint32')
        else:
            kn_idxs_untouched = pickle.load(open(precomputed_knn_path, 'rb'))
        if isinstance(kn_idxs_untouched, tuple):
            kn_idxs_untouched = kn_idxs_untouched[1]

        assert (kn_idxs_untouched >= 0).all()
        if p is None:
            Idx = kn_idxs_untouched
        else:
            # if we have shuffled the array with p, we must convert our neighbors
            # matrix to correspond to the shuffled indices
            import pyximport
            pyximport.install()
            from core.convert_idxs import convert_idxs
            Idx = convert_idxs(kn_idxs_untouched.astype(np.int32, copy=False),
                               p.astype(np.int32, copy=False), k, n)
            print('converted all indices')

    else:
        if verbose:
            print('computing k={} nearest neighbors...'.format(k))
        if len(x1.shape) > 2:
            x1_flat = x1.reshape(x1.shape[0], np.prod(x1.shape[1:]))[:n]
        else:
            x1_flat = x1[:n]

            print('next')
        if use_approx:
            ann = AnnoyIndex(x1_flat.shape[1], metric='euclidean')
            for i, x_ in enumerate(x1_flat):
                ann.add_item(i, x_)
            ann.build(50)
            Idx = np.empty((len(x1_flat), k + 1))
            for i in range(len(x1_flat)):
                nn_i = ann.get_nns_by_item(i, k + 1, include_distances=False)
                Idx[i, :] = np.array(nn_i)
        else:
            print('start paralleling')
            nbrs = NearestNeighbors(n_neighbors=k + 1, n_jobs=-1).fit(x1_flat)
            _, Idx = nbrs.kneighbors(x1_flat)
            print('computation comleted!')

    # for each row, remove the element itself from its list of neighbors
    # (we don't care that each point is its own closest neighbor)
    new_Idx = np.empty((Idx.shape[0], Idx.shape[1] - 1))
    assert (Idx >= 0).all()
    for i in range(Idx.shape[0]):
        try:
            new_Idx[i] = Idx[i, Idx[i] != i][:Idx.shape[1] - 1]
        except Exception as e:
            print(Idx[i, ...], new_Idx.shape, Idx.shape)
            raise e
    Idx = new_Idx.astype(np.int)
    k_max = min(Idx.shape[1], k + 1)

    if verbose:
        print('creating pairs...')
        print("ks", n, k_max, k, pairs_per_pt)

    # pair generation loop (alternates between true and false pairs)
    consecutive_fails = 0
    for i in range(n):
        # get_choices sometimes fails with precomputed results. if this happens
        # too often, we relax the constraint on k
        if consecutive_fails > 5:
            k_max = min(Idx.shape[1], int(k_max * 2))
            consecutive_fails = 0
        if verbose and i % 10000 == 0:
            print("Iter: {}/{}".format(i, n))
        # pick points from neighbors of i for positive pairs
        try:
            choices = get_choices(Idx[i, :k_max], pairs_per_pt, replace=False)
            consecutive_fails = 0
        except ValueError:
            consecutive_fails += 1
            continue
        assert i not in choices
        # form the pairs
        new_pos = [[x1[i], x1[c]] for c in choices]
        if x2 is not None:
            new_pos2 = [[x2[i], x2[c]] for c in choices]
        if y is not None:
            pos_labels = [[y[i] == y[c]] for c in choices]
        # pick points *not* in neighbors of i for negative pairs
        try:
            choices = get_choices((0, n),
                                  pairs_per_pt,
                                  not_arr=Idx[i, :k_max],
                                  replace=False)
            consecutive_fails = 0
        except ValueError:
            consecutive_fails += 1
            continue
        # form negative pairs
        new_neg = [[x1[i], x1[c]] for c in choices]
        if x2 is not None:
            new_neg2 = [[x2[i], x2[c]] for c in choices]
        if y is not None:
            neg_labels = [[y[i] == y[c]] for c in choices]

        # add pairs to our list
        labels += [1] * len(new_pos) + [0] * len(new_neg)
        pairs += new_pos + new_neg
        if x2 is not None:
            pairs2 += new_pos2 + new_neg2
        if y is not None:
            true += pos_labels + neg_labels

    # package return parameters for output
    ret = [np.array(pairs).reshape((len(pairs), 2) + x1.shape[1:])]
    if x2 is not None:
        ret.append(np.array(pairs2).reshape((len(pairs2), 2) + x2.shape[1:]))
    ret.append(np.array(labels))
    if y is not None:
        true = np.array(true).astype(np.int).reshape(-1, 1)
        if verbose:
            # if true vectors are provided, we can take a peek to check
            # the validity of our kNN approximation
            print("confusion matrix for pairs and approximated labels:")
            print(metrics.confusion_matrix(true, labels) / true.shape[0])
            print(metrics.confusion_matrix(true, labels))
        ret.append(true)

    return ret
コード例 #10
0
def predict(params):
  result = {}

  print('Predict', params)

  annoy_vector_dimension = VECTOR_SIZE
  index_filename = default_index_file

  data_file = default_csv_file_path
  use_model = default_use_model
  k = default_k
  stop_words = False

  input_sentence_id = None

  try:
    if params:
      if params.get('guid'):
        input_sentence_id = params.get('guid')
      if params.get('vector_size'):
        annoy_vector_dimension = params.get('vector_size')
      if params.get('index_filename'):
        index_filename = params.get('index_filename')
      if params.get('data_file'):
        data_file = params.get('data_file')
      if params.get('use_model'):
        use_model = params.get('use_model')
      if params.get('k'):
        k = params.get('k')
      if params.get('stop_words'):
        stop_words = params.get('stop_words')

    if len(input_sentence_id) <= 0:
      print_with_time('Input Sentence Id: {}'.format(input_sentence_id))
      result = {
        'error': 'Invalid Input id'
      }
      return result

    start_time = time.time()
    annoy_index = AnnoyIndex(annoy_vector_dimension, metric='angular')
    annoy_index.load(model_indexes_path + index_filename)
    end_time = time.time()
    print_with_time('Annoy Index load time: {}'.format(end_time-start_time))

    start_time = time.time()
    data_frame = read_data(data_file)
    content_array = data_frame.to_numpy()
    end_time = time.time()
    print_with_time('Time to read data file: {}'.format(end_time-start_time))

    start_time = time.time()
    embed_func = hub.Module(use_model)
    end_time = time.time()
    print_with_time('Load the module: {}'.format(end_time-start_time))

    start_time = time.time()
    sentences = tf.compat.v1.placeholder(dtype=tf.string, shape=[None])
    embedding = embed_func(sentences)
    end_time = time.time()
    print_with_time('Init sentences embedding: {}'.format(end_time-start_time))

    start_time = time.time()
    sess = tf.compat.v1.Session()
    sess.run([tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer()])
    end_time = time.time()
    print_with_time('Time to create session: {}'.format(end_time-start_time))

    print_with_time('Input Sentence id: {}'.format(input_sentence_id))
    params_filter = 'GUID == "' + input_sentence_id + '"'
    input_data_object = data_frame.query(params_filter)
    input_sentence = input_data_object['CONTENT']

    if stop_words:
      input_sentence = remove_stopwords(input_sentence)

    start_time = time.time()
    sentence_vector = sess.run(embedding, feed_dict={sentences:input_sentence})
    nns = annoy_index.get_nns_by_vector(sentence_vector[0], k)
    end_time = time.time()
    print_with_time('nns done: Time: {}'.format(end_time-start_time))

    similar_sentences = []
    similarities = [content_array[nn] for nn in nns]
    for sentence in similarities[1:]:
      similar_sentences.append({
        'guid': sentence[0],
        'content': sentence[1]
      })
      print(sentence[0])

    result = SimilarityResult(input_sentence_id, input_sentence.values[0], similar_sentences)
  
  except Exception as e:
    print('Exception in predict: {0}'.format(e))
    result = {
        'error': 'Failure'
    }

  return result    
コード例 #11
0
                        help='How many dimension in the vector space')
    parser.add_argument('--max_trees',
                        type=int,
                        default=50,
                        help='How many trees do want to use for `AnnoyIndex`')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    with open(args.input_vectors) as input_file:
        content = (json.load(input_file))

    title_id = 0
    titles = []
    title_index = AnnoyIndex(args.dimensions)

    for line_items in content:
        title = line_items[0]
        titles.append(title)
        vectors = line_items[1]
        title_index.add_item(title, vectors)
        title_id += 1

    title_index.build(args.max_trees)

    # If you want to save index:
    # title_index.save('crawl_50_clean.ann')

    # If you want to load index:
    # u1 = AnnoyIndex(args.dimensions)
コード例 #12
0
ファイル: utils.py プロジェクト: anminhhung/hackathon
        category_list = [
            learner.data.train_ds.y.reconstruct(y)
            for y in similar_images_df['label_id']
        ]
    # return learner.data.show_xys([open_image(img_id) for img_id in similar_images_df['img_path']],
    #                             category_list, figsize=fig_size)
    return similar_images_df['img_path'].tolist()


print("building tree ....")
# more tree = better approximation
ntree = 100
#"angular", "euclidean", "manhattan", "hamming", or "dot"
metric_choice = 'angular'

annoy_tree = AnnoyIndex(len(data_df_ouput['embedding'][0]),
                        metric=metric_choice)

# # takes a while to build the tree
for i, vector in enumerate(data_df_ouput['embedding']):
    annoy_tree.add_item(i, vector)
_ = annoy_tree.build(ntree)

print("finished build tree")


def centroid_embedding(outfit_embedding_list):
    number_of_outfits = outfit_embedding_list.shape[0]
    length_of_embedding = outfit_embedding_list.shape[1]
    centroid = []
    for i in range(length_of_embedding):
        centroid.append(
コード例 #13
0
# https://github.com/jimkang/annoy-node
# https://github.com/spotify/annoy

"""
pip install --user annoy
or
pip3 install --user annoy
"""

import random
from annoy import AnnoyIndex

DIMENSIONS = 10  # TODO: 512 for embeddings using TensorFlow JS U.S.E.
STATIC_STORAGE_PATH = "embeddings-data"
METRIC = "angular"
storage = AnnoyIndex(DIMENSIONS, METRIC)

v1 = [-5.0, -4.5, -3.2, -2.8, -2.1, -1.5, -0.34, 0, 3.7, 6]


def load_existing_storage():
    storage.load(STATIC_STORAGE_PATH)
    # storage.unbuild()  # this won't work
    # can't unbuild to add to storage if it's already saved
    # https://github.com/spotify/annoy/issues/174#issuecomment-319554923
    # https://github.com/spotify/annoy/issues/403#issuecomment-520616560

    # have to re-index all to "add" new:
    # https://github.com/spotify/annoy/issues/447#issuecomment-574836547

コード例 #14
0
from __future__ import print_function
import random, time
from annoy import AnnoyIndex

try:
    xrange
except NameError:
    # Python 3 compat
    xrange = range

n, f = 100000, 40

t = AnnoyIndex(f, 'angular')
for i in xrange(n):
    v = []
    for z in xrange(f):
        v.append(random.gauss(0, 1))
    t.add_item(i, v)

t.build(2 * f)
t.save('test.tree')

limits = [10, 100, 1000, 10000]
k = 10
prec_sum = {}
prec_n = 1000
time_sum = {}

for i in xrange(prec_n):
    j = random.randrange(0, n)
コード例 #15
0
ファイル: faceSearch.py プロジェクト: kyrs/search-Engine
    for (dirpath, dirnames, filenames) in walk(args.index):
        nfilenames = []
        for f in filenames:
            nfilenames.append(dirpath + '/' + f)
        onlyfiles.extend(nfilenames)
    for x in batch(onlyfiles, args.index_batch_size):
        sys.stdout.write('\r' + str(c) + '/' + str(len(onlyfiles)))
        sys.stdout.flush()
        classif = dd.post_predict(sname, x, parameters_input, parameters_mllib,
                                  parameters_output)
        print classif
        for p in classif['body']['predictions']:
            if c == 0:
                layer_size = len(p['vals'])
                s['layer_size'] = layer_size
                t = AnnoyIndex(layer_size, metric)  # prepare index
            t.add_item(c, p['vals'])
            s[str(c)] = p['uri']
            c = c + 1
        #if c >= 10000:
        #    break
    print 'building index...\n'
    print 'layer_size=', layer_size
    t.build(ntrees)
    t.save('index.ann')
    s.close()

if args.search:
    s = shelve.open('names.bin')
    u = AnnoyIndex(s['layer_size'], metric)
    u.load('index.ann')
コード例 #16
0
ファイル: main.py プロジェクト: hanabi1224/RuAnnoy
metrics = ["angular", "euclidean", "manhattan", "dot", "hamming"]
dim = 5
size = 100

for metric in metrics:
    fname = f'index.{metric}.{dim}d.ann'
    print(f'Generating index for {metric}')
    # t = AnnoyIndex(dim, metric)  # Length of item vector that will be indexed
    # for i in range(size):
    #     v = [random.gauss(0, 1) for z in range(dim)]
    #     t.add_item(i, v)

    # t.build(10)  # 10 trees
    # t.save(fname)

    # ...

    u = AnnoyIndex(dim, metric)
    u.verbose(True)
    u.load('./../tests/' + fname)  # super fast, will just mmap the file
    print(u.get_item_vector(3))
    v0 = u.get_item_vector(0)
    print(v0)
    nearests = u.get_nns_by_vector(v0, 5, include_distances=True)
    id_1 = nearests[0][1]
    print(u.get_item_vector(id_1))
    print(u.get_distance(0, id_1))
    # print(u.get_distance(0, 16))
    print(nearests[0])  # will find the 1000 nearest neighbors
    print(nearests[1])
コード例 #17
0
#!/home/stefan2/anaconda/bin/python

from annoy import AnnoyIndex
#from annoy import AnnoyIndexEuclidean

num_features = 100
num_points = 42000
num_nn = 100  #number of nearest neighbors

f = '../data/index.annoy'

index = AnnoyIndex(num_features)
index.load(f)

results = open('../data/annoy-results.train.txt', 'w')
print "search for nn"

for i in range(0, num_points):
    items = index.get_nns_by_item(i, num_nn)
    #items = t.get_nns_by_vector(vectors[i], 100) #if you have the vectors
    if (items[0] != i):
        print 'the top nn is expected to be the item itself'
        raise SystemExit
    as_str = " ".join(map(str, items))
    results.write(as_str + "\n")
results.close()
コード例 #18
0
ファイル: server.py プロジェクト: ChaitanyaCixLive/mndm-IR
        self.out = tf.squeeze(
            self.sess.graph.get_tensor_by_name('vgg_16/avgp5/AvgPool:0'))

    def feat_(self, image_path):
        img_data = np.expand_dims(np.array(open(image_path, 'r').read()), 0)
        return self.sess.run(self.out, {self.img: img_data})

    def feat(self, feat_string):
        img_data = np.expand_dims(np.array(feat_string), 0)
        return self.sess.run(self.out, {self.img: img_data})


names = np.load('data/name.npy')
if not os.path.exists('model/inshop.ann'):
    feats = np.load('data/feats.npy')
    t = AnnoyIndex(512)
    for i, a in enumerate(feats):
        t.add_item(i, a)
    t.build(200)
    t.save('model/inshop.ann')
else:
    t = AnnoyIndex(512)
    t.load('model/inshop.ann')

worker = Feature2()


class RequestHandler(pyjsonrpc.HttpRequestHandler):
    @pyjsonrpc.rpcmethod
    def Feat(self, str):
        str = base64.b64decode(str)
コード例 #19
0
from annoy import AnnoyIndex

a = AnnoyIndex(3)
a.add_item(0, [1, 0, 0])
a.add_item(1, [0, 1, 0])
a.add_item(2, [0, 0, 1])
a.build(-1)
a.save('test.tree')

b = AnnoyIndex(3)
b.load('test.tree')

print b.get_nns_by_item(0, 100)
print b.get_nns_by_vector([1.0, 0.5, 0.5], 100)
コード例 #20
0
ファイル: index_test.py プロジェクト: keithmcneill/annoy
 def test_metric_f_kwargs(self):
     i = AnnoyIndex(f=3, metric='euclidean')
コード例 #21
0
ファイル: bot.py プロジェクト: VladimirNorakidze/hacaton
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
from annoy import AnnoyIndex

data = pd.read_pickle('./data/file.pkl')
w2v = KeyedVectors.load("./data/word2vec.model")
a = AnnoyIndex(300)
a.load('./data/annoy_15')


def ret_vect(x, pos):
    k = []
    for s in x:
        try:
            k.append(w2v.wv[s])
        except KeyError:
            continue
    if len(k) == 0:
        return 0
    else:
        return np.mean(k, axis=0) if pos else -np.mean(k, axis=0)


# def ret_news(x_pos, x_neg):
#     res_vec = ret_vect(x_pos, True) + ret_vect(x_neg, False)
#     if isinstance(res_vec, int):
#         return "Нет новостей"
#     best_index = a.get_nns_by_vector(res_vec, 1)[0]
#     tags = []
#     for i in data[best_index]['tokens_wo_upper']:
コード例 #22
0
ファイル: index_test.py プロジェクト: keithmcneill/annoy
 def test_prefault(self):
     i = AnnoyIndex(10)
     i.load('test/test.tree', prefault=True)
     self.assertEqual(i.get_nns_by_item(0, 10),
                      [0, 85, 42, 11, 54, 38, 53, 66, 19, 31])
コード例 #23
0
ファイル: index_test.py プロジェクト: ntwarijoshua/annoy
 def test_load_unload(self):
     # Issue #108
     i = AnnoyIndex(10)
     for x in range(100000):
         i.load('test/test.tree')
         i.unload()
コード例 #24
0
ファイル: index_test.py プロジェクト: keithmcneill/annoy
 def test_fail_save(self):
     t = AnnoyIndex(40)
     with self.assertRaises(IOError):
         t.save('')
コード例 #25
0
ファイル: index_test.py プロジェクト: ntwarijoshua/annoy
 def test_construct_destruct(self):
     for x in range(100000):
         i = AnnoyIndex(10)
         i.add_item(1000, [random.gauss(0, 1) for z in range(10)])
コード例 #26
0
ファイル: index_test.py プロジェクト: keithmcneill/annoy
 def test_get_n_trees(self):
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     self.assertEqual(i.get_n_trees(), 10)
コード例 #27
0
ファイル: index_test.py プロジェクト: ntwarijoshua/annoy
 def test_unbuild_with_loaded_tree(self):
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     i.unbuild()
コード例 #28
0
                                         hyper_overrides={})

    predictions = []
    for language in ('python', 'go', 'javascript', 'java', 'php', 'ruby'):
        print("Evaluating language: %s" % language)
        definitions = pickle.load(
            open(
                '../resources/data/{}_dedupe_definitions_v2.pkl'.format(
                    language), 'rb'))
        indexes = [{
            'code_tokens': d['function_tokens'],
            'language': d['language']
        } for d in tqdm(definitions)]
        code_representations = model.get_code_representations(indexes)
        print(code_representations[0].shape)
        indices = AnnoyIndex(code_representations[0].shape[0], 'angular')
        for index, vector in tqdm(enumerate(code_representations)):
            assert vector is not None
            indices.add_item(index, vector)
        indices.build(1000)
        # indices.build(10)
        # indices.build(200)

        for query in tqdm(queries):
            for idx, _ in zip(*query_model(query, model, indices, language)):
                predictions.append(
                    (query, language, definitions[idx]['identifier'],
                     definitions[idx]['url']))

    df = pd.DataFrame(predictions,
                      columns=['query', 'language', 'identifier', 'url'])
コード例 #29
0
 def __init__(self):
     self.connection = sqlite3.connect('junction17.db')
     self.annoy_index = AnnoyIndex(128, metric='angular')
     self._populate_index()
コード例 #30
0
ファイル: _annoy_test.py プロジェクト: mindis/annoy
 def test_not_found_tree(self):
     i = AnnoyIndex(10)
     with self.assertRaises(IOError):
         i.load("nonexists.tree")