from keras.applications.vgg16 import preprocess_input, VGG16 from keras.models import Model from annoy import AnnoyIndex ANNOY_MODEL_PATH = "./models/ramen.ann" ANNOY_DIMENTION = 4096 SEARCH_IMAGE_PATH = "ramen_images/ramen1001.jpg" # VGG19から中間層を抽出 base_model = VGG16(weights="imagenet") model = Model(inputs=base_model.input, outputs=base_model.get_layer("fc2").output) # Annoyのモデルを構築 loaded_model = AnnoyIndex(ANNOY_DIMENTION) loaded_model.load(ANNOY_MODEL_PATH) # 検索対象の画像をロードして、ベクトルに変換 img_path = SEARCH_IMAGE_PATH img = image.load_img(img_path, target_size=(224, 224)) x = image.img_to_array(img) x = np.expand_dims(x, axis=0) x = preprocess_input(x) fc2_features = model.predict(x) # Annoyで検索 items = loaded_model.get_nns_by_vector(fc2_features[0], 3,
def test_save_twice(self): # Issue #100 t = AnnoyIndex(10) t.save("t.ann") t.save("t.ann")
def test_seed(self): i = AnnoyIndex(10) i.load('test/test.tree') i.set_seed(42)
def test_not_found_tree(self): i = AnnoyIndex(10) self.assertRaises(IOError, i.load, 'nonexists.tree')
def test_construct_load_destruct(self): for x in range(100000): i = AnnoyIndex(10) i.load('test/test.tree')
from annoy import AnnoyIndex import pandas as pd from v01.Utils import * # Read feature file df = pd.read_csv('../features/featuresplayground100-1.csv') df.head() # Convert feature from df to list of list copydf = df.copy() del copydf['filename'] features = copydf.values.tolist() features = normalize_minmax_matrix(features) # Get number of features feature_nums = len(df.columns) - 1 # add feature data to annoy indexing f = feature_nums t = AnnoyIndex(f, metric='euclidean') print(len(features)) for i in range(len(features)): v = features[i] t.add_item(i, v) # create index tree t.build(f) t.save('../indextree/englishwordfeatures-minmax100-1.ann') print("DONE")
def __init__(self, embeddings, ntrees): self.ntrees = ntrees self.index = AnnoyIndex(embeddings.shape[1], metric='angular') for i, embedding in enumerate(embeddings): self.index.add_item(i, embedding) self.index.build(ntrees)
hnsw.saveIndex('hnswIndex40.ann') del hnsw del rez del dist del result del neighbours gc.collect() #______________________________________________# #annoy #Annoy from annoy import AnnoyIndex for trs in [90]:#[5,15,30,45,60,80]: f = train.shape[1] t = AnnoyIndex(f, 'euclidean') startClock= time.clock() startTime = process_time() for i in range(train.shape[0]): t.add_item(i,train[i]) t.build(trs) end_time = process_time() constructionTime = end_time - startTime endClock = time.clock() constructionClock= endClock - startClock rez = [] dist = [] startClock = time.clock()
def create_pairs_from_unlabeled_data(x1, x2=None, y=None, p=None, k=5, tot_pairs=None, precomputed_knn_path='', use_approx=False, pre_shuffled=False, verbose=None): ''' Generates positive and negative pairs for the siamese network from unlabeled data. Draws from the k nearest neighbors (where k is the provided parameter) of each point to form pairs. Number of neighbors to draw is determined by tot_pairs, if provided, or k if not provided. x1: input data array x2: parallel data array (pairs will exactly shadow the indices of x1, but be drawn from x2) y: true labels (if available) purely for checking how good our pairs are p: permutation vector - in cases where the array is shuffled and we use a precomputed knn matrix (where knn is performed on unshuffled data), we keep track of the permutations with p, and apply the same permutation to the precomputed knn matrix k: the number of neighbors to use (the 'k' in knn) tot_pairs: total number of pairs to produce precomputed_knn_path: location of stored precomputed knn results - empty string means we do not load precomputed neighbors use_approx: flag for running with LSH instead of KNN, in other words, an approximation of KNN verbose: flag for extra debugging printouts returns: pairs for x1, (pairs for x2 if x2 is provided), labels (inferred by knn), (labels_true, the absolute truth, if y is provided ''' if x2 is not None and x1.shape != x2.shape: raise ValueError("x1 and x2 must be the same shape!") n = len(p) if p is not None else len(x1) pairs_per_pt = max(1, min(k, int( tot_pairs / (n * 2)))) if tot_pairs is not None else max(1, k) if p is not None and not pre_shuffled: x1 = x1[p[:n]] y = y[p[:n]] pairs = [] pairs2 = [] labels = [] true = [] verbose = True if len(precomputed_knn_path): # load precomputed weights if verbose: print('loading precomputed weights...') print('load path:', precomputed_knn_path) if precomputed_knn_path.endswith('.h5'): with h5py.File(precomputed_knn_path, 'r') as f: kn_idxs_untouched = np.asarray(f.get('kn_idxs'), dtype='uint32') else: kn_idxs_untouched = pickle.load(open(precomputed_knn_path, 'rb')) if isinstance(kn_idxs_untouched, tuple): kn_idxs_untouched = kn_idxs_untouched[1] assert (kn_idxs_untouched >= 0).all() if p is None: Idx = kn_idxs_untouched else: # if we have shuffled the array with p, we must convert our neighbors # matrix to correspond to the shuffled indices import pyximport pyximport.install() from core.convert_idxs import convert_idxs Idx = convert_idxs(kn_idxs_untouched.astype(np.int32, copy=False), p.astype(np.int32, copy=False), k, n) print('converted all indices') else: if verbose: print('computing k={} nearest neighbors...'.format(k)) if len(x1.shape) > 2: x1_flat = x1.reshape(x1.shape[0], np.prod(x1.shape[1:]))[:n] else: x1_flat = x1[:n] print('next') if use_approx: ann = AnnoyIndex(x1_flat.shape[1], metric='euclidean') for i, x_ in enumerate(x1_flat): ann.add_item(i, x_) ann.build(50) Idx = np.empty((len(x1_flat), k + 1)) for i in range(len(x1_flat)): nn_i = ann.get_nns_by_item(i, k + 1, include_distances=False) Idx[i, :] = np.array(nn_i) else: print('start paralleling') nbrs = NearestNeighbors(n_neighbors=k + 1, n_jobs=-1).fit(x1_flat) _, Idx = nbrs.kneighbors(x1_flat) print('computation comleted!') # for each row, remove the element itself from its list of neighbors # (we don't care that each point is its own closest neighbor) new_Idx = np.empty((Idx.shape[0], Idx.shape[1] - 1)) assert (Idx >= 0).all() for i in range(Idx.shape[0]): try: new_Idx[i] = Idx[i, Idx[i] != i][:Idx.shape[1] - 1] except Exception as e: print(Idx[i, ...], new_Idx.shape, Idx.shape) raise e Idx = new_Idx.astype(np.int) k_max = min(Idx.shape[1], k + 1) if verbose: print('creating pairs...') print("ks", n, k_max, k, pairs_per_pt) # pair generation loop (alternates between true and false pairs) consecutive_fails = 0 for i in range(n): # get_choices sometimes fails with precomputed results. if this happens # too often, we relax the constraint on k if consecutive_fails > 5: k_max = min(Idx.shape[1], int(k_max * 2)) consecutive_fails = 0 if verbose and i % 10000 == 0: print("Iter: {}/{}".format(i, n)) # pick points from neighbors of i for positive pairs try: choices = get_choices(Idx[i, :k_max], pairs_per_pt, replace=False) consecutive_fails = 0 except ValueError: consecutive_fails += 1 continue assert i not in choices # form the pairs new_pos = [[x1[i], x1[c]] for c in choices] if x2 is not None: new_pos2 = [[x2[i], x2[c]] for c in choices] if y is not None: pos_labels = [[y[i] == y[c]] for c in choices] # pick points *not* in neighbors of i for negative pairs try: choices = get_choices((0, n), pairs_per_pt, not_arr=Idx[i, :k_max], replace=False) consecutive_fails = 0 except ValueError: consecutive_fails += 1 continue # form negative pairs new_neg = [[x1[i], x1[c]] for c in choices] if x2 is not None: new_neg2 = [[x2[i], x2[c]] for c in choices] if y is not None: neg_labels = [[y[i] == y[c]] for c in choices] # add pairs to our list labels += [1] * len(new_pos) + [0] * len(new_neg) pairs += new_pos + new_neg if x2 is not None: pairs2 += new_pos2 + new_neg2 if y is not None: true += pos_labels + neg_labels # package return parameters for output ret = [np.array(pairs).reshape((len(pairs), 2) + x1.shape[1:])] if x2 is not None: ret.append(np.array(pairs2).reshape((len(pairs2), 2) + x2.shape[1:])) ret.append(np.array(labels)) if y is not None: true = np.array(true).astype(np.int).reshape(-1, 1) if verbose: # if true vectors are provided, we can take a peek to check # the validity of our kNN approximation print("confusion matrix for pairs and approximated labels:") print(metrics.confusion_matrix(true, labels) / true.shape[0]) print(metrics.confusion_matrix(true, labels)) ret.append(true) return ret
def predict(params): result = {} print('Predict', params) annoy_vector_dimension = VECTOR_SIZE index_filename = default_index_file data_file = default_csv_file_path use_model = default_use_model k = default_k stop_words = False input_sentence_id = None try: if params: if params.get('guid'): input_sentence_id = params.get('guid') if params.get('vector_size'): annoy_vector_dimension = params.get('vector_size') if params.get('index_filename'): index_filename = params.get('index_filename') if params.get('data_file'): data_file = params.get('data_file') if params.get('use_model'): use_model = params.get('use_model') if params.get('k'): k = params.get('k') if params.get('stop_words'): stop_words = params.get('stop_words') if len(input_sentence_id) <= 0: print_with_time('Input Sentence Id: {}'.format(input_sentence_id)) result = { 'error': 'Invalid Input id' } return result start_time = time.time() annoy_index = AnnoyIndex(annoy_vector_dimension, metric='angular') annoy_index.load(model_indexes_path + index_filename) end_time = time.time() print_with_time('Annoy Index load time: {}'.format(end_time-start_time)) start_time = time.time() data_frame = read_data(data_file) content_array = data_frame.to_numpy() end_time = time.time() print_with_time('Time to read data file: {}'.format(end_time-start_time)) start_time = time.time() embed_func = hub.Module(use_model) end_time = time.time() print_with_time('Load the module: {}'.format(end_time-start_time)) start_time = time.time() sentences = tf.compat.v1.placeholder(dtype=tf.string, shape=[None]) embedding = embed_func(sentences) end_time = time.time() print_with_time('Init sentences embedding: {}'.format(end_time-start_time)) start_time = time.time() sess = tf.compat.v1.Session() sess.run([tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer()]) end_time = time.time() print_with_time('Time to create session: {}'.format(end_time-start_time)) print_with_time('Input Sentence id: {}'.format(input_sentence_id)) params_filter = 'GUID == "' + input_sentence_id + '"' input_data_object = data_frame.query(params_filter) input_sentence = input_data_object['CONTENT'] if stop_words: input_sentence = remove_stopwords(input_sentence) start_time = time.time() sentence_vector = sess.run(embedding, feed_dict={sentences:input_sentence}) nns = annoy_index.get_nns_by_vector(sentence_vector[0], k) end_time = time.time() print_with_time('nns done: Time: {}'.format(end_time-start_time)) similar_sentences = [] similarities = [content_array[nn] for nn in nns] for sentence in similarities[1:]: similar_sentences.append({ 'guid': sentence[0], 'content': sentence[1] }) print(sentence[0]) result = SimilarityResult(input_sentence_id, input_sentence.values[0], similar_sentences) except Exception as e: print('Exception in predict: {0}'.format(e)) result = { 'error': 'Failure' } return result
help='How many dimension in the vector space') parser.add_argument('--max_trees', type=int, default=50, help='How many trees do want to use for `AnnoyIndex`') return parser.parse_args() if __name__ == '__main__': args = parse_args() with open(args.input_vectors) as input_file: content = (json.load(input_file)) title_id = 0 titles = [] title_index = AnnoyIndex(args.dimensions) for line_items in content: title = line_items[0] titles.append(title) vectors = line_items[1] title_index.add_item(title, vectors) title_id += 1 title_index.build(args.max_trees) # If you want to save index: # title_index.save('crawl_50_clean.ann') # If you want to load index: # u1 = AnnoyIndex(args.dimensions)
category_list = [ learner.data.train_ds.y.reconstruct(y) for y in similar_images_df['label_id'] ] # return learner.data.show_xys([open_image(img_id) for img_id in similar_images_df['img_path']], # category_list, figsize=fig_size) return similar_images_df['img_path'].tolist() print("building tree ....") # more tree = better approximation ntree = 100 #"angular", "euclidean", "manhattan", "hamming", or "dot" metric_choice = 'angular' annoy_tree = AnnoyIndex(len(data_df_ouput['embedding'][0]), metric=metric_choice) # # takes a while to build the tree for i, vector in enumerate(data_df_ouput['embedding']): annoy_tree.add_item(i, vector) _ = annoy_tree.build(ntree) print("finished build tree") def centroid_embedding(outfit_embedding_list): number_of_outfits = outfit_embedding_list.shape[0] length_of_embedding = outfit_embedding_list.shape[1] centroid = [] for i in range(length_of_embedding): centroid.append(
# https://github.com/jimkang/annoy-node # https://github.com/spotify/annoy """ pip install --user annoy or pip3 install --user annoy """ import random from annoy import AnnoyIndex DIMENSIONS = 10 # TODO: 512 for embeddings using TensorFlow JS U.S.E. STATIC_STORAGE_PATH = "embeddings-data" METRIC = "angular" storage = AnnoyIndex(DIMENSIONS, METRIC) v1 = [-5.0, -4.5, -3.2, -2.8, -2.1, -1.5, -0.34, 0, 3.7, 6] def load_existing_storage(): storage.load(STATIC_STORAGE_PATH) # storage.unbuild() # this won't work # can't unbuild to add to storage if it's already saved # https://github.com/spotify/annoy/issues/174#issuecomment-319554923 # https://github.com/spotify/annoy/issues/403#issuecomment-520616560 # have to re-index all to "add" new: # https://github.com/spotify/annoy/issues/447#issuecomment-574836547
from __future__ import print_function import random, time from annoy import AnnoyIndex try: xrange except NameError: # Python 3 compat xrange = range n, f = 100000, 40 t = AnnoyIndex(f, 'angular') for i in xrange(n): v = [] for z in xrange(f): v.append(random.gauss(0, 1)) t.add_item(i, v) t.build(2 * f) t.save('test.tree') limits = [10, 100, 1000, 10000] k = 10 prec_sum = {} prec_n = 1000 time_sum = {} for i in xrange(prec_n): j = random.randrange(0, n)
for (dirpath, dirnames, filenames) in walk(args.index): nfilenames = [] for f in filenames: nfilenames.append(dirpath + '/' + f) onlyfiles.extend(nfilenames) for x in batch(onlyfiles, args.index_batch_size): sys.stdout.write('\r' + str(c) + '/' + str(len(onlyfiles))) sys.stdout.flush() classif = dd.post_predict(sname, x, parameters_input, parameters_mllib, parameters_output) print classif for p in classif['body']['predictions']: if c == 0: layer_size = len(p['vals']) s['layer_size'] = layer_size t = AnnoyIndex(layer_size, metric) # prepare index t.add_item(c, p['vals']) s[str(c)] = p['uri'] c = c + 1 #if c >= 10000: # break print 'building index...\n' print 'layer_size=', layer_size t.build(ntrees) t.save('index.ann') s.close() if args.search: s = shelve.open('names.bin') u = AnnoyIndex(s['layer_size'], metric) u.load('index.ann')
metrics = ["angular", "euclidean", "manhattan", "dot", "hamming"] dim = 5 size = 100 for metric in metrics: fname = f'index.{metric}.{dim}d.ann' print(f'Generating index for {metric}') # t = AnnoyIndex(dim, metric) # Length of item vector that will be indexed # for i in range(size): # v = [random.gauss(0, 1) for z in range(dim)] # t.add_item(i, v) # t.build(10) # 10 trees # t.save(fname) # ... u = AnnoyIndex(dim, metric) u.verbose(True) u.load('./../tests/' + fname) # super fast, will just mmap the file print(u.get_item_vector(3)) v0 = u.get_item_vector(0) print(v0) nearests = u.get_nns_by_vector(v0, 5, include_distances=True) id_1 = nearests[0][1] print(u.get_item_vector(id_1)) print(u.get_distance(0, id_1)) # print(u.get_distance(0, 16)) print(nearests[0]) # will find the 1000 nearest neighbors print(nearests[1])
#!/home/stefan2/anaconda/bin/python from annoy import AnnoyIndex #from annoy import AnnoyIndexEuclidean num_features = 100 num_points = 42000 num_nn = 100 #number of nearest neighbors f = '../data/index.annoy' index = AnnoyIndex(num_features) index.load(f) results = open('../data/annoy-results.train.txt', 'w') print "search for nn" for i in range(0, num_points): items = index.get_nns_by_item(i, num_nn) #items = t.get_nns_by_vector(vectors[i], 100) #if you have the vectors if (items[0] != i): print 'the top nn is expected to be the item itself' raise SystemExit as_str = " ".join(map(str, items)) results.write(as_str + "\n") results.close()
self.out = tf.squeeze( self.sess.graph.get_tensor_by_name('vgg_16/avgp5/AvgPool:0')) def feat_(self, image_path): img_data = np.expand_dims(np.array(open(image_path, 'r').read()), 0) return self.sess.run(self.out, {self.img: img_data}) def feat(self, feat_string): img_data = np.expand_dims(np.array(feat_string), 0) return self.sess.run(self.out, {self.img: img_data}) names = np.load('data/name.npy') if not os.path.exists('model/inshop.ann'): feats = np.load('data/feats.npy') t = AnnoyIndex(512) for i, a in enumerate(feats): t.add_item(i, a) t.build(200) t.save('model/inshop.ann') else: t = AnnoyIndex(512) t.load('model/inshop.ann') worker = Feature2() class RequestHandler(pyjsonrpc.HttpRequestHandler): @pyjsonrpc.rpcmethod def Feat(self, str): str = base64.b64decode(str)
from annoy import AnnoyIndex a = AnnoyIndex(3) a.add_item(0, [1, 0, 0]) a.add_item(1, [0, 1, 0]) a.add_item(2, [0, 0, 1]) a.build(-1) a.save('test.tree') b = AnnoyIndex(3) b.load('test.tree') print b.get_nns_by_item(0, 100) print b.get_nns_by_vector([1.0, 0.5, 0.5], 100)
def test_metric_f_kwargs(self): i = AnnoyIndex(f=3, metric='euclidean')
from gensim.models import KeyedVectors import pandas as pd import numpy as np from annoy import AnnoyIndex data = pd.read_pickle('./data/file.pkl') w2v = KeyedVectors.load("./data/word2vec.model") a = AnnoyIndex(300) a.load('./data/annoy_15') def ret_vect(x, pos): k = [] for s in x: try: k.append(w2v.wv[s]) except KeyError: continue if len(k) == 0: return 0 else: return np.mean(k, axis=0) if pos else -np.mean(k, axis=0) # def ret_news(x_pos, x_neg): # res_vec = ret_vect(x_pos, True) + ret_vect(x_neg, False) # if isinstance(res_vec, int): # return "Нет новостей" # best_index = a.get_nns_by_vector(res_vec, 1)[0] # tags = [] # for i in data[best_index]['tokens_wo_upper']:
def test_prefault(self): i = AnnoyIndex(10) i.load('test/test.tree', prefault=True) self.assertEqual(i.get_nns_by_item(0, 10), [0, 85, 42, 11, 54, 38, 53, 66, 19, 31])
def test_load_unload(self): # Issue #108 i = AnnoyIndex(10) for x in range(100000): i.load('test/test.tree') i.unload()
def test_fail_save(self): t = AnnoyIndex(40) with self.assertRaises(IOError): t.save('')
def test_construct_destruct(self): for x in range(100000): i = AnnoyIndex(10) i.add_item(1000, [random.gauss(0, 1) for z in range(10)])
def test_get_n_trees(self): i = AnnoyIndex(10) i.load('test/test.tree') self.assertEqual(i.get_n_trees(), 10)
def test_unbuild_with_loaded_tree(self): i = AnnoyIndex(10) i.load('test/test.tree') i.unbuild()
hyper_overrides={}) predictions = [] for language in ('python', 'go', 'javascript', 'java', 'php', 'ruby'): print("Evaluating language: %s" % language) definitions = pickle.load( open( '../resources/data/{}_dedupe_definitions_v2.pkl'.format( language), 'rb')) indexes = [{ 'code_tokens': d['function_tokens'], 'language': d['language'] } for d in tqdm(definitions)] code_representations = model.get_code_representations(indexes) print(code_representations[0].shape) indices = AnnoyIndex(code_representations[0].shape[0], 'angular') for index, vector in tqdm(enumerate(code_representations)): assert vector is not None indices.add_item(index, vector) indices.build(1000) # indices.build(10) # indices.build(200) for query in tqdm(queries): for idx, _ in zip(*query_model(query, model, indices, language)): predictions.append( (query, language, definitions[idx]['identifier'], definitions[idx]['url'])) df = pd.DataFrame(predictions, columns=['query', 'language', 'identifier', 'url'])
def __init__(self): self.connection = sqlite3.connect('junction17.db') self.annoy_index = AnnoyIndex(128, metric='angular') self._populate_index()
def test_not_found_tree(self): i = AnnoyIndex(10) with self.assertRaises(IOError): i.load("nonexists.tree")