def test_dist(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) self.assertAlmostEqual(i.get_distance(0, 1), 1.0)
def test_metric_kwarg(self): # Issue 211 i = AnnoyIndex(2, metric='euclidean') i.add_item(0, [1, 0]) i.add_item(1, [9, 0]) self.assertAlmostEqual(i.get_distance(0, 1), 8) self.assertEqual(i.f, 2)
class ImageSearchAnnoyCombo: ''' load an Annoy index for approximate nearest neighbor computation Annoy's angular distance uses dist(u,v) = 2(1-cos(u,v)) ''' def __init__(self,h5fname = 'X_ILSVRC2015.hdf5',annf='ILSVRC2015.ann',imageListPath = '/home/scratch/benediktb/RegionOfInterest/ILSVRC2015_filelist.txt',dset = 'fc6fc7'): #load h5 data h5f = h5py.File(h5fname,'r') self.X = h5f[dset] #load filenames with open(imageListPath,'r') as f: self.line_to_file = {i:line.rstrip() for i,line in enumerate(f)} self.A = AnnoyIndex(self.X.shape[1],'angular') self.A.load(annf) def run_query_approx(self,query,n=100,accuracy_factor = 5): nearest,scores = self.A.get_nns_by_vector(query, n, search_k=n*int(accuracy_factor)*128, include_distances=True) return zip((self.line_to_file[i] for i in nearest),scores) def run_query_exact(self,query,n=1000,nsmall=100): #retrieve approximate nearest neighbors using Annoy, then do exact ranking by loading from h5 into memory #use Annoy if n < nsmall: n = nsmall indexes = self.A.get_nns_by_vector(query, n, search_k=-1, include_distances=False) indexes_sorted = sorted(indexes) #use scipy cdist (or normalize first and do dot product for faster computation) #getting X by index from disc is very slow. distance = (cdist(self.X[indexes_sorted], query.reshape((1,query.shape[0])), 'cosine'))[:,0] ind = np.argpartition(distance, nsmall)[:nsmall]#partial sort, indices for top n, s_ind = np.argsort(distance[ind])#sort nearest = ind[s_ind] scoresorted = distance[ind][s_ind] return zip((self.line_to_file[indexes_sorted[i]] for i in nearest),scoresorted)
def test_tuple(self, n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(n_points): i.add_item(j, (random.gauss(0, 1) for x in xrange(f))) i.build(n_trees)
def do(indextype): a = AnnoyIndex(8, indextype[0]) a.load('points.%s.annoy' % indextype) with open('points.%s.ann.txt' % indextype, 'w') as out: for q_index in [1443, 1240, 818, 1725, 1290, 2031, 1117, 1211, 1902, 603]: nns = a.get_nns_by_item(q_index, 10) print >> out, '%s\t%s' % (q_index, ','.join([str(n) for n in nns]))
def test_dist(self): f = 2 i = AnnoyIndex(f) i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) self.assertAlmostEqual(i.get_distance(0, 1), (2 * (1.0 - 2 ** -0.5))**0.5)
def _get_index(self, f, distance): input = 'test/glove.twitter.27B.%dd.txt.gz' % f output = 'test/glove.%d.%s.annoy' % (f, distance) if not os.path.exists(output): if not os.path.exists(input): # Download GloVe pretrained vectors: http://nlp.stanford.edu/projects/glove/ # Hosting them on my own S3 bucket since the original files changed format url = 'https://s3-us-west-1.amazonaws.com/annoy-vectors/glove.twitter.27B.%dd.txt.gz' % f print('downloading', url, '->', input) urlretrieve(url, input) print('building index', distance, f) annoy = AnnoyIndex(f, 12, "test_db", 10, 1000, 3048576000, 0) v_v = [] items = [] for i, line in enumerate(gzip.open(input, 'rb')): v = [float(x) for x in line.strip().split()[1:]] v_v.append(v) items.append(i) if (i+1) % 10000 == 0: print (i+1) annoy.add_item_batch(items, v_v) v_v = [] items = [] if v_v: annoy.add_item_batch(items, v_v) return annoy
def ANN(searchSpace): dimension = searchSpace[0].shape[0] t = AnnoyIndex(dimension, metric='euclidean') for i in range(len(searchSpace)): t.add_item(i, searchSpace[i]) t.build(10) return t
def test_dist_degen(self): f = 2 i = AnnoyIndex(f) i.add_item(0, [1, 0]) i.add_item(1, [0, 0]) self.assertAlmostEqual(i.get_distance(0, 1), 2.0**0.5)
def retrieve(self): print 'Loading necessary files..' u = AnnoyIndex(self.dim, metric='angular') u.load(index_file) print 'ANN Retrieval..' for n_neighbors in knns: print 'Number of neighbors: ' + str(n_neighbors) for mult in self.multipliers: print 'Multiplier: ' + str(mult) search_k = self.n_trees * n_neighbors * mult filename = '.'.join((self.test_file.split('/')[-1].split('.')[:-1])) with open(self.test_file, 'r') as data_file: data = json.load(data_file) qArray = [] for i in range(len(data["questions"])): question_body = data["questions"][i]["body"] question_id = data["questions"][i]["id"] qcentroid = np.transpose(np.array(get_centroid_idf(question_body, self.emb, self.idf, self.stopwords, self.dim))) anns = u.get_nns_by_vector(qcentroid, n_neighbors, search_k) doc_anns = [] for n in anns: doc_anns.append(self.idmap[n]) q = Question(question_body, question_id, doc_anns) qArray.append(q) directory = "system_results/" if not os.path.exists(directory): os.makedirs(directory) with open(str(directory)+"/"+"CentIDF_annoy_"+str(n_trees)+"_"+str(n_neighbors)+"_"+str(mult)+".json", "w+") as outfile: outfile.write(json.dumps({"questions":[ob.__dict__ for ob in qArray]}, indent=2))
def test_no_items(self): idx = AnnoyIndex(100) idx.build(n_trees=10) idx.save('foo.idx') idx = AnnoyIndex(100) idx.load('foo.idx') self.assertEquals(idx.get_n_items(), 0) self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])
def test_save_without_build(self): # Issue #61 i = AnnoyIndex(10) i.add_item(1000, [random.gauss(0, 1) for z in xrange(10)]) i.save('x.tree') j = AnnoyIndex(10) j.load('x.tree') j.build(10)
def fit_annoy(data, n_trees=-1): logger.info('Fitting Annoy Matcher...') from annoy import AnnoyIndex matcher = AnnoyIndex(data.shape[1], metric='euclidean') for i, d in enumerate(data): matcher.add_item(i, d) matcher.build(n_trees) return matcher
def test_get_item_vector(self): f = 10 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [random.gauss(0, 1) for x in xrange(f)]) for j in xrange(100): print(j, '...') for k in xrange(1000 * 1000): i.get_item_vector(0)
def build_index(counts,label_to_id,dimension): index = AnnoyIndex(dimension,metric='angular') for label,cnt_list in counts.items(): id = label_to_id[label] index.add_item(id,cnt_list) index.build(100) return index
def test_wrong_length(self, n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [random.gauss(0, 1) for x in xrange(f)]) self.assertRaises(IndexError, i.add_item, 1, [random.gauss(0, 1) for x in xrange(f+1000)]) self.assertRaises(IndexError, i.add_item, 2, []) i.build(n_trees)
def __init__(self, fn_word, model_name, model_path): self.model = QueryModel(fn_word, model_name, model_path) self.queries = [] self.titles = [] self.query_index = 0 self.title_index = 0 self.query_ann = AnnoyIndex(self.model.dim, metric='euclidean') self.title_ann = AnnoyIndex(self.model.dim, metric='euclidean')
def create_index_tree(clusters): features = clusters.shape[1] tree = AnnoyIndex(features, metric='euclidean') for i, v in enumerate(clusters): tree.add_item(i, v.tolist()) tree.build(features*2) return tree
def _build_from_model(self, vectors, labels, num_features): index = AnnoyIndex(num_features) for vector_num, vector in enumerate(vectors): index.add_item(vector_num, vector) index.build(self.num_trees) self.index = index self.labels = labels
class FeatureNN: tree = None def __init__(self, features, tree_file): self.tree = AnnoyIndex(features, metric='euclidean') self.tree.load(str(tree_file)) def nn(self, x): return self.tree.get_nns_by_vector(x.tolist(), 1)[0]
def test_numpy(self, n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(n_points): a = numpy.random.normal(size=f) a = a.astype(random.choice([numpy.float64, numpy.float32, numpy.uint8, numpy.int16])) i.add_item(j, a) i.build(n_trees)
def test_dist_degen(self): os.system("rm -rf test_db") os.system("mkdir test_db") f = 2 i = AnnoyIndex(f, 2, "test_db", 64, 1000, 3048576000, 0) i.add_item(0, [1, 0]) i.add_item(1, [0, 0]) self.assertAlmostEqual(i.get_distance(0, 1), 2.0)
def _get_index(self, dataset): url = 'http://vectors.erikbern.com/%s.hdf5' % dataset vectors_fn = os.path.join('test', dataset + '.hdf5') index_fn = os.path.join('test', dataset + '.annoy') if not os.path.exists(vectors_fn): print('downloading', url, '->', vectors_fn) urlretrieve(url, vectors_fn) dataset_f = h5py.File(vectors_fn) distance = dataset_f.attrs['distance'] f = dataset_f['train'].shape[1] annoy = AnnoyIndex(f, distance) if not os.path.exists(index_fn): print('adding items', distance, f) for i, v in enumerate(dataset_f['train']): annoy.add_item(i, v) print('building index') annoy.build(10) annoy.save(index_fn) else: annoy.load(index_fn) return annoy, dataset_f
def test_write_failed(self): f = 40 # Build the initial index t = AnnoyIndex(f) for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) if sys.platform == "linux" or sys.platform == "linux2": # linux try: t.save("/dev/full") self.fail("didn't get expected exception") except Exception as e: self.assertTrue(str(e).find("No space left on device") > 0) elif sys.platform == "darwin": volume = "FULLDISK" device = os.popen('hdiutil attach -nomount ram://64').read() os.popen('diskutil erasevolume MS-DOS %s %s' % (volume, device)) os.popen('touch "/Volumes/%s/full"' % volume) try: t.save('/Volumes/%s/annoy.tree' % volume) self.fail("didn't get expected exception") except Exception as e: self.assertTrue(str(e).find("No space left on device") > 0) finally: os.popen("hdiutil detach %s" % device)
def precision(f=40, n=1000000): t = AnnoyIndex(f) for i in xrange(n): v = [] for z in xrange(f): v.append(random.gauss(0, 1)) t.add_item(i, v) t.build(2 * f) t.save('test.tree') limits = [10, 100, 1000, 10000] k = 10 prec_sum = {} prec_n = 1000 time_sum = {} for i in xrange(prec_n): j = random.randrange(0, n) print 'finding nbs for', j closest = set(t.get_nns_by_item(j, n)[:k]) for limit in limits: t0 = time.time() toplist = t.get_nns_by_item(j, limit) T = time.time() - t0 found = len(closest.intersection(toplist)) hitrate = 1.0 * found / k prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate time_sum[limit] = time_sum.get(limit, 0.0) + T for limit in limits: print 'limit: %-9d precision: %6.2f%% avg time: %.6fs' % (limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1))
def build_annoy_index(corpus, dimension, winlen, winstep): print "Adding to Annoy index" index = AnnoyIndex(dimension, "euclidean") mfcc_list = [] i = 0 for filename, frames in corpus: # print filename, frames.shape for index_in_file, mfcc in enumerate(frames): mfcc_list.append((filename, index_in_file)) index.add_item(i, mfcc.tolist()) assert mfcc_list[i] == (filename, index_in_file) i += 1 opts = {"samplerate": desired_samplerate, "winlen": winlen, "winstep": winstep, "numcep": 13, "nfilt": 26, "nfft": 512, "ntrees": ANN_NTREES } cache_filename = "annoy_index_" + hashlib.md5(str([filename for filename, frames in corpus])).hexdigest() + "." + "_".join("%s=%s" % (k, v) for k, v in sorted(opts.items())) + ".tree" if not os.path.exists(cache_filename): print "Building Annoy index with %d trees" % ANN_NTREES # index.build(-1) index.build(ANN_NTREES) index.save(cache_filename) print "\tWrote cache to %s" % cache_filename else: print "\tReading cache from %s" % cache_filename index.load(cache_filename) return index, mfcc_list
def test_range_errors(self, n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(n_points): i.add_item(j, [random.gauss(0, 1) for x in xrange(f)]) self.assertRaises(IndexError, i.add_item, -1, [random.gauss(0, 1) for x in xrange(f)]) i.build(n_trees) for bad_index in [-1000, -1, n_points, n_points + 1000]: self.assertRaises(IndexError, i.get_distance, 0, bad_index) self.assertRaises(IndexError, i.get_nns_by_item, bad_index, 1) self.assertRaises(IndexError, i.get_item_vector, bad_index)
def test_threads(self): n, f = 10000, 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) pool = multiprocessing.pool.ThreadPool() def query_f(j): i.get_nns_by_item(1, 1000) pool.map(query_f, range(n))
def test_dist(self): os.system("rm -rf test_db") os.system("mkdir test_db") f = 2 i = AnnoyIndex(f, 2, "test_db", 64, 1000, 3048576000, 0) # i.verbose(True) i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) self.assertAlmostEqual(i.get_distance(0, 1), 2 * (1.0 - 2 ** -0.5))
def test_get_nns_by_item_batch(self): print "test_get_nns_by_item_batch " os.system("rm -rf test_db") os.system("mkdir test_db") f = 3 i = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 0) i.add_item_batch([0,1,2], [[2, 1, 0], [1, 2, 0], [0, 0, 1]]) self.assertEqual(i.get_nns_by_item(0, 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_item(1, 3), [1, 0, 2]) self.assertTrue(i.get_nns_by_item(2, 3) in [[2, 0, 1], [2, 1, 0]]) # could be either
# coding: utf-8 from annoy import AnnoyIndex import json import random import redis redis_client = redis.StrictRedis(host='localhost', port=6379, db=0) illust_vector_list = [] dim = 512 t = AnnoyIndex(dim) # Length of item vector that will be indexed path = '../keras/pixiv-ranking-features.txt' with open(path, 'r') as f: i = 0 line = f.readline() while line: js = json.loads(line) print(js['illust_id']) illust_vector_list.append(js) if i > 1e3: break i = i + 1 line = f.readline() for i, illust_vector in enumerate(illust_vector_list): illust_id = illust_vector['illust_id'] line2illustId = f"line2illustId_{i}" illustId2line = f"illustId2line_{illust_id}" print(f"{line2illustId} -> {illustId2line}")
def main(): # load face features u = AnnoyIndex(512, metric="euclidean") u.load(config.FACE_FEATURES) video = args.video_path f = open(args.output_path, "w") fvs = FileVideoStream(video).start() time.sleep(1.0) fps = FPS().start() count = 0 frame_count = 0 while fvs.more(): img = fvs.read() f.write("frame_{}".format(frame_count)) img = cv2.resize(img, None, fx=config.RESIZE_IMAGE, fy=config.RESIZE_IMAGE) bboxlist = detector.detect(img) for b in bboxlist: x1, y1, x2, y2, s = b if (s > config.DETECTION_THRESHOLD): x1 = int(x1) x2 = int(x2) y1 = int(y1) y2 = int(y2) width = x2 - x1 height = y2 - y1 if width >= config.MIN_SIZE and height >= config.MIN_SIZE: ret = mtcnn_detector.detect_face(img[y1:y2, x1:x2], det_type=1) if ret is None: continue face_image = "frame_" + str(frame_count) + "_face_" + str( count) + ".jpg" count += 1 f.write(",object_{},position,{},{},{},{}".format( count, x1, x2, y1, y2)) bbox, landmarks = ret if landmarks is None: continue pointx = landmarks[0][:5] pointy = landmarks[0][5:] pointx_img_space = map(lambda x: x + x1, pointx) pointy_img_space = map(lambda y: y + y1, pointy) landmarks_img_space = list(pointx_img_space) + list( pointy_img_space) bbox_process = np.array([x1, y1, x2, y2]) landmarks_process = np.array(landmarks_img_space).reshape( (2, 5)).T nimg = preprocess(img, bbox_process, landmarks_process, image_size="112,112") e = embedding.get_feature(nimg) match = u.get_nns_by_vector(e, config.TOP_ACC_NUM, include_distances=True) pose = headpose.get_pose(nimg) cv2.imwrite( os.path.join(FACE_IMAGE_PATH, "aligned_" + face_image), nimg) f.write(",pose,{},{},{}".format(pose[0], pose[1], pose[2])) identified_face_top_1 = face_db_pd[(match[0][0])] score_top_1 = match[1][0] if score_top_1 < config.MATCH_THRESHOLD: cv2.putText(img, identified_face_top_1, (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0)) for i in range(config.TOP_ACC_NUM): identified_face = face_db_pd[(match[0][i])] if (abs(pose[0]) > config.POSE_THRESHOLD or abs(pose[1]) > config.POSE_THRESHOLD or abs(pose[2]) > config.POSE_THRESHOLD): continue if (match[1][i] < config.MATCH_THRESHOLD): f.write(",{},{}".format( identified_face, match[1][i])) cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 1) frame_count += 1 f.write("\n") cv2.imshow("img_window", img) fps.update() if cv2.waitKey(1) & 0xFF == ord('q'): break fps.stop() cv2.destroyAllWindows() fvs.stop() f.close()
def initialiseAnnoy(part_of_speech): # Loads Annoy ADJ vectors if part_of_speech == "adjective": adj_annoy = AnnoyIndex(50, metric='angular') adjectives = list() adj_lookup = dict() print("\n\tLoading adjective vectors...") for i, line in enumerate(open("./vectors/adjVectors", "r")): line = line.strip() word, vec_s = line.split(" ") vec = [float(n) for n in vec_s.split()] adj_annoy.add_item(i, vec) adj_lookup[word] = vec adjectives.append(word) adj_annoy.build(50) return adj_annoy, adjectives, adj_lookup # Loads Annoy VERB vectors elif part_of_speech == "verb": verb_annoy = AnnoyIndex(50, metric='angular') verbs = list() verb_lookup = dict() print("\n\tLoading verb vectors...") for i, line in enumerate(open("./vectors/verbVectors", "r")): line = line.strip() word, vec_s = line.split(" ") vec = [float(n) for n in vec_s.split()] verb_annoy.add_item(i, vec) verb_lookup[word] = vec verbs.append(word) verb_annoy.build(50) return verb_annoy, verbs, verb_lookup # Loads Annoy NOUN vectors elif part_of_speech == "noun": noun_annoy = AnnoyIndex(50, metric='angular') nouns = list() noun_lookup = dict() print("\n\tLoading noun vectors...") for i, line in enumerate(open("./vectors/nounVectors", "r")): line = line.strip() word, vec_s = line.split(" ") vec = [float(n) for n in vec_s.split()] noun_annoy.add_item(i, vec) noun_lookup[word] = vec nouns.append(word) noun_annoy.build(50) return noun_annoy, nouns, noun_lookup else: raise Exception("Part of speech must either be 'adjective', 'verb' or 'noun'.")
def main(project_name): tic = time.time() logger = Logger('_05_make_submission_1000_{}'.format(project_name)) logger.info('=' * 50) model_path = '_model/embedding_model_{}.pt'.format(project_name) logger.info('load model from {}'.format(model_path)) model = torch.load(model_path) model.eval() dir_target = '../../input/test' embedder = ImgEmbedder(model, dir_target) sample_submission = pd.read_csv('../../dataset/sample_submission.csv') images = list() with open( os.path.join('_embed_index', 'index_names_{}.json'.format(project_name)), 'r') as f: index_names = json.load(f) test_id_list = sample_submission.id f = 512 u = AnnoyIndex(f, metric='euclidean') u.load( os.path.join('_embed_index', 'index_features_{}.ann'.format(project_name))) logger.info('===> embed test images and get nearest neighbors') search_k = 1_000_000 for test_id in tqdm(test_id_list): target_file = '{}.jpg'.format(test_id) try: img_feature = embedder.get_vector(target_file) indeces = u.get_nns_by_vector(img_feature.tolist(), n=1000, search_k=search_k) except: indeces = list(range(1000)) names = [index_names[index] for index in indeces] images.append(' '.join(names)) submission = pd.DataFrame(test_id_list, columns=['id']) submission['images'] = images output_path = '../../submission/submission_1000_{}.csv'.format( project_name) submission.to_csv(output_path, index=False) toc = time.time() - tic logger.info('Elapsed time: {:.1f} [min]'.format(toc / 60.0))
def test_item_vector_after_save(self): # Issue #279 a = AnnoyIndex(3) a.verbose(True) a.add_item(1, [1, 0, 0]) a.add_item(2, [0, 1, 0]) a.add_item(3, [0, 0, 1]) a.build(-1) self.assertEquals(a.get_n_items(), 4) self.assertEquals(a.get_item_vector(3), [0, 0, 1]) self.assertEquals(set(a.get_nns_by_item(1, 999)), set([1, 2, 3])) a.save('something.annoy') self.assertEquals(a.get_n_items(), 4) self.assertEquals(a.get_item_vector(3), [0, 0, 1]) self.assertEquals(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
def test_seed(self): i = AnnoyIndex(10) i.load('test/test.tree') i.set_seed(42)
def test_metric_f_kwargs(self): i = AnnoyIndex(f=3, metric='euclidean')
def get_subsampling_index2(data_process, standard_scale = True, cutoff_sig = 0.02, rate = 0.3, \ method = "pykdtree", verbose = 1, image_index = []): """ Using Nearest-Neighbor search based algorithm, find the list of indices of the subsampled dataset Parameters ------------- data_process: List. the list of datapoints, with selected features standard_scale [True]: Boolean. Whether to apply standard scaler to the dataset prior to subsampling cutoff_sig [0.02]: Float. cutoff significance. the cutoff distance equals to the Euclidean norm of the standard deviations in all dimensions of the data points rate [0.3]: Float. possibility of deletion method ["pykdtree"]: String. which backend nearest neighbour model to use. possible choices: ["pykdtree", "nmslib", "sklearn", "scipy", "annoy", "flann"] verbose [1]: integer. level of verbosity Return ------------- overall_keep_list: The list of indices of the final subsampled entries """ if verbose >= 1: print("Started NN-subsampling, original length: {}".format( len(data_process))) method = method.lower() start = time.time() if method == "flann": if verbose >= 1: print("use flann backend") elif method == "pykdtree": if verbose >= 1: print("use pykdtree backend") elif method == "sklearn": if verbose >= 1: print("use slearn nearest neighbors backend") elif method == "scipy": if verbose >= 1: print("use scipy cKDTree backend") elif method == "annoy": if verbose >= 1: print("use annoy backend") elif method == "nmslib": if verbose >= 1: print("use nmslib backend") else: print("method {} not impletemented".format(method)) raise NotImplemented # apply standard scaling if standard_scale: if verbose >= 2: print("Subample with standard scaled data") data_process = StandardScaler().fit_transform( np.asarray(data_process).copy()) else: if verbose >= 2: print("Subample with original data") data_process = np.asarray(data_process).copy() #set cutoff distance list_of_descs = zip(*data_process) sum_std2 = 0. for descs in list_of_descs: temp_std = np.std(descs) sum_std2 += temp_std**2 cutoff = cutoff_sig * np.sqrt(sum_std2) #initialize the index overall_keep_list = np.arange(len(data_process)).tolist() keep_going = True iter_count = 1 while keep_going: if verbose >= 2: print('start iteration {}, total length: {}'.format( iter_count, len(overall_keep_list))) start_cycle = time.time() temp_data_process = get_array_based_on_index(data_process.copy(), overall_keep_list) temp_image_index = get_array_based_on_index(image_index, overall_keep_list) #build and query nearest neighbour model if method == "flann": flann = FLANN() indices, distances = flann.nn(temp_data_process, temp_data_process, 2, algorithm="kmeans") elif method == "scipy": kd_tree = cKDTree(temp_data_process) distances, indices = kd_tree.query(temp_data_process, k=2) elif method == "pykdtree": kd_tree = KDTree(temp_data_process, leafsize=6) distances, indices = kd_tree.query(temp_data_process, k=2) elif method == "sklearn": nbrs = NearestNeighbors(n_neighbors=2, algorithm='kd_tree', n_jobs=-1).fit(temp_data_process) distances, indices = nbrs.kneighbors(temp_data_process) elif method == "annoy": annoy = AnnoyIndex(len(temp_data_process[0]), metric='euclidean') for i in range(len(temp_data_process)): annoy.add_item(i, temp_data_process[i]) annoy.build(1) distances = [] indices = [] for i in range(len(temp_data_process)): temp_index, temp_dist = annoy.get_nns_by_vector( temp_data_process[i], 2, include_distances=True) indices.append([i, temp_index[1]]) distances.append([0.0, temp_dist[1]]) elif method == "nmslib": index = nmslib.init(method='hnsw', space='l2') index.addDataPointBatch(temp_data_process) index.createIndex(print_progress=False) neighbours = index.knnQueryBatch(temp_data_process, k=2) distances = [] indices = [] for item in neighbours: indices.append(item[0]) distances.append(item[1]) else: raise NotImplemented # if distance between each point and its nearest neighbor is below cutoff distance, # add the nearest neighbout to the candidate removal list remove_index_li = [] index_li = [] for index, distance in zip(indices, distances): index_li.append(index[0]) if distance[1] <= cutoff: remove_index_li.append(index[1]) # randomly select datapoints in the candidate removal list (based on rate) # and form the final removal list of this iteration # stop the cycle if the final removal list is empty temp_num = int(ceil(float(len(remove_index_li)) * rate)) if temp_num == 0: keep_going = False #remove_index_li = random_subsampling(remove_index_li,temp_num) remove_index_li = rank_subsampling(remove_index_li, temp_num, temp_image_index) temp_keep_list = remove_list_from_list(index_li, remove_index_li) overall_keep_list = [overall_keep_list[i] for i in temp_keep_list] try: if len(overall_keep_list) == old_overall_keep_list_len: keep_going = False print("stopped because length didn't change") except: pass if verbose >= 2: print('end iteration {}. length: {}\t time:{}'.format( iter_count, len(overall_keep_list), time.time() - start_cycle)) iter_count += 1 old_overall_keep_list_len = len(overall_keep_list) if verbose >= 1: print('end NN-subsampling. length: {}\t time:{}'.format( len(overall_keep_list), time.time() - start)) return overall_keep_list
def test_save_twice(self): # Issue #100 t = AnnoyIndex(10) t.save("t.ann") t.save("t.ann")
def test_construct_destruct(self): for x in range(100000): i = AnnoyIndex(10) i.add_item(1000, [random.gauss(0, 1) for z in range(10)])
def test_construct_load_destruct(self): for x in range(100000): i = AnnoyIndex(10) i.load('test/test.tree')
def test_load_unload(self): # Issue #108 i = AnnoyIndex(10) for x in range(100000): i.load('test/test.tree') i.unload()
from nltk import ngrams import random, json, glob, os, codecs, random import numpy as np # data structures file_index_to_file_name = {} file_index_to_file_vector = {} # config dims = 2048 n_nearest_neighbors = 200 trees = 100 infiles = glob.glob('feature_vectors/*.npz') # build ann index t = AnnoyIndex(dims) for file_index, i in enumerate(infiles): file_vector = np.loadtxt(i) file_name = os.path.basename(i).split('.')[0] print("file_name = %s" % file_name) file_index_to_file_name[file_index] = file_name file_index_to_file_vector[file_index] = file_vector t.add_item(file_index, file_vector) t.build(trees) # create a nearest neighbors json file for each input if not os.path.exists('bran_neighbors'): os.makedirs('bran_neighbors') # do the similarity search for the feature vectors # this is a <filename*.npz> file
def generate_triplets_from_ANN(model, sequences, entity2unique, entity2same, unique_text, test): predictions = model.predict(sequences) t = AnnoyIndex(len(predictions[0]), metric='euclidean') # Length of item vector that will be indexed t.set_seed(123) for i in range(len(predictions)): # print(predictions[i]) v = predictions[i] t.add_item(i, v) t.build(100) # 100 trees match = 0 no_match = 0 ann_accuracy = 0 total = 0 precise = 0 triplets = {} closest_positive_counts = [] pos_distances = [] neg_distances = [] all_pos_distances = [] all_neg_distances = [] triplets['anchor'] = [] triplets['positive'] = [] triplets['negative'] = [] if test: NNlen = TEST_NEIGHBOR_LEN else: NNlen = TRAIN_NEIGHBOR_LEN for key in entity2same: index = entity2unique[key] nearest = t.get_nns_by_vector(predictions[index], NNlen) nearest_text = set([unique_text[i] for i in nearest]) expected_text = set(entity2same[key]) # annoy has this annoying habit of returning the queried item back as a nearest neighbor. Remove it. if key in nearest_text: nearest_text.remove(key) # print("query={} names = {} true_match = {}".format(unique_text[index], nearest_text, expected_text)) overlap = expected_text.intersection(nearest_text) # collect up some statistics on how well we did on the match m = len(overlap) match += m # since we asked for only x nearest neighbors, and we get at most x-1 neighbors that are not the same as key (!) # make sure we adjust our estimate of no match appropriately no_match += min(len(expected_text), NNlen - 1) - m # sample only the negatives that are true negatives # that is, they are not in the expected set - sampling only 'semi-hard negatives is not defined here' # positives = expected_text - nearest_text positives = overlap negatives = nearest_text - expected_text # print(key + str(expected_text) + str(nearest_text)) for i in negatives: for j in positives: dist_pos = t.get_distance(index, entity2unique[j]) pos_distances.append(dist_pos) dist_neg = t.get_distance(index, entity2unique[i]) neg_distances.append(dist_neg) if dist_pos < dist_neg: ann_accuracy += 1 total += 1 # print(key + "|" + j + "|" + i) # print(dist_pos) # print(dist_neg) min_neg_distance = 1000000 for i in negatives: dist_neg = t.get_distance(index, entity2unique[i]) all_neg_distances.append(dist_neg) if dist_neg < min_neg_distance: min_neg_distance = dist_neg for j in expected_text: dist_pos = t.get_distance(index, entity2unique[j]) all_pos_distances.append(dist_pos) closest_pos_count = 0 for p in overlap: dist_pos = t.get_distance(index, entity2unique[p]) if dist_pos < min_neg_distance: closest_pos_count+=1 if closest_pos_count > 0: precise+=1 closest_positive_counts.append(closest_pos_count / min(len(expected_text), NNlen - 1)) for i in negatives: for j in expected_text: triplets['anchor'].append(key) triplets['positive'].append(j) triplets['negative'].append(i) print("mean closest positive count:" + str(statistics.mean(closest_positive_counts))) print("mean positive distance:" + str(statistics.mean(pos_distances))) print("stdev positive distance:" + str(statistics.stdev(pos_distances))) print("max positive distance:" + str(max(pos_distances))) print("mean neg distance:" + str(statistics.mean(neg_distances))) print("stdev neg distance:" + str(statistics.stdev(neg_distances))) print("max neg distance:" + str(max(neg_distances))) print("mean all positive distance:" + str(statistics.mean(all_pos_distances))) print("stdev all positive distance:" + str(statistics.stdev(all_pos_distances))) print("max all positive distance:" + str(max(all_pos_distances))) print("mean all neg distance:" + str(statistics.mean(all_neg_distances))) print("stdev all neg distance:" + str(statistics.stdev(all_neg_distances))) print("max all neg distance:" + str(max(all_neg_distances))) print("Accuracy in the ANN for triplets that obey the distance func:" + str(ann_accuracy / total)) print("Precision at 1: " + str(precise / len(entity2same))) obj = {} obj['accuracy'] = ann_accuracy / total obj['steps'] = 1 with open(output_file_name_for_hpo, 'w', encoding='utf8') as out: json.dump(obj, out) if test: return match/(match + no_match) else: return triplets, match/(match + no_match)
for p in classif['body']['predictions']: uri = p['uri'] rois = p['rois'] sys.stdout.write('\rIndexing image '+str(d)+'/'+str(len(onlyfiles)) + ' : ' + str(len(rois)) + ' rois total:' + str(c) + ' ') sys.stdout.flush() for roi in rois: bbox = roi['bbox'] cat = roi['cat'] prob = roi['prob'] vals = roi['vals'] if c == 0: layer_size = len(vals) s['layer_size'] = layer_size t = AnnoyIndex(layer_size,metric) # prepare index t.add_item(c,vals) s[str(c)] = {'uri':uri, 'bbox' : bbox, 'cat' : cat, 'prob' : prob} c = c + 1 d = d + 1 #if c >= 10000: # break print 'building index...\n' print 'layer_size=',layer_size t.build(ntrees) t.save('index.ann') s.close() if args.search: s = shelve.open('data.bin') u = AnnoyIndex(s['layer_size'],metric)
#!/bin/env python3 import sys, argparse from annoy import AnnoyIndex import re import statistics parser = argparse.ArgumentParser() parser.add_argument('--file', help='Input file') parser.add_argument('--out', help='Outfile base') parser.add_argument('--L', help='Fingerprint length') parser.add_argument('--norm', help='Normalize') args = parser.parse_args() a = AnnoyIndex(int(args.L)) i = 0 names = [] with open(args.file, 'r') as f: for line in f: id, statements, *v = line.split("\t") id = re.sub('.json.gz', '', id) id = re.sub('\.', '|', id) names.append(id) v = [float(j) for j in v] if args.norm: avg = statistics.mean(v) std = statistics.stdev(v) v = [(j - avg) / std for j in v] a.add_item(i, v) i = i + 1
class FakeDetector: """ #TODO: Complete the description """ def __init__(self): """ """ self.tree = None self.cos = nn.CosineSimilarity(dim=1, eps=1e-6) # self.embedder = EmbedSentence() def build(self, brand_names: str, n_tree: int = 100, embedding_size: int = 100): """ #TODO: fill details of param :param brand_names: :param n_tree: :param embedding_size: :return: """ brand_names_sentence = sentence_char2vec(brand_names) self.tree = AnnoyIndex(embedding_size, 'angular') for value, (_, _token) in enumerate(brand_names_sentence.items()): self.tree.add_item(value, _token) self.tree.build(n_tree) def fake_detector( self, text: str, embedding_size: int = 100, detection_range: Tuple = (0.97, 0.99)) -> bool: """ #TODO: fill details of params :param text: :param embedding_size: :param detection_range: :return: """ found_match = False text_sentence = sentence_char2vec(text) for _, (_, _token) in enumerate(text_sentence.items()): match = self.tree.get_nns_by_vector(_token, 1)[0] sim_score = round( float( self.cos( _token.view(-1, embedding_size), torch.tensor(self.tree.get_item_vector(match)).view( -1, embedding_size))), 2) if detection_range[0] <= sim_score <= detection_range[1]: found_match = True if found_match: break return found_match
# In[27]: from annoy import AnnoyIndex # In[28]: # Choose a random image to experiment random_image_index = random.randint(0, num_items) # Note: the results may change if the image is changed # First, we build a search index with two hyperparameters - the number of dimensions of the dataset, and the number of trees. # In[29]: annoy_index = AnnoyIndex( num_dimensions) # Length of item vector that will be indexed for i in range(num_items): annoy_index.add_item(i, dataset[i]) annoy_index.build(40) #40 trees # Now let’s find out the time it takes to search the 5 nearest neighbors of one image. # In[30]: #u = AnnoyIndex(num_dimensions) #Time the search for one image for Annoy get_ipython().run_line_magic( 'timeit', 'annoy_index.get_nns_by_vector(query, 5, include_distances=True )') # Now THAT is blazing fast! To put this in perspective, for such a modestly sized dataset, this can serve almost 15000 requests on a single CPU core. Considering most CPUs have multiple cores, it should be able to handle 100K+ requests on a single system. The best part is that it lets you share the same index in memory between multiple processes. Hence, the biggest index can be equivalent to the size of your overall RAM, making it possible to serve multiple requests on a single system.
def test_load_save(self): # Issue #61 i = AnnoyIndex(10) i.load('test/test.tree') u = i.get_item_vector(99) i.save('i.tree') v = i.get_item_vector(99) self.assertEqual(u, v) j = AnnoyIndex(10) j.load('test/test.tree') w = i.get_item_vector(99) self.assertEqual(u, w) # Ensure specifying if prefault is allowed does not impact result j.save('j.tree', True) k = AnnoyIndex(10) k.load('j.tree', True) x = k.get_item_vector(99) self.assertEqual(u, x) k.save('k.tree', False) l = AnnoyIndex(10) l.load('k.tree', False) y = l.get_item_vector(99) self.assertEqual(u, y)
vectorArr.append(vector) except Exception as e: print(per) print(e) np.save('vectors', vectorArr) np.save('persons', nameArr) ########################################### ## ANNOY SIDE ## ########################################### file = np.load('vectors.npy') f = 512 t = AnnoyIndex( f, metric="euclidean") # Length of item vector that will be indexed for i in range(len(file)): v = file[i] t.add_item(i, v) t.build(10) # 10 trees t.save('test.ann') u = AnnoyIndex(f, metric="euclidean") u.load('test.ann') # super fast, will just mmap the file av = np.load('vectors.npy') pr = np.load('persons.npy') arr = u.get_nns_by_vector(av[0], 3, include_distances=True)
def test_load_save(self): # Issue #61 i = AnnoyIndex(10) i.load('test/test.tree') u = i.get_item_vector(99) i.save('x.tree') v = i.get_item_vector(99) self.assertEqual(u, v) j = AnnoyIndex(10) j.load('test/test.tree') w = i.get_item_vector(99) self.assertEqual(u, w)
class EntityType(object): """Convenience wrapper around Annoy. More generally a way to collect vectors within the same entity type and quickly find similar vectors. * Helps deal with non-contiguous ids through an id map. * Checks for 0 vectors before returning matches. """ def __init__(self, nfactor, ntrees, metric='angular', entity_type_id=None, entity_type=None): """Initialize EntityType.""" # metadata self._nfactor = nfactor self._metric = metric # object is accessed using this id. e.g. 'user' self._entity_type = entity_type # data is loaded in using this id. This can be more compact than the # entity_type, depending on the data source self._entity_type_id = entity_type_id self._ntrees = ntrees # data self._ann_obj = AnnoyIndex(nfactor, metric) # maps entity id to internal representation of id self._ann_map = {} # maps internal representation of id to entity id self._ann_map_inv = {} self._nitems = 0 def add_item(self, entity_id, factors): """Add item, populating id map.""" if entity_id in self._ann_map: raise ValueError('Duplicate entity: type = {0}, id = {1}'.format( self._entity_type, entity_id)) self._ann_obj.add_item(self._nitems, factors) self._ann_map[entity_id] = self._nitems self._nitems = self._nitems + 1 def build(self, verbose=False): """Build annoy model, create invert dictionary for future lookups.""" self._ann_obj.verbose(verbose) self._ann_obj.build(self._ntrees) # this is only necessary after build, so we'll create it here self._ann_map_inv = {v: k for k, v in self._ann_map.items()} def get_nns_by_vector(self, vec, n, search_k): """Get nearest neighbors from an input vector.""" nns = self._ann_obj.get_nns_by_vector(vec, n, search_k) return [self._ann_map_inv[x] for x in nns] def get_item_vector(self, entity_id): """Get a vector for an entity.""" if entity_id in self._ann_map: return self._ann_obj.get_item_vector(self._ann_map[entity_id]) else: return [] def __iter__(self): """Iterate over object, return (entity_id, vector) tuples.""" return (EntityVector( entity_id=entity_id, vector=self.get_item_vector(entity_id) ) for entity_id in self._ann_map.keys()) def get_nfactor(self): return self._nfactor def load(self, pkl, filepath): entity_type = pkl.get_entity_type(self._entity_type_id) self.__dict__ = entity_type.__dict__ # initialize index self._ann_obj = AnnoyIndex(pkl.get_nfactor(), entity_type._metric) # mmap the file self._ann_obj.load(filepath)
words_to_wvs = {} import numpy as np with open("wvs.txt") as rf: for line in rf: split_line = line.strip().split("\t") if len(split_line) == 2: word = split_line[0] #print(word) vec = [float(i) for i in split_line[1].split(" ")] words.append(word) words_to_wvs[word] = vec w1 = "pikachu" w2 = 'kanto' w3 = "sinnoh" w4 = "pachirisu" words.append("vec({}) - vec({}) + vec({}) =".format(w1, w2, w3)) words_to_wvs["vec({}) - vec({}) + vec({}) =".format(w1, w2, w3)] = list( np.asarray(words_to_wvs[w1]) - np.asarray(words_to_wvs[w2]) + np.asarray(words_to_wvs[w3])) t = AnnoyIndex(100, 'angular') # Length of item vector that will be indexed for ct, i in enumerate(words): t.add_item(ct, words_to_wvs[i]) t.build(100) # 10 trees wwca = [w1, w2, w3, w4, "vec({}) - vec({}) + vec({}) =".format(w1, w2, w3)] for word in wwca: print("10 NEAREST NEIGHBORS OF {}".format(word)) nearest_neighbors = t.get_nns_by_item(words.index(word), 10) for nn in nearest_neighbors: print(words[nn])
def test_save_without_build(self): # Issue #61 i = AnnoyIndex(10) i.add_item(1000, [random.gauss(0, 1) for z in range(10)]) i.save('x.tree') j = AnnoyIndex(10) j.load('x.tree') j.build(10)
def main(_): parser = argparse.ArgumentParser(description='TransE.') parser.add_argument('--data', dest='data_dir', type=str, help="Data folder", default='./data/FB15k/') parser.add_argument('--lr', dest='lr', type=float, help="Learning rate", default=1e-2) parser.add_argument("--dim", dest='dim', type=int, help="Embedding dimension", default=256) parser.add_argument("--batch", dest='batch', type=int, help="Batch size", default=32) parser.add_argument("--worker", dest='n_worker', type=int, help="Evaluation worker", default=3) parser.add_argument("--generator", dest='n_generator', type=int, help="Data generator", default=10) parser.add_argument("--eval_batch", dest="eval_batch", type=int, help="Evaluation batch size", default=32) parser.add_argument("--save_dir", dest='save_dir', type=str, help="Model path", default='./transE') parser.add_argument("--load_model", dest='load_model', type=str, help="Model file", default="") parser.add_argument("--save_per", dest='save_per', type=int, help="Save per x iteration", default=1) parser.add_argument("--eval_per", dest='eval_per', type=int, help="Evaluate every x iteration", default=3) parser.add_argument("--max_iter", dest='max_iter', type=int, help="Max iteration", default=30) parser.add_argument("--summary_dir", dest='summary_dir', type=str, help="summary directory", default='./transE_summary/') parser.add_argument("--keep", dest='drop_out', type=float, help="Keep prob (1.0 keep all, 0. drop all)", default=0.5) parser.add_argument("--optimizer", dest='optimizer', type=str, help="Optimizer", default='gradient') parser.add_argument("--prefix", dest='prefix', type=str, help="model_prefix", default='DEFAULT') parser.add_argument("--loss_weight", dest='loss_weight', type=float, help="Weight on parameter loss", default=1e-2) parser.add_argument("--neg_weight", dest='neg_weight', type=float, help="Sampling weight on negative examples", default=0.5) parser.add_argument("--save_per_batch", dest='save_per_batch', type=int, help='evaluate and save after every x batches', default=1000) parser.add_argument( "--outfile_prefix", dest='outfile_prefix', type=str, help='The filename of output file is outfile_prefix.txt', default='test_output') parser.add_argument("--neg_sample", dest='neg_sample', type=int, help='No. of neg. samples per (h,r) or (t,r) pair', default=5) parser.add_argument( "--fanout_thresh", dest='fanout_thresh', type=int, help='threshold on fanout of entities to be considered', default=2) parser.add_argument('--annoy_n_trees', dest='annoy_n_trees', type=int, help='builds a forest of n_trees trees', default=10) parser.add_argument( '--annoy_search_k', dest='annoy_search_k', type=int, help='During the query it will inspect up to search_k nodes', default=-1) parser.add_argument('--eval_after', dest='eval_after', type=int, help='Evaluate after this many no. of epochs', default=4) args = parser.parse_args() if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) print(args) model = TransE(args.data_dir, embed_dim=args.dim, fanout_thresh=args.fanout_thresh, eval_batch=args.eval_batch) train_pos_neg_list, \ train_loss, train_op = train_ops(model, learning_rate=args.lr, optimizer_str=args.optimizer, regularizer_weight=args.loss_weight) get_embedding_op = embedding_ops(model) # test_input, test_head, test_tail = test_ops(model) f1 = open('%s/%s.txt' % (args.save_dir, args.outfile_prefix), 'w') with tf.Session() as session: tf.global_variables_initializer().run() all_var = tf.all_variables() print 'printing all', len(all_var), ' TF variables:' for var in all_var: print var.name, var.get_shape() saver = tf.train.Saver(restore_sequentially=True) iter_offset = 0 if args.load_model is not None and os.path.exists(args.load_model): saver.restore(session, args.load_model) iter_offset = int( args.load_model.split('.')[-2].split('_')[-1]) + 1 f1.write("Load model from %s, iteration %d restored.\n" % (args.load_model, iter_offset)) total_inst = model.n_train best_filtered_mean_rank = float("inf") f1.write("preparing training data...\n") nbatches_count = 0 # training_data_list = [] training_data_pos_neg_list = [] for dat in model.raw_training_data(batch_size=args.batch): # raw_training_data_queue.put(dat) # training_data_list.append(dat) ps_list = data_generator_func(dat, model.tr_h, model.hr_t, model.n_entity, args.neg_sample, model.n_relation) assert ps_list is not None training_data_pos_neg_list.append(ps_list) nbatches_count += 1 f1.write("training data prepared.\n") f1.write("No. of batches : %d\n" % nbatches_count) f1.close() start_time = timeit.default_timer() for n_iter in range(iter_offset, args.max_iter): accu_loss = 0. ninst = 0 # f1.close() for batch_id in range(nbatches_count): f1 = open('%s/%s.txt' % (args.save_dir, args.outfile_prefix), 'a') pos_neg_list = training_data_pos_neg_list[batch_id] #print data_e l, _ = session.run([train_loss, train_op], {train_pos_neg_list: pos_neg_list}) accu_loss += l ninst += len(pos_neg_list) # print('len(pos_neg_list) = %d\n' % len(pos_neg_list)) if ninst % (5000) is not None: f1.write('[%d sec](%d/%d) : %.2f -- loss : %.5f \n' % (timeit.default_timer() - start_time, ninst, total_inst, float(ninst) / total_inst, l)) f1.close() f1 = open('%s/%s.txt' % (args.save_dir, args.outfile_prefix), 'a') f1.write("") f1.write("iter %d avg loss %.5f, time %.3f\n" % (n_iter, accu_loss / ninst, timeit.default_timer() - start_time)) # if n_iter == args.max_iter - 1: # save_path = saver.save(session, # os.path.join(args.save_dir, # "TransE_" + str(args.prefix) + "_" + str(n_iter) + ".ckpt")) # f1.write("Model saved at %s\n" % save_path) with tf.device('/cpu'): if n_iter > args.eval_after and (n_iter % args.eval_per == 0 or n_iter == args.max_iter - 1): t = AnnoyIndex(model.embed_dim, metric='euclidean') ent_embedding, rel_embedding = session.run( get_embedding_op, {train_pos_neg_list: pos_neg_list}) # sess = tf.InteractiveSession() # with sess.as_default(): # ent_embedding = model.ent_embeddings.eval() print np.asarray(ent_embedding).shape print np.asarray(rel_embedding).shape # print ent_embedding[10,:] # print rel_embedding[10,:] print 'Index creation started' for i in xrange(model.n_entity): v = ent_embedding[i, :] t.add_item(i, v) t.build(args.annoy_n_trees) print 'Index creation completed' # n = int(0.0005 * model.n_entity) n = 1000 # search_k = int(n * args.annoy_n_trees/100.0) search_k = 1000 print 'No. of items = %d' % t.get_n_items() print sum(t.get_item_vector(0)) print sum(ent_embedding[0, :]) assert sum(t.get_item_vector(0)) == sum( ent_embedding[0, :]) if n_iter == args.max_iter - 1: eval_dict = zip( [model.validation_data, model.testing_data], ['VALID', 'TEST']) else: eval_dict = zip([model.validation_data], ['VALID']) for data_func, test_type in eval_dict: accu_mean_rank_h = list() accu_mean_rank_t = list() accu_filtered_mean_rank_h = list() accu_filtered_mean_rank_t = list() evaluation_count = 0 evaluation_batch = [] batch_id = 0 for testing_data in data_func( batch_size=args.eval_batch): batch_id += 1 print 'test_type: %s, batch id: %d' % (test_type, batch_id) head_ids = list() tail_ids = list() for i in xrange(testing_data.shape[0]): # try: # print (ent_embedding[testing_data[i,0],:] + rel_embedding[testing_data[i,2],:]) tail_ids.append( t.get_nns_by_vector( (ent_embedding[testing_data[i, 0], :] + rel_embedding[testing_data[i, 2], :]), n, search_k)) head_ids.append( t.get_nns_by_vector( (ent_embedding[testing_data[i, 1], :] - rel_embedding[testing_data[i, 2], :]), n, search_k)) # except: # print 'i = %d' % i # print 'testing_data[i,0] = %d' % testing_data[i,0] # print 'testing_data[i,1] = %d' % testing_data[i,1] # print 'testing_data[i,2] = %d' % testing_data[i,2] # print head_ids # print tail_ids evaluation_batch.append( (testing_data, head_ids, tail_ids)) evaluation_count += 1 while evaluation_count > 0: evaluation_count -= 1 # (mrh, fmrh), (mrt, fmrt) = result_queue.get() (mrh, fmrh), (mrt, fmrt) = worker_func( evaluation_batch[evaluation_count - 1], model.hr_t, model.tr_h) accu_mean_rank_h += mrh accu_mean_rank_t += mrt accu_filtered_mean_rank_h += fmrh accu_filtered_mean_rank_t += fmrt f1.write( "[%s] ITER %d [HEAD PREDICTION] MEAN RANK: %.1f FILTERED MEAN RANK %.1f HIT@10 %.3f FILTERED HIT@10 %.3f\n" % (test_type, n_iter, np.mean(accu_mean_rank_h), np.mean(accu_filtered_mean_rank_h), np.mean( np.asarray(accu_mean_rank_h, dtype=np.int32) < 10), np.mean( np.asarray(accu_filtered_mean_rank_h, dtype=np.int32) < 10))) f1.write( "[%s] ITER %d [TAIL PREDICTION] MEAN RANK: %.1f FILTERED MEAN RANK %.1f HIT@10 %.3f FILTERED HIT@10 %.3f\n" % (test_type, n_iter, np.mean(accu_mean_rank_t), np.mean(accu_filtered_mean_rank_t), np.mean( np.asarray(accu_mean_rank_t, dtype=np.int32) < 10), np.mean( np.asarray(accu_filtered_mean_rank_t, dtype=np.int32) < 10))) if test_type == 'VALID': filtered_mean_rank = ( np.mean(accu_filtered_mean_rank_t) + np.mean(accu_mean_rank_h)) / 2.0 if filtered_mean_rank < best_filtered_mean_rank: save_path = saver.save( session, os.path.join( args.save_dir, "TransE_" + str(args.prefix) + "_" + str(n_iter) + ".ckpt")) f1.write("Model saved at %s\n" % save_path) best_filtered_mean_rank = filtered_mean_rank f1.close()
def test_unbuild_with_loaded_tree(self): i = AnnoyIndex(10) i.load('test/test.tree') i.unbuild()
from keras.preprocessing import image from keras.applications.vgg16 import preprocess_input, VGG16 from keras.models import Model from annoy import AnnoyIndex # img_dir_path = 'dataset/All/' img_dir_path = 'dataDrivenArt/bin/data/images/' annoy_model_path = 'model/x-fresh-flatten.ann' # annoy_dim = 4096 # fc2 annoy_dim = 25088 base_model = VGG16(weights='imagenet') model = Model(inputs=base_model.input, outputs=base_model.get_layer('flatten').output) annoy_model = AnnoyIndex(annoy_dim) for i in range(1, 3988): # img_path = img_dir_path + str(i) + '.jpg' img_path = "C:\\Users\\santa\\Desktop\\Python\\x-fresh\\dataDrivenArt\\bin\\data\\images\\{0:04d}.jpg".format( i + 0) img = image.load_img(img_path, target_size=(224, 224)) x = image.img_to_array(img) x = np.expand_dims(x, axis=0) x = preprocess_input(x) flatten_features = model.predict(x) annoy_model.add_item(i, flatten_features[0]) print(img_path, 'saved')
definitions[definitions_index]['language'], } for resource in [ data_pipeline.TREE_LABEL, data_pipeline.GRAPH_LABEL, data_pipeline.CODE_TOKENS_LABEL ]: sample[resource] = loaded_sample[resource] samples.append(sample) definitions_index += 1 data_file_code_representations = model.get_code_representations( samples) code_representations_all.extend(data_file_code_representations) print('len(code_representations_all)', len(code_representations_all)) indices = AnnoyIndex(code_representations_all[0].shape[0], 'angular') for index, vector in tqdm(enumerate(code_representations_all)): if vector is not None: indices.add_item(index, vector) indices.build(200) print('Index is built') for query in queries: for idx, _ in zip(*query_model(query, model, indices, language)): predictions.append( (query, language, definitions[idx]['identifier'], definitions[idx]['url'])) df = pd.DataFrame(predictions, columns=['query', 'language', 'identifier', 'url']) df.to_csv(predictions_csv, index=False)
def test_not_found_tree(self): i = AnnoyIndex(10) self.assertRaises(IOError, i.load, 'nonexists.tree')
def load_annoy_tree(model_file_name, vector_dims): tree = AnnoyIndex(vector_dims) tree.load(model_file_name) return tree