def retrieve_and_build(self): """ Retrieve embeddings stored in HDF5 files and build and save the graph. :return: None """ try: embs = self.retrieve_embeddings() self.logger.debug("Creating annoy index. Embeddings count: %s.", len(embs)) t = AnnoyIndex(self.f_dim) for i in range(0, len(embs)): t.add_item(i, embs[i]) trees_num = 10 # Number of trees in the index t.build(trees_num) self.logger.debug("Building annoy index.") t.save(os.path.join(self.organization_dir, "lshforest_" + self.embedding_type + ".ann")) self.logger.debug("Saved annoy index.") t.unload() except Exception: message = "Failed to build recommendation model." self.logger.exception(message)
def combine_labels(names_list,annoyfeature): ann_considered = [] ret_list = [] distances_considered = {} distances = [] print 'in function',annoyfeature.shape f= 360+512 print 'annoy made' ann_names = os.listdir(ann_add) for ann_n in ann_names: ann_considered.append(ann_n.split('.')[0]) print 'considered' for ann_names3 in ann_considered: if ann_names3 in names_list: ret_list.append(ann_names3) print 'list is',ret_list for rets in ret_list: ann_load = ann_add+"\\"+rets+".ann" print 'loading is',ann_load t2 = AnnoyIndex(f,metric = 'angular') t2.load(ann_load) temp_dist = t2.get_nns_by_item(annoyfeature.astype(np.int32), 1,search_k=-1, include_distances=True) #print t.get_nns_by_item(annoyfeature, 1,search_k=-1, include_distances=True) print 'annot dist',temp_dist t2.unload() distances_considered[rets] = temp_dist distances.append(temp_dist) print 'list and dic is' print distances_considered,distances return distances_considered,distances
def annoy(als_model, user_truth, test_user, sc, n_trees=10, search_k=-1): print('creating annoy baseline with n_trees: ' + str(n_trees), 'search_k: ' + str(search_k)) sc = SparkContext.getOrCreate() factors = als_model.userFactors size = factors.limit(1).select(F.size('features').alias('calculation')).collect()[0].calculation time_start = time() annoy_list = AnnoyIndex(size) for row in factors.collect(): annoy_list.add_item(row.id, row.features) annoy_list.build(n_trees) annoy_list.save('./home/hj1325/final-project-final-project/annoy_list' + str(n_trees) + '_k_' + str(search_k) + '.ann') recommend_list = [(user.user_label, annoy_list.get_nns_by_item(int(user.user_label), 500)) for user in test_user.collect()] temp = sc.parallelize(recommend_list) print('recommendations has been created') recommend = spark.createDataFrame(temp, ['user_label', 'recommendation']) predictions = recommend.join(user_truth, recommend.user_label == user_truth.user_label, 'inner') score = predictions.select('recommendation', 'truth').rdd.map(tuple) metrics = RankingMetrics(score) precision = metrics.precisionAt(500) mean_average_precision = metrics.meanAveragePrecision print('time taken: ' + str(time() - time_start)) print('precision at 500: ' + str(precision)) print('mean average precision: ' + str(mean_average_precision)) annoy_list.unload()
def annoy(alsmodel, groundTruth, testUsers, sc, n_trees=10, search_k=-1): print(f"annoy index version with n_trees: {n_trees}, search_k: {search_k}") sc = SparkContext.getOrCreate() userfactors = alsmodel.userFactors size = userfactors.limit(1).select( F.size("features").alias("calc_size")).collect()[0].calc_size start_time = time() a = AnnoyIndex(size) for row in userfactors.collect(): a.add_item(row.id, row.features) a.build(n_trees) a.save("./anns/annoy_t" + str(n_trees) + "_k_" + str(search_k) + ".ann") rec_list = [(u.userNew, a.get_nns_by_item(int(u.userNew), 500)) for u in testUsers.collect()] temp = sc.parallelize(rec_list) print("created recs") rec = spark.createDataFrame(temp, ["userNew", "recs"]) predictions = rec.join(groundTruth, rec.userNew == groundTruth.userNew, 'inner') scoreAndLabels = predictions.select('recs', 'truth').rdd.map(tuple) metrics = RankingMetrics(scoreAndLabels) precision = metrics.precisionAt(500) MAP = metrics.meanAveragePrecision print(f"time elapsed: {time()-start_time}s") print(f"precision at 500: {precision}") print(f"MAP: {MAP}") a.unload()
def updateAnnoy(self): annoy = AnnoyIndex(300) self_annoy = AnnoyIndex(300) i_counter = 0 for group in Group.select(): if i_counter % 100 == 0: logging.info("processed {}".format(i_counter)) for sentence in group.sentences: try: annoy.add_item(group.id, self.getSentenceVec(sentence.text)) except Exception as e: # logging.exception(e) # logging.info(sentence.text) pass try: arr = [ self.getSentenceVec(reply.text) for reply in group.replys ] self_annoy.add_item(group.id, np.mean(arr, axis=0)) except Exception as e: # logging.exception(e) pass i_counter += 1 annoy.build(50) annoy.save(self.SENTENCE_ANN + ".new") annoy.unload() self_annoy.build(50) self_annoy.save(self.SELF_SENTENCE_ANN + ".new") annoy.unload()
def load_embeddings(index_path, embedding_size, num_nodes): # Load Annoy index which stores the embedded vectors index = AnnoyIndex(embedding_size) index.load(index_path) embeddings = [index.get_item_vector(i) for i in range(num_nodes)] # Unload the index to save memory (loading mmaps the index file) index.unload() # V x D matrix of embeddings return np.array(embeddings)
def test_on_disk(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.on_disk_build('on_disk.ann') self.add_items(i) i.build(10) self.check_nns(i) i.unload() i.load('on_disk.ann') self.check_nns(i) j = AnnoyIndex(f, 'euclidean') j.load('on_disk.ann') self.check_nns(j)
def build(self, index_file, vectors, sender_urn): logger.info("Building {0}".format(index_file)) logger.info("Vectors {0}".format(vectors)) new_index = AnnoyIndex(self.feat_size, metric='euclidean') for idx, v in enumerate(vectors): logger.info("Adding item {0} with id {1}".format(v, idx)) new_index.add_item(idx, v) new_index.build(self.n_trees) logger.info("Saving index file {0}".format(index_file)) new_index.save(index_file) new_index.unload() pykka.ActorRegistry.get_by_urn(actor_urn=sender_urn).proxy().load() logger.info("Sent load command to worker")
def get_nns(annoyindex_tempfile, img_list): # Calculates the nearest neighbors of the master item t = AnnoyIndex(DIMS, metric='angular') t.load(annoyindex_tempfile) list_of_thumb_nearest_neighbors = [] for item in img_list: nearest_neighbors = t.get_nns_by_item(item, n_nearest_neighbors) thumb_nearest_neighbors = [] for j in nearest_neighbors: thumb_nearest_neighbors.append(j) list_of_thumb_nearest_neighbors.append(thumb_nearest_neighbors) t.unload() return list_of_thumb_nearest_neighbors
def build_empty_index(self) -> None: """ Build empty index - used when building index after deleting takes time. :return: None """ t = AnnoyIndex(self.f_dim) trees_num = 10 # Number of trees in the index t.build(trees_num) t.save(os.path.join(self.organization_dir, "lshforest_" + self.embedding_type + ".ann")) print('organization ann has been created') t.unload()
def retrieve_and_build(self): """ Build empty index - used when building index after deleting takes time. :return: None """ t = AnnoyIndex(self.f_dim) trees_num = 10 # Number of trees in the index t.build(trees_num) t.save(os.path.join(self.organization_dir, "lshforest.ann")) t.unload()
def test_dense_annoy_index(annoy_index_file): data = np.random.choice([0, 1], size=(10, 5)) index = build_annoy_index(data, annoy_index_file) assert os.path.exists(annoy_index_file) loaded_index = AnnoyIndex(5, metric='angular') loaded_index.load(annoy_index_file) assert index.f == loaded_index.f == 5 assert index.get_n_items() == loaded_index.get_n_items() == 10 assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5) index.unload() loaded_index.unload()
def annoy_model(als_model, sc, groundTruth_test, test_users, n_trees=10, search_k=-1): print(f"annoy model with n_trees: {n_trees}, search_k: {search_k}") sc = SparkContext.getOrCreate() user_factors = als_model.userFactors size = user_factors.limit(1).select( F.size("features").alias("calc_size")).collect()[0].calc_size start_time = time() index_size = AnnoyIndex(size) for row in user_factors.collect(): index_size.add_item(row.id, row.features) index_size.build(n_trees) index_size.save("./annoy_result/annoy_t" + str(n_trees) + "_k_" + str(search_k) + ".ann") rec_list = [(user.user_id, index_size.get_nns_by_item(int(user.user_id), 500)) for user in test_users.collect()] temp = sc.parallelize(rec_list) print("Annoy-Recommendations (500) created for test users") rec = spark.createDataFrame(temp, ["user_id", "recommendations"]) pred_test = rec.join(groundTruth_test, rec.user_id == groundTruth_test.user_id, 'inner') predAndLabels_test_annoy = pred_test.select('recommendations', 'test_truth').rdd.map(tuple) metrics_test_annoy = RankingMetrics(predAndLabels_test_annoy) precision_test_annoy = metrics_test_annoy.precisionAt(500) map_test_annoy = metrics_test_annoy.meanAveragePrecision print(f"Time taken: {time() - start_time}s") print(f"Precision at 500: {precision_test_annoy}") print(f"Mean Average Precision: {map_test_annoy}") index_size.unload()
def test_on_disk(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.on_disk_build('test.ann') i.add_item(0, [2, 2]) i.add_item(1, [3, 2]) i.add_item(2, [3, 3]) i.build(10) i.unload() i.load('test.ann') self.assertEqual(i.get_nns_by_vector([4, 4], 3), [2, 1, 0]) self.assertEqual(i.get_nns_by_vector([1, 1], 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_vector([4, 2], 3), [1, 2, 0])
def build_index(embedding_files_pattern, index_filename, num_trees=100): annoy_index = AnnoyIndex(VECTOR_LENGTH, metric=METRIC) mapping = {} embed_files = tf.gfile.Glob(embedding_files_pattern)[:250] logging.info('{} embedding files are found.'.format(len(embed_files))) item_counter = 0 for f, embed_file in enumerate(embed_files): logging.info('Loading embeddings in file {} of {}...'.format( f, len(embed_files))) record_iterator = tf.python_io.tf_record_iterator(path=embed_file) for string_record in record_iterator: example = tf.train.Example() example.ParseFromString(string_record) string_identifier = example.features.feature[ 'id'].bytes_list.value[0] mapping[item_counter] = string_identifier embedding = np.array( example.features.feature['embedding'].float_list.value) annoy_index.add_item(item_counter, embedding) item_counter += 1 logging.info('Loaded {} items to the index'.format(item_counter)) logging.info('Start building the index with {} trees...'.format(num_trees)) annoy_index.build(n_trees=num_trees) logging.info('Index is successfully built.') logging.info('Saving index to disk...') annoy_index.save(index_filename) logging.info('Index is saved to disk.') logging.info("Index file size: {} GB".format( round(os.path.getsize(index_filename) / float(1024**3), 2))) annoy_index.unload() logging.info('Saving mapping to disk...') with open(index_filename + '.mapping', 'wb') as handle: pickle.dump(mapping, handle, protocol=pickle.HIGHEST_PROTOCOL) logging.info('Mapping is saved to disk.') logging.info("Mapping file size: {} MB".format( round( os.path.getsize(index_filename + '.mapping') / float(1024**2), 2)))
class face_annoy: def __init__(self): self.f = int(face_comm.get_conf('annoy', 'face_vector')) self.annoy_index_path = os.path.abspath( face_comm.get_conf('annoy', 'index_path')) self.lmdb_file = os.path.abspath( face_comm.get_conf('lmdb', 'lmdb_path')) self.num_trees = int(face_comm.get_conf('annoy', 'num_trees')) self.annoy = AnnoyIndex(self.f) if os.path.isfile(self.annoy_index_path): self.annoy.load(self.annoy_index_path) #从lmdb文件中建立annoy索引 def create_index_from_lmdb(self): # 遍历 lmdb_file = self.lmdb_file if os.path.isdir(lmdb_file): evn = lmdb.open(lmdb_file) wfp = evn.begin() annoy = AnnoyIndex(self.f) for key, value in wfp.cursor(): str_list = key.split(',') key = int(str_list[0]) name = str_list[1] value = face_comm.str_to_embed(value) annoy.add_item(key, value) annoy.build(self.num_trees) annoy.save(self.annoy_index_path) #重新加载索引 def reload(self): self.annoy.unload() self.annoy.load(self.annoy_index_path) #根据人脸特征找到相似的 def query_vector(self, face_vector): n = int(face_comm.get_conf('annoy', 'num_nn_nearst')) return self.annoy.get_nns_by_vector(face_vector, n, include_distances=True)
def evaluate_set(prefix, tails, annoy_tree_file, vector_dims, lock, rank_threshold=100, sample_size=1000): #fname = ''.join(annoy_tree_file) lock.acquire() try: annoy_tree = AnnoyIndex(vector_dims) annoy_tree.load(annoy_tree_file) finally: lock.release() # annoy_tree = load_annoy_tree(annoy_tree_file, vector_dims) print(mp.current_process().name, id(annoy_tree), prefix.encode('utf-8')) sys.stdout.flush() counts = dict() counts[True] = 0 counts[False] = 0 if len(tails) > sample_size: tails = random.sample(tails, sample_size) for (comp1, tail1), (comp2, tail2) in itertools.combinations(tails, 2): diff = np.array(annoy_tree.get_item_vector(comp2)) - np.array( annoy_tree.get_item_vector(tail2)) predicted = np.array(annoy_tree.get_item_vector(tail1)) + diff result = annoy_knn(annoy_tree, predicted, comp1, rank_threshold) counts[result] += 1 annoy_tree.unload(annoy_tree_file) return (prefix, float(counts[True]) / (counts[True] + counts[False]) ) if counts[True] + counts[False] > 0 else (prefix, 0.0)
class face_annoy: def __init__(self): self.f = int(face_comm.get_conf('annoy','face_vector')) self.annoy_index_path = os.path.abspath(face_comm.get_conf('annoy','index_path')) self.lmdb_file =os.path.abspath(face_comm.get_conf('lmdb','lmdb_path')) self.num_trees =int(face_comm.get_conf('annoy','num_trees')) self.annoy = AnnoyIndex(self.f) if os.path.isfile(self.annoy_index_path): self.annoy.load(self.annoy_index_path) #从lmdb文件中建立annoy索引 def create_index_from_lmdb(self): # 遍历 lmdb_file = self.lmdb_file if os.path.isdir(lmdb_file): evn = lmdb.open(lmdb_file) wfp = evn.begin() annoy = AnnoyIndex(self.f) for key, value in wfp.cursor(): key = int(key) print(type(value)) value = np.fromstring(value,dtype=np.float32) print(value.shape) annoy.add_item(key,value) annoy.build(self.num_trees) annoy.save(self.annoy_index_path) #重新加载索引 def reload(self): self.annoy.unload() self.annoy.load(self.annoy_index_path) #根据人脸特征找到相似的 def query_vector(self,face_vector): n=int(face_comm.get_conf('annoy','num_nn_nearst')) print(face_vector.shape) return self.annoy.get_nns_by_vector(face_vector,n,include_distances=True)
class KNNIndex(object): annoy = None vec_len = -1 metric = 'euclidean' is_loaded = False def __init__(self, vec_len, metric='euclidean', index_file=None): self.vec_len = vec_len self.metric = metric self.annoy = AnnoyIndex(self.vec_len, self.metric) if index_file: self.load(index_file) def get_nns_by_item(self, i, n, search_k=-1, include_distances=False): if self.is_loaded: return self.annoy.get_nns_by_item(i, n, search_k, include_distances) else: raise RuntimeError("Annoy index file is not loaded!") def get_nns_by_vector(self, v, n, search_k=-1, include_distances=False, n_propagation=0): if self.is_loaded: return self.annoy.get_nns_by_vector(v, n, search_k, include_distances) else: raise RuntimeError("Annoy index file is not loaded!") def load(self, index_file): self.annoy.load(index_file) self.is_loaded = True def unload(self): self.annoy.unload() self.is_loaded = False
def searchNNSentence(self, target, cascading=False): annoy = AnnoyIndex(300) annoy.load(self.SENTENCE_ANN) self_annoy = AnnoyIndex(300) self_annoy.load(self.SELF_SENTENCE_ANN) # normal search target_vec = self.getSentenceVec(target) _vecs = annoy.get_nns_by_vector(target_vec, 50) _self_vecs = self_annoy.get_nns_by_vector(target_vec, 50) _vecs = self.NNfilter(annoy, self_annoy, _vecs, _self_vecs, target_vec, cascading) _replys = [] for i in Sentence.select().where(Sentence.id.in_(_vecs)): for reply in i.replys: if len(reply.reply) < 1: continue _replys.append(reply.reply) annoy.unload() self_annoy.unload() if len(_replys) > 0: return _replys return None
class face_annoy: def __init__(self): self.f = 512 self.annoy_index_path = os.path.abspath( os.path.expanduser('~') + "/acs/data/face_vector.nn") self.num_trees = 100 self.annoy = AnnoyIndex(self.f) if os.path.isfile(self.annoy_index_path): self.annoy.load(self.annoy_index_path) # 从lmdb文件中建立annoy索引 def create_index_from_lmdb(self): # 遍历 # lmdb_file = self.lmdb_file rows = dbsql.getallem() if len(rows) > 0: annoy = AnnoyIndex(self.f) for row in rows: key = row[0] value = str2embed(row[1]) annoy.add_item(key, value) annoy.build(self.num_trees) annoy.save(self.annoy_index_path) # 重新加载索引 def reload(self): self.annoy.unload() self.annoy.load(self.annoy_index_path) # 根据人脸特征找到相似的 def query_vector(self, face_vector): n = 1 return self.annoy.get_nns_by_vector(face_vector, n, include_distances=True)
def merge_indicies(self, index_file_a, index_file_b, sender_urn): logger.info("Merging {0} and {1} for {2} index".format(index_file_a, index_file_b, sender_urn)) index_a = AnnoyIndex(self.feat_size, metric='euclidean') index_b = AnnoyIndex(self.feat_size, metric='euclidean') new_index = AnnoyIndex(self.feat_size, metric='euclidean') index_a.load(index_file_a) index_b.load(index_file_b) cnt = 0 for i in range(index_a.get_n_items()): new_index.add_item(cnt, index_a.get_item_vector(i)) cnt += 1 for i in range(index_b.get_n_items()): new_index.add_item(cnt, index_b.get_item_vector(i)) cnt += 1 new_index_file = index_file_a + ".merged" index_a.unload() index_b.unload() new_index.build(self.n_trees) new_index.save(new_index_file) logger.info("Merging {0} and {1} for {2} index, total number of items: {3}".format( index_file_a, index_file_b, sender_urn, cnt)) new_index.unload() pykka.ActorRegistry.get_by_urn(sender_urn).proxy().complete_compaction( new_index_file=new_index_file, index_file_a=index_file_a, index_file_b=index_file_b )
class Index: """Procedures over multidimensional spaces.""" def __init__(self, size, data_dir=None, trees=.001, volatile=False): """ Indexing tensors operations and nearest neighbours search. Parameters ---------- size: int Shape of unidimensional vectors which will be indexed data_dir: str Location where to load or save the index trees (optional): float Defines the number of trees to create based on the dataset size. Should be a number between 0 and 1. volatile (optional): bool If the index will be temporary or not. """ self._position = -1 self._size = size self._data_dir = data_dir self._trees = trees self._volatile = volatile if self._data_dir and not self._volatile: if os.path.isfile(self._data_dir): raise OSError('data_dir parameter is not a directory') os.makedirs(self._data_dir, exist_ok=True) self._path = os.path.join(self._data_dir, self.index_name) elif not self._data_dir and not self._volatile: raise NoDataDirForPermanentIndex elif not self._data_dir and self._volatile: _temp_file = FileIO.safe_temp_file() self._data_dir = os.path.dirname(_temp_file) self._path = _temp_file else: raise DataDirDefinedForVolatileIndex if os.path.isfile(self._path): try: self.tree = AnnoyIndex(size, metric='angular') self.tree.load(self._path) self._is_new_index = False except OSError as os_error: raise FileIsNotAnIndex from os_error else: self.tree = AnnoyIndex(size, metric='angular') self._is_new_index = True self._image_database = ImageDatabase( import_images=True, data_dir=self._data_dir, ) @property def size(self): """Getter for property size.""" return self._size @property def path(self): """Getter for property path.""" return self._path @property def index_name(self): """Getter for property index_name.""" return 'pupyl.index' @property def trees(self): """Getter for property trees.""" return self._trees @property def volatile(self): """Getter for property volatile.""" return self._volatile @trees.setter def trees(self, trees): """Setter for property trees.""" self._trees = trees def __enter__(self): """Context opening index.""" return self def __exit__(self, exc_type, exc_val, exc_tb): """Context closing index.""" if not exc_type: if self._is_new_index: self.tree.build(self.size << intmul >> self.trees) self.tree.save(self.path) self.tree.unload() def items(self): """Return the indexed items.""" for item in range(len(self)): yield item def values(self): """Return the indexed values.""" for item in self.items(): yield self.tree.get_item_vector(item) def items_values(self): """Return tuples with all items and values.""" for item, value in zip(self.items(), self.values()): yield item, value def __getitem__(self, position): """Return item at index. Supports negative slicing.""" if position >= 0: return self.tree.get_item_vector(position) return self.tree.get_item_vector( len(self) - abs(position) ) def refresh(self): """Update all information regarding index file.""" self.tree.unload() self.tree.load(self.path) def append(self, tensor, check_unique=False): """ Insert a new tensor at the end of the index. Be advised that this operation is linear on index size ($O(n)$). Parameters ---------- tensor: numpy.ndarray or list A vector to insert into index. check_unique (optional, default: False): bool Defines if append method should verify the existence of a really similar tensor on the current index. In other words, it checks for the unicity of the value. Be advised that this check creates an overhead on the append process. """ if sum(tensor) == 0.: raise NullTensorError if self._is_new_index: index_it = True if check_unique and len(self) > 1: self.tree.build(self.size << intmul >> self.trees) result = self.item( self.index(tensor), top=1, distances=True ) if result[1][0] <= .05: warning( 'Tensor being indexed already exists in ' 'the database and the check for duplicates ' 'are on. Refusing to store again this tensor.' ) index_it = False self.tree.unbuild() if index_it: self.tree.add_item(len(self), tensor) else: with Index(self.size, volatile=True, trees=self.trees) as tmp_idx: for value in self.values(): tmp_idx.append(value, check_unique) tmp_idx.append(tensor, check_unique) _temp_file = tmp_idx.path move(_temp_file, self.path) self.refresh() def remove(self, position): """ Remove the tensor at index from the database. Be advised that this operation is linear on index size ($O(n)$). Parameters ---------- position: int The index which must be removed """ if self._is_new_index: raise IndexNotBuildYet if position > len(self): raise IndexError with Index(self.size, volatile=True, trees=self.trees) as tmp_idx: shrink = False for item, value in self.items_values(): if item == position: shrink = True else: if shrink: item -= 1 tmp_idx.tree.add_item(item, value) _temp_file = tmp_idx.path move(_temp_file, self.path) self.refresh() def pop(self, position=None): """ Pop-out the index at position, returning it. Be advised that this operation is linear on index size ($O(n)$). Parameters ---------- position (optional) (default: last position): int Removes and returns the value at position. Returns ---------- int: With the popped item. """ if position: value = self[position] else: inverse_index = -1 value = self[inverse_index] position = len(self) + inverse_index self.remove(position) return value def index(self, tensor): """ Search for the first most similar image compared to the query. Parameters ---------- tensor: numpy.ndarray or list A vector to search for the most similar. Returns ---------- int: Describing the most similar resulting index. """ return self.tree.get_nns_by_vector(tensor, n=1)[0] def item(self, position, top=10, distances=False): """ Search the index using an internal position Parameters ---------- position: int The item id within index. top (optional, default 10): int How many similar items should be returned. distances (optional, default 10): bool If should be returned also the distances between items. Returns ------- if distances is True: list of tuples: Containing pairs of item and distances else: list: Containing similar items. """ return self.tree.get_nns_by_item( position, top, include_distances=distances ) def search(self, tensor, results=16): """ Search for the first most similars image compared to the query. Parameters ---------- tensor: numpy.ndarray or list A vector to search for the most similar images. results: int How many results to return. If similar images are less than results, it exhausts and will be returned actual total. """ for result in self.tree.get_nns_by_vector(tensor, n=results): yield result def __len__(self): """Return how many items are indexed.""" return self.tree.get_n_items() def __iter__(self): """Return an iterable.""" for value in self.values(): yield value def __next__(self): """Iterate over the iterable.""" self._position += 1 all_values = list(self.values()) if self._position < len(all_values): return all_values[self._position] raise StopIteration def group_by(self, top=10, **kwargs): """ Returns all (or some position) on the index that is similar with other elements inside index. Parameters ---------- top (optional, default 10): int How many similar internal images should be returned position (optional): int Returns the groups based on a specified position. Returns ------- list: If a position is defined or dict: Generator with a dictionary containing internal ids as key and a list of similar images as values. """ position = kwargs.get('position') if len(self) <= 1: raise EmptyIndexError if top >= 1: if isinstance(position, int): results = self.item(position, top + 1) if len(results) > 1: yield results[1:] else: for item in self.items(): yield { item: self.item( item, top + 1 )[1:] } else: raise TopNegativeOrZero def export_by_group_by(self, path, top=10, **kwargs): """ Saves images, creating directories, based on their groups. Parameters ---------- path: str Place to create the directories and export images top (optional, default 10): How many similar internal images should be returned position (optional): int Returns the groups based on a specified position. """ for element in FileIO.progress( self.group_by( top=top, position=kwargs.get('position') ) ): if isinstance(element, dict): item = [*element.keys()][0] similars = element[item] elif isinstance(element, list): item = kwargs['position'] similars = element save_path = os.path.join( path, str(item) ) os.makedirs( save_path, exist_ok=True ) try: copyfile( self._image_database.mount_file_name( item, 'jpg' ), os.path.join( save_path, 'group.jpg' ) ) except FileNotFoundError: continue for rank, similar in enumerate(similars): original_file_path = self._image_database.mount_file_name( similar, 'jpg' ) try: copyfile( original_file_path, os.path.join( save_path, f'{rank + 1}.jpg' ) ) except FileNotFoundError: continue
def test_load_unload(self): # Issue #108 i = AnnoyIndex(10) for x in xrange(100000): i.load('test/test.tree') i.unload()
class AnnoySearch: def __init__(self, vec_dim=2048, lmdb_file="static/lmdb", ann_file="static/annoy_file/tree.ann", metric='angular', num_trees=10): self.vec_dim = vec_dim # 要index的向量维度 self.metric = metric # 度量可以是"angular","euclidean","manhattan","hamming",或"dot" self.annoy_instance = AnnoyIndex(self.vec_dim, self.metric) self.lmdb_file = lmdb_file self.ann_file = ann_file self.num_trees = num_trees self.logger = logging.getLogger('AnnoySearch') def save_annoy(self): self.annoy_instance.save(self.ann_file) self.logger.info('save annoy SUCCESS !') def unload_annoy(self): self.annoy_instance.unload() def load_annoy(self): try: self.annoy_instance.unload() self.annoy_instance.load(self.ann_file) self.logger.info('load annoy SUCCESS !') except FileNotFoundError: self.logger.error( 'annoy file DOES NOT EXIST , load annoy FAILURE !', exc_info=True) # 创建annoy索引 def create_index_from_lmdb(self): # 遍历 lmdb_file = self.lmdb_file if os.path.isdir(lmdb_file): evn = lmdb.open(lmdb_file) wfp = evn.begin() for key, value in wfp.cursor(): key = int(key) value = str2embed(value) print(len(value)) self.annoy_instance.add_item(key, value) self.annoy_instance.build(self.num_trees) self.annoy_instance.save(self.ann_file) def build_annoy(self): self.annoy_instance.build(self.num_trees) def get_nns_by_item(self, index, nn_num, search_k=-1, include_distances=False): return self.annoy_instance.get_nns_by_item(index, nn_num, search_k, include_distances) def get_nns_by_vector(self, vec, nn_num, search_k=-1, include_distances=False): return self.annoy_instance.get_nns_by_vector(vec, nn_num, search_k, include_distances) def get_n_items(self): return self.annoy_instance.get_n_items() def get_n_trees(self): return self.annoy_instance.get_n_trees() def get_vec_dim(self): return self.vec_dim def add_item(self, index, vec): self.annoy_instance.add_item(index, vec) def get_item_vector(self, index): return self.annoy_instance.get_item_vector(index)
def expertDocsSort(self, expertId, txt, topN): vec = self.t2v.text2v(txt, self.cuttor) annoy = AnnoyIndex(200) count = 0 annoy.add_item(count, vec) count = count + 1 db = DB() papers = db.getPapers(expertId) for p in papers: p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor) if p[3] is not None: annoy.add_item(count, p[3]) p[3] = annoy.get_distance(0, count) count = count + 1 papers = sorted(papers, key=lambda p: p[3]) papersFormated = [] for p in papers: if len(papersFormated) == topN: break map = {} if p[0] is not None: map['paperId'] = p[0].encode('utf8') else: map['paperId'] = p[0] if p[1] is not None: map['name'] = p[1].encode('utf8') else: map['name'] = p[1] if p[4] is not None: map['authors'] = p[4].encode('utf8') else: map['authors'] = p[4] if p[5] is not None: map['journalName'] = p[5].encode('utf8') else: map['journalName'] = p[5] if p[6] is not None: map['year'] = p[6].encode('utf8') else: map['year'] = p[6] papersFormated.append(map) count = 0 annoy.unload() annoy.add_item(count, vec) count = count + 1 patents = db.getPatents(expertId) for p in patents: p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor) if p[3] is not None: annoy.add_item(count, p[3]) p[3] = annoy.get_distance(0, count) count = count + 1 patents = sorted(patents, key=lambda p: p[3]) patentsFormated = [] for p in patents: if len(patentsFormated) == topN: break map = {} if p[0] is not None: map['patentId'] = p[0].encode('utf8') else: map['patentId'] = p[0] if p[4] is not None: map['publicationNo'] = p[4].encode('utf8') else: map['publicationNo'] = p[4] if p[1] is not None: map['name'] = p[1].encode('utf8') else: map['name'] = p[1] if p[5] is not None: map['inventors'] = p[5].encode('utf8') else: map['inventors'] = p[5] if p[6] is not None: map['applicant'] = p[6].encode('utf8') else: map['applicant'] = p[6] if p[7] is not None: map['year'] = p[7].encode('utf8') else: map['year'] = p[7] patentsFormated.append(map) count = 0 annoy.unload() annoy.add_item(count, vec) count = count + 1 projects = db.getProjects(expertId) for p in projects: p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor) if p[3] is not None: annoy.add_item(count, p[3]) p[3] = annoy.get_distance(0, count) count = count + 1 projects = sorted(projects, key=lambda p: p[3]) projectsFormated = [] for p in projects: if len(projectsFormated) == topN: break map = {} if p[0] is not None: map['projectId'] = p[0].encode('utf8') else: map['projectId'] = p[0] if p[1] is not None: map['name'] = p[1].encode('utf8') else: map['name'] = p[1] if p[4] is not None: map['member'] = p[4].encode('utf8') else: map['member'] = p[4] if p[5] is not None: map['unit'] = p[5].encode('utf8') else: map['unit'] = p[5] if p[6] is not None: map['year'] = p[6].encode('utf8') else: map['year'] = p[6] if p[7] is not None: map['type'] = p[7].encode('utf8') else: map['type'] = p[7] projectsFormated.append(map) result = {} result['papers'] = papersFormated result['patents'] = patentsFormated result['projects'] = projectsFormated return result
class Annoy(VectorIndex): def __init__(self, path, dims=None, metric='angular', build_on_disk=True): self.path = path self.is_mutable = None self.is_built = None self.build_on_disk = build_on_disk self.metric = metric if os.path.isfile(self.path): logging.debug(f'Loading existing index: {self.path}') self.load_meta() assert self.dims == dims or not dims, \ 'Passed path to existing index but dims do not match' assert self.metric == metric or not metric, \ 'Passed path to existing index but metrics do not match' self.index = AnnoyIndex(self.dims, metric=self.metric) elif dims: logging.debug( f'Creating new index with {dims} dimensions and {self.metric} metric' ) self.dims = dims self.index = AnnoyIndex(self.dims, metric=self.metric) if build_on_disk: self.index.on_disk_build(self.path) else: logging.debug(f'Loading existing index: {self.path}') self.load_meta() self.index = AnnoyIndex(self.dims, metric=self.metric) @property def meta_path(self): return self.path + '.meta.json' @property def files(self): return [self.path, self.meta_path] def load_meta(self): self.__dict__.update(load_json(self.meta_path)) def save_meta(self): d = {**self.__dict__} d.pop('index') save_json(d, self.meta_path) def build(self, num_trees=10): logging.debug(f'staring to build index: {self.path}') self.index.build(num_trees) logging.debug(f'finished building index: {self.path}') self.is_mutable = False self.is_built = True self.save_meta() def save(self): self.index.save(self.path) self.is_mutable = False self.save_meta() def load(self, memory=False): self.index.load(self.path, prefault=memory) self.is_mutable = False def unload(self): self.index.unload() def __del__(self): self.unload() def __setitem__(self, idx, vector): self.index.add_item(idx, vector) def __getitem__(self, idx): return self.index.get_item_vector(idx) def __len__(self): return self.index.get_n_items() def add(self, vector): idx = len(self) self[idx] = vector return idx def add_bulk(self, vectors): start = len(self) for n, v in enumerate(vectors): self[start + n] = v return self def set_bulk(self, indices, vectors): for idx, vector in zip(indices, vectors): self[idx] = vector def search(self, vector, num=10, depth=None, distances=True): return self.index.get_nns_by_vector(vector, num, depth or -1, distances) def search_index(self, idx, num=10, depth=None, distances=True): return self.index.get_nns_by_item(idx, num, depth or -1, distances) def distance(self, i, j): return self.index.get_distance(i, j)
tree.load("test_tree.ann") raw_input("<Enter> to load 10,000 vectors.") q = True while q: for i in xrange(10000): tree.get_item_vector(i) resp = raw_input("<Enter> to load 10,000 vectors.") if resp.strip() == "q": q = False raw_input("<Enter> to unload tree.") tree.unload("test_tree.ann") raw_input("done.") tree.load("test_tree.ann") raw_input("<Enter> to load 10,000 vectors.") q = True while q: for i in xrange(10000): tree.get_item_vector(i) resp = raw_input("<Enter> to load 10,000 vectors.") if resp.strip() == "q": q = False
class ProcessAnnoyWorkerThread(Thread): """Worker Thread Class.""" def __init__(self, notify_window, imagedata): """Init Worker Thread Class.""" Thread.__init__(self) # Configuring annoy parameters self._n_nearest_neighbors = 10 self._trees = 10000 self._t = AnnoyIndex(DIMS, metric='angular') self._notify_window = notify_window self._imagedata = imagedata self._want_abort = 0 # This starts the thread running on creation, but you could # also make the GUI thread responsible for calling this self.start() def run(self): """Run Worker Thread.""" # Definition of module with using tfhub.dev handle logging.error("Start loading mobilenet...") module_handle = "https://tfhub.dev/google/imagenet/mobilenet_v2_140_224/feature_vector/4" # module_handle = "4/" # Load the module import tensorflow_hub as hub module = hub.load(module_handle) starttime = datetime.now() ################################################ logging.error("Start processing and adding images to AnnoyIndex...") for file_index, item in enumerate(list(self._imagedata.getKeys())): # Loads and pre-process the image btmp = self._imagedata.getThumbnail(item) # pil_img = wx2PIL(wx_img) image_array = np.fromstring(bytes(btmp), dtype=np.uint8).reshape( (THUMBNAIL_MAX_SIZE, THUMBNAIL_MAX_SIZE, 3)) # image_array = tf.keras.preprocessing.image.img_to_array(pil_img) image_array = tf.convert_to_tensor(image_array) tf_image_array = tf.image.convert_image_dtype( image_array, tf.float32)[tf.newaxis, ...] # Calculate the image feature vector of the img try: features = module(tf_image_array) except ValueError as err: logging.error("Image too small for feature processing " + item) continue # Remove single-dimensional entries from the 'features' array feature_set = np.squeeze(features) # Adds image feature vectors into annoy index self._t.add_item(file_index, feature_set) # self._imagedata.addFeatureSetAnnoy(filename,feature_set) wx.PostEvent( self._notify_window, ResultEvent((file_index / self._imagedata.getSize()) * 0.5, EVT_RESULT_PROGRESS)) # Builds annoy index logging.error("Building trees...") self._t.build(self._trees, n_jobs=-1) # save annoy index to file for multiprocessing self._t.save(self._imagedata.getAnnoyIndexTempFile()) wx.PostEvent(self._notify_window, ResultEvent(0.75, EVT_RESULT_PROGRESS)) # Loops through all indexed items returndata = [] logging.error("Fetching nearest neighbors...") img_list_of_lists = [] img_list = [] window_size = math.floor(len(self._imagedata.getKeys()) / cpu_count()) i = 1 for file_index, filename in enumerate(list(self._imagedata.getKeys())): if file_index < (window_size * i): img_list.append(file_index) else: img_list_of_lists.append(img_list) img_list = [] i = i + 1 img_list.append(file_index) img_list_of_lists.append(img_list) # read files separately using multithreaded pool pool = Pool(cpu_count()) func = partial(get_nns, self._imagedata.getAnnoyIndexTempFile()) load_results = pool.map_async(func, img_list_of_lists) pool.close() # 'TERM' # maintain status gauge here while True: if load_results.ready() or self._want_abort == 1: break time.sleep(0.3) remaining = min( load_results._number_left * load_results._chunksize, len(img_list_of_lists)) # print("Waiting for", remaining, "tasks to complete...") wx.PostEvent( self._notify_window, ResultEvent( 0.75 + ((len(img_list_of_lists) - remaining) / len(img_list_of_lists)) * 0.25, EVT_RESULT_PROGRESS)) if self._want_abort == 1: pool.terminate() pool.join() # 'KILL' self._t.unload() for list_of_thumb_nearest_neighbors in load_results.get(): for thumb_nearest_neighbors in list_of_thumb_nearest_neighbors: names_list = [] for item in thumb_nearest_neighbors: name = list(self._imagedata.getKeys())[item] names_list.append(name) returndata.append(names_list) # wx.PostEvent(self._notify_window, # ResultEvent((file_index / self._imagedata.getSize()) * 0.25 + 0.75, EVT_RESULT_PROGRESS)) wx.PostEvent(self._notify_window, ResultEvent(returndata, EVT_RESULT_NEIGHBORS)) wx.PostEvent(self._notify_window, ResultEvent(None, EVT_RESULT_NEIGHBORS)) wx.PostEvent(self._notify_window, ResultEvent(0.0, EVT_RESULT_PROGRESS)) ################################################ stoptime = datetime.now() logging.error("ProcessAnnoyWorkerThread took " + str(stoptime - starttime) + " to process " + str(len(returndata)) + " image files")
class AnnoySearch: def __init__(self, vec_dim=100, metric='angular'): self.vec_dim = vec_dim # 要index的向量维度 self.metric = metric # 度量可以是"angular","euclidean","manhattan","hamming",或"dot" self.annoy_instance = AnnoyIndex(self.vec_dim, self.metric) self.logger = logging.getLogger('AnnoySearch') def save_annoy(self, annoy_file, prefault=False): self.annoy_instance.save(annoy_file, prefault=prefault) self.logger.info('save annoy SUCCESS !') def unload_annoy(self): self.annoy_instance.unload() def load_annoy(self, annoy_file, prefault=False): try: self.annoy_instance.unload() self.annoy_instance.load(annoy_file, prefault=prefault) self.logger.info('load annoy SUCCESS !') except FileNotFoundError: self.logger.error( 'annoy file DOES NOT EXIST , load annoy FAILURE !', exc_info=True) # 创建annoy索引 def build_annoy(self, n_trees): self.annoy_instance.build(n_trees) # 查询最近邻,通过index def get_nns_by_item(self, index, nn_num, search_k=-1, include_distances=False): return self.annoy_instance.get_nns_by_item(index, nn_num, search_k, include_distances) # 查询最近邻,通过向量 def get_nns_by_vector(self, vec, nn_num, search_k=-1, include_distances=False): return self.annoy_instance.get_nns_by_vector(vec, nn_num, search_k, include_distances) def get_n_items(self): return self.annoy_instance.get_n_items() def get_n_trees(self): return self.annoy_instance.get_n_trees() def get_vec_dim(self): return self.vec_dim # 添加item def add_item(self, index, vec): self.annoy_instance.add_item(index, vec) def get_item_vector(self, index): return self.annoy_instance.get_item_vector(index)
for i in locn: print 'annoy' counter = 0 f = 360 + 512 t = AnnoyIndex(f, metric='angular') feat = loc + i feat2 = loc2 + i annl = i.split('.')[0] feat1 = pd.read_csv(feat, sep=',', header=None) feat1 = np.asarray(feat1) feat3 = pd.read_csv(feat2, sep=',', header=None) feat3 = np.asarray(feat3) hogx, hogy = feat3.shape feat1 = feat1[:hogx, :] print 'shapes are', feat1.shape, feat3.shape feat1 = np.concatenate((feat1, feat3), axis=1) print feat1.shape row, col = feat1.shape feat1 = feat1[:, :col - 2] for count2 in range(row): count = feat1[count2, :] t.add_item(counter, count.astype(np.int32)) counter = counter + 1 t.build(100) t.save(save_loc + annl + '.ann') t.unload() del t print "Done with" + str(i)
class EmbeddingsIndex: def __init__(self, organization_dir, logger, embedding_type, load=False): """ :param organization_dir: str - path to resources :param logger: logging object :param load: bool - True if used for generating content """ self.organization_dir = organization_dir self.logger = logger self.embedding_type = embedding_type if 'use' in self.embedding_type: self.f_dim = 1024 elif 'bert' in self.embedding_type: self.f_dim = 1536 if load: self.logger.debug("Loading model.") self.t = AnnoyIndex(self.f_dim) self.t.load(os.path.join(self.organization_dir, "lshforest_" + self.embedding_type + ".ann")) self.logger.debug("Loading ids.") with h5py.File(os.path.join(self.organization_dir, "data_" + self.embedding_type + ".hdf5"), "r") as f: self.ids = numpy.array(f["id"]) if len(self.ids) != self.t.get_n_items(): raise EmbeddingsDbNotSynced def build_index(self) -> None: """ Load embeddings and build a new index. Run it as a deamon because it might take more time with huge content databases. :return: None """ self.logger.debug("Starting recommendation model building daemon.") self.retrieve_and_build() # t = Thread(target=self.retrieve_and_build, daemon=True) # t.start() def build_empty_index(self) -> None: """ Build empty index - used when building index after deleting takes time. :return: None """ t = AnnoyIndex(self.f_dim) trees_num = 10 # Number of trees in the index t.build(trees_num) t.save(os.path.join(self.organization_dir, "lshforest_" + self.embedding_type + ".ann")) print('organization ann has been created') t.unload() def retrieve_and_build(self): """ Build empty index - used when building index after deleting takes time. :return: None """ t = AnnoyIndex(self.f_dim) trees_num = 10 # Number of trees in the index t.build(trees_num) t.save(os.path.join(self.organization_dir, "lshforest.ann")) t.unload() def retrieve_and_build(self): """ Retrieve embeddings stored in HDF5 files and build and save the graph. :return: None """ try: embs = self.retrieve_embeddings() self.logger.debug("Creating annoy index. Embeddings count: %s.", len(embs)) t = AnnoyIndex(self.f_dim) for i in range(0, len(embs)): t.add_item(i, embs[i]) trees_num = 10 # Number of trees in the index t.build(trees_num) self.logger.debug("Building annoy index.") t.save(os.path.join(self.organization_dir, "lshforest_" + self.embedding_type + ".ann")) self.logger.debug("Saved annoy index.") t.unload() except Exception: message = "Failed to build recommendation model." self.logger.exception(message) def get_knn(self, vector, k=3): """ Get k closest neighbors to a vector :param vector: embedding of a query :param k: parameter specifying how many neighbors we want :return: dict - dict with results """ self.logger.debug("Getting nearest neighbors.") # Get closest neighbors ind, dist = self.t.get_nns_by_vector(vector, n=k, include_distances=True) ret = {"hits": []} if len(self.ids) != self.t.get_n_items(): raise EmbeddingsDbNotSynced for d, i in zip(dist, ind): ret["hits"].append({"id": self.ids[i], "score": d}) self.logger.debug("Nearest neighbors found.") return ret def retrieve_embeddings(self) -> numpy.ndarray: """ Retrieve embeddings from a pandas dataframe :return: numpy.ndarray """ self.logger.debug("Retrieving embeddings.") with h5py.File(os.path.join(self.organization_dir, "data_" + self.embedding_type + ".hdf5"), "r") as f: contexts = numpy.array(f["context"]) contents = numpy.array(f["content"]) ret = numpy.hstack((contexts, contents)) self.logger.debug("Retrieved embeddings.") return ret def unload(self) -> None: """ Unload index from memory :return: None """ self.t.unload() del self.t
tree.load("test_tree.ann") input("<Enter> to load 10,000 vectors.") q = True while q: for i in range(10000): tree.get_item_vector(i) resp = input("<Enter> to load 10,000 vectors.") if resp.strip() == "q": q = False input("<Enter> to unload tree.") tree.unload("test_tree.ann") input("done.") tree.load("test_tree.ann") input("<Enter> to load 10,000 vectors.") q = True while q: for i in range(10000): tree.get_item_vector(i) resp = input("<Enter> to load 10,000 vectors.") if resp.strip() == "q": q = False