def test_build_unbuid(self): f = 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(1000): i.add_item(j, [random.gauss(0, 1) for x in xrange(f)]) i.build(10) for j in xrange(100): i.unbuild() i.build(10) self.assertEqual(i.get_n_items(), 1000)
def test_build_unbuid(self): f = 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(1000): i.add_item(j, [random.gauss(0, 1) for x in xrange(f)]) i.build(10) for j in xrange(100): i.unbuild() i.build(10) self.assertEqual(i.get_n_items(), 1000)
class DND(object): def __init__(self, capacity=100000, key_size=128, cache_size=32, alpha=0.1): self.alpha = alpha self.capacity = capacity self.lru_cache = LRUCache(capacity) self.dup_cache = deque(maxlen=cache_size) self.index = AnnoyIndex(key_size, metric='euclidean') self.keys = np.zeros((capacity, key_size), dtype=np.float32) self.values = np.zeros((capacity, ), dtype=np.float32) self.insert_idx = 0 self.insertions = 0 def add(self, key, value): if not self.cache_lookup(key, value): self.keys[self.insert_idx] = key self.values[self.insert_idx] = value self.dup_cache.append(key) self.index.add_item(self.insert_idx, key) #advance insert position to least-recently-used key new_idx = self.lru_cache.update(self.insert_idx) if new_idx: self.insert_idx = new_idx self.insertions += 1 #rebuilding the index is expensive so we don't want to do it too often if self.insertions % 1000 == 0: self.rebuild_index() def cache_lookup(self, key, value): for i, e in enumerate(self.dup_cache): if np.allclose(key, e): idx = self.size - len(self.dup_cache) + i self.values[idx] += self.alpha * (value - self.values[idx]) return True def rebuild_index(self): self.index.unbuild() self.index.build(50) def query(self, key, k_neighbors=40): indices, distances = self.index.get_nns_by_vector( key, k_neighbors, include_distances=True) for idx in indices: self.lru_cache.update(idx) return self.values[indices], distances
def test_unbuild_with_loaded_tree(self): i = AnnoyIndex(10) i.load('test/test.tree') i.unbuild()
class AnnoyDictionary(object): def __init__(self, dict_size, key_width, new_value_shift_coefficient=0.1, batch_size=100, key_error_threshold=0.01): self.max_size = dict_size self.curr_size = 0 self.new_value_shift_coefficient = new_value_shift_coefficient self.index = AnnoyIndex(key_width, metric='euclidean') self.index.set_seed(1) self.embeddings = np.zeros((dict_size, key_width)) self.values = np.zeros(dict_size) self.lru_timestamps = np.zeros(dict_size) self.current_timestamp = 0.0 # keys that are in this distance will be considered as the same key self.key_error_threshold = key_error_threshold self.initial_update_size = batch_size self.min_update_size = self.initial_update_size self.key_dimension = key_width self.value_dimension = 1 self._reset_buffer() self.built_capacity = 0 def add(self, keys, values): # Adds new embeddings and values to the dictionary indices = [] indices_to_remove = [] for i in range(keys.shape[0]): index = self._lookup_key_index(keys[i]) if index: # update existing value self.values[index] += self.new_value_shift_coefficient * ( values[i] - self.values[index]) self.lru_timestamps[index] = self.current_timestamp indices_to_remove.append(i) else: # add new if self.curr_size >= self.max_size: # find the LRU entry index = np.argmin(self.lru_timestamps) else: index = self.curr_size self.curr_size += 1 self.lru_timestamps[index] = self.current_timestamp indices.append(index) for i in reversed(indices_to_remove): keys = np.delete(keys, i, 0) values = np.delete(values, i, 0) self.buffered_keys = np.vstack((self.buffered_keys, keys)) self.buffered_values = np.vstack((self.buffered_values, values)) self.buffered_indices = self.buffered_indices + indices if len(self.buffered_indices) >= self.min_update_size: self.min_update_size = max(self.initial_update_size, int(self.curr_size * 0.02)) self._rebuild_index() self.current_timestamp += 1 # Returns the stored embeddings and values of the closest embeddings def query(self, keys, k): if not self.has_enough_entries(k): # this will only happen when the DND is not yet populated with enough entries, which is only during heatup # these values won't be used and therefore they are meaningless return [0.0], [0.0], [0] _, indices = self._get_k_nearest_neighbors_indices(keys, k) embeddings = [] values = [] for ind in indices: self.lru_timestamps[ind] = self.current_timestamp embeddings.append(self.embeddings[ind]) values.append(self.values[ind]) self.current_timestamp += 1 return embeddings, values, indices def has_enough_entries(self, k): return self.curr_size > k and (self.built_capacity > k) def _get_k_nearest_neighbors_indices(self, keys, k): distances = [] indices = [] for key in keys: index, distance = self.index.get_nns_by_vector( key, k, include_distances=True) distances.append(distance) indices.append(index) return distances, indices def _rebuild_index(self): self.index.unbuild() self.embeddings[self.buffered_indices] = self.buffered_keys self.values[self.buffered_indices] = np.squeeze(self.buffered_values) for idx, key in zip(self.buffered_indices, self.buffered_keys): self.index.add_item(idx, key) self._reset_buffer() self.index.build(50) self.built_capacity = self.curr_size def _reset_buffer(self): self.buffered_keys = np.zeros((0, self.key_dimension)) self.buffered_values = np.zeros((0, self.value_dimension)) self.buffered_indices = [] def _lookup_key_index(self, key): distance, index = self._get_k_nearest_neighbors_indices([key], 1) if distance != [[]] and distance[0][0] <= self.key_error_threshold: return index return None
class Annoy: def __init__(self): # to keep the thread & queue running self.process_flag = True self.q_maxsize = 10100 self.process_thread = None self._lock = threading.Lock() self.process_timeout_sec = 5 # seconds # this is to keep track of all vectors inserted # for saving into disk and retrieve later self.index_disk = None try: with open('DB_config.yml', 'r') as stream: DB_config = yaml.safe_load(stream) # make sure to parse env variables to their expected type if os.getenv('FIXED_VEC_DIMENSION', None) is not None: self.dim = int(os.getenv('FIXED_VEC_DIMENSION')) else: self.dim = DB_config['annoy']['init']['vd'] if os.getenv('ANNOY_NTREES', None) is not None: self.dim = int(os.getenv('ANNOY_NTREES')) else: self.dim = DB_config['annoy']['init']['ntrees'] self.sim_metric = os.getenv( 'ANNOY_SIM_METRIC', DB_config['annoy']['init']['smetric']) self.n_trees = os.getenv('ANNOY_NTREES', DB_config['annoy']['init']['ntrees']) self.search_k = os.getenv( 'ANNOY_SEARCHK', DB_config['annoy']['init']['search_k']) self.modelLoaded = self.loadModelFromDisk() except Exception as e: print('Error initializing Annoy: ', e) # spawn process thread self.spawn() def __del__(self): self.process_flag = False if self.process_thread: self.process_thread.join() def spawn(self): # create pipeline to add documents self.pipeline = queue.Queue(maxsize=self.q_maxsize) # create process thread self.process_thread = threading.Thread(target=self.process, args=(), daemon=True) # start process thread self.process_thread.start() # return self.pipeline def initAnnoy(self): # only do if no index loaded from disk if not self.modelLoaded: print('Annoy init index') self.a_index = AnnoyIndex(self.dim, self.sim_metric) # Lock index read / wtite until it is built with self._lock: # build index build_ = self.a_index.build(self.n_trees) if build_: self.modelLoaded = self.saveModelToDisk() return self.modelLoaded def addVectors(self, documents): ids = [] # add vectors for document in documents: # add document to queue self.pipeline.put_nowait(document) ids.append(document._id) return True, ids def process(self): while (self.process_flag): # print(list(self.pipeline.queue)) # set a timeout till next vector indexing time.sleep(self.process_timeout_sec) # check if queue is not empty if self.pipeline.qsize() > 0: # Lock index read / wtite until it is built with self._lock: # unbuild index first self.a_index.unbuild() # fetch all currently available documents from queue while not self.pipeline.empty(): # extract document & contents document = self.pipeline.get_nowait() _id = document._id vec = document.vector vector_e = vec.e # resize vectors vector_e_l = len(vector_e) # check if the vector length is below dimention limit # then pad vector with 0 by dimension if vector_e_l < self.dim: vector_e.extend([0] * (self.dim - vector_e_l)) # make sure vector length doesn't exceed dimension limit vector_e = vector_e[:self.dim] # add vector to index self.a_index.add_item(int(_id), vector_e) # keep a copy for disk storage list_ = vector_e list_.append(int(_id)) # append to disk proxy if self.index_disk is None: self.index_disk = np.array([list_], dtype=float) else: self.index_disk = np.append(self.index_disk, [list_], axis=0) # build vector build_ = self.a_index.build(self.n_trees) # write to disk if build_: self.modelLoaded = self.saveModelToDisk() def deleteVectors(self, ids): return True, ids def getNearest(self, matrix, k): ids = [] dists = [] # Lock index read / wtite until nearest neighbor search with self._lock: for vec_data in matrix: if self.search_k != -1: _id, _dist = self.a_index.get_nns_by_vector( vec_data, k, self.search_k, include_distances=True) else: _id, _dist = self.a_index.get_nns_by_vector( vec_data, k, include_distances=True) ids.append(_id) dists.append(_dist) return True, ids, dists def loadModelFromDisk(self): try: # prepare new index self.a_index = AnnoyIndex(self.dim, self.sim_metric) # read index self.index_disk = np.load(model_location + '.npy') # build Annoy Index for vec_ in self.index_disk.tolist(): self.a_index.add_item(int(vec_[-1]), vec_[0:-1]) # build index build_ = self.a_index.build(self.n_trees) print('Annoy index loading success') return True except Exception as e: print('Annoy index loading failed') return False def saveModelToDisk(self): try: # write index np.save(model_location, self.index_disk) print('Annoy index writing success') return True except: print('Annoy index writing failed') return False
class Annoy: def __init__(self): self.total = 0 # this is to keep track of all vectors inserted # for saving into disk and retrieve later self.index_disk = None try: with open('DB_config.yml', 'r') as stream: DB_config = yaml.safe_load(stream) self.dim = os.getenv('FIXED_VEC_DIMENSION', DB_config['annoy']['init']['vd']) self.sim_metric = os.getenv( 'ANNOY_SIM_METRIC', DB_config['annoy']['init']['smetric']) self.n_trees = os.getenv('ANNOY_NTREES', DB_config['annoy']['init']['ntrees']) self.modelLoaded = self.loadModelFromDisk() except Exception as e: print('Error initializing Annoy: ', e) def initAnnoy(self): # only do if no index loaded from disk if not self.modelLoaded: print('Annoy init index') self.a_index = AnnoyIndex(self.dim, self.sim_metric) # build index build_ = self.a_index.build(self.n_trees) if build_: self.modelLoaded = self.saveModelToDisk() return self.modelLoaded def addVectors(self, documents): # unbuild index first self.a_index.unbuild() self.total = self.total + len(documents) ids = [] # add vectors for document in documents: _id = document._id vec = document.vector ids.append(_id) vector_e = vec.e vector_e_l = len(vector_e) # check if the vector length is below dimention limit # then pad vector with 0 by dimension if vector_e_l < self.dim: vector_e.extend([0] * (self.dim - vector_e_l)) # make sure vector length doesn't exceed dimension limit vector_e = vector_e[:self.dim] # add vector self.a_index.add_item(int(_id), vector_e) # keep a copy for disk storage list_ = vector_e list_.append(int(_id)) if self.index_disk is None: self.index_disk = np.array([list_], dtype=float) else: self.index_disk = np.append(self.index_disk, [list_], axis=0) # build vector build_ = self.a_index.build(self.n_trees) if build_: self.modelLoaded = self.saveModelToDisk() return self.modelLoaded, ids def deleteVectors(self, ids): return True, ids def getNearest(self, matrix, k): ids = [] dists = [] for vec_data in matrix: _id, _dist = self.a_index.get_nns_by_vector(vec_data, k, include_distances=True) ids.append(_id) dists.append(_dist) return True, ids, dists def loadModelFromDisk(self): try: # prepare new index self.a_index = AnnoyIndex(self.dim, self.sim_metric) # read index self.index_disk = np.load(model_location + '.npy') # build Annoy Index for vec_ in self.index_disk.tolist(): self.a_index.add_item(int(vec_[-1]), vec_[0:-1]) # build index build_ = self.a_index.build(self.n_trees) print('Annoy index loading success') return True except Exception as e: print('Annoy index loading failed') return False def saveModelToDisk(self): try: # write index np.save(model_location, self.index_disk) print('Annoy index writing success') return True except: print('Annoy index writing failed') return False
class Annoy: def __init__(self): self.dim = 300 self.sim_metric = 'angular' self.n_trees = 10 self.search_k = 1 self.modelLoaded = False # self.loadModelFromDisk(model_location) def initAnnoy(self, dim, metric, matrix): self.sim_metric = metric self.dim = dim print('Annoy init index') self.a_index = AnnoyIndex(self.dim, self.sim_metric) build_ = self.a_index.build(self.n_trees) #if build_: # self.modelLoaded = self.saveModelToDisk(model_location, self.a_index) return build_ #self.modelLoaded def addVectors(self, documents): ids = [] # unbuild annoy index before adding new data self.a_index.unbuild() # add vectors for document in documents: _id = document._id vec = document.vector ids.append(_id) vector_e = vec.e vector_e_l = len(vector_e) # check if the vector length is below dimention limit # then pad vector with 0 by dimension if vector_e_l < self.dim: vector_e.extend([0]*(self.dim-vector_e_l)) # make sure vector length doesn't exceed dimension limit vector_e = vector_e[:self.dim] # add vector self.a_index.add_item(int(_id), vector_e) # build vector build_ = self.a_index.build(self.n_trees) # if build_: # self.modelLoaded = self.saveModelToDisk(model_location, self.a_index) return build_, ids def deleteVectors(self, ids): return True, ids def getNearest(self, matrix, k): ids = [] dists = [] for vec_data in matrix: _id, _dist = self.a_index.get_nns_by_vector(vec_data, k, search_k=self.search_k, include_distances=True) ids.append(_id) dists.append(_dist) return True, ids, dists def loadModelFromDisk(self, location): try: # read index self.a_index = AnnoyIndex(self.dim, self.sim_metric) self.a_index.load(location) print('Annoy index loading success') return True except: print('Annoy index loading failed') return False def saveModelToDisk(self, location, index): try: # write index index.save(location) print('Annoy index writing success') return True except: print('Annoy index writing failed') return False
# query vectors count num_queries = query_vecs.shape[0] # create report file with open(result_csv_file, 'w') as csvOut: writer = csv.writer(csvOut) # write header writer.writerow(['Number of trees', 'Indexing + Build time(sec)', 'Indexing + Build memory(GB)', 'Test time(sec)', 'Correct(Percent)']) for num_trees in num_trees_vals: annoy.unbuild() print('Building annoy with %d tree(s)...' % num_trees) sys.stdout.flush() # Record used memory at the start time memStart = mem_usage_gb() # Record the start time tmStart = time.time() # Build annoy tree annoy.build(num_trees) # Calculate build time tmBuild = time.time() - tmStart # Calculate used memory for build
class alpha_KNN: def __init__(self, capacity, key_dimension, delta=0.001, alpha=0.1, batch_size=1000): self.capacity = capacity self.curr_capacity = 0 self.delta = delta self.alpha = 0.001 self.embeddings = np.zeros((capacity, key_dimension)) self.values = np.zeros(capacity) self.weights = np.zeros(capacity) from annoy import AnnoyIndex self.index = AnnoyIndex(key_dimension, metric='euclidean') self.index.set_seed(123) self.min_update_size = batch_size self.cached_keys = [] self.cached_values = [] self.cached_indices = [] self.built_capacity = 0 def _nn(self, keys, k): dists = [] inds = [] for key in keys: ind, dist = self.index.get_nns_by_vector(key + [1.0], k, include_distances=True) dists.append(dist) inds.append(ind) return dists, inds def _insert(self, keys, values, indices): self.cached_keys = self.cached_keys + keys self.cached_values = self.cached_values + values self.cached_indices = self.cached_indices + indices if len(self.cached_indices) >= self.min_update_size: self._rebuild_index() def _rebuild_index(self): self.index.unbuild() for i, ind in enumerate(self.cached_indices): new_key = self.cached_keys[i] new_value = self.cached_values[i] self.embeddings[ind] = new_key self.values[ind] = new_value self.weights[ind] = new_weight self.index.add_item(ind, new_key + [new_weight]) self.cached_keys = [] self.cached_values = [] self.cached_indices = [] self.index.build(50) self.built_capacity = self.curr_capacity def queryable(self, k): return (self.built_capacity > k) # Returns the stored embeddings and values of the closest embeddings def query(self, keys, k): _, indices = self._nn(keys, k) embs = [] values = [] weights = [] for ind in indices: embs.append(self.embeddings[ind]) values.append(self.values[ind]) weights.append(self.weights[ind]) return embs, values, weights # Adds new embeddings (and values) to the dictionary def add(self, keys, values): if self.queryable(5): dists, inds = self._nn(keys, k=5) for ind, dist in enumerate(dists): for i, d in enumerate(dist): index = inds[ind][i] self.weights[index] *= (1 - self.alpha) indices, keys_, values_ = [], [], [] for i, _ in enumerate(keys): if self.curr_capacity >= self.capacity: # find the LRU entry index = np.argmin(self.weights) else: index = self.curr_capacity self.curr_capacity += 1 self.weights[index] = 1.0 indices.append(index) keys_.append(keys[i]) values_.append(values[i]) self._insert(keys_, values_, indices)
class annoy_dict(LRU_KNN): def __init__(self, capacity, key_dimension, delta=0.001, alpha=0.1, batch_size=100): LRU_KNN.__init__(self, capacity, key_dimension, delta, alpha) from annoy import AnnoyIndex self.index = AnnoyIndex(key_dimension, metric='euclidean') self.index.set_seed(123) self.initial_update_size = batch_size self.min_update_size = self.initial_update_size self.cached_keys = [] self.cached_values = [] self.cached_indices = [] self.built_capacity = 0 def _nn(self, keys, k): dists = [] inds = [] for key in keys: ind, dist = self.index.get_nns_by_vector(key, k, include_distances=True) dists.append(dist) inds.append(ind) return dists, inds def _insert(self, keys, values, indices): self.cached_keys = self.cached_keys + keys self.cached_values = self.cached_values + values self.cached_indices = self.cached_indices + indices if len(self.cached_indices) >= self.min_update_size: self.min_update_size = max(self.initial_update_size, self.curr_capacity * 0.02) self._update_index() def _update_index(self): self.index.unbuild() for i, ind in enumerate(self.cached_indices): new_key = self.cached_keys[i] new_value = self.cached_values[i] self.embeddings[ind] = new_key self.values[ind] = new_value self.index.add_item(ind, new_key) self.cached_keys = [] self.cached_values = [] self.cached_indices = [] self.index.build(50) self.built_capacity = self.curr_capacity def _rebuild_index(self): self.index.unbuild() for ind, emb in enumerate(self.embeddings[:self.curr_capacity]): self.index.add_item(ind, emb) self.index.build(50) self.built_capacity = self.curr_capacity def queryable(self, k): return (LRU_KNN.queryable(self, k) and (self.built_capacity > k))
class Index: """Procedures over multidimensional spaces.""" def __init__(self, size, data_dir=None, trees=.001, volatile=False): """ Indexing tensors operations and nearest neighbours search. Parameters ---------- size: int Shape of unidimensional vectors which will be indexed data_dir: str Location where to load or save the index trees (optional): float Defines the number of trees to create based on the dataset size. Should be a number between 0 and 1. volatile (optional): bool If the index will be temporary or not. """ self._position = -1 self._size = size self._data_dir = data_dir self._trees = trees self._volatile = volatile if self._data_dir and not self._volatile: if os.path.isfile(self._data_dir): raise OSError('data_dir parameter is not a directory') os.makedirs(self._data_dir, exist_ok=True) self._path = os.path.join(self._data_dir, self.index_name) elif not self._data_dir and not self._volatile: raise NoDataDirForPermanentIndex elif not self._data_dir and self._volatile: _temp_file = FileIO.safe_temp_file() self._data_dir = os.path.dirname(_temp_file) self._path = _temp_file else: raise DataDirDefinedForVolatileIndex if os.path.isfile(self._path): try: self.tree = AnnoyIndex(size, metric='angular') self.tree.load(self._path) self._is_new_index = False except OSError as os_error: raise FileIsNotAnIndex from os_error else: self.tree = AnnoyIndex(size, metric='angular') self._is_new_index = True self._image_database = ImageDatabase( import_images=True, data_dir=self._data_dir, ) @property def size(self): """Getter for property size.""" return self._size @property def path(self): """Getter for property path.""" return self._path @property def index_name(self): """Getter for property index_name.""" return 'pupyl.index' @property def trees(self): """Getter for property trees.""" return self._trees @property def volatile(self): """Getter for property volatile.""" return self._volatile @trees.setter def trees(self, trees): """Setter for property trees.""" self._trees = trees def __enter__(self): """Context opening index.""" return self def __exit__(self, exc_type, exc_val, exc_tb): """Context closing index.""" if not exc_type: if self._is_new_index: self.tree.build(self.size << intmul >> self.trees) self.tree.save(self.path) self.tree.unload() def items(self): """Return the indexed items.""" for item in range(len(self)): yield item def values(self): """Return the indexed values.""" for item in self.items(): yield self.tree.get_item_vector(item) def items_values(self): """Return tuples with all items and values.""" for item, value in zip(self.items(), self.values()): yield item, value def __getitem__(self, position): """Return item at index. Supports negative slicing.""" if position >= 0: return self.tree.get_item_vector(position) return self.tree.get_item_vector( len(self) - abs(position) ) def refresh(self): """Update all information regarding index file.""" self.tree.unload() self.tree.load(self.path) def append(self, tensor, check_unique=False): """ Insert a new tensor at the end of the index. Be advised that this operation is linear on index size ($O(n)$). Parameters ---------- tensor: numpy.ndarray or list A vector to insert into index. check_unique (optional, default: False): bool Defines if append method should verify the existence of a really similar tensor on the current index. In other words, it checks for the unicity of the value. Be advised that this check creates an overhead on the append process. """ if sum(tensor) == 0.: raise NullTensorError if self._is_new_index: index_it = True if check_unique and len(self) > 1: self.tree.build(self.size << intmul >> self.trees) result = self.item( self.index(tensor), top=1, distances=True ) if result[1][0] <= .05: warning( 'Tensor being indexed already exists in ' 'the database and the check for duplicates ' 'are on. Refusing to store again this tensor.' ) index_it = False self.tree.unbuild() if index_it: self.tree.add_item(len(self), tensor) else: with Index(self.size, volatile=True, trees=self.trees) as tmp_idx: for value in self.values(): tmp_idx.append(value, check_unique) tmp_idx.append(tensor, check_unique) _temp_file = tmp_idx.path move(_temp_file, self.path) self.refresh() def remove(self, position): """ Remove the tensor at index from the database. Be advised that this operation is linear on index size ($O(n)$). Parameters ---------- position: int The index which must be removed """ if self._is_new_index: raise IndexNotBuildYet if position > len(self): raise IndexError with Index(self.size, volatile=True, trees=self.trees) as tmp_idx: shrink = False for item, value in self.items_values(): if item == position: shrink = True else: if shrink: item -= 1 tmp_idx.tree.add_item(item, value) _temp_file = tmp_idx.path move(_temp_file, self.path) self.refresh() def pop(self, position=None): """ Pop-out the index at position, returning it. Be advised that this operation is linear on index size ($O(n)$). Parameters ---------- position (optional) (default: last position): int Removes and returns the value at position. Returns ---------- int: With the popped item. """ if position: value = self[position] else: inverse_index = -1 value = self[inverse_index] position = len(self) + inverse_index self.remove(position) return value def index(self, tensor): """ Search for the first most similar image compared to the query. Parameters ---------- tensor: numpy.ndarray or list A vector to search for the most similar. Returns ---------- int: Describing the most similar resulting index. """ return self.tree.get_nns_by_vector(tensor, n=1)[0] def item(self, position, top=10, distances=False): """ Search the index using an internal position Parameters ---------- position: int The item id within index. top (optional, default 10): int How many similar items should be returned. distances (optional, default 10): bool If should be returned also the distances between items. Returns ------- if distances is True: list of tuples: Containing pairs of item and distances else: list: Containing similar items. """ return self.tree.get_nns_by_item( position, top, include_distances=distances ) def search(self, tensor, results=16): """ Search for the first most similars image compared to the query. Parameters ---------- tensor: numpy.ndarray or list A vector to search for the most similar images. results: int How many results to return. If similar images are less than results, it exhausts and will be returned actual total. """ for result in self.tree.get_nns_by_vector(tensor, n=results): yield result def __len__(self): """Return how many items are indexed.""" return self.tree.get_n_items() def __iter__(self): """Return an iterable.""" for value in self.values(): yield value def __next__(self): """Iterate over the iterable.""" self._position += 1 all_values = list(self.values()) if self._position < len(all_values): return all_values[self._position] raise StopIteration def group_by(self, top=10, **kwargs): """ Returns all (or some position) on the index that is similar with other elements inside index. Parameters ---------- top (optional, default 10): int How many similar internal images should be returned position (optional): int Returns the groups based on a specified position. Returns ------- list: If a position is defined or dict: Generator with a dictionary containing internal ids as key and a list of similar images as values. """ position = kwargs.get('position') if len(self) <= 1: raise EmptyIndexError if top >= 1: if isinstance(position, int): results = self.item(position, top + 1) if len(results) > 1: yield results[1:] else: for item in self.items(): yield { item: self.item( item, top + 1 )[1:] } else: raise TopNegativeOrZero def export_by_group_by(self, path, top=10, **kwargs): """ Saves images, creating directories, based on their groups. Parameters ---------- path: str Place to create the directories and export images top (optional, default 10): How many similar internal images should be returned position (optional): int Returns the groups based on a specified position. """ for element in FileIO.progress( self.group_by( top=top, position=kwargs.get('position') ) ): if isinstance(element, dict): item = [*element.keys()][0] similars = element[item] elif isinstance(element, list): item = kwargs['position'] similars = element save_path = os.path.join( path, str(item) ) os.makedirs( save_path, exist_ok=True ) try: copyfile( self._image_database.mount_file_name( item, 'jpg' ), os.path.join( save_path, 'group.jpg' ) ) except FileNotFoundError: continue for rank, similar in enumerate(similars): original_file_path = self._image_database.mount_file_name( similar, 'jpg' ) try: copyfile( original_file_path, os.path.join( save_path, f'{rank + 1}.jpg' ) ) except FileNotFoundError: continue
class LRU_KNN: def __init__(self, capacity, key_dim, value_dim, batch_size): self.capacity = capacity self.curr_capacity = 0 self.states = np.zeros((capacity, key_dim)) self.values = np.zeros((capacity, value_dim)) self.lru = np.zeros(capacity) self.tm = 0.0 self.index = AnnoyIndex(key_dim, metric="euclidean") self.index.set_seed(123) self.initial_update_size = batch_size self.min_update_size = self.initial_update_size self.cached_states = [] self.cached_values = [] self.cached_indices = [] def nn(self, keys, k): dists = [] inds = [] for key in keys: ind, dist = self.index.get_nns_by_vector(key, k, include_distances=True) dists.append(dist) inds.append(ind) return dists, inds def query(self, keys, k): _, indices = self.nn(keys, k) states = [] values = [] for ind in indices: self.lru[ind] = self.tm states.append(self.states[ind]) values.append(self.values[ind]) self.tm += 0.001 return states, values def _insert(self, keys, values, indices): self.cached_states = self.cached_states + keys self.cached_values = self.cached_values + values self.cached_indices = self.cached_indices + indices if len(self.cached_states) >= self.min_update_size: self.min_update_size = max(self.initial_update_size, self.curr_capacity * 0.02) self._update_index() def _update_index(self): self.index.unbuild() for i, ind in enumerate(self.cached_indices): new_state = self.cached_states[i] new_value = self.cached_values[i] self.states[ind] = new_state self.values[ind] = new_value self.index.add_item(ind, new_state) self.cached_states = [] self.cached_values = [] self.cached_indices = [] self.index.build(50) self.built_capacity = self.curr_capacity def _rebuild_index(self): self.index.unbuild() for ind, state in enumerate(self.states[:self.curr_capacity]): self.index.add_item(ind, state) self.index.build(50) self.built_capacity = self.curr_capacity
class Memory: def __init__(self, capacity, state_dim, value_dim): self.capacity = capacity print("state_dim:", state_dim) self.states = np.zeros((capacity, state_dim)) self.values = np.zeros((capacity, value_dim)) self.curr_capacity = 0 self.curr_ = 0 self.lru = np.zeros(capacity) self.tm = 0 self.cached_states = [] self.cached_values = [] self.cached_indices = [] self.index = AnnoyIndex(state_dim) self.index.set_seed(123) self.update_size = 1 self.build_capacity = 0 def sample_knn_test(self, state, k): inds, dists = self.index.get_nns_by_vector(state, k, include_distances=True) self.tm += 0.01 self.lru[inds] = self.tm return self.states[inds], self.values[inds], dists def sample_knn(self, states, k): dists = [] inds = [] for state in states: ind, dist = self.index.get_nns_by_vector(state, k, include_distances=True) inds.append(ind) dists.append(dist) # inds = np.reshape(np.array(inds), -1) self.tm += 0.01 self.lru[inds] = self.tm return self.states[inds], self.values[inds], dists def sample(self, n_samples): if self.curr_capacity < n_samples or n_samples == 0: idx = np.random.choice(np.arange(len(self.states)), n_samples, replace=False) else: idx = np.random.choice(np.arange(self.curr_capacity), n_samples, replace=False) self.tm += 0.01 self.lru[idx] = self.tm embs = self.states[idx] values = self.values[idx] return embs, values def add_knn(self, states, values): self._add_knn(states, values) def add_knn_lru(self, states, values): self._add_knn(states, values, lru=True) def add(self, states, values): self._add(states, values) def add_lru(self, states, values): self._add(states, values, lru=True) def add_rand(self, states, values): self._add(states, values, rand=True) def _insert(self, states, values, indices): self.cached_states = self.cached_states + states self.cached_values = self.cached_values + values self.cached_indices = self.cached_indices + indices if len(self.cached_states) >= self.update_size: self._update_index() def _update_index(self): self.index.unbuild() for i, ind in enumerate(self.cached_indices): self.states[ind] = self.cached_states[i] self.values[ind] = self.cached_values[i] self.index.add_item(ind, self.cached_states[i]) self.index.build(50) self.build_capacity = self.curr_capacity self.cached_states = [] self.cached_values = [] self.cached_indices = [] def _rebuild_index(self): self.index.unbuild() for ind, state in enumerate(self.states[:self.curr_capacity]): self.index.add_item(ind, state) self.index.build(50) self.build_capacity = self.curr_capacity def _add_knn(self, states, values, lru=False): # print(states) indices = [] states_ = [] values_ = [] for i, _ in enumerate(states): if lru: if self.curr_capacity >= self.capacity: ind = np.argmin(self.lru) else: ind = self.curr_capacity self.curr_capacity += 1 else: if self.curr_capacity >= self.capacity: self.curr_ = (self.curr_ + 1) % self.capacity ind = self.curr_ else: ind = self.curr_capacity self.curr_capacity += 1 self.lru[ind] = self.tm indices.append(ind) states_.append(states[i]) values_.append(values[i]) self._insert(states_, values_, indices) def _add(self, states, values, rand=False, lru=False): for i, state in enumerate(states): if self.curr_capacity < self.capacity: self.curr_ = (self.curr_ + 1) % self.capacity # self.states[self.curr_] = state # self.values[self.curr_] = values[i] if self.curr_capacity < self.capacity: self.curr_capacity += 1 else: if lru: self.curr_ = np.argmin(self.lru) if rand: self.curr_ = np.random.choice(np.arange( self.curr_capacity), 1, replace=False) if not lru and not rand: self.curr_ = (self.curr_ + 1) % self.capacity self.states[self.curr_] = state self.values[self.curr_] = values[i] @property def length(self): # assert self.index.get_n_items() == self.curr_capacity # return self.curr_capacity return self.index.get_n_items()
def test_unbuild_with_loaded_tree(self): i = AnnoyIndex(10) i.load('test/test.tree') i.unbuild()
class Annoy: def __init__(self, model_location): # set model location self.model_location = model_location # to keep the thread & queue running self.process_flag = True self.q_maxsize = int(os.environ["FIXED_Q_LEN"]) self.build_batch_size = int(os.environ["FIXED_Q_LEN"]) self.process_thread = None self._lock = threading.Lock() self.process_timeout_sec = int(os.environ["THREAD_SLEEP"]) # seconds # this is to keep track of all vectors inserted # for saving into disk and retrieve later self.index_disk = None try: # make sure to parse env variables to their expected type self.dim = int(os.environ["FIXED_VEC_DIMENSION"]) self.sim_metric = str(os.environ["ANNOY_SIM_METRIC"]) self.n_trees = int(os.environ["ANNOY_NTREES"]) self.search_k = int(os.environ["ANNOY_SK"]) self.model_loaded = self.load_model_from_disk() if not self.model_loaded: if self.init_annoy(): logging.debug("Annoy Init done") else: logging.debug("Annoy Init Failed") except Exception as e: logging.error('Error initializing Annoy: ' + e) # spawn process thread self.spawn() def __del__(self): self.process_flag = False if self.process_thread: self.process_thread.join() def spawn(self): # create pipeline to add documents self.pipeline = queue.Queue(maxsize=self.q_maxsize) # create process thread self.process_thread = threading.Thread(target=self.process, args=(), daemon=True) # start process thread self.process_thread.start() # return self.pipeline def init_annoy(self): # only do if no index loaded from disk if not self.model_loaded: logging.debug('Annoy init index') self.a_index = AnnoyIndex(self.dim, self.sim_metric) # Lock index read / wtite until it is built with self._lock: # build index build_ = self.a_index.build(self.n_trees) if build_: self.model_loaded = self.save_model_to_disk() return self.model_loaded def add_vectors(self, documents): # add documents to queue self.pipeline.put({"action": "add", "docs": documents}) return True def process(self): while (self.process_flag): # set a timeout till next vector indexing time.sleep(self.process_timeout_sec) # check if queue is not empty if self.pipeline.qsize() > 0: # Lock index read / wtite until it is built with self._lock: # unbuild index first self.a_index.unbuild() len_documents = 0 # fetch all currently available documents from queue while not self.pipeline.empty(): # extract document & contents qitem = self.pipeline.get_nowait() if qitem["action"] == "add": documents = qitem["docs"] len_documents += len(documents) for document in documents: _id = document["_id"] vector_e = document["code"] # add vector to index self.a_index.add_item(int(_id), vector_e) # append to disk proxy if self.index_disk is None: self.index_disk = np.array( [vector_e + [int(_id)]], dtype=float) else: self.index_disk = np.append( self.index_disk, [vector_e + [int(_id)]], axis=0) elif qitem["action"] == "delete": ids = qitem["ids"] len_documents += len(ids) # reset zero_ = np.zeros(self.dim + 1) for id_ in ids: # add zero vector to index self.a_index.add_item(int(id_), zero_[:-1].tolist()) # reset npy disk array if self.index_disk is not None: self.index_disk[ids] = zero_ # take a rest if doc length is > batch_size if len_documents > self.build_batch_size: break # build vector build_ = self.a_index.build(self.n_trees, n_jobs=-1) # write to disk if build_: self.model_loaded = self.save_model_to_disk() def delete_vectors(self, ids): # add documents to queue self.pipeline.put({"action": "delete", "ids": ids}) return True, ids def get_nearest_k(self, matrix, k): ids = [] dists = [] # Lock index read / wtite until nearest neighbor search with self._lock: for vec_data in matrix: if self.search_k != -1: _id, _dist = self.a_index.get_nns_by_vector( vec_data, k, self.search_k, include_distances=True) else: _id, _dist = self.a_index.get_nns_by_vector( vec_data, k, include_distances=True) ids.append(_id) dists.append(_dist) return ids, dists def get_nearest_rad(self, matrix, rad): ids = [] dists = [] # Lock index read / wtite until nearest neighbor search with self._lock: for vec_data in matrix: if self.search_k != -1: _id, _dist = self.a_index.get_nns_by_vector( vec_data, k, self.search_k, include_distances=True) else: _id, _dist = self.a_index.get_nns_by_vector( vec_data, k, include_distances=True) ids.append(_id) dists.append(_dist) return ids, dists def load_model_from_disk(self): try: # prepare new index self.a_index = AnnoyIndex(self.dim, self.sim_metric) # read index self.index_disk = np.load(self.model_location + '.npy') # build Annoy Index for vec_ in self.index_disk.tolist(): self.a_index.add_item(int(vec_[-1]), vec_[0:-1]) # build index build_ = self.a_index.build(self.n_trees) logging.debug('Annoy index loading success') return True except Exception as e: logging.debug('Annoy index loading failed. Creating new index..') return False def save_model_to_disk(self): try: # write index np.save(self.model_location, self.index_disk) logging.debug('Annoy index writing success') return True except Exception as e: logging.error('Annoy index writing failed' + e) return False
class Annoy_Dict(LRU_KNN_ANNOY): def __init__(self, config): super(Annoy_Dict, self).__init__(config) self.config = config self.key_dim = self.config.knn_key_dim self.index = AnnoyIndex(self.key_dim, metric='euclidean') self.index.set_seed(123) self.initial_update_size = self.config.knn_dict_update_step self.min_update_size = self.initial_update_size self.cached_embs = [] self.cached_vals = [] self.cached_terminals = [] self.cached_embs_next = [] self.cached_indices = [] self.build_capacity = 0 def _nn(self, keys, k): assert np.ndim(keys) == 2 dists = [] inds = [] for key in keys: ind, dist = self.index.get_nns_by_vector(key, k, include_distances=True) dists.append(dist) inds.append(ind) return dists, inds def _insert(self, keys, values, terminal, keys_next, indices): self.cached_embs = self.cached_embs + keys self.cached_vals = self.cached_vals + values self.cached_terminals = self.cached_terminals + terminal self.cached_embs_next = self.cached_embs_next + keys_next self.cached_indices = self.cached_indices + indices if len(self.cached_indices) >= self.min_update_size: # self.min_update_size = max(self.initial_update_size, self.curr_capacity * 0.02) self._update_index() def _update_index(self): self.index.unbuild() for i, ind in enumerate(self.cached_indices): new_emb = self.cached_embs[i] new_val = self.cached_vals[i] new_t = self.cached_terminals[i] new_emb_next = self.cached_embs_next[i] self.embs[ind] = new_emb self.values[ind] = new_val self.terminal[ind] = new_t self.embs_next[ind] = new_emb_next self.index.add_item(ind, new_emb) self.cached_embs = [] self.cached_vals = [] self.cached_terminals = [] self.cached_embs_next = [] self.cached_indices = [] self.index.build(50) self.build_capacity = self.curr_capacity def _rebuild(self): self.index.unbuild() for ind, emb in enumerate(self.embs[:self.curr_capacity]): self.index.add_item(ind, emb) self.index.build(50) self.build_capacity = self.curr_capacity def queryable(self, k): return (LRU_KNN_ANNOY.queryable(self, k) and (self.build_capacity > k)) @property def capacity_(self): # print("self.index.get_n_items: ", self.index.get_n_items()) return self.index.get_n_items()