Exemple #1
0
 def test_build_unbuid(self):
     f = 10
     i = AnnoyIndex(f, 'euclidean')
     for j in xrange(1000):
         i.add_item(j, [random.gauss(0, 1) for x in xrange(f)])
     i.build(10)
     
     for j in xrange(100):
         i.unbuild()
         i.build(10)
         
     self.assertEqual(i.get_n_items(), 1000)
Exemple #2
0
 def test_build_unbuid(self):
     f = 10
     i = AnnoyIndex(f, 'euclidean')
     for j in xrange(1000):
         i.add_item(j, [random.gauss(0, 1) for x in xrange(f)])
     i.build(10)
     
     for j in xrange(100):
         i.unbuild()
         i.build(10)
         
     self.assertEqual(i.get_n_items(), 1000)
Exemple #3
0
class DND(object):
    def __init__(self,
                 capacity=100000,
                 key_size=128,
                 cache_size=32,
                 alpha=0.1):
        self.alpha = alpha
        self.capacity = capacity
        self.lru_cache = LRUCache(capacity)
        self.dup_cache = deque(maxlen=cache_size)
        self.index = AnnoyIndex(key_size, metric='euclidean')
        self.keys = np.zeros((capacity, key_size), dtype=np.float32)
        self.values = np.zeros((capacity, ), dtype=np.float32)
        self.insert_idx = 0
        self.insertions = 0

    def add(self, key, value):
        if not self.cache_lookup(key, value):
            self.keys[self.insert_idx] = key
            self.values[self.insert_idx] = value
            self.dup_cache.append(key)
            self.index.add_item(self.insert_idx, key)
            #advance insert position to least-recently-used key
            new_idx = self.lru_cache.update(self.insert_idx)
            if new_idx:
                self.insert_idx = new_idx

        self.insertions += 1
        #rebuilding the index is expensive so we don't want to do it too often
        if self.insertions % 1000 == 0:
            self.rebuild_index()

    def cache_lookup(self, key, value):
        for i, e in enumerate(self.dup_cache):
            if np.allclose(key, e):
                idx = self.size - len(self.dup_cache) + i
                self.values[idx] += self.alpha * (value - self.values[idx])

                return True

    def rebuild_index(self):
        self.index.unbuild()
        self.index.build(50)

    def query(self, key, k_neighbors=40):
        indices, distances = self.index.get_nns_by_vector(
            key, k_neighbors, include_distances=True)

        for idx in indices:
            self.lru_cache.update(idx)

        return self.values[indices], distances
Exemple #4
0
 def test_unbuild_with_loaded_tree(self):
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     i.unbuild()
class AnnoyDictionary(object):
    def __init__(self,
                 dict_size,
                 key_width,
                 new_value_shift_coefficient=0.1,
                 batch_size=100,
                 key_error_threshold=0.01):
        self.max_size = dict_size
        self.curr_size = 0
        self.new_value_shift_coefficient = new_value_shift_coefficient

        self.index = AnnoyIndex(key_width, metric='euclidean')
        self.index.set_seed(1)

        self.embeddings = np.zeros((dict_size, key_width))
        self.values = np.zeros(dict_size)

        self.lru_timestamps = np.zeros(dict_size)
        self.current_timestamp = 0.0

        # keys that are in this distance will be considered as the same key
        self.key_error_threshold = key_error_threshold

        self.initial_update_size = batch_size
        self.min_update_size = self.initial_update_size
        self.key_dimension = key_width
        self.value_dimension = 1
        self._reset_buffer()

        self.built_capacity = 0

    def add(self, keys, values):
        # Adds new embeddings and values to the dictionary
        indices = []
        indices_to_remove = []
        for i in range(keys.shape[0]):
            index = self._lookup_key_index(keys[i])
            if index:
                # update existing value
                self.values[index] += self.new_value_shift_coefficient * (
                    values[i] - self.values[index])
                self.lru_timestamps[index] = self.current_timestamp
                indices_to_remove.append(i)
            else:
                # add new
                if self.curr_size >= self.max_size:
                    # find the LRU entry
                    index = np.argmin(self.lru_timestamps)
                else:
                    index = self.curr_size
                    self.curr_size += 1
                self.lru_timestamps[index] = self.current_timestamp
                indices.append(index)

        for i in reversed(indices_to_remove):
            keys = np.delete(keys, i, 0)
            values = np.delete(values, i, 0)

        self.buffered_keys = np.vstack((self.buffered_keys, keys))
        self.buffered_values = np.vstack((self.buffered_values, values))
        self.buffered_indices = self.buffered_indices + indices

        if len(self.buffered_indices) >= self.min_update_size:
            self.min_update_size = max(self.initial_update_size,
                                       int(self.curr_size * 0.02))
            self._rebuild_index()

        self.current_timestamp += 1

    # Returns the stored embeddings and values of the closest embeddings
    def query(self, keys, k):
        if not self.has_enough_entries(k):
            # this will only happen when the DND is not yet populated with enough entries, which is only during heatup
            # these values won't be used and therefore they are meaningless
            return [0.0], [0.0], [0]

        _, indices = self._get_k_nearest_neighbors_indices(keys, k)

        embeddings = []
        values = []
        for ind in indices:
            self.lru_timestamps[ind] = self.current_timestamp
            embeddings.append(self.embeddings[ind])
            values.append(self.values[ind])

        self.current_timestamp += 1

        return embeddings, values, indices

    def has_enough_entries(self, k):
        return self.curr_size > k and (self.built_capacity > k)

    def _get_k_nearest_neighbors_indices(self, keys, k):
        distances = []
        indices = []
        for key in keys:
            index, distance = self.index.get_nns_by_vector(
                key, k, include_distances=True)
            distances.append(distance)
            indices.append(index)
        return distances, indices

    def _rebuild_index(self):
        self.index.unbuild()
        self.embeddings[self.buffered_indices] = self.buffered_keys
        self.values[self.buffered_indices] = np.squeeze(self.buffered_values)
        for idx, key in zip(self.buffered_indices, self.buffered_keys):
            self.index.add_item(idx, key)

        self._reset_buffer()

        self.index.build(50)
        self.built_capacity = self.curr_size

    def _reset_buffer(self):
        self.buffered_keys = np.zeros((0, self.key_dimension))
        self.buffered_values = np.zeros((0, self.value_dimension))
        self.buffered_indices = []

    def _lookup_key_index(self, key):
        distance, index = self._get_k_nearest_neighbors_indices([key], 1)
        if distance != [[]] and distance[0][0] <= self.key_error_threshold:
            return index
        return None
Exemple #6
0
class Annoy:
    def __init__(self):
        # to keep the thread & queue running
        self.process_flag = True
        self.q_maxsize = 10100
        self.process_thread = None
        self._lock = threading.Lock()
        self.process_timeout_sec = 5  # seconds
        # this is to keep track of all vectors inserted
        # for saving into disk and retrieve later
        self.index_disk = None
        try:
            with open('DB_config.yml', 'r') as stream:
                DB_config = yaml.safe_load(stream)

                # make sure to parse env variables to their expected type
                if os.getenv('FIXED_VEC_DIMENSION', None) is not None:
                    self.dim = int(os.getenv('FIXED_VEC_DIMENSION'))
                else:
                    self.dim = DB_config['annoy']['init']['vd']

                if os.getenv('ANNOY_NTREES', None) is not None:
                    self.dim = int(os.getenv('ANNOY_NTREES'))
                else:
                    self.dim = DB_config['annoy']['init']['ntrees']

                self.sim_metric = os.getenv(
                    'ANNOY_SIM_METRIC', DB_config['annoy']['init']['smetric'])
                self.n_trees = os.getenv('ANNOY_NTREES',
                                         DB_config['annoy']['init']['ntrees'])
                self.search_k = os.getenv(
                    'ANNOY_SEARCHK', DB_config['annoy']['init']['search_k'])
                self.modelLoaded = self.loadModelFromDisk()
        except Exception as e:
            print('Error initializing Annoy: ', e)

        # spawn process thread
        self.spawn()

    def __del__(self):
        self.process_flag = False
        if self.process_thread:
            self.process_thread.join()

    def spawn(self):
        # create pipeline to add documents
        self.pipeline = queue.Queue(maxsize=self.q_maxsize)
        # create process thread
        self.process_thread = threading.Thread(target=self.process,
                                               args=(),
                                               daemon=True)
        # start process thread
        self.process_thread.start()
        # return self.pipeline

    def initAnnoy(self):
        # only do if no index loaded from disk
        if not self.modelLoaded:
            print('Annoy init index')
            self.a_index = AnnoyIndex(self.dim, self.sim_metric)

        # Lock index read / wtite until it is built
        with self._lock:
            # build index
            build_ = self.a_index.build(self.n_trees)

            if build_:
                self.modelLoaded = self.saveModelToDisk()

        return self.modelLoaded

    def addVectors(self, documents):
        ids = []
        # add vectors
        for document in documents:
            # add document to queue
            self.pipeline.put_nowait(document)
            ids.append(document._id)
        return True, ids

    def process(self):
        while (self.process_flag):
            # print(list(self.pipeline.queue))

            # set a timeout till next vector indexing
            time.sleep(self.process_timeout_sec)

            # check if queue is not empty
            if self.pipeline.qsize() > 0:
                # Lock index read / wtite until it is built
                with self._lock:

                    # unbuild index first
                    self.a_index.unbuild()

                    # fetch all currently available documents from queue
                    while not self.pipeline.empty():
                        # extract document & contents
                        document = self.pipeline.get_nowait()
                        _id = document._id
                        vec = document.vector
                        vector_e = vec.e

                        # resize vectors
                        vector_e_l = len(vector_e)
                        # check if the vector length is below dimention limit
                        # then pad vector with 0 by dimension
                        if vector_e_l < self.dim:
                            vector_e.extend([0] * (self.dim - vector_e_l))
                        # make sure vector length doesn't exceed dimension limit
                        vector_e = vector_e[:self.dim]

                        # add vector to index
                        self.a_index.add_item(int(_id), vector_e)
                        # keep a copy for disk storage
                        list_ = vector_e
                        list_.append(int(_id))
                        # append to disk proxy
                        if self.index_disk is None:
                            self.index_disk = np.array([list_], dtype=float)
                        else:
                            self.index_disk = np.append(self.index_disk,
                                                        [list_],
                                                        axis=0)

                    # build vector
                    build_ = self.a_index.build(self.n_trees)

                # write to disk
                if build_:
                    self.modelLoaded = self.saveModelToDisk()

    def deleteVectors(self, ids):

        return True, ids

    def getNearest(self, matrix, k):
        ids = []
        dists = []

        # Lock index read / wtite until nearest neighbor search
        with self._lock:
            for vec_data in matrix:
                if self.search_k != -1:
                    _id, _dist = self.a_index.get_nns_by_vector(
                        vec_data, k, self.search_k, include_distances=True)
                else:
                    _id, _dist = self.a_index.get_nns_by_vector(
                        vec_data, k, include_distances=True)
            ids.append(_id)
            dists.append(_dist)

        return True, ids, dists

    def loadModelFromDisk(self):
        try:
            # prepare new index
            self.a_index = AnnoyIndex(self.dim, self.sim_metric)
            # read index
            self.index_disk = np.load(model_location + '.npy')
            # build Annoy Index
            for vec_ in self.index_disk.tolist():
                self.a_index.add_item(int(vec_[-1]), vec_[0:-1])
            # build index
            build_ = self.a_index.build(self.n_trees)
            print('Annoy index loading success')
            return True
        except Exception as e:
            print('Annoy index loading failed')
            return False

    def saveModelToDisk(self):
        try:
            # write index
            np.save(model_location, self.index_disk)
            print('Annoy index writing success')
            return True
        except:
            print('Annoy index writing failed')
            return False
Exemple #7
0
class Annoy:
    def __init__(self):
        self.total = 0
        # this is to keep track of all vectors inserted
        # for saving into disk and retrieve later
        self.index_disk = None
        try:
            with open('DB_config.yml', 'r') as stream:
                DB_config = yaml.safe_load(stream)
                self.dim = os.getenv('FIXED_VEC_DIMENSION',
                                     DB_config['annoy']['init']['vd'])
                self.sim_metric = os.getenv(
                    'ANNOY_SIM_METRIC', DB_config['annoy']['init']['smetric'])
                self.n_trees = os.getenv('ANNOY_NTREES',
                                         DB_config['annoy']['init']['ntrees'])
                self.modelLoaded = self.loadModelFromDisk()
        except Exception as e:
            print('Error initializing Annoy: ', e)

    def initAnnoy(self):
        # only do if no index loaded from disk
        if not self.modelLoaded:
            print('Annoy init index')
            self.a_index = AnnoyIndex(self.dim, self.sim_metric)

        # build index
        build_ = self.a_index.build(self.n_trees)

        if build_:
            self.modelLoaded = self.saveModelToDisk()
        return self.modelLoaded

    def addVectors(self, documents):
        # unbuild index first
        self.a_index.unbuild()
        self.total = self.total + len(documents)
        ids = []
        # add vectors
        for document in documents:
            _id = document._id
            vec = document.vector
            ids.append(_id)
            vector_e = vec.e
            vector_e_l = len(vector_e)
            # check if the vector length is below dimention limit
            # then pad vector with 0 by dimension
            if vector_e_l < self.dim:
                vector_e.extend([0] * (self.dim - vector_e_l))
            # make sure vector length doesn't exceed dimension limit
            vector_e = vector_e[:self.dim]

            # add vector
            self.a_index.add_item(int(_id), vector_e)
            # keep a copy for disk storage
            list_ = vector_e
            list_.append(int(_id))
            if self.index_disk is None:
                self.index_disk = np.array([list_], dtype=float)
            else:
                self.index_disk = np.append(self.index_disk, [list_], axis=0)

        # build vector
        build_ = self.a_index.build(self.n_trees)
        if build_:
            self.modelLoaded = self.saveModelToDisk()
        return self.modelLoaded, ids

    def deleteVectors(self, ids):

        return True, ids

    def getNearest(self, matrix, k):
        ids = []
        dists = []

        for vec_data in matrix:
            _id, _dist = self.a_index.get_nns_by_vector(vec_data,
                                                        k,
                                                        include_distances=True)
            ids.append(_id)
            dists.append(_dist)

        return True, ids, dists

    def loadModelFromDisk(self):
        try:
            # prepare new index
            self.a_index = AnnoyIndex(self.dim, self.sim_metric)
            # read index
            self.index_disk = np.load(model_location + '.npy')
            # build Annoy Index
            for vec_ in self.index_disk.tolist():
                self.a_index.add_item(int(vec_[-1]), vec_[0:-1])
            # build index
            build_ = self.a_index.build(self.n_trees)
            print('Annoy index loading success')
            return True
        except Exception as e:
            print('Annoy index loading failed')
            return False

    def saveModelToDisk(self):
        try:
            # write index
            np.save(model_location, self.index_disk)
            print('Annoy index writing success')
            return True
        except:
            print('Annoy index writing failed')
            return False
Exemple #8
0
class Annoy:
    def __init__(self):
        self.dim = 300
        self.sim_metric = 'angular'
        self.n_trees = 10
        self.search_k = 1
        self.modelLoaded = False # self.loadModelFromDisk(model_location)

    def initAnnoy(self, dim, metric, matrix):
        self.sim_metric = metric
        self.dim = dim

        print('Annoy init index')
        self.a_index = AnnoyIndex(self.dim, self.sim_metric)
        build_ = self.a_index.build(self.n_trees)

        #if build_:
        #    self.modelLoaded = self.saveModelToDisk(model_location, self.a_index)
        return build_ #self.modelLoaded

    def addVectors(self, documents):
        ids = []
        # unbuild annoy index before adding new data
        self.a_index.unbuild()
        # add vectors
        for document in documents:
            _id = document._id
            vec = document.vector
            ids.append(_id)
            vector_e = vec.e
            vector_e_l = len(vector_e)
            # check if the vector length is below dimention limit
            # then pad vector with 0 by dimension
            if vector_e_l < self.dim:
                vector_e.extend([0]*(self.dim-vector_e_l))
            # make sure vector length doesn't exceed dimension limit
            vector_e = vector_e[:self.dim]
        
            # add vector
            self.a_index.add_item(int(_id), vector_e)
            
        # build vector
        build_ = self.a_index.build(self.n_trees)
        # if build_:
            # self.modelLoaded = self.saveModelToDisk(model_location, self.a_index)
        return build_, ids

    def deleteVectors(self, ids):

        return True, ids

    def getNearest(self, matrix, k):
        ids = []
        dists = []

        for vec_data in matrix:
            _id, _dist = self.a_index.get_nns_by_vector(vec_data, k, search_k=self.search_k, include_distances=True)
            ids.append(_id)
            dists.append(_dist)

        return True, ids, dists

    def loadModelFromDisk(self, location):
        try:
            # read index
            self.a_index = AnnoyIndex(self.dim, self.sim_metric)
            self.a_index.load(location)
            print('Annoy index loading success')
            return True
        except: 
            print('Annoy index loading failed')
            return False

    def saveModelToDisk(self, location, index):
        try:
            # write index
            index.save(location)
            print('Annoy index writing success')
            return True
        except:
            print('Annoy index writing failed')
            return False
# query vectors count
num_queries = query_vecs.shape[0]

# create report file
with open(result_csv_file, 'w') as csvOut:

    writer = csv.writer(csvOut)
    # write header
    writer.writerow(['Number of trees',
                     'Indexing + Build time(sec)', 'Indexing + Build memory(GB)',
                     'Test time(sec)', 'Correct(Percent)'])

    for num_trees in num_trees_vals:

        annoy.unbuild()

        print('Building annoy with %d tree(s)...' % num_trees)
        sys.stdout.flush()

        # Record used memory at the start time
        memStart = mem_usage_gb()
        # Record the start time
        tmStart = time.time()

        # Build annoy tree
        annoy.build(num_trees)

        # Calculate build time
        tmBuild = time.time() - tmStart
        # Calculate used memory for build
Exemple #10
0
class alpha_KNN:
    def __init__(self,
                 capacity,
                 key_dimension,
                 delta=0.001,
                 alpha=0.1,
                 batch_size=1000):
        self.capacity = capacity
        self.curr_capacity = 0
        self.delta = delta
        self.alpha = 0.001

        self.embeddings = np.zeros((capacity, key_dimension))
        self.values = np.zeros(capacity)

        self.weights = np.zeros(capacity)

        from annoy import AnnoyIndex
        self.index = AnnoyIndex(key_dimension, metric='euclidean')
        self.index.set_seed(123)

        self.min_update_size = batch_size
        self.cached_keys = []
        self.cached_values = []
        self.cached_indices = []

        self.built_capacity = 0

    def _nn(self, keys, k):
        dists = []
        inds = []
        for key in keys:
            ind, dist = self.index.get_nns_by_vector(key + [1.0],
                                                     k,
                                                     include_distances=True)
            dists.append(dist)
            inds.append(ind)
        return dists, inds

    def _insert(self, keys, values, indices):
        self.cached_keys = self.cached_keys + keys
        self.cached_values = self.cached_values + values
        self.cached_indices = self.cached_indices + indices

        if len(self.cached_indices) >= self.min_update_size:
            self._rebuild_index()

    def _rebuild_index(self):
        self.index.unbuild()
        for i, ind in enumerate(self.cached_indices):
            new_key = self.cached_keys[i]
            new_value = self.cached_values[i]
            self.embeddings[ind] = new_key
            self.values[ind] = new_value
            self.weights[ind] = new_weight
            self.index.add_item(ind, new_key + [new_weight])

        self.cached_keys = []
        self.cached_values = []
        self.cached_indices = []

        self.index.build(50)
        self.built_capacity = self.curr_capacity

    def queryable(self, k):
        return (self.built_capacity > k)

    # Returns the stored embeddings and values of the closest embeddings
    def query(self, keys, k):
        _, indices = self._nn(keys, k)

        embs = []
        values = []
        weights = []
        for ind in indices:
            embs.append(self.embeddings[ind])
            values.append(self.values[ind])
            weights.append(self.weights[ind])

        return embs, values, weights

    # Adds new embeddings (and values) to the dictionary
    def add(self, keys, values):

        if self.queryable(5):
            dists, inds = self._nn(keys, k=5)
            for ind, dist in enumerate(dists):
                for i, d in enumerate(dist):
                    index = inds[ind][i]
                    self.weights[index] *= (1 - self.alpha)

        indices, keys_, values_ = [], [], []
        for i, _ in enumerate(keys):
            if self.curr_capacity >= self.capacity:
                # find the LRU entry
                index = np.argmin(self.weights)
            else:
                index = self.curr_capacity
                self.curr_capacity += 1
            self.weights[index] = 1.0
            indices.append(index)
            keys_.append(keys[i])
            values_.append(values[i])

        self._insert(keys_, values_, indices)
Exemple #11
0
class annoy_dict(LRU_KNN):
    def __init__(self,
                 capacity,
                 key_dimension,
                 delta=0.001,
                 alpha=0.1,
                 batch_size=100):
        LRU_KNN.__init__(self, capacity, key_dimension, delta, alpha)

        from annoy import AnnoyIndex
        self.index = AnnoyIndex(key_dimension, metric='euclidean')
        self.index.set_seed(123)

        self.initial_update_size = batch_size
        self.min_update_size = self.initial_update_size
        self.cached_keys = []
        self.cached_values = []
        self.cached_indices = []

        self.built_capacity = 0

    def _nn(self, keys, k):
        dists = []
        inds = []
        for key in keys:
            ind, dist = self.index.get_nns_by_vector(key,
                                                     k,
                                                     include_distances=True)
            dists.append(dist)
            inds.append(ind)
        return dists, inds

    def _insert(self, keys, values, indices):
        self.cached_keys = self.cached_keys + keys
        self.cached_values = self.cached_values + values
        self.cached_indices = self.cached_indices + indices

        if len(self.cached_indices) >= self.min_update_size:
            self.min_update_size = max(self.initial_update_size,
                                       self.curr_capacity * 0.02)
            self._update_index()

    def _update_index(self):
        self.index.unbuild()
        for i, ind in enumerate(self.cached_indices):
            new_key = self.cached_keys[i]
            new_value = self.cached_values[i]
            self.embeddings[ind] = new_key
            self.values[ind] = new_value
            self.index.add_item(ind, new_key)

        self.cached_keys = []
        self.cached_values = []
        self.cached_indices = []

        self.index.build(50)
        self.built_capacity = self.curr_capacity

    def _rebuild_index(self):
        self.index.unbuild()
        for ind, emb in enumerate(self.embeddings[:self.curr_capacity]):
            self.index.add_item(ind, emb)
        self.index.build(50)
        self.built_capacity = self.curr_capacity

    def queryable(self, k):
        return (LRU_KNN.queryable(self, k) and (self.built_capacity > k))
Exemple #12
0
class Index:
    """Procedures over multidimensional spaces."""

    def __init__(self, size, data_dir=None, trees=.001, volatile=False):
        """
        Indexing tensors operations and nearest neighbours search.

        Parameters
        ----------
        size: int
            Shape of unidimensional vectors which will be indexed

        data_dir: str
            Location where to load or save the index

        trees (optional): float
            Defines the number of trees to create based on the dataset
            size. Should be a number between 0 and 1.

        volatile (optional): bool
            If the index will be temporary or not.
        """
        self._position = -1
        self._size = size
        self._data_dir = data_dir
        self._trees = trees
        self._volatile = volatile

        if self._data_dir and not self._volatile:
            if os.path.isfile(self._data_dir):
                raise OSError('data_dir parameter is not a directory')

            os.makedirs(self._data_dir, exist_ok=True)
            self._path = os.path.join(self._data_dir, self.index_name)
        elif not self._data_dir and not self._volatile:
            raise NoDataDirForPermanentIndex
        elif not self._data_dir and self._volatile:
            _temp_file = FileIO.safe_temp_file()
            self._data_dir = os.path.dirname(_temp_file)
            self._path = _temp_file

        else:
            raise DataDirDefinedForVolatileIndex

        if os.path.isfile(self._path):
            try:
                self.tree = AnnoyIndex(size, metric='angular')

                self.tree.load(self._path)

                self._is_new_index = False
            except OSError as os_error:
                raise FileIsNotAnIndex from os_error
        else:
            self.tree = AnnoyIndex(size, metric='angular')
            self._is_new_index = True

        self._image_database = ImageDatabase(
            import_images=True,
            data_dir=self._data_dir,
        )

    @property
    def size(self):
        """Getter for property size."""
        return self._size

    @property
    def path(self):
        """Getter for property path."""
        return self._path

    @property
    def index_name(self):
        """Getter for property index_name."""
        return 'pupyl.index'

    @property
    def trees(self):
        """Getter for property trees."""
        return self._trees

    @property
    def volatile(self):
        """Getter for property volatile."""
        return self._volatile

    @trees.setter
    def trees(self, trees):
        """Setter for property trees."""
        self._trees = trees

    def __enter__(self):
        """Context opening index."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context closing index."""
        if not exc_type:

            if self._is_new_index:
                self.tree.build(self.size << intmul >> self.trees)

                self.tree.save(self.path)

            self.tree.unload()

    def items(self):
        """Return the indexed items."""
        for item in range(len(self)):
            yield item

    def values(self):
        """Return the indexed values."""
        for item in self.items():
            yield self.tree.get_item_vector(item)

    def items_values(self):
        """Return tuples with all items and values."""
        for item, value in zip(self.items(), self.values()):
            yield item, value

    def __getitem__(self, position):
        """Return item at index. Supports negative slicing."""
        if position >= 0:
            return self.tree.get_item_vector(position)

        return self.tree.get_item_vector(
            len(self) - abs(position)
        )

    def refresh(self):
        """Update all information regarding index file."""
        self.tree.unload()
        self.tree.load(self.path)

    def append(self, tensor, check_unique=False):
        """
        Insert a new tensor at the end of the index.
        Be advised that this operation is linear on index size ($O(n)$).

        Parameters
        ----------
        tensor: numpy.ndarray or list
            A vector to insert into index.

        check_unique (optional, default: False): bool
            Defines if append method should verify the existence
            of a really similar tensor on the current index. In other words,
            it checks for the unicity of the value. Be advised that this check
            creates an overhead on the append process.
        """
        if sum(tensor) == 0.:
            raise NullTensorError

        if self._is_new_index:

            index_it = True

            if check_unique and len(self) > 1:

                self.tree.build(self.size << intmul >> self.trees)

                result = self.item(
                    self.index(tensor),
                    top=1,
                    distances=True
                )

                if result[1][0] <= .05:
                    warning(
                        'Tensor being indexed already exists in '
                        'the database and the check for duplicates '
                        'are on. Refusing to store again this tensor.'
                    )

                    index_it = False

                self.tree.unbuild()

            if index_it:
                self.tree.add_item(len(self), tensor)

        else:

            with Index(self.size, volatile=True, trees=self.trees) as tmp_idx:
                for value in self.values():
                    tmp_idx.append(value, check_unique)

                tmp_idx.append(tensor, check_unique)

                _temp_file = tmp_idx.path

            move(_temp_file, self.path)

            self.refresh()

    def remove(self, position):
        """
        Remove the tensor at index from the database.
        Be advised that this operation is linear on index size ($O(n)$).

        Parameters
        ----------
        position: int
            The index which must be removed
        """
        if self._is_new_index:
            raise IndexNotBuildYet

        if position > len(self):
            raise IndexError

        with Index(self.size, volatile=True, trees=self.trees) as tmp_idx:
            shrink = False

            for item, value in self.items_values():
                if item == position:
                    shrink = True
                else:
                    if shrink:
                        item -= 1

                    tmp_idx.tree.add_item(item, value)

            _temp_file = tmp_idx.path

        move(_temp_file, self.path)

        self.refresh()

    def pop(self, position=None):
        """
        Pop-out the index at position, returning it.
        Be advised that this operation is linear on index size ($O(n)$).

        Parameters
        ----------
        position (optional) (default: last position): int
            Removes and returns the value at position.

        Returns
        ----------
        int:
            With the popped item.
        """
        if position:
            value = self[position]
        else:
            inverse_index = -1
            value = self[inverse_index]
            position = len(self) + inverse_index

        self.remove(position)

        return value

    def index(self, tensor):
        """
        Search for the first most similar image compared to the query.

        Parameters
        ----------
        tensor: numpy.ndarray or list
            A vector to search for the most similar.

        Returns
        ----------
        int:
            Describing the most similar resulting index.
        """
        return self.tree.get_nns_by_vector(tensor, n=1)[0]

    def item(self, position, top=10, distances=False):
        """
        Search the index using an internal position

        Parameters
        ----------
        position: int
            The item id within index.

        top (optional, default 10): int
            How many similar items should be returned.

        distances (optional, default 10): bool
            If should be returned also the distances between
            items.

        Returns
        -------
        if distances is True:
            list of tuples:
                Containing pairs of item and distances
        else:
            list:
                Containing similar items.
        """
        return self.tree.get_nns_by_item(
            position,
            top,
            include_distances=distances
        )

    def search(self, tensor, results=16):
        """
        Search for the first most similars image compared to the query.

        Parameters
        ----------
        tensor: numpy.ndarray or list
            A vector to search for the most similar images.

        results: int
            How many results to return. If similar images are less than
            results, it exhausts and will be returned actual total.
        """
        for result in self.tree.get_nns_by_vector(tensor, n=results):
            yield result

    def __len__(self):
        """Return how many items are indexed."""
        return self.tree.get_n_items()

    def __iter__(self):
        """Return an iterable."""
        for value in self.values():
            yield value

    def __next__(self):
        """Iterate over the iterable."""
        self._position += 1

        all_values = list(self.values())

        if self._position < len(all_values):
            return all_values[self._position]

        raise StopIteration

    def group_by(self, top=10, **kwargs):
        """
        Returns all (or some position) on the index that is similar
        with other elements inside index.

        Parameters
        ----------
        top (optional, default 10): int
            How many similar internal images should be returned

        position (optional): int
            Returns the groups based on a specified position.

        Returns
        -------
        list:
            If a position is defined

        or

        dict:
            Generator with a dictionary containing internal ids
            as key and a list of similar images as values.
        """
        position = kwargs.get('position')

        if len(self) <= 1:
            raise EmptyIndexError

        if top >= 1:
            if isinstance(position, int):

                results = self.item(position, top + 1)

                if len(results) > 1:

                    yield results[1:]

            else:

                for item in self.items():

                    yield {
                        item: self.item(
                            item,
                            top + 1
                        )[1:]
                    }
        else:

            raise TopNegativeOrZero

    def export_by_group_by(self, path, top=10, **kwargs):
        """
        Saves images, creating directories, based on their groups.

        Parameters
        ----------
        path: str
            Place to create the directories and export images

        top (optional, default 10):
            How many similar internal images should be returned

        position (optional): int
            Returns the groups based on a specified position.
        """
        for element in FileIO.progress(
            self.group_by(
                top=top,
                position=kwargs.get('position')
            )
        ):
            if isinstance(element, dict):
                item = [*element.keys()][0]
                similars = element[item]
            elif isinstance(element, list):
                item = kwargs['position']
                similars = element

            save_path = os.path.join(
                path,
                str(item)
            )

            os.makedirs(
                save_path,
                exist_ok=True
            )

            try:
                copyfile(
                    self._image_database.mount_file_name(
                        item,
                        'jpg'
                    ),
                    os.path.join(
                        save_path,
                        'group.jpg'
                    )
                )
            except FileNotFoundError:
                continue

            for rank, similar in enumerate(similars):

                original_file_path = self._image_database.mount_file_name(
                    similar,
                    'jpg'
                )

                try:
                    copyfile(
                        original_file_path,
                        os.path.join(
                            save_path,
                            f'{rank + 1}.jpg'
                        )
                    )
                except FileNotFoundError:
                    continue
Exemple #13
0
class LRU_KNN:
    def __init__(self, capacity, key_dim, value_dim, batch_size):
        self.capacity = capacity
        self.curr_capacity = 0

        self.states = np.zeros((capacity, key_dim))
        self.values = np.zeros((capacity, value_dim))
        self.lru = np.zeros(capacity)
        self.tm = 0.0

        self.index = AnnoyIndex(key_dim, metric="euclidean")
        self.index.set_seed(123)

        self.initial_update_size = batch_size
        self.min_update_size = self.initial_update_size
        self.cached_states = []
        self.cached_values = []
        self.cached_indices = []

    def nn(self, keys, k):
        dists = []
        inds = []
        for key in keys:
            ind, dist = self.index.get_nns_by_vector(key,
                                                     k,
                                                     include_distances=True)
            dists.append(dist)
            inds.append(ind)
        return dists, inds

    def query(self, keys, k):
        _, indices = self.nn(keys, k)
        states = []
        values = []

        for ind in indices:
            self.lru[ind] = self.tm
            states.append(self.states[ind])
            values.append(self.values[ind])
        self.tm += 0.001
        return states, values

    def _insert(self, keys, values, indices):
        self.cached_states = self.cached_states + keys
        self.cached_values = self.cached_values + values
        self.cached_indices = self.cached_indices + indices

        if len(self.cached_states) >= self.min_update_size:
            self.min_update_size = max(self.initial_update_size,
                                       self.curr_capacity * 0.02)
            self._update_index()

    def _update_index(self):
        self.index.unbuild()
        for i, ind in enumerate(self.cached_indices):
            new_state = self.cached_states[i]
            new_value = self.cached_values[i]

            self.states[ind] = new_state
            self.values[ind] = new_value
            self.index.add_item(ind, new_state)

        self.cached_states = []
        self.cached_values = []
        self.cached_indices = []

        self.index.build(50)
        self.built_capacity = self.curr_capacity

    def _rebuild_index(self):
        self.index.unbuild()
        for ind, state in enumerate(self.states[:self.curr_capacity]):
            self.index.add_item(ind, state)
        self.index.build(50)
        self.built_capacity = self.curr_capacity
Exemple #14
0
class Memory:
    def __init__(self, capacity, state_dim, value_dim):
        self.capacity = capacity
        print("state_dim:", state_dim)
        self.states = np.zeros((capacity, state_dim))
        self.values = np.zeros((capacity, value_dim))

        self.curr_capacity = 0
        self.curr_ = 0
        self.lru = np.zeros(capacity)
        self.tm = 0

        self.cached_states = []
        self.cached_values = []
        self.cached_indices = []

        self.index = AnnoyIndex(state_dim)
        self.index.set_seed(123)
        self.update_size = 1
        self.build_capacity = 0

    def sample_knn_test(self, state, k):
        inds, dists = self.index.get_nns_by_vector(state,
                                                   k,
                                                   include_distances=True)
        self.tm += 0.01
        self.lru[inds] = self.tm
        return self.states[inds], self.values[inds], dists

    def sample_knn(self, states, k):
        dists = []
        inds = []
        for state in states:
            ind, dist = self.index.get_nns_by_vector(state,
                                                     k,
                                                     include_distances=True)
            inds.append(ind)
            dists.append(dist)
        # inds = np.reshape(np.array(inds), -1)
        self.tm += 0.01
        self.lru[inds] = self.tm
        return self.states[inds], self.values[inds], dists

    def sample(self, n_samples):
        if self.curr_capacity < n_samples or n_samples == 0:
            idx = np.random.choice(np.arange(len(self.states)),
                                   n_samples,
                                   replace=False)
        else:
            idx = np.random.choice(np.arange(self.curr_capacity),
                                   n_samples,
                                   replace=False)
        self.tm += 0.01
        self.lru[idx] = self.tm
        embs = self.states[idx]
        values = self.values[idx]

        return embs, values

    def add_knn(self, states, values):
        self._add_knn(states, values)

    def add_knn_lru(self, states, values):
        self._add_knn(states, values, lru=True)

    def add(self, states, values):
        self._add(states, values)

    def add_lru(self, states, values):
        self._add(states, values, lru=True)

    def add_rand(self, states, values):
        self._add(states, values, rand=True)

    def _insert(self, states, values, indices):
        self.cached_states = self.cached_states + states
        self.cached_values = self.cached_values + values
        self.cached_indices = self.cached_indices + indices
        if len(self.cached_states) >= self.update_size:
            self._update_index()

    def _update_index(self):
        self.index.unbuild()
        for i, ind in enumerate(self.cached_indices):
            self.states[ind] = self.cached_states[i]
            self.values[ind] = self.cached_values[i]
            self.index.add_item(ind, self.cached_states[i])

        self.index.build(50)
        self.build_capacity = self.curr_capacity

        self.cached_states = []
        self.cached_values = []
        self.cached_indices = []

    def _rebuild_index(self):
        self.index.unbuild()
        for ind, state in enumerate(self.states[:self.curr_capacity]):
            self.index.add_item(ind, state)
        self.index.build(50)
        self.build_capacity = self.curr_capacity

    def _add_knn(self, states, values, lru=False):
        # print(states)
        indices = []
        states_ = []
        values_ = []
        for i, _ in enumerate(states):
            if lru:
                if self.curr_capacity >= self.capacity:
                    ind = np.argmin(self.lru)
                else:

                    ind = self.curr_capacity
                    self.curr_capacity += 1
            else:
                if self.curr_capacity >= self.capacity:
                    self.curr_ = (self.curr_ + 1) % self.capacity
                    ind = self.curr_
                else:
                    ind = self.curr_capacity
                    self.curr_capacity += 1

            self.lru[ind] = self.tm
            indices.append(ind)
            states_.append(states[i])
            values_.append(values[i])
        self._insert(states_, values_, indices)

    def _add(self, states, values, rand=False, lru=False):
        for i, state in enumerate(states):
            if self.curr_capacity < self.capacity:
                self.curr_ = (self.curr_ + 1) % self.capacity
                # self.states[self.curr_] = state
                # self.values[self.curr_] = values[i]
                if self.curr_capacity < self.capacity:
                    self.curr_capacity += 1
            else:
                if lru:
                    self.curr_ = np.argmin(self.lru)
                if rand:
                    self.curr_ = np.random.choice(np.arange(
                        self.curr_capacity),
                                                  1,
                                                  replace=False)

                if not lru and not rand:
                    self.curr_ = (self.curr_ + 1) % self.capacity
            self.states[self.curr_] = state
            self.values[self.curr_] = values[i]

    @property
    def length(self):
        # assert self.index.get_n_items() == self.curr_capacity
        # return self.curr_capacity
        return self.index.get_n_items()
Exemple #15
0
 def test_unbuild_with_loaded_tree(self):
     i = AnnoyIndex(10)
     i.load('test/test.tree')
     i.unbuild()
Exemple #16
0
class Annoy:
    def __init__(self, model_location):
        # set model location
        self.model_location = model_location

        # to keep the thread & queue running
        self.process_flag = True
        self.q_maxsize = int(os.environ["FIXED_Q_LEN"])
        self.build_batch_size = int(os.environ["FIXED_Q_LEN"])
        self.process_thread = None
        self._lock = threading.Lock()
        self.process_timeout_sec = int(os.environ["THREAD_SLEEP"])  # seconds
        # this is to keep track of all vectors inserted
        # for saving into disk and retrieve later
        self.index_disk = None
        try:
            # make sure to parse env variables to their expected type
            self.dim = int(os.environ["FIXED_VEC_DIMENSION"])
            self.sim_metric = str(os.environ["ANNOY_SIM_METRIC"])
            self.n_trees = int(os.environ["ANNOY_NTREES"])
            self.search_k = int(os.environ["ANNOY_SK"])
            self.model_loaded = self.load_model_from_disk()
            if not self.model_loaded:
                if self.init_annoy():
                    logging.debug("Annoy Init done")
                else:
                    logging.debug("Annoy Init Failed")
        except Exception as e:
            logging.error('Error initializing Annoy: ' + e)

        # spawn process thread
        self.spawn()

    def __del__(self):
        self.process_flag = False
        if self.process_thread:
            self.process_thread.join()

    def spawn(self):
        # create pipeline to add documents
        self.pipeline = queue.Queue(maxsize=self.q_maxsize)
        # create process thread
        self.process_thread = threading.Thread(target=self.process,
                                               args=(),
                                               daemon=True)
        # start process thread
        self.process_thread.start()
        # return self.pipeline

    def init_annoy(self):
        # only do if no index loaded from disk
        if not self.model_loaded:
            logging.debug('Annoy init index')
            self.a_index = AnnoyIndex(self.dim, self.sim_metric)

        # Lock index read / wtite until it is built
        with self._lock:
            # build index
            build_ = self.a_index.build(self.n_trees)

            if build_:
                self.model_loaded = self.save_model_to_disk()

        return self.model_loaded

    def add_vectors(self, documents):
        # add documents to queue
        self.pipeline.put({"action": "add", "docs": documents})

        return True

    def process(self):
        while (self.process_flag):
            # set a timeout till next vector indexing
            time.sleep(self.process_timeout_sec)

            # check if queue is not empty
            if self.pipeline.qsize() > 0:
                # Lock index read / wtite until it is built
                with self._lock:

                    # unbuild index first
                    self.a_index.unbuild()
                    len_documents = 0

                    # fetch all currently available documents from queue
                    while not self.pipeline.empty():
                        # extract document & contents
                        qitem = self.pipeline.get_nowait()
                        if qitem["action"] == "add":
                            documents = qitem["docs"]
                            len_documents += len(documents)
                            for document in documents:
                                _id = document["_id"]
                                vector_e = document["code"]

                                # add vector to index
                                self.a_index.add_item(int(_id), vector_e)
                                # append to disk proxy
                                if self.index_disk is None:
                                    self.index_disk = np.array(
                                        [vector_e + [int(_id)]], dtype=float)
                                else:
                                    self.index_disk = np.append(
                                        self.index_disk,
                                        [vector_e + [int(_id)]],
                                        axis=0)
                        elif qitem["action"] == "delete":
                            ids = qitem["ids"]
                            len_documents += len(ids)
                            # reset
                            zero_ = np.zeros(self.dim + 1)
                            for id_ in ids:
                                # add zero vector to index
                                self.a_index.add_item(int(id_),
                                                      zero_[:-1].tolist())
                            # reset npy disk array
                            if self.index_disk is not None:
                                self.index_disk[ids] = zero_

                        # take a rest if doc length is > batch_size
                        if len_documents > self.build_batch_size:
                            break

                    # build vector
                    build_ = self.a_index.build(self.n_trees, n_jobs=-1)

                # write to disk
                if build_:
                    self.model_loaded = self.save_model_to_disk()

    def delete_vectors(self, ids):
        # add documents to queue
        self.pipeline.put({"action": "delete", "ids": ids})

        return True, ids

    def get_nearest_k(self, matrix, k):
        ids = []
        dists = []

        # Lock index read / wtite until nearest neighbor search
        with self._lock:
            for vec_data in matrix:
                if self.search_k != -1:
                    _id, _dist = self.a_index.get_nns_by_vector(
                        vec_data, k, self.search_k, include_distances=True)
                else:
                    _id, _dist = self.a_index.get_nns_by_vector(
                        vec_data, k, include_distances=True)
            ids.append(_id)
            dists.append(_dist)

        return ids, dists

    def get_nearest_rad(self, matrix, rad):
        ids = []
        dists = []

        # Lock index read / wtite until nearest neighbor search
        with self._lock:
            for vec_data in matrix:
                if self.search_k != -1:
                    _id, _dist = self.a_index.get_nns_by_vector(
                        vec_data, k, self.search_k, include_distances=True)
                else:
                    _id, _dist = self.a_index.get_nns_by_vector(
                        vec_data, k, include_distances=True)
            ids.append(_id)
            dists.append(_dist)

        return ids, dists

    def load_model_from_disk(self):
        try:
            # prepare new index
            self.a_index = AnnoyIndex(self.dim, self.sim_metric)
            # read index
            self.index_disk = np.load(self.model_location + '.npy')
            # build Annoy Index
            for vec_ in self.index_disk.tolist():
                self.a_index.add_item(int(vec_[-1]), vec_[0:-1])
            # build index
            build_ = self.a_index.build(self.n_trees)
            logging.debug('Annoy index loading success')
            return True
        except Exception as e:
            logging.debug('Annoy index loading failed. Creating new index..')
            return False

    def save_model_to_disk(self):
        try:
            # write index
            np.save(self.model_location, self.index_disk)
            logging.debug('Annoy index writing success')
            return True
        except Exception as e:
            logging.error('Annoy index writing failed' + e)
            return False
Exemple #17
0
class Annoy_Dict(LRU_KNN_ANNOY):
    def __init__(self, config):
        super(Annoy_Dict, self).__init__(config)
        self.config = config
        self.key_dim = self.config.knn_key_dim

        self.index = AnnoyIndex(self.key_dim, metric='euclidean')
        self.index.set_seed(123)

        self.initial_update_size = self.config.knn_dict_update_step
        self.min_update_size = self.initial_update_size

        self.cached_embs = []
        self.cached_vals = []
        self.cached_terminals = []
        self.cached_embs_next = []
        self.cached_indices = []

        self.build_capacity = 0

    def _nn(self, keys, k):
        assert np.ndim(keys) == 2
        dists = []
        inds = []
        for key in keys:
            ind, dist = self.index.get_nns_by_vector(key,
                                                     k,
                                                     include_distances=True)
            dists.append(dist)
            inds.append(ind)
        return dists, inds

    def _insert(self, keys, values, terminal, keys_next, indices):
        self.cached_embs = self.cached_embs + keys
        self.cached_vals = self.cached_vals + values
        self.cached_terminals = self.cached_terminals + terminal
        self.cached_embs_next = self.cached_embs_next + keys_next
        self.cached_indices = self.cached_indices + indices

        if len(self.cached_indices) >= self.min_update_size:
            # self.min_update_size = max(self.initial_update_size, self.curr_capacity * 0.02)
            self._update_index()

    def _update_index(self):
        self.index.unbuild()
        for i, ind in enumerate(self.cached_indices):
            new_emb = self.cached_embs[i]
            new_val = self.cached_vals[i]
            new_t = self.cached_terminals[i]
            new_emb_next = self.cached_embs_next[i]
            self.embs[ind] = new_emb
            self.values[ind] = new_val
            self.terminal[ind] = new_t
            self.embs_next[ind] = new_emb_next
            self.index.add_item(ind, new_emb)
        self.cached_embs = []
        self.cached_vals = []
        self.cached_terminals = []
        self.cached_embs_next = []
        self.cached_indices = []

        self.index.build(50)
        self.build_capacity = self.curr_capacity

    def _rebuild(self):
        self.index.unbuild()
        for ind, emb in enumerate(self.embs[:self.curr_capacity]):
            self.index.add_item(ind, emb)
        self.index.build(50)
        self.build_capacity = self.curr_capacity

    def queryable(self, k):
        return (LRU_KNN_ANNOY.queryable(self, k) and (self.build_capacity > k))

    @property
    def capacity_(self):
        # print("self.index.get_n_items: ", self.index.get_n_items())
        return self.index.get_n_items()