def retrieve_and_build(self):

        """
        Retrieve embeddings stored in HDF5 files and build and save the graph.
        :return: None
        """

        try:
            embs = self.retrieve_embeddings()

            self.logger.debug("Creating annoy index. Embeddings count: %s.", len(embs))
            t = AnnoyIndex(self.f_dim)
            for i in range(0, len(embs)):
                t.add_item(i, embs[i])

            trees_num = 10  # Number of trees in the index
            t.build(trees_num)

            self.logger.debug("Building annoy index.")

            t.save(os.path.join(self.organization_dir, "lshforest_" + self.embedding_type + ".ann"))

            self.logger.debug("Saved annoy index.")
            t.unload()

        except Exception:
            message = "Failed to build recommendation model."
            self.logger.exception(message)
Example #2
0
def combine_labels(names_list,annoyfeature):
    ann_considered = []
    ret_list = []
    distances_considered = {}
    distances = []
    print 'in function',annoyfeature.shape
    f= 360+512
    
    print 'annoy made'
    ann_names = os.listdir(ann_add)
    for ann_n in ann_names:
        ann_considered.append(ann_n.split('.')[0])
    print 'considered'
    for ann_names3 in ann_considered:
        if ann_names3 in names_list:
            ret_list.append(ann_names3)
    print 'list is',ret_list
    for rets in ret_list:
        ann_load = ann_add+"\\"+rets+".ann"
        print 'loading is',ann_load
        t2 = AnnoyIndex(f,metric = 'angular')
        t2.load(ann_load)
        temp_dist = t2.get_nns_by_item(annoyfeature.astype(np.int32), 1,search_k=-1, include_distances=True)
        #print t.get_nns_by_item(annoyfeature, 1,search_k=-1, include_distances=True)
        print 'annot dist',temp_dist
        t2.unload()
        distances_considered[rets] = temp_dist
        distances.append(temp_dist)
    print 'list and dic is'
    print distances_considered,distances
    return distances_considered,distances
def annoy(als_model, user_truth, test_user, sc, n_trees=10, search_k=-1):
    print('creating annoy baseline with n_trees: ' + str(n_trees), 'search_k: ' + str(search_k))
    sc = SparkContext.getOrCreate()
    factors = als_model.userFactors
    size = factors.limit(1).select(F.size('features').alias('calculation')).collect()[0].calculation
    time_start = time()
    annoy_list = AnnoyIndex(size)
    for row in factors.collect():
        annoy_list.add_item(row.id, row.features)
    annoy_list.build(n_trees)
    annoy_list.save('./home/hj1325/final-project-final-project/annoy_list' + str(n_trees) + '_k_' + str(search_k) +
                    '.ann')
    recommend_list = [(user.user_label, annoy_list.get_nns_by_item(int(user.user_label), 500)) for user in
                      test_user.collect()]
    temp = sc.parallelize(recommend_list)
    print('recommendations has been created')
    recommend = spark.createDataFrame(temp, ['user_label', 'recommendation'])
    predictions = recommend.join(user_truth, recommend.user_label == user_truth.user_label, 'inner')

    score = predictions.select('recommendation', 'truth').rdd.map(tuple)
    metrics = RankingMetrics(score)
    precision = metrics.precisionAt(500)
    mean_average_precision = metrics.meanAveragePrecision
    print('time taken: ' + str(time() - time_start))
    print('precision at 500: ' + str(precision))
    print('mean average precision: ' + str(mean_average_precision))
    annoy_list.unload()
Example #4
0
def annoy(alsmodel, groundTruth, testUsers, sc, n_trees=10, search_k=-1):
    print(f"annoy index version with n_trees: {n_trees}, search_k: {search_k}")
    sc = SparkContext.getOrCreate()
    userfactors = alsmodel.userFactors
    size = userfactors.limit(1).select(
        F.size("features").alias("calc_size")).collect()[0].calc_size
    start_time = time()
    a = AnnoyIndex(size)
    for row in userfactors.collect():
        a.add_item(row.id, row.features)
    a.build(n_trees)
    a.save("./anns/annoy_t" + str(n_trees) + "_k_" + str(search_k) + ".ann")
    rec_list = [(u.userNew, a.get_nns_by_item(int(u.userNew), 500))
                for u in testUsers.collect()]
    temp = sc.parallelize(rec_list)
    print("created recs")
    rec = spark.createDataFrame(temp, ["userNew", "recs"])
    predictions = rec.join(groundTruth, rec.userNew == groundTruth.userNew,
                           'inner')

    scoreAndLabels = predictions.select('recs', 'truth').rdd.map(tuple)
    metrics = RankingMetrics(scoreAndLabels)
    precision = metrics.precisionAt(500)
    MAP = metrics.meanAveragePrecision
    print(f"time elapsed: {time()-start_time}s")
    print(f"precision at 500: {precision}")
    print(f"MAP: {MAP}")
    a.unload()
Example #5
0
 def updateAnnoy(self):
     annoy = AnnoyIndex(300)
     self_annoy = AnnoyIndex(300)
     i_counter = 0
     for group in Group.select():
         if i_counter % 100 == 0:
             logging.info("processed {}".format(i_counter))
         for sentence in group.sentences:
             try:
                 annoy.add_item(group.id,
                                self.getSentenceVec(sentence.text))
             except Exception as e:
                 # logging.exception(e)
                 # logging.info(sentence.text)
                 pass
         try:
             arr = [
                 self.getSentenceVec(reply.text) for reply in group.replys
             ]
             self_annoy.add_item(group.id, np.mean(arr, axis=0))
         except Exception as e:
             # logging.exception(e)
             pass
         i_counter += 1
     annoy.build(50)
     annoy.save(self.SENTENCE_ANN + ".new")
     annoy.unload()
     self_annoy.build(50)
     self_annoy.save(self.SELF_SENTENCE_ANN + ".new")
     annoy.unload()
Example #6
0
def load_embeddings(index_path, embedding_size, num_nodes):
    # Load Annoy index which stores the embedded vectors
    index = AnnoyIndex(embedding_size)
    index.load(index_path)

    embeddings = [index.get_item_vector(i) for i in range(num_nodes)]

    # Unload the index to save memory (loading mmaps the index file)
    index.unload()

    # V x D matrix of embeddings
    return np.array(embeddings)
 def test_on_disk(self):
     f = 2
     i = AnnoyIndex(f, 'euclidean')
     i.on_disk_build('on_disk.ann')
     self.add_items(i)
     i.build(10)
     self.check_nns(i)
     i.unload()
     i.load('on_disk.ann')
     self.check_nns(i)
     j = AnnoyIndex(f, 'euclidean')
     j.load('on_disk.ann')
     self.check_nns(j)
Example #8
0
 def build(self, index_file, vectors, sender_urn):
     logger.info("Building {0}".format(index_file))
     logger.info("Vectors {0}".format(vectors))
     new_index = AnnoyIndex(self.feat_size, metric='euclidean')
     for idx, v in enumerate(vectors):
         logger.info("Adding item {0} with id {1}".format(v, idx))
         new_index.add_item(idx, v)
     new_index.build(self.n_trees)
     logger.info("Saving index file {0}".format(index_file))
     new_index.save(index_file)
     new_index.unload()
     pykka.ActorRegistry.get_by_urn(actor_urn=sender_urn).proxy().load()
     logger.info("Sent load command to worker")
Example #9
0
def get_nns(annoyindex_tempfile, img_list):
    # Calculates the nearest neighbors of the master item
    t = AnnoyIndex(DIMS, metric='angular')
    t.load(annoyindex_tempfile)
    list_of_thumb_nearest_neighbors = []
    for item in img_list:
        nearest_neighbors = t.get_nns_by_item(item, n_nearest_neighbors)
        thumb_nearest_neighbors = []
        for j in nearest_neighbors:
            thumb_nearest_neighbors.append(j)
        list_of_thumb_nearest_neighbors.append(thumb_nearest_neighbors)
    t.unload()
    return list_of_thumb_nearest_neighbors
    def build_empty_index(self) -> None:

        """
        Build empty index - used when building index after deleting takes time.
        :return: None
        """

        t = AnnoyIndex(self.f_dim)

        trees_num = 10  # Number of trees in the index
        t.build(trees_num)
        t.save(os.path.join(self.organization_dir, "lshforest_" + self.embedding_type + ".ann"))
        print('organization ann has been created')
        t.unload()
    def retrieve_and_build(self):

        """
        Build empty index - used when building index after deleting takes time.
        :return: None
        """

        t = AnnoyIndex(self.f_dim)

        trees_num = 10  # Number of trees in the index
        t.build(trees_num)

        t.save(os.path.join(self.organization_dir, "lshforest.ann"))
        t.unload()
Example #12
0
def test_dense_annoy_index(annoy_index_file):
    data = np.random.choice([0, 1], size=(10, 5))
    index = build_annoy_index(data, annoy_index_file)
    assert os.path.exists(annoy_index_file)

    loaded_index = AnnoyIndex(5, metric='angular')
    loaded_index.load(annoy_index_file)

    assert index.f == loaded_index.f == 5
    assert index.get_n_items() == loaded_index.get_n_items() == 10
    assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5)

    index.unload()
    loaded_index.unload()
def annoy_model(als_model,
                sc,
                groundTruth_test,
                test_users,
                n_trees=10,
                search_k=-1):
    print(f"annoy model with n_trees: {n_trees}, search_k: {search_k}")

    sc = SparkContext.getOrCreate()

    user_factors = als_model.userFactors
    size = user_factors.limit(1).select(
        F.size("features").alias("calc_size")).collect()[0].calc_size
    start_time = time()
    index_size = AnnoyIndex(size)

    for row in user_factors.collect():
        index_size.add_item(row.id, row.features)

    index_size.build(n_trees)
    index_size.save("./annoy_result/annoy_t" + str(n_trees) + "_k_" +
                    str(search_k) + ".ann")

    rec_list = [(user.user_id,
                 index_size.get_nns_by_item(int(user.user_id), 500))
                for user in test_users.collect()]

    temp = sc.parallelize(rec_list)

    print("Annoy-Recommendations (500) created for test users")

    rec = spark.createDataFrame(temp, ["user_id", "recommendations"])

    pred_test = rec.join(groundTruth_test,
                         rec.user_id == groundTruth_test.user_id, 'inner')

    predAndLabels_test_annoy = pred_test.select('recommendations',
                                                'test_truth').rdd.map(tuple)

    metrics_test_annoy = RankingMetrics(predAndLabels_test_annoy)
    precision_test_annoy = metrics_test_annoy.precisionAt(500)
    map_test_annoy = metrics_test_annoy.meanAveragePrecision

    print(f"Time taken: {time() - start_time}s")
    print(f"Precision at 500: {precision_test_annoy}")
    print(f"Mean Average Precision: {map_test_annoy}")

    index_size.unload()
Example #14
0
    def test_on_disk(self):
        f = 2
        i = AnnoyIndex(f, 'euclidean')
        i.on_disk_build('test.ann')
        i.add_item(0, [2, 2])
        i.add_item(1, [3, 2])
        i.add_item(2, [3, 3])

        i.build(10)
        i.unload()

        i.load('test.ann')

        self.assertEqual(i.get_nns_by_vector([4, 4], 3), [2, 1, 0])
        self.assertEqual(i.get_nns_by_vector([1, 1], 3), [0, 1, 2])
        self.assertEqual(i.get_nns_by_vector([4, 2], 3), [1, 2, 0])
Example #15
0
def build_index(embedding_files_pattern, index_filename, num_trees=100):

    annoy_index = AnnoyIndex(VECTOR_LENGTH, metric=METRIC)
    mapping = {}

    embed_files = tf.gfile.Glob(embedding_files_pattern)[:250]
    logging.info('{} embedding files are found.'.format(len(embed_files)))

    item_counter = 0
    for f, embed_file in enumerate(embed_files):
        logging.info('Loading embeddings in file {} of {}...'.format(
            f, len(embed_files)))
        record_iterator = tf.python_io.tf_record_iterator(path=embed_file)

        for string_record in record_iterator:
            example = tf.train.Example()
            example.ParseFromString(string_record)
            string_identifier = example.features.feature[
                'id'].bytes_list.value[0]
            mapping[item_counter] = string_identifier
            embedding = np.array(
                example.features.feature['embedding'].float_list.value)
            annoy_index.add_item(item_counter, embedding)
            item_counter += 1

        logging.info('Loaded {} items to the index'.format(item_counter))

    logging.info('Start building the index with {} trees...'.format(num_trees))
    annoy_index.build(n_trees=num_trees)
    logging.info('Index is successfully built.')
    logging.info('Saving index to disk...')
    annoy_index.save(index_filename)
    logging.info('Index is saved to disk.')
    logging.info("Index file size: {} GB".format(
        round(os.path.getsize(index_filename) / float(1024**3), 2)))
    annoy_index.unload()
    logging.info('Saving mapping to disk...')
    with open(index_filename + '.mapping', 'wb') as handle:
        pickle.dump(mapping, handle, protocol=pickle.HIGHEST_PROTOCOL)
    logging.info('Mapping is saved to disk.')
    logging.info("Mapping file size: {} MB".format(
        round(
            os.path.getsize(index_filename + '.mapping') / float(1024**2), 2)))
Example #16
0
class face_annoy:
    def __init__(self):
        self.f = int(face_comm.get_conf('annoy', 'face_vector'))
        self.annoy_index_path = os.path.abspath(
            face_comm.get_conf('annoy', 'index_path'))
        self.lmdb_file = os.path.abspath(
            face_comm.get_conf('lmdb', 'lmdb_path'))
        self.num_trees = int(face_comm.get_conf('annoy', 'num_trees'))

        self.annoy = AnnoyIndex(self.f)
        if os.path.isfile(self.annoy_index_path):
            self.annoy.load(self.annoy_index_path)

    #从lmdb文件中建立annoy索引
    def create_index_from_lmdb(self):
        # 遍历
        lmdb_file = self.lmdb_file
        if os.path.isdir(lmdb_file):
            evn = lmdb.open(lmdb_file)
            wfp = evn.begin()
            annoy = AnnoyIndex(self.f)
            for key, value in wfp.cursor():
                str_list = key.split(',')
                key = int(str_list[0])
                name = str_list[1]
                value = face_comm.str_to_embed(value)
                annoy.add_item(key, value)

            annoy.build(self.num_trees)
            annoy.save(self.annoy_index_path)

    #重新加载索引
    def reload(self):
        self.annoy.unload()
        self.annoy.load(self.annoy_index_path)

    #根据人脸特征找到相似的
    def query_vector(self, face_vector):
        n = int(face_comm.get_conf('annoy', 'num_nn_nearst'))
        return self.annoy.get_nns_by_vector(face_vector,
                                            n,
                                            include_distances=True)
    def evaluate_set(prefix,
                     tails,
                     annoy_tree_file,
                     vector_dims,
                     lock,
                     rank_threshold=100,
                     sample_size=1000):

        #fname = ''.join(annoy_tree_file)
        lock.acquire()
        try:
            annoy_tree = AnnoyIndex(vector_dims)
            annoy_tree.load(annoy_tree_file)
        finally:
            lock.release()

        # annoy_tree = load_annoy_tree(annoy_tree_file, vector_dims)

        print(mp.current_process().name, id(annoy_tree),
              prefix.encode('utf-8'))
        sys.stdout.flush()

        counts = dict()
        counts[True] = 0
        counts[False] = 0

        if len(tails) > sample_size:
            tails = random.sample(tails, sample_size)
        for (comp1, tail1), (comp2, tail2) in itertools.combinations(tails, 2):

            diff = np.array(annoy_tree.get_item_vector(comp2)) - np.array(
                annoy_tree.get_item_vector(tail2))
            predicted = np.array(annoy_tree.get_item_vector(tail1)) + diff

            result = annoy_knn(annoy_tree, predicted, comp1, rank_threshold)

            counts[result] += 1

        annoy_tree.unload(annoy_tree_file)

        return (prefix, float(counts[True]) / (counts[True] + counts[False])
                ) if counts[True] + counts[False] > 0 else (prefix, 0.0)
Example #18
0
class face_annoy:

    def __init__(self):
        self.f                = int(face_comm.get_conf('annoy','face_vector'))
        self.annoy_index_path = os.path.abspath(face_comm.get_conf('annoy','index_path'))
        self.lmdb_file        =os.path.abspath(face_comm.get_conf('lmdb','lmdb_path'))
        self.num_trees        =int(face_comm.get_conf('annoy','num_trees'))

        self.annoy = AnnoyIndex(self.f)
        if os.path.isfile(self.annoy_index_path):
            self.annoy.load(self.annoy_index_path)

    #从lmdb文件中建立annoy索引
    def create_index_from_lmdb(self):
        # 遍历
        lmdb_file = self.lmdb_file
        if os.path.isdir(lmdb_file):
            evn = lmdb.open(lmdb_file)
            wfp = evn.begin()
            annoy = AnnoyIndex(self.f)
            for key, value in wfp.cursor():
                key = int(key)
                print(type(value))
                value = np.fromstring(value,dtype=np.float32)
                print(value.shape)
                annoy.add_item(key,value)

            annoy.build(self.num_trees)
            annoy.save(self.annoy_index_path)

    #重新加载索引
    def reload(self):
        self.annoy.unload()
        self.annoy.load(self.annoy_index_path)

    #根据人脸特征找到相似的
    def query_vector(self,face_vector):
        n=int(face_comm.get_conf('annoy','num_nn_nearst'))
        print(face_vector.shape)
        return self.annoy.get_nns_by_vector(face_vector,n,include_distances=True)
Example #19
0
class KNNIndex(object):
    annoy = None
    vec_len = -1
    metric = 'euclidean'
    is_loaded = False

    def __init__(self, vec_len, metric='euclidean', index_file=None):
        self.vec_len = vec_len
        self.metric = metric
        self.annoy = AnnoyIndex(self.vec_len, self.metric)
        if index_file:
            self.load(index_file)

    def get_nns_by_item(self, i, n, search_k=-1, include_distances=False):
        if self.is_loaded:
            return self.annoy.get_nns_by_item(i, n, search_k,
                                              include_distances)
        else:
            raise RuntimeError("Annoy index file is not loaded!")

    def get_nns_by_vector(self,
                          v,
                          n,
                          search_k=-1,
                          include_distances=False,
                          n_propagation=0):
        if self.is_loaded:
            return self.annoy.get_nns_by_vector(v, n, search_k,
                                                include_distances)
        else:
            raise RuntimeError("Annoy index file is not loaded!")

    def load(self, index_file):
        self.annoy.load(index_file)
        self.is_loaded = True

    def unload(self):
        self.annoy.unload()
        self.is_loaded = False
Example #20
0
 def searchNNSentence(self, target, cascading=False):
     annoy = AnnoyIndex(300)
     annoy.load(self.SENTENCE_ANN)
     self_annoy = AnnoyIndex(300)
     self_annoy.load(self.SELF_SENTENCE_ANN)
     # normal search
     target_vec = self.getSentenceVec(target)
     _vecs = annoy.get_nns_by_vector(target_vec, 50)
     _self_vecs = self_annoy.get_nns_by_vector(target_vec, 50)
     _vecs = self.NNfilter(annoy, self_annoy, _vecs, _self_vecs, target_vec,
                           cascading)
     _replys = []
     for i in Sentence.select().where(Sentence.id.in_(_vecs)):
         for reply in i.replys:
             if len(reply.reply) < 1:
                 continue
             _replys.append(reply.reply)
     annoy.unload()
     self_annoy.unload()
     if len(_replys) > 0:
         return _replys
     return None
Example #21
0
class face_annoy:
    def __init__(self):
        self.f = 512
        self.annoy_index_path = os.path.abspath(
            os.path.expanduser('~') + "/acs/data/face_vector.nn")
        self.num_trees = 100

        self.annoy = AnnoyIndex(self.f)
        if os.path.isfile(self.annoy_index_path):
            self.annoy.load(self.annoy_index_path)

    # 从lmdb文件中建立annoy索引
    def create_index_from_lmdb(self):
        # 遍历
        # lmdb_file = self.lmdb_file
        rows = dbsql.getallem()
        if len(rows) > 0:

            annoy = AnnoyIndex(self.f)
            for row in rows:
                key = row[0]
                value = str2embed(row[1])
                annoy.add_item(key, value)

            annoy.build(self.num_trees)
            annoy.save(self.annoy_index_path)

    # 重新加载索引
    def reload(self):
        self.annoy.unload()
        self.annoy.load(self.annoy_index_path)

    # 根据人脸特征找到相似的
    def query_vector(self, face_vector):
        n = 1
        return self.annoy.get_nns_by_vector(face_vector,
                                            n,
                                            include_distances=True)
Example #22
0
    def merge_indicies(self, index_file_a, index_file_b, sender_urn):
        logger.info("Merging {0} and {1} for {2} index".format(index_file_a, index_file_b, sender_urn))
        index_a = AnnoyIndex(self.feat_size, metric='euclidean')
        index_b = AnnoyIndex(self.feat_size, metric='euclidean')
        new_index = AnnoyIndex(self.feat_size, metric='euclidean')

        index_a.load(index_file_a)
        index_b.load(index_file_b)

        cnt = 0
        for i in range(index_a.get_n_items()):
            new_index.add_item(cnt, index_a.get_item_vector(i))
            cnt += 1

        for i in range(index_b.get_n_items()):
            new_index.add_item(cnt, index_b.get_item_vector(i))
            cnt += 1


        new_index_file = index_file_a + ".merged"

        index_a.unload()
        index_b.unload()

        new_index.build(self.n_trees)
        new_index.save(new_index_file)
        logger.info("Merging {0} and {1} for {2} index, total number of items: {3}".format(
                index_file_a,
                index_file_b,
                sender_urn,
                cnt))

        new_index.unload()
        pykka.ActorRegistry.get_by_urn(sender_urn).proxy().complete_compaction(
                new_index_file=new_index_file,
                index_file_a=index_file_a,
                index_file_b=index_file_b
        )
Example #23
0
class Index:
    """Procedures over multidimensional spaces."""

    def __init__(self, size, data_dir=None, trees=.001, volatile=False):
        """
        Indexing tensors operations and nearest neighbours search.

        Parameters
        ----------
        size: int
            Shape of unidimensional vectors which will be indexed

        data_dir: str
            Location where to load or save the index

        trees (optional): float
            Defines the number of trees to create based on the dataset
            size. Should be a number between 0 and 1.

        volatile (optional): bool
            If the index will be temporary or not.
        """
        self._position = -1
        self._size = size
        self._data_dir = data_dir
        self._trees = trees
        self._volatile = volatile

        if self._data_dir and not self._volatile:
            if os.path.isfile(self._data_dir):
                raise OSError('data_dir parameter is not a directory')

            os.makedirs(self._data_dir, exist_ok=True)
            self._path = os.path.join(self._data_dir, self.index_name)
        elif not self._data_dir and not self._volatile:
            raise NoDataDirForPermanentIndex
        elif not self._data_dir and self._volatile:
            _temp_file = FileIO.safe_temp_file()
            self._data_dir = os.path.dirname(_temp_file)
            self._path = _temp_file

        else:
            raise DataDirDefinedForVolatileIndex

        if os.path.isfile(self._path):
            try:
                self.tree = AnnoyIndex(size, metric='angular')

                self.tree.load(self._path)

                self._is_new_index = False
            except OSError as os_error:
                raise FileIsNotAnIndex from os_error
        else:
            self.tree = AnnoyIndex(size, metric='angular')
            self._is_new_index = True

        self._image_database = ImageDatabase(
            import_images=True,
            data_dir=self._data_dir,
        )

    @property
    def size(self):
        """Getter for property size."""
        return self._size

    @property
    def path(self):
        """Getter for property path."""
        return self._path

    @property
    def index_name(self):
        """Getter for property index_name."""
        return 'pupyl.index'

    @property
    def trees(self):
        """Getter for property trees."""
        return self._trees

    @property
    def volatile(self):
        """Getter for property volatile."""
        return self._volatile

    @trees.setter
    def trees(self, trees):
        """Setter for property trees."""
        self._trees = trees

    def __enter__(self):
        """Context opening index."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context closing index."""
        if not exc_type:

            if self._is_new_index:
                self.tree.build(self.size << intmul >> self.trees)

                self.tree.save(self.path)

            self.tree.unload()

    def items(self):
        """Return the indexed items."""
        for item in range(len(self)):
            yield item

    def values(self):
        """Return the indexed values."""
        for item in self.items():
            yield self.tree.get_item_vector(item)

    def items_values(self):
        """Return tuples with all items and values."""
        for item, value in zip(self.items(), self.values()):
            yield item, value

    def __getitem__(self, position):
        """Return item at index. Supports negative slicing."""
        if position >= 0:
            return self.tree.get_item_vector(position)

        return self.tree.get_item_vector(
            len(self) - abs(position)
        )

    def refresh(self):
        """Update all information regarding index file."""
        self.tree.unload()
        self.tree.load(self.path)

    def append(self, tensor, check_unique=False):
        """
        Insert a new tensor at the end of the index.
        Be advised that this operation is linear on index size ($O(n)$).

        Parameters
        ----------
        tensor: numpy.ndarray or list
            A vector to insert into index.

        check_unique (optional, default: False): bool
            Defines if append method should verify the existence
            of a really similar tensor on the current index. In other words,
            it checks for the unicity of the value. Be advised that this check
            creates an overhead on the append process.
        """
        if sum(tensor) == 0.:
            raise NullTensorError

        if self._is_new_index:

            index_it = True

            if check_unique and len(self) > 1:

                self.tree.build(self.size << intmul >> self.trees)

                result = self.item(
                    self.index(tensor),
                    top=1,
                    distances=True
                )

                if result[1][0] <= .05:
                    warning(
                        'Tensor being indexed already exists in '
                        'the database and the check for duplicates '
                        'are on. Refusing to store again this tensor.'
                    )

                    index_it = False

                self.tree.unbuild()

            if index_it:
                self.tree.add_item(len(self), tensor)

        else:

            with Index(self.size, volatile=True, trees=self.trees) as tmp_idx:
                for value in self.values():
                    tmp_idx.append(value, check_unique)

                tmp_idx.append(tensor, check_unique)

                _temp_file = tmp_idx.path

            move(_temp_file, self.path)

            self.refresh()

    def remove(self, position):
        """
        Remove the tensor at index from the database.
        Be advised that this operation is linear on index size ($O(n)$).

        Parameters
        ----------
        position: int
            The index which must be removed
        """
        if self._is_new_index:
            raise IndexNotBuildYet

        if position > len(self):
            raise IndexError

        with Index(self.size, volatile=True, trees=self.trees) as tmp_idx:
            shrink = False

            for item, value in self.items_values():
                if item == position:
                    shrink = True
                else:
                    if shrink:
                        item -= 1

                    tmp_idx.tree.add_item(item, value)

            _temp_file = tmp_idx.path

        move(_temp_file, self.path)

        self.refresh()

    def pop(self, position=None):
        """
        Pop-out the index at position, returning it.
        Be advised that this operation is linear on index size ($O(n)$).

        Parameters
        ----------
        position (optional) (default: last position): int
            Removes and returns the value at position.

        Returns
        ----------
        int:
            With the popped item.
        """
        if position:
            value = self[position]
        else:
            inverse_index = -1
            value = self[inverse_index]
            position = len(self) + inverse_index

        self.remove(position)

        return value

    def index(self, tensor):
        """
        Search for the first most similar image compared to the query.

        Parameters
        ----------
        tensor: numpy.ndarray or list
            A vector to search for the most similar.

        Returns
        ----------
        int:
            Describing the most similar resulting index.
        """
        return self.tree.get_nns_by_vector(tensor, n=1)[0]

    def item(self, position, top=10, distances=False):
        """
        Search the index using an internal position

        Parameters
        ----------
        position: int
            The item id within index.

        top (optional, default 10): int
            How many similar items should be returned.

        distances (optional, default 10): bool
            If should be returned also the distances between
            items.

        Returns
        -------
        if distances is True:
            list of tuples:
                Containing pairs of item and distances
        else:
            list:
                Containing similar items.
        """
        return self.tree.get_nns_by_item(
            position,
            top,
            include_distances=distances
        )

    def search(self, tensor, results=16):
        """
        Search for the first most similars image compared to the query.

        Parameters
        ----------
        tensor: numpy.ndarray or list
            A vector to search for the most similar images.

        results: int
            How many results to return. If similar images are less than
            results, it exhausts and will be returned actual total.
        """
        for result in self.tree.get_nns_by_vector(tensor, n=results):
            yield result

    def __len__(self):
        """Return how many items are indexed."""
        return self.tree.get_n_items()

    def __iter__(self):
        """Return an iterable."""
        for value in self.values():
            yield value

    def __next__(self):
        """Iterate over the iterable."""
        self._position += 1

        all_values = list(self.values())

        if self._position < len(all_values):
            return all_values[self._position]

        raise StopIteration

    def group_by(self, top=10, **kwargs):
        """
        Returns all (or some position) on the index that is similar
        with other elements inside index.

        Parameters
        ----------
        top (optional, default 10): int
            How many similar internal images should be returned

        position (optional): int
            Returns the groups based on a specified position.

        Returns
        -------
        list:
            If a position is defined

        or

        dict:
            Generator with a dictionary containing internal ids
            as key and a list of similar images as values.
        """
        position = kwargs.get('position')

        if len(self) <= 1:
            raise EmptyIndexError

        if top >= 1:
            if isinstance(position, int):

                results = self.item(position, top + 1)

                if len(results) > 1:

                    yield results[1:]

            else:

                for item in self.items():

                    yield {
                        item: self.item(
                            item,
                            top + 1
                        )[1:]
                    }
        else:

            raise TopNegativeOrZero

    def export_by_group_by(self, path, top=10, **kwargs):
        """
        Saves images, creating directories, based on their groups.

        Parameters
        ----------
        path: str
            Place to create the directories and export images

        top (optional, default 10):
            How many similar internal images should be returned

        position (optional): int
            Returns the groups based on a specified position.
        """
        for element in FileIO.progress(
            self.group_by(
                top=top,
                position=kwargs.get('position')
            )
        ):
            if isinstance(element, dict):
                item = [*element.keys()][0]
                similars = element[item]
            elif isinstance(element, list):
                item = kwargs['position']
                similars = element

            save_path = os.path.join(
                path,
                str(item)
            )

            os.makedirs(
                save_path,
                exist_ok=True
            )

            try:
                copyfile(
                    self._image_database.mount_file_name(
                        item,
                        'jpg'
                    ),
                    os.path.join(
                        save_path,
                        'group.jpg'
                    )
                )
            except FileNotFoundError:
                continue

            for rank, similar in enumerate(similars):

                original_file_path = self._image_database.mount_file_name(
                    similar,
                    'jpg'
                )

                try:
                    copyfile(
                        original_file_path,
                        os.path.join(
                            save_path,
                            f'{rank + 1}.jpg'
                        )
                    )
                except FileNotFoundError:
                    continue
Example #24
0
 def test_load_unload(self):
     # Issue #108
     i = AnnoyIndex(10)
     for x in xrange(100000):
         i.load('test/test.tree')
         i.unload()
class AnnoySearch:
    def __init__(self,
                 vec_dim=2048,
                 lmdb_file="static/lmdb",
                 ann_file="static/annoy_file/tree.ann",
                 metric='angular',
                 num_trees=10):
        self.vec_dim = vec_dim  # 要index的向量维度
        self.metric = metric  # 度量可以是"angular","euclidean","manhattan","hamming",或"dot"
        self.annoy_instance = AnnoyIndex(self.vec_dim, self.metric)
        self.lmdb_file = lmdb_file
        self.ann_file = ann_file
        self.num_trees = num_trees
        self.logger = logging.getLogger('AnnoySearch')

    def save_annoy(self):
        self.annoy_instance.save(self.ann_file)
        self.logger.info('save annoy SUCCESS !')

    def unload_annoy(self):
        self.annoy_instance.unload()

    def load_annoy(self):
        try:
            self.annoy_instance.unload()
            self.annoy_instance.load(self.ann_file)
            self.logger.info('load annoy SUCCESS !')
        except FileNotFoundError:
            self.logger.error(
                'annoy file DOES NOT EXIST , load annoy FAILURE !',
                exc_info=True)
        # 创建annoy索引

    def create_index_from_lmdb(self):
        # 遍历
        lmdb_file = self.lmdb_file
        if os.path.isdir(lmdb_file):
            evn = lmdb.open(lmdb_file)
            wfp = evn.begin()
            for key, value in wfp.cursor():
                key = int(key)
                value = str2embed(value)
                print(len(value))
                self.annoy_instance.add_item(key, value)

            self.annoy_instance.build(self.num_trees)
            self.annoy_instance.save(self.ann_file)

    def build_annoy(self):
        self.annoy_instance.build(self.num_trees)

    def get_nns_by_item(self,
                        index,
                        nn_num,
                        search_k=-1,
                        include_distances=False):
        return self.annoy_instance.get_nns_by_item(index, nn_num, search_k,
                                                   include_distances)

    def get_nns_by_vector(self,
                          vec,
                          nn_num,
                          search_k=-1,
                          include_distances=False):
        return self.annoy_instance.get_nns_by_vector(vec, nn_num, search_k,
                                                     include_distances)

    def get_n_items(self):
        return self.annoy_instance.get_n_items()

    def get_n_trees(self):
        return self.annoy_instance.get_n_trees()

    def get_vec_dim(self):
        return self.vec_dim

    def add_item(self, index, vec):
        self.annoy_instance.add_item(index, vec)

    def get_item_vector(self, index):
        return self.annoy_instance.get_item_vector(index)
Example #26
0
 def test_load_unload(self):
     # Issue #108
     i = AnnoyIndex(10)
     for x in xrange(100000):
         i.load('test/test.tree')
         i.unload()
Example #27
0
    def expertDocsSort(self, expertId, txt, topN):
        vec = self.t2v.text2v(txt, self.cuttor)
        annoy = AnnoyIndex(200)
        count = 0
        annoy.add_item(count, vec)
        count = count + 1
        db = DB()
        papers = db.getPapers(expertId)
        for p in papers:
            p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor)
            if p[3] is not None:
                annoy.add_item(count, p[3])
                p[3] = annoy.get_distance(0, count)
                count = count + 1
        papers = sorted(papers, key=lambda p: p[3])
        papersFormated = []
        for p in papers:
            if len(papersFormated) == topN:
                break
            map = {}
            if p[0] is not None:
                map['paperId'] = p[0].encode('utf8')
            else:
                map['paperId'] = p[0]
            if p[1] is not None:
                map['name'] = p[1].encode('utf8')
            else:
                map['name'] = p[1]
            if p[4] is not None:
                map['authors'] = p[4].encode('utf8')
            else:
                map['authors'] = p[4]
            if p[5] is not None:
                map['journalName'] = p[5].encode('utf8')
            else:
                map['journalName'] = p[5]
            if p[6] is not None:
                map['year'] = p[6].encode('utf8')
            else:
                map['year'] = p[6]
            papersFormated.append(map)

        count = 0
        annoy.unload()
        annoy.add_item(count, vec)
        count = count + 1
        patents = db.getPatents(expertId)
        for p in patents:
            p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor)
            if p[3] is not None:
                annoy.add_item(count, p[3])
                p[3] = annoy.get_distance(0, count)
                count = count + 1
        patents = sorted(patents, key=lambda p: p[3])
        patentsFormated = []
        for p in patents:
            if len(patentsFormated) == topN:
                break
            map = {}
            if p[0] is not None:
                map['patentId'] = p[0].encode('utf8')
            else:
                map['patentId'] = p[0]
            if p[4] is not None:
                map['publicationNo'] = p[4].encode('utf8')
            else:
                map['publicationNo'] = p[4]
            if p[1] is not None:
                map['name'] = p[1].encode('utf8')
            else:
                map['name'] = p[1]
            if p[5] is not None:
                map['inventors'] = p[5].encode('utf8')
            else:
                map['inventors'] = p[5]
            if p[6] is not None:
                map['applicant'] = p[6].encode('utf8')
            else:
                map['applicant'] = p[6]
            if p[7] is not None:
                map['year'] = p[7].encode('utf8')
            else:
                map['year'] = p[7]
            patentsFormated.append(map)

        count = 0
        annoy.unload()
        annoy.add_item(count, vec)
        count = count + 1
        projects = db.getProjects(expertId)
        for p in projects:
            p[3] = self.t2v.text2v(p[1] + p[2], self.cuttor)
            if p[3] is not None:
                annoy.add_item(count, p[3])
                p[3] = annoy.get_distance(0, count)
                count = count + 1
        projects = sorted(projects, key=lambda p: p[3])
        projectsFormated = []
        for p in projects:
            if len(projectsFormated) == topN:
                break
            map = {}
            if p[0] is not None:
                map['projectId'] = p[0].encode('utf8')
            else:
                map['projectId'] = p[0]
            if p[1] is not None:
                map['name'] = p[1].encode('utf8')
            else:
                map['name'] = p[1]
            if p[4] is not None:
                map['member'] = p[4].encode('utf8')
            else:
                map['member'] = p[4]
            if p[5] is not None:
                map['unit'] = p[5].encode('utf8')
            else:
                map['unit'] = p[5]
            if p[6] is not None:
                map['year'] = p[6].encode('utf8')
            else:
                map['year'] = p[6]
            if p[7] is not None:
                map['type'] = p[7].encode('utf8')
            else:
                map['type'] = p[7]
            projectsFormated.append(map)
        result = {}
        result['papers'] = papersFormated
        result['patents'] = patentsFormated
        result['projects'] = projectsFormated
        return result
Example #28
0
class Annoy(VectorIndex):
    def __init__(self, path, dims=None, metric='angular', build_on_disk=True):
        self.path = path
        self.is_mutable = None
        self.is_built = None
        self.build_on_disk = build_on_disk
        self.metric = metric

        if os.path.isfile(self.path):
            logging.debug(f'Loading existing index: {self.path}')
            self.load_meta()
            assert self.dims == dims or not dims, \
              'Passed path to existing index but dims do not match'
            assert self.metric == metric or not metric, \
              'Passed path to existing index but metrics do not match'
            self.index = AnnoyIndex(self.dims, metric=self.metric)
        elif dims:
            logging.debug(
                f'Creating new index with {dims} dimensions and {self.metric} metric'
            )
            self.dims = dims
            self.index = AnnoyIndex(self.dims, metric=self.metric)
            if build_on_disk:
                self.index.on_disk_build(self.path)
        else:
            logging.debug(f'Loading existing index: {self.path}')
            self.load_meta()
            self.index = AnnoyIndex(self.dims, metric=self.metric)

    @property
    def meta_path(self):
        return self.path + '.meta.json'

    @property
    def files(self):
        return [self.path, self.meta_path]

    def load_meta(self):
        self.__dict__.update(load_json(self.meta_path))

    def save_meta(self):
        d = {**self.__dict__}
        d.pop('index')
        save_json(d, self.meta_path)

    def build(self, num_trees=10):
        logging.debug(f'staring to build index: {self.path}')
        self.index.build(num_trees)
        logging.debug(f'finished building index: {self.path}')
        self.is_mutable = False
        self.is_built = True
        self.save_meta()

    def save(self):
        self.index.save(self.path)
        self.is_mutable = False
        self.save_meta()

    def load(self, memory=False):
        self.index.load(self.path, prefault=memory)
        self.is_mutable = False

    def unload(self):
        self.index.unload()

    def __del__(self):
        self.unload()

    def __setitem__(self, idx, vector):
        self.index.add_item(idx, vector)

    def __getitem__(self, idx):
        return self.index.get_item_vector(idx)

    def __len__(self):
        return self.index.get_n_items()

    def add(self, vector):
        idx = len(self)
        self[idx] = vector
        return idx

    def add_bulk(self, vectors):
        start = len(self)
        for n, v in enumerate(vectors):
            self[start + n] = v
        return self

    def set_bulk(self, indices, vectors):
        for idx, vector in zip(indices, vectors):
            self[idx] = vector

    def search(self, vector, num=10, depth=None, distances=True):
        return self.index.get_nns_by_vector(vector, num, depth or -1,
                                            distances)

    def search_index(self, idx, num=10, depth=None, distances=True):
        return self.index.get_nns_by_item(idx, num, depth or -1, distances)

    def distance(self, i, j):
        return self.index.get_distance(i, j)
tree.load("test_tree.ann")

raw_input("<Enter> to load 10,000 vectors.")
q = True
while q:

    for i in xrange(10000):
        tree.get_item_vector(i)

    resp = raw_input("<Enter> to load 10,000 vectors.")
    if resp.strip() == "q":
        q = False

raw_input("<Enter> to unload tree.")

tree.unload("test_tree.ann")

raw_input("done.")
tree.load("test_tree.ann")

raw_input("<Enter> to load 10,000 vectors.")
q = True
while q:

    for i in xrange(10000):
        tree.get_item_vector(i)

    resp = raw_input("<Enter> to load 10,000 vectors.")
    if resp.strip() == "q":
        q = False
Example #30
0
class ProcessAnnoyWorkerThread(Thread):
    """Worker Thread Class."""
    def __init__(self, notify_window, imagedata):
        """Init Worker Thread Class."""
        Thread.__init__(self)

        # Configuring annoy parameters

        self._n_nearest_neighbors = 10
        self._trees = 10000
        self._t = AnnoyIndex(DIMS, metric='angular')

        self._notify_window = notify_window
        self._imagedata = imagedata
        self._want_abort = 0
        # This starts the thread running on creation, but you could
        # also make the GUI thread responsible for calling this
        self.start()

    def run(self):
        """Run Worker Thread."""
        # Definition of module with using tfhub.dev handle
        logging.error("Start loading mobilenet...")
        module_handle = "https://tfhub.dev/google/imagenet/mobilenet_v2_140_224/feature_vector/4"
        # module_handle = "4/"
        # Load the module
        import tensorflow_hub as hub
        module = hub.load(module_handle)
        starttime = datetime.now()
        ################################################
        logging.error("Start processing and adding images to AnnoyIndex...")
        for file_index, item in enumerate(list(self._imagedata.getKeys())):
            # Loads and pre-process the image
            btmp = self._imagedata.getThumbnail(item)
            # pil_img = wx2PIL(wx_img)
            image_array = np.fromstring(bytes(btmp), dtype=np.uint8).reshape(
                (THUMBNAIL_MAX_SIZE, THUMBNAIL_MAX_SIZE, 3))
            # image_array = tf.keras.preprocessing.image.img_to_array(pil_img)
            image_array = tf.convert_to_tensor(image_array)
            tf_image_array = tf.image.convert_image_dtype(
                image_array, tf.float32)[tf.newaxis, ...]
            # Calculate the image feature vector of the img
            try:
                features = module(tf_image_array)
            except ValueError as err:
                logging.error("Image too small for feature processing " + item)
                continue
            # Remove single-dimensional entries from the 'features' array
            feature_set = np.squeeze(features)
            # Adds image feature vectors into annoy index
            self._t.add_item(file_index, feature_set)
            # self._imagedata.addFeatureSetAnnoy(filename,feature_set)
            wx.PostEvent(
                self._notify_window,
                ResultEvent((file_index / self._imagedata.getSize()) * 0.5,
                            EVT_RESULT_PROGRESS))
        # Builds annoy index
        logging.error("Building trees...")
        self._t.build(self._trees, n_jobs=-1)
        # save annoy index to file for multiprocessing
        self._t.save(self._imagedata.getAnnoyIndexTempFile())
        wx.PostEvent(self._notify_window, ResultEvent(0.75,
                                                      EVT_RESULT_PROGRESS))
        # Loops through all indexed items
        returndata = []
        logging.error("Fetching nearest neighbors...")
        img_list_of_lists = []
        img_list = []
        window_size = math.floor(len(self._imagedata.getKeys()) / cpu_count())
        i = 1
        for file_index, filename in enumerate(list(self._imagedata.getKeys())):
            if file_index < (window_size * i):
                img_list.append(file_index)
            else:
                img_list_of_lists.append(img_list)
                img_list = []
                i = i + 1
                img_list.append(file_index)
        img_list_of_lists.append(img_list)
        # read files separately using multithreaded pool
        pool = Pool(cpu_count())
        func = partial(get_nns, self._imagedata.getAnnoyIndexTempFile())
        load_results = pool.map_async(func, img_list_of_lists)
        pool.close()  # 'TERM'
        # maintain status gauge here
        while True:
            if load_results.ready() or self._want_abort == 1: break
            time.sleep(0.3)
            remaining = min(
                load_results._number_left * load_results._chunksize,
                len(img_list_of_lists))
            # print("Waiting for", remaining, "tasks to complete...")
            wx.PostEvent(
                self._notify_window,
                ResultEvent(
                    0.75 + ((len(img_list_of_lists) - remaining) /
                            len(img_list_of_lists)) * 0.25,
                    EVT_RESULT_PROGRESS))
        if self._want_abort == 1:
            pool.terminate()
        pool.join()  # 'KILL'

        self._t.unload()

        for list_of_thumb_nearest_neighbors in load_results.get():
            for thumb_nearest_neighbors in list_of_thumb_nearest_neighbors:
                names_list = []
                for item in thumb_nearest_neighbors:
                    name = list(self._imagedata.getKeys())[item]
                    names_list.append(name)
                returndata.append(names_list)
            # wx.PostEvent(self._notify_window,
            #              ResultEvent((file_index / self._imagedata.getSize()) * 0.25 + 0.75, EVT_RESULT_PROGRESS))
        wx.PostEvent(self._notify_window,
                     ResultEvent(returndata, EVT_RESULT_NEIGHBORS))
        wx.PostEvent(self._notify_window,
                     ResultEvent(None, EVT_RESULT_NEIGHBORS))
        wx.PostEvent(self._notify_window, ResultEvent(0.0,
                                                      EVT_RESULT_PROGRESS))
        ################################################
        stoptime = datetime.now()
        logging.error("ProcessAnnoyWorkerThread took " +
                      str(stoptime - starttime) + " to process " +
                      str(len(returndata)) + " image files")
Example #31
0
class AnnoySearch:
    def __init__(self, vec_dim=100, metric='angular'):
        self.vec_dim = vec_dim  # 要index的向量维度
        self.metric = metric  # 度量可以是"angular","euclidean","manhattan","hamming",或"dot"
        self.annoy_instance = AnnoyIndex(self.vec_dim, self.metric)
        self.logger = logging.getLogger('AnnoySearch')

    def save_annoy(self, annoy_file, prefault=False):
        self.annoy_instance.save(annoy_file, prefault=prefault)
        self.logger.info('save annoy SUCCESS !')

    def unload_annoy(self):
        self.annoy_instance.unload()

    def load_annoy(self, annoy_file, prefault=False):
        try:
            self.annoy_instance.unload()
            self.annoy_instance.load(annoy_file, prefault=prefault)
            self.logger.info('load annoy SUCCESS !')
        except FileNotFoundError:
            self.logger.error(
                'annoy file DOES NOT EXIST , load annoy FAILURE !',
                exc_info=True)

    # 创建annoy索引
    def build_annoy(self, n_trees):
        self.annoy_instance.build(n_trees)

    # 查询最近邻,通过index
    def get_nns_by_item(self,
                        index,
                        nn_num,
                        search_k=-1,
                        include_distances=False):
        return self.annoy_instance.get_nns_by_item(index, nn_num, search_k,
                                                   include_distances)

    # 查询最近邻,通过向量
    def get_nns_by_vector(self,
                          vec,
                          nn_num,
                          search_k=-1,
                          include_distances=False):
        return self.annoy_instance.get_nns_by_vector(vec, nn_num, search_k,
                                                     include_distances)

    def get_n_items(self):
        return self.annoy_instance.get_n_items()

    def get_n_trees(self):
        return self.annoy_instance.get_n_trees()

    def get_vec_dim(self):
        return self.vec_dim

    # 添加item
    def add_item(self, index, vec):
        self.annoy_instance.add_item(index, vec)

    def get_item_vector(self, index):
        return self.annoy_instance.get_item_vector(index)
Example #32
0
for i in locn:
    print 'annoy'
    counter = 0
    f = 360 + 512
    t = AnnoyIndex(f, metric='angular')
    feat = loc + i
    feat2 = loc2 + i
    annl = i.split('.')[0]
    feat1 = pd.read_csv(feat, sep=',', header=None)
    feat1 = np.asarray(feat1)
    feat3 = pd.read_csv(feat2, sep=',', header=None)
    feat3 = np.asarray(feat3)
    hogx, hogy = feat3.shape
    feat1 = feat1[:hogx, :]

    print 'shapes are', feat1.shape, feat3.shape
    feat1 = np.concatenate((feat1, feat3), axis=1)
    print feat1.shape
    row, col = feat1.shape
    feat1 = feat1[:, :col - 2]
    for count2 in range(row):
        count = feat1[count2, :]

        t.add_item(counter, count.astype(np.int32))
        counter = counter + 1
    t.build(100)
    t.save(save_loc + annl + '.ann')
    t.unload()
    del t
    print "Done with" + str(i)
class EmbeddingsIndex:

    def __init__(self, organization_dir, logger, embedding_type, load=False):

        """
        :param organization_dir: str - path to resources
        :param logger: logging object
        :param load: bool - True if used for generating content
        """

        self.organization_dir = organization_dir
        self.logger = logger
        self.embedding_type = embedding_type
        if 'use' in self.embedding_type:
            self.f_dim = 1024
        elif 'bert' in self.embedding_type:
            self.f_dim = 1536


        if load:

            self.logger.debug("Loading model.")
            self.t = AnnoyIndex(self.f_dim)
            self.t.load(os.path.join(self.organization_dir, "lshforest_" + self.embedding_type + ".ann"))

            self.logger.debug("Loading ids.")
            with h5py.File(os.path.join(self.organization_dir, "data_" + self.embedding_type + ".hdf5"), "r") as f:
                self.ids = numpy.array(f["id"])

            if len(self.ids) != self.t.get_n_items():
                raise EmbeddingsDbNotSynced

    def build_index(self) -> None:

        """
        Load embeddings and build a new index. Run it as a deamon because it might take more time with
        huge content databases.
        :return: None
        """

        self.logger.debug("Starting recommendation model building daemon.")
        self.retrieve_and_build()
        # t = Thread(target=self.retrieve_and_build, daemon=True)
        # t.start()

    def build_empty_index(self) -> None:

        """
        Build empty index - used when building index after deleting takes time.
        :return: None
        """

        t = AnnoyIndex(self.f_dim)

        trees_num = 10  # Number of trees in the index
        t.build(trees_num)
        t.save(os.path.join(self.organization_dir, "lshforest_" + self.embedding_type + ".ann"))
        print('organization ann has been created')
        t.unload()

    def retrieve_and_build(self):

        """
        Build empty index - used when building index after deleting takes time.
        :return: None
        """

        t = AnnoyIndex(self.f_dim)

        trees_num = 10  # Number of trees in the index
        t.build(trees_num)

        t.save(os.path.join(self.organization_dir, "lshforest.ann"))
        t.unload()

    def retrieve_and_build(self):

        """
        Retrieve embeddings stored in HDF5 files and build and save the graph.
        :return: None
        """

        try:
            embs = self.retrieve_embeddings()

            self.logger.debug("Creating annoy index. Embeddings count: %s.", len(embs))
            t = AnnoyIndex(self.f_dim)
            for i in range(0, len(embs)):
                t.add_item(i, embs[i])

            trees_num = 10  # Number of trees in the index
            t.build(trees_num)

            self.logger.debug("Building annoy index.")

            t.save(os.path.join(self.organization_dir, "lshforest_" + self.embedding_type + ".ann"))

            self.logger.debug("Saved annoy index.")
            t.unload()

        except Exception:
            message = "Failed to build recommendation model."
            self.logger.exception(message)

    def get_knn(self, vector, k=3):

        """
        Get k closest neighbors to a vector
        :param vector: embedding of a query
        :param k: parameter specifying how many neighbors we want
        :return: dict - dict with results
        """

        self.logger.debug("Getting nearest neighbors.")
        # Get closest neighbors
        ind, dist = self.t.get_nns_by_vector(vector, n=k, include_distances=True)

        ret = {"hits": []}

        if len(self.ids) != self.t.get_n_items():
            raise EmbeddingsDbNotSynced

        for d, i in zip(dist, ind):
            ret["hits"].append({"id": self.ids[i], "score": d})

        self.logger.debug("Nearest neighbors found.")
        return ret

    def retrieve_embeddings(self) -> numpy.ndarray:

        """
        Retrieve embeddings from a pandas dataframe
        :return: numpy.ndarray
        """

        self.logger.debug("Retrieving embeddings.")
        with h5py.File(os.path.join(self.organization_dir, "data_" + self.embedding_type + ".hdf5"), "r") as f:

            contexts = numpy.array(f["context"])
            contents = numpy.array(f["content"])

        ret = numpy.hstack((contexts, contents))

        self.logger.debug("Retrieved embeddings.")

        return ret

    def unload(self) -> None:

        """
        Unload index from memory
        :return: None
        """

        self.t.unload()
        del self.t
Example #34
0
tree.load("test_tree.ann")

input("<Enter> to load 10,000 vectors.")
q = True
while q:

    for i in range(10000):
        tree.get_item_vector(i)

    resp = input("<Enter> to load 10,000 vectors.")
    if resp.strip() == "q":
        q = False

input("<Enter> to unload tree.")

tree.unload("test_tree.ann")

input("done.")
tree.load("test_tree.ann")

input("<Enter> to load 10,000 vectors.")
q = True
while q:

    for i in range(10000):
        tree.get_item_vector(i)

    resp = input("<Enter> to load 10,000 vectors.")
    if resp.strip() == "q":
        q = False