コード例 #1
0
def load_annoy(annoypath, model):
    '''

    :param annoypath: 
    :type annoypath: 
    :param model: 
    :type model: Word2Vec
    :return: 
    :rtype: AnnoyIndexer
    '''
    if not os.path.exists(annoypath):
        print("开始构建annoy索引:当前时间 : " +
              time.asctime(time.localtime(time.time())))
        starttime12 = time.time()
        aindex = AnnoyIndexer(model, 200)
        print("构建索引完毕 %.2f secs" % (time.time() - starttime12))
        # 保存annoy索引
        print("开始保存annoy索引")
        starttime13 = time.time()
        aindex.save(annoypath)
        print("保存索引完毕 %.2f secs" % (time.time() - starttime13))
    else:
        aindex = AnnoyIndexer()
        aindex.load(annoypath)
    return aindex
コード例 #2
0
 def load_index_investment(self, path):
     index = AnnoyIndexer()
     for parent, dirnames, filenames in os.walk(path):
         for filename in filenames:
             # 生成的B.ind.d 是不能加载进来的,只能加载B.ind
             if len(filename.split('.')) == 2:
                 logger.info(u'文件名为%s ,路径为:%s' %
                             (str(filename.split('.')[0]),
                              os.path.join(parent, filename)))
                 index = AnnoyIndexer()
                 index.load(os.path.join(parent, filename))
     return index
コード例 #3
0
def get_annoy(w2v, embedding_type='w2v'):
    dims = 100
    annoy_file_name = data_dir + '/annoy_index_' + '_' + str(dims) + '_' + embedding_type + '_' + str(len(w2v.vocab))
    if os.path.exists(annoy_file_name):
        logging.info("Loading Annoy from file: %s", annoy_file_name)
        annoy_index = AnnoyIndexer()
        annoy_index.load(annoy_file_name)
        annoy_index.model = word_vectors
    else:
        logging.info("Creating Annoy")
        annoy_index = AnnoyIndexer(word_vectors, dims)
        annoy_index.save(annoy_file_name)
        logging.info("Annoy indexing saved to %s", annoy_file_name)
    return annoy_index
コード例 #4
0
 def __init__(self, vec_file, pap, pat, pro):
     # self.wm = gensim.models.KeyedVectors.load_word2vec_format(vec_file,binary=True)
     self.wm = gensim.models.word2vec.Word2Vec.load_word2vec_format(
         vec_file, binary=True)
     self.paper_index = AnnoyIndexer()
     self.paper_index.load(pap)
     self.patent_index = AnnoyIndexer()
     self.patent_index.load(pat)
     self.project_index = AnnoyIndexer()
     self.project_index.load(pro)
     self.t2v = Convert2Vec(self.wm)
     self.cuttor = FilterCut()
     self.db = DB()
     self.featureIndex = self.buildFeatureIndex()
コード例 #5
0
 def index_vector(self, dimensions=300, save=False):
     '''
     make annoy_index which is used in function 'is_word_pairs_similar'
     Using annoy_index, execution may be slower than normal index
     '''
     path = Path.cwd().parent.joinpath('preprocessed/annoy.index')
     if path.exists():
         annoy_index = AnnoyIndexer()
         annoy_index.load(str(path))
         annoy_index.model = self.embedding
     else:
         annoy_index = AnnoyIndexer(self.embedding, dimensions)
         if save:
             annoy_index.save(str(path))
     return annoy_index
コード例 #6
0
def get_similarity(model, args):
    indexer = AnnoyIndexer(model, 10)
    i = 0
    chunk_i = 0
    item = []
    similarity = []
    with tqdm.tqdm(desc="get_similarity",
                   total=len(model.wv.vectors)) as progress:
        for word in np.sort(list(model.wv.vocab.keys())):
            item.append(word)
            similarity.append([
                '{}={}'.format(cscore, cword) for cscore, cword in
                model.wv.most_similar(word, topn=args.k, indexer=indexer)
            ])
            i += 1
            if i % args.save_one_time == 0:
                print("save to csv chunk no: {}".format(chunk_i))
                topk_df = pd.DataFrame({'item': item, 'topk': similarity})
                topk_df.to_csv(args.output_file,
                               mode='a',
                               header=False,
                               index=False)
                i = 0
                chunk_i += 1
                item = []
                similarity = []
            progress.update(1)
        if i > 0:
            print("save to csv chunk no: {}".format(chunk_i))
            topk_df = pd.DataFrame({'item': item, 'topk': similarity})
            topk_df.to_csv(args.output_file,
                           mode='a',
                           header=False,
                           index=False)
    return similarity
コード例 #7
0
 def load_index(self, path):
     index = {}
     nn = 0
     for field in field_list:
         logger.info(u'---------field:' + field)
         index[field] = {}
         for unit_type in unit_types:
             index[field][unit_type] = {}
             for parent, dirnames, filenames in os.walk(path + field + '/' +
                                                        unit_type + '/'):
                 for filename in filenames:
                     # if len(filename.split('.')) == 2 and (
                     #         'A5' in os.path.join(parent, filename) or 'project' in os.path.join(parent, filename)):
                     if len(filename.split('.')) == 2:
                         nn = nn + 1
                         logger.info(
                             u'创建AnnoyIndexer %s:field=%s,unit_type=%s,province=%s'
                             %
                             (nn, field, unit_type, filename.split('.')[0]))
                         index[field][unit_type][str(
                             filename.split('.')[0])] = AnnoyIndexer()
                         index[field][unit_type][str(
                             filename.split('.')[0])].load(
                                 os.path.join(parent, filename))
     return index
コード例 #8
0
    def _load_classifier(self, **kwargs):
        if self.classifier_type == 'ann':
            for f in list_files(self.s3_conn, self.s3_path):
                filepath = os.path.join(self.temporary_directory, f)
                if not os.path.exists(filepath):
                    logging.warning('calling download from %s to %s',
                                    self.s3_path + f, filepath)
                    download(self.s3_conn, filepath,
                             os.path.join(self.s3_path, f))
            ann_index = AnnoyIndexer()
            ann_index.load(
                os.path.join(self.temporary_directory,
                             self.classifier_id + '.index'))
            return NearestNeighbors(s3_conn=self.s3_conn,
                                    indexer=ann_index,
                                    **kwargs)

        elif self.classifier_type == 'knn':
            return NearestNeighbors(s3_conn=self.s3_conn,
                                    indexed=False,
                                    **kwargs)

        else:
            print('Not implemented yet!')
            return None
コード例 #9
0
def build_graph(filename,
                TOPN,
                A_name,
                indice2word_name,
                annoy=False,
                dim=100,
                tree_num=20):
    """
    """
    model = read_w2v(filename, dim)
    V = len(model.wv.vocab)
    print("Num. vocab = %i" % V)
    word_indice_dic = {word: i for i, word in enumerate(model.wv.vocab)}
    indice2word = {i: word for word, i in word_indice_dic.items()}
    A = dok_matrix((V, V), dtype=np.float32)
    if annoy:
        print("Using ANNOY...")
        from gensim.similarities.index import AnnoyIndexer
        annoy_index = AnnoyIndexer(model, tree_num)
        add_neighbors(A, TOPN, model, word_indice_dic, annoy_index=annoy_index)
    else:
        add_neighbors(A, TOPN, model, word_indice_dic)

    save_sparse_csr(A_name, A.tocsr())
    pickle.dump(indice2word, open(indice2word_name, "wb"))
コード例 #10
0
 def _initialize_nns_method(self, nnsmethod, annoymodelpath):
     if self.nns_method_init_dict[nnsmethod]: return
     if nnsmethod == NNSMethod.KD_TREE:
         print("Building KD tree..")
         self.tree = cKDTree(self.model.docvecs.vectors_docs)
         print("Finished building KD tree.")
         self.keys = list(self.model.docvecs.doctags.keys())
     elif nnsmethod == NNSMethod.ANNOY:
         if not os.path.isfile(annoymodelpath):
             print("Generating annoy index...")
             self.annoy_indexer = AnnoyIndexer(self.model, 50)
             print("Finished generating annoy index.")
             self.annoy_indexer.save(annoymodelpath)
         else:
             self.annoy_indexer = AnnoyIndexer()
             self.annoy_indexer.load(annoymodelpath)
             self.annoy_indexer.model = self.model
コード例 #11
0
def create_sim_dict(file, vectors, min_sim=0.55, topn=10, num_trees=200):
    indexer = AnnoyIndexer(vectors, num_trees=num_trees)
    sim_dict = dict()
    for w in messages.pbar(vectors.vocab):
        sim = indexer.most_similar(vectors.get_vector(w), topn)
        sim_dict[w] = [s for s in sim if s[1] > min_sim]
    with open(file, 'wb') as fileout:
        pickle.dump(sim_dict, fileout)
コード例 #12
0
ファイル: augment.py プロジェクト: nguyenvulebinh/vlsp-hsd
def similar_augment(texts,
                    labels,
                    n_increase,
                    n_word_replace,
                    model_path,
                    similar_threshold=0.5,
                    use_annoy=True,
                    annoy_path=None):
    w2v = KeyedVectors.load_word2vec_format(model_path, binary=True)
    texts_long = []
    labels_long = []
    if use_annoy:
        if annoy_path is None:
            indexer = AnnoyIndexer(w2v, 100)
        else:
            indexer = AnnoyIndexer()
            indexer.load(annoy_path)

    for ind in range(len(texts)):
        if len(texts[ind]) >= n_word_replace:
            texts_long.append(texts[ind])
            labels_long.append(labels[ind])

    shuffle_ind = np.random.choice(len(texts_long), size=n_increase)
    for ind in shuffle_ind:
        text_copy = copy.deepcopy(texts_long[ind])
        # if is_hier:

        replace_inds = np.random.choice(text_copy.shape[-1],
                                        size=n_word_replace,
                                        replace=False)
        for word_ind in replace_inds:
            word = text_copy[word_ind]
            try:

                closest, score = w2v.wv.most_similar(
                    word, topn=2, indexer=indexer if use_annoy else None)[1]
                if score > similar_threshold:
                    text_copy[word_ind] = closest
            except:
                continue

        texts.append(text_copy)
        labels = np.append(labels, [labels_long[ind]])

    return texts, labels
コード例 #13
0
    def testAnnoyIndexingOfKeyedVectors(self):
        from gensim.similarities.index import AnnoyIndexer
        keyVectors_file = datapath('lee_fasttext.vec')
        model = KeyedVectors.load_word2vec_format(keyVectors_file)
        index = AnnoyIndexer(model, 10)

        self.assertEqual(index.num_trees, 10)
        self.assertVectorIsSimilarToItself(model, index)
        self.assertApproxNeighborsMatchExact(model, model, index)
コード例 #14
0
    def load_indexer(self, model_name):
        if self.full_log is not None:
            self.full_log.info("Loading word model indexer...")

        indexer = None

        if self.settings["use_annoy_indexer"]:
            if model_name == "glove-twitter-100":
                if self.full_log is not None:
                    self.full_log.debug("Loading Twitter 100")
                indexer = AnnoyIndexer()
                indexer.load(
                    r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440"
                    + r"\MajorProject\models\glove-twitter-100-5-trees.ann")
            elif model_name == "glove-twitter-200":
                if self.full_log is not None:
                    self.full_log.debug("Loading Twitter 200")
                indexer = AnnoyIndexer()
                indexer.load(
                    r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440"
                    + r"\MajorProject\models\glove-twitter-200-5-trees.ann")
            elif model_name == "glove-wiki-300":
                if self.full_log is not None:
                    self.full_log.debug("Loading Wiki 300")
                indexer = AnnoyIndexer()
                indexer.load(
                    r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440"
                    + r"\MajorProject\models\glove-wiki-300-5-trees.ann")
            elif model_name == "glove-wiki-100":
                if self.full_log is not None:
                    self.full_log.debug("Loading Wiki 100")
                indexer = AnnoyIndexer()
                indexer.load(
                    r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440"
                    + r"\MajorProject\models\glove-wiki-100-5-trees.ann")
            if self.full_log is not None:
                self.full_log.info("Done loading model indexer")
        else:
            if self.full_log is not None:
                self.full_log.warning("No indexer selected, using default")

        return indexer
コード例 #15
0
def get_indexer(fpath, model, room_id):
    if os.path.exists(fpath):
        logging.info("Use annoy_index :: room_id:%s", room_id)
        annoy_index = AnnoyIndexer()
        annoy_index.load(fpath)
        annoy_index.model = model

        return annoy_index
    else:
        # indexer: defaut is None
        return None
コード例 #16
0
def f(process_id):
    print('Process Id: {}'.format(os.getpid()))
    process = psutil.Process(os.getpid())
    new_model = Word2Vec.load('/tmp/mymodel.pkl')
    vector = new_model.wv["science"]
    annoy_index = AnnoyIndexer(new_model, 100)
    approximate_neighbors = new_model.wv.most_similar([vector],
                                                      topn=5,
                                                      indexer=annoy_index)
    print('\nMemory used by process {}: {}\n---'.format(
        os.getpid(), process.memory_info()))
コード例 #17
0
ファイル: wordcalc.py プロジェクト: juvu3/WordCalc
 def train_with_annoy(self):
     self.status = 3
     push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间开始注水"))
     self.annoy_index = AnnoyIndexer(self.tc_wv_model, 200)
     fname = 'tc_index_genoy.index'
     self.annoy_index.save(fname)
     # 导出训练结果,以后直接 load 即可
     # annoy_index = AnnoyIndexer()
     # annoy_index.load(fname)
     # annoy_index.model = tc_wv_model
     push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间注水完毕"))
     self.status = 4
コード例 #18
0
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        index.save('index')

        index2 = AnnoyIndexer()
        index2.load('index')
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
コード例 #19
0
    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        self.index.save('index')

        self.index2 = AnnoyIndexer()
        self.index2.load('index')
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
コード例 #20
0
 def __init__(self,
              path_to_dictionary="mipt_vecs.w2v",
              indexer=None,
              cache_dict=False,
              partition=0.5):
     # если нет словаря, то скачать его
     self.model = gensim.models.KeyedVectors.load_word2vec_format(
         path_to_dictionary, binary=True, unicode_errors="ignore")
     self.annoy_index = (AnnoyIndexer(self.model, num_trees=10) if
                         (indexer == "annoy") else None)
     self.replace_dict = dict() if cache_dict else None
     self.partition = partition
コード例 #21
0
ファイル: test_similarities.py プロジェクト: zanderdk/gensim
    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = doc2vec.Doc2Vec(sentences, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 300)
        self.vector = self.model.docvecs.doctag_syn0norm[0]
コード例 #22
0
ファイル: test_similarities.py プロジェクト: zanderdk/gensim
    def setUp(self):
        try:
            import annoy
        except ImportError:
            raise unittest.SkipTest("Annoy library is not available")

        from gensim.similarities.index import AnnoyIndexer

        self.model = word2vec.Word2Vec(texts, min_count=1)
        self.model.init_sims()
        self.index = AnnoyIndexer(self.model, 10)
        self.vector = self.model.syn0norm[0]
コード例 #23
0
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        fname = get_tmpfile('gensim_similarities.tst.pkl')
        index.save(fname)

        index2 = AnnoyIndexer()
        index2.load(fname)
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
コード例 #24
0
ファイル: similarity.py プロジェクト: zhaoqinghai/harvester
def predict(text):
    model = doc2vec.Doc2Vec.load('../models/doc2vec.model')
    indexer = AnnoyIndexer()
    indexer.load('../models/dv_index')
    indexer.model = model
    # print(indexer.labels)
    new_vec = []
    for word in transform_text(text, strip=False):
        new_vec.append(model[word])
    print(new_vec)
    sv = model.infer_vector(transform_text(text, strip=False))
    print(sv)
    print(indexer.most_similar(sv, 2))
コード例 #25
0
    def testSaveLoad(self):
        from gensim.similarities.index import AnnoyIndexer

        fname = get_tmpfile('gensim_similarities.tst.pkl')
        self.index.save(fname)

        self.index2 = AnnoyIndexer()
        self.index2.load(fname)
        self.index2.model = self.model

        self.assertEqual(self.index.index.f, self.index2.index.f)
        self.assertEqual(self.index.labels, self.index2.labels)
        self.assertEqual(self.index.num_trees, self.index2.num_trees)
コード例 #26
0
def load_w2v():
    print("Loading gensim pre-trained model")
    # model = KeyedVectors.load_word2vec_format("SO_vectors_200.bin", binary=True)
    # Above is intolerably slow and large, normed by code found here: https://stackoverflow.com/a/56963501
    model = KeyedVectors.load("SO_vectors_normed", mmap='r')

    # Use this to load the provided AnnoyIndex
    annoy_index = AnnoyIndexer()
    annoy_index.load('SO_vectors_normed_annoy_index')

    # Use this to generate a new AnnoyIndex in ram, number is n-gram size (2 is recommended and seems to work best here)
    # annoy_index = AnnoyIndexer(model, 3)

    return Word2Vec(model, index=annoy_index)
コード例 #27
0
ファイル: augment.py プロジェクト: nguyenvulebinh/vlsp-hsd
def create_sim_dict(word_map,
                    model_path,
                    similar_threshold=0.5,
                    use_annoy=True,
                    annoy_path=None):
    w2v = KeyedVectors.load_word2vec_format(model_path, binary=True)
    if use_annoy:
        if annoy_path is None:
            indexer = AnnoyIndexer(w2v, 100)
        else:
            indexer = AnnoyIndexer()
            indexer.load(annoy_path)

    sim_dict = dict()
    for word in word_map:
        try:
            closest, score = w2v.wv.most_similar(
                word, topn=2, indexer=indexer if use_annoy else None)[1]
            if score > similar_threshold and closest in word_map:
                sim_dict[word_map[word]] = word_map[closest]
        except:
            continue

    return sim_dict
コード例 #28
0
    def _ann_indexer(self):
        """This function should be in the training process. It's here for temporary usage.
        Annoy is an open source library to search for points in space that are close to a given query point.
        It also creates large read-only file-based data structures that are mmapped into memory so that many
        processes may share the same data. For our purpose, it is used to find similarity between words or
        documents in a vector space.

        Returns:
            Annoy index object if self.indexed is True. None if we want to use gensim built-in index.
        """

        logging.info('indexing the model %s', self.model_name)
        self.model.init_sims()
        annoy_index = AnnoyIndexer(self.model, 200)
        return annoy_index
コード例 #29
0
def train_item2vec(df=None, sessions=None, samples=None):
    if df is None and samples is None:
        raise NotImplementedError(
            ">>> Must be specific no items. Can not set `df` and `samples` to None"  # noqa
        )

    if samples is None:
        gen_rooms = RoomsGenerator(df, sessions)
    else:
        gen_rooms = samples

    start_ = time.time()
    model_i2v_path = os.path.join(st.BASE_MODEL,
                                  "{}.model".format(st.ITEM2VEC_KEY))
    if os.path.exists(model_i2v_path):
        logging.info("Load pre-train model")
        model = Word2Vec.load(model_i2v_path)
        logging.info("Vocabulary before re-training: %d", len(model.wv.vocab))

        model.build_vocab(gen_rooms, update=True)
        logging.info("Vocabulary after re-training: %d", len(model.wv.vocab))
        model.train(gen_rooms,
                    total_examples=model.corpus_count,
                    epochs=model.iter,
                    callbacks=())
        logging.info("Pre-train model took %d's'", time.time() - start_)
    else:
        model = Word2Vec(gen_rooms,
                         sg=st.SG,
                         size=st.I2V_DIM,
                         window=st.WINDOWS,
                         min_count=st.MIN_COUNT,
                         workers=st.WORKERS,
                         iter=st.EPOCHS,
                         sample=st.SAMPLE,
                         negative=st.NS,
                         compute_loss=st.COMPUTE_LOSS,
                         callbacks=[Timer(start_)])

    logging.info("Saving item2vec model")
    model.save(model_i2v_path)

    logging.info("Build annoy index for item2vec model")
    annoy_index = AnnoyIndexer(model, 100)
    annoy_index.save(
        os.path.join(st.BASE_MODEL, "{}.model".format(st.ANNOY_INDEX_KEY)))
コード例 #30
0
    def build_ann_indexer(self, num_trees=100):
        """ Annoy is an open source library to search for points in space that are close to a given query point.
        It also creates large read-only file-based data structures that are mmapped into memory so that many
        processes may share the same data. For our purpose, it is used to find similarity between words or
        documents in a vector space.

        Args:
            num_trees (int): A positive integer which effects the build time and the index size.
                             A larger value will give more accurate results, but larger indexes.
                             (https://github.com/spotify/annoy)
        Returns:
            Annoy index object
        """
        logging.info('indexing the model %s', self.model_name)
        self.model.init_sims()
        annoy_index = AnnoyIndexer(self.model, num_trees)
        self.indexer = annoy_index
        return annoy_index