コード例 #1
0
def get_indexer(fpath, model, room_id):
    if os.path.exists(fpath):
        logging.info("Use annoy_index :: room_id:%s", room_id)
        annoy_index = AnnoyIndexer()
        annoy_index.load(fpath)
        annoy_index.model = model

        return annoy_index
    else:
        # indexer: defaut is None
        return None
コード例 #2
0
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        index.save('index')

        index2 = AnnoyIndexer()
        index2.load('index')
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
コード例 #3
0
ファイル: test_similarities.py プロジェクト: leahic/gensim
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        index.save('index')

        index2 = AnnoyIndexer()
        index2.load('index')
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
コード例 #4
0
ファイル: similarity.py プロジェクト: zhaoqinghai/harvester
def predict(text):
    model = doc2vec.Doc2Vec.load('../models/doc2vec.model')
    indexer = AnnoyIndexer()
    indexer.load('../models/dv_index')
    indexer.model = model
    # print(indexer.labels)
    new_vec = []
    for word in transform_text(text, strip=False):
        new_vec.append(model[word])
    print(new_vec)
    sv = model.infer_vector(transform_text(text, strip=False))
    print(sv)
    print(indexer.most_similar(sv, 2))
コード例 #5
0
def f(process_id):
    print('Process Id: {}'.format(os.getpid()))
    process = psutil.Process(os.getpid())
    new_model = Word2Vec.load('/tmp/mymodel.pkl')
    vector = new_model.wv["science"]
    annoy_index = AnnoyIndexer()
    annoy_index.load('/tmp/mymodel.index')
    annoy_index.model = new_model
    approximate_neighbors = new_model.wv.most_similar([vector],
                                                      topn=5,
                                                      indexer=annoy_index)
    print('\nMemory used by process {}: {}\n---'.format(
        os.getpid(), process.memory_info()))
コード例 #6
0
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        fname = get_tmpfile('gensim_similarities.tst.pkl')
        index.save(fname)

        index2 = AnnoyIndexer()
        index2.load(fname)
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
コード例 #7
0
    def assertLoadedIndexEqual(self, index, model):
        from gensim.similarities.index import AnnoyIndexer

        fname = get_tmpfile('gensim_similarities.tst.pkl')
        index.save(fname)

        index2 = AnnoyIndexer()
        index2.load(fname)
        index2.model = model

        self.assertEqual(index.index.f, index2.index.f)
        self.assertEqual(index.labels, index2.labels)
        self.assertEqual(index.num_trees, index2.num_trees)
コード例 #8
0
def get_annoy(w2v, embedding_type='w2v'):
    dims = 100
    annoy_file_name = data_dir + '/annoy_index_' + '_' + str(dims) + '_' + embedding_type + '_' + str(len(w2v.vocab))
    if os.path.exists(annoy_file_name):
        logging.info("Loading Annoy from file: %s", annoy_file_name)
        annoy_index = AnnoyIndexer()
        annoy_index.load(annoy_file_name)
        annoy_index.model = word_vectors
    else:
        logging.info("Creating Annoy")
        annoy_index = AnnoyIndexer(word_vectors, dims)
        annoy_index.save(annoy_file_name)
        logging.info("Annoy indexing saved to %s", annoy_file_name)
    return annoy_index
コード例 #9
0
 def index_vector(self, dimensions=300, save=False):
     '''
     make annoy_index which is used in function 'is_word_pairs_similar'
     Using annoy_index, execution may be slower than normal index
     '''
     path = Path.cwd().parent.joinpath('preprocessed/annoy.index')
     if path.exists():
         annoy_index = AnnoyIndexer()
         annoy_index.load(str(path))
         annoy_index.model = self.embedding
     else:
         annoy_index = AnnoyIndexer(self.embedding, dimensions)
         if save:
             annoy_index.save(str(path))
     return annoy_index
コード例 #10
0
ファイル: w2v_fcst.py プロジェクト: MathewXJ/PycharmProjects
from gensim.models import Word2Vec
from gensim.similarities.index import AnnoyIndexer
import os
from app.common.config import model_path
from app.util.pre_model import W2V_VOCABULARY_SET, VOCABULARY_SET
from app.util.remove_utils import remove_not_sports

# 加载model目录下指定模型
path_to_model = os.path.join(model_path, 'word2vec')
model = Word2Vec.load(path_to_model)

# 从disk加载annoy indexer
path_to_indexer = os.path.join(model_path, 'annoy_indexer_100')
annoy_indexer_100 = AnnoyIndexer()
annoy_indexer_100.load(path_to_indexer)
annoy_indexer_100.model = model


# 使用模型计算输入词组
# 2018-03-08 使用indexer解决cpu占用问题
def associate_words(words, cont_type, with_model=model, top_n=10):
    words = [w.strip() for w in words]
    words = list(filter(lambda x: x in VOCABULARY_SET, words))
    res = {}
    tops = []
    if words is None or len(words) == 0:
        return res
    for i in range(len(words)):
        try:
            tops = (with_model.most_similar(positive=words[0:(len(words) - i)],
                                            topn=top_n,
コード例 #11
0
import codecs, json
from collections import defaultdict

import numpy as np
from matplotlib import pyplot as plt

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

from gensim.models import KeyedVectors
from gensim.similarities.index import AnnoyIndexer

wv_ent = KeyedVectors.load_word2vec_format('entity2vec.bin', binary=True)
annoy_index_ent = AnnoyIndexer()
annoy_index_ent.load('entity2vec.index')
annoy_index_ent.model = wv_ent

wv_rel = KeyedVectors.load_word2vec_format('relation2vec.bin', binary=True)
annoy_index_rel = AnnoyIndexer()
annoy_index_rel.load('relation2vec.index')
annoy_index_rel.model = wv_rel


def tsne_vis(X, labels, name):
    tsne = TSNE(n_components=2).fit_transform(X)
    plt.figure(figsize=(50, 50))
    for i, label in enumerate(labels):
        x, y = tsne[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
コード例 #12
0
    annoy_index.save(annoy_file)


# extend_glove()
# build_word2vec()

info('loading model')
model = KeyedVectors.load(w2v_model)
info(model)

info('init sims')
model.init_sims()

# build_annoy(model)
info('loading annoy indexer')
annoy_index = AnnoyIndexer()
annoy_index.load(annoy_file)
annoy_index.model = model

noise = np.random.random([DIM])
noise = np.zeros(DIM)
info('querying with Annoy')
with DisableLogger():
    val = model.most_similar([noise, noise], topn=3, indexer=annoy_index)
info(val)

info('querying with gensim')
with DisableLogger():
    val = model.most_similar([noise, noise], topn=1)
info(val)
コード例 #13
0
# You can save and load your indexes from/to disk to prevent having to
# construct them each time. This will create two files on disk, *fname* and
# *fname.d*. Both files are needed to correctly restore all attributes. Before
# loading an index, you will have to create an empty AnnoyIndexer object.
#
fname = '/tmp/mymodel.index'

# Persist index to disk
annoy_index.save(fname)

# Load index back
import os.path
if os.path.exists(fname):
    annoy_index2 = AnnoyIndexer()
    annoy_index2.load(fname)
    annoy_index2.model = model

# Results should be identical to above
vector = model.wv["science"]
approximate_neighbors2 = model.wv.most_similar([vector],
                                               topn=11,
                                               indexer=annoy_index2)
for neighbor in approximate_neighbors2:
    print(neighbor)

assert approximate_neighbors == approximate_neighbors2

###############################################################################
# Be sure to use the same model at load that was used originally, otherwise you
# will get unexpected behaviors.
#