def get_indexer(fpath, model, room_id): if os.path.exists(fpath): logging.info("Use annoy_index :: room_id:%s", room_id) annoy_index = AnnoyIndexer() annoy_index.load(fpath) annoy_index.model = model return annoy_index else: # indexer: defaut is None return None
def assertLoadedIndexEqual(self, index, model): from gensim.similarities.index import AnnoyIndexer index.save('index') index2 = AnnoyIndexer() index2.load('index') index2.model = model self.assertEqual(index.index.f, index2.index.f) self.assertEqual(index.labels, index2.labels) self.assertEqual(index.num_trees, index2.num_trees)
def assertLoadedIndexEqual(self, index, model): from gensim.similarities.index import AnnoyIndexer index.save('index') index2 = AnnoyIndexer() index2.load('index') index2.model = model self.assertEqual(index.index.f, index2.index.f) self.assertEqual(index.labels, index2.labels) self.assertEqual(index.num_trees, index2.num_trees)
def predict(text): model = doc2vec.Doc2Vec.load('../models/doc2vec.model') indexer = AnnoyIndexer() indexer.load('../models/dv_index') indexer.model = model # print(indexer.labels) new_vec = [] for word in transform_text(text, strip=False): new_vec.append(model[word]) print(new_vec) sv = model.infer_vector(transform_text(text, strip=False)) print(sv) print(indexer.most_similar(sv, 2))
def f(process_id): print('Process Id: {}'.format(os.getpid())) process = psutil.Process(os.getpid()) new_model = Word2Vec.load('/tmp/mymodel.pkl') vector = new_model.wv["science"] annoy_index = AnnoyIndexer() annoy_index.load('/tmp/mymodel.index') annoy_index.model = new_model approximate_neighbors = new_model.wv.most_similar([vector], topn=5, indexer=annoy_index) print('\nMemory used by process {}: {}\n---'.format( os.getpid(), process.memory_info()))
def assertLoadedIndexEqual(self, index, model): from gensim.similarities.index import AnnoyIndexer fname = get_tmpfile('gensim_similarities.tst.pkl') index.save(fname) index2 = AnnoyIndexer() index2.load(fname) index2.model = model self.assertEqual(index.index.f, index2.index.f) self.assertEqual(index.labels, index2.labels) self.assertEqual(index.num_trees, index2.num_trees)
def assertLoadedIndexEqual(self, index, model): from gensim.similarities.index import AnnoyIndexer fname = get_tmpfile('gensim_similarities.tst.pkl') index.save(fname) index2 = AnnoyIndexer() index2.load(fname) index2.model = model self.assertEqual(index.index.f, index2.index.f) self.assertEqual(index.labels, index2.labels) self.assertEqual(index.num_trees, index2.num_trees)
def get_annoy(w2v, embedding_type='w2v'): dims = 100 annoy_file_name = data_dir + '/annoy_index_' + '_' + str(dims) + '_' + embedding_type + '_' + str(len(w2v.vocab)) if os.path.exists(annoy_file_name): logging.info("Loading Annoy from file: %s", annoy_file_name) annoy_index = AnnoyIndexer() annoy_index.load(annoy_file_name) annoy_index.model = word_vectors else: logging.info("Creating Annoy") annoy_index = AnnoyIndexer(word_vectors, dims) annoy_index.save(annoy_file_name) logging.info("Annoy indexing saved to %s", annoy_file_name) return annoy_index
def index_vector(self, dimensions=300, save=False): ''' make annoy_index which is used in function 'is_word_pairs_similar' Using annoy_index, execution may be slower than normal index ''' path = Path.cwd().parent.joinpath('preprocessed/annoy.index') if path.exists(): annoy_index = AnnoyIndexer() annoy_index.load(str(path)) annoy_index.model = self.embedding else: annoy_index = AnnoyIndexer(self.embedding, dimensions) if save: annoy_index.save(str(path)) return annoy_index
from gensim.models import Word2Vec from gensim.similarities.index import AnnoyIndexer import os from app.common.config import model_path from app.util.pre_model import W2V_VOCABULARY_SET, VOCABULARY_SET from app.util.remove_utils import remove_not_sports # 加载model目录下指定模型 path_to_model = os.path.join(model_path, 'word2vec') model = Word2Vec.load(path_to_model) # 从disk加载annoy indexer path_to_indexer = os.path.join(model_path, 'annoy_indexer_100') annoy_indexer_100 = AnnoyIndexer() annoy_indexer_100.load(path_to_indexer) annoy_indexer_100.model = model # 使用模型计算输入词组 # 2018-03-08 使用indexer解决cpu占用问题 def associate_words(words, cont_type, with_model=model, top_n=10): words = [w.strip() for w in words] words = list(filter(lambda x: x in VOCABULARY_SET, words)) res = {} tops = [] if words is None or len(words) == 0: return res for i in range(len(words)): try: tops = (with_model.most_similar(positive=words[0:(len(words) - i)], topn=top_n,
import codecs, json from collections import defaultdict import numpy as np from matplotlib import pyplot as plt from sklearn.manifold import TSNE from sklearn.cluster import KMeans from gensim.models import KeyedVectors from gensim.similarities.index import AnnoyIndexer wv_ent = KeyedVectors.load_word2vec_format('entity2vec.bin', binary=True) annoy_index_ent = AnnoyIndexer() annoy_index_ent.load('entity2vec.index') annoy_index_ent.model = wv_ent wv_rel = KeyedVectors.load_word2vec_format('relation2vec.bin', binary=True) annoy_index_rel = AnnoyIndexer() annoy_index_rel.load('relation2vec.index') annoy_index_rel.model = wv_rel def tsne_vis(X, labels, name): tsne = TSNE(n_components=2).fit_transform(X) plt.figure(figsize=(50, 50)) for i, label in enumerate(labels): x, y = tsne[i, :] plt.scatter(x, y) plt.annotate(label, xy=(x, y),
annoy_index.save(annoy_file) # extend_glove() # build_word2vec() info('loading model') model = KeyedVectors.load(w2v_model) info(model) info('init sims') model.init_sims() # build_annoy(model) info('loading annoy indexer') annoy_index = AnnoyIndexer() annoy_index.load(annoy_file) annoy_index.model = model noise = np.random.random([DIM]) noise = np.zeros(DIM) info('querying with Annoy') with DisableLogger(): val = model.most_similar([noise, noise], topn=3, indexer=annoy_index) info(val) info('querying with gensim') with DisableLogger(): val = model.most_similar([noise, noise], topn=1) info(val)
# You can save and load your indexes from/to disk to prevent having to # construct them each time. This will create two files on disk, *fname* and # *fname.d*. Both files are needed to correctly restore all attributes. Before # loading an index, you will have to create an empty AnnoyIndexer object. # fname = '/tmp/mymodel.index' # Persist index to disk annoy_index.save(fname) # Load index back import os.path if os.path.exists(fname): annoy_index2 = AnnoyIndexer() annoy_index2.load(fname) annoy_index2.model = model # Results should be identical to above vector = model.wv["science"] approximate_neighbors2 = model.wv.most_similar([vector], topn=11, indexer=annoy_index2) for neighbor in approximate_neighbors2: print(neighbor) assert approximate_neighbors == approximate_neighbors2 ############################################################################### # Be sure to use the same model at load that was used originally, otherwise you # will get unexpected behaviors. #