def _load_classifier(self, **kwargs): if self.classifier_type == 'ann': for f in list_files(self.s3_conn, self.s3_path): filepath = os.path.join(self.temporary_directory, f) if not os.path.exists(filepath): logging.warning('calling download from %s to %s', self.s3_path + f, filepath) download(self.s3_conn, filepath, os.path.join(self.s3_path, f)) ann_index = AnnoyIndexer() ann_index.load( os.path.join(self.temporary_directory, self.classifier_id + '.index')) return NearestNeighbors(s3_conn=self.s3_conn, indexer=ann_index, **kwargs) elif self.classifier_type == 'knn': return NearestNeighbors(s3_conn=self.s3_conn, indexed=False, **kwargs) else: print('Not implemented yet!') return None
def create_sim_dict(file, vectors, min_sim=0.55, topn=10, num_trees=200): indexer = AnnoyIndexer(vectors, num_trees=num_trees) sim_dict = dict() for w in messages.pbar(vectors.vocab): sim = indexer.most_similar(vectors.get_vector(w), topn) sim_dict[w] = [s for s in sim if s[1] > min_sim] with open(file, 'wb') as fileout: pickle.dump(sim_dict, fileout)
def get_indexer(fpath, model, room_id): if os.path.exists(fpath): logging.info("Use annoy_index :: room_id:%s", room_id) annoy_index = AnnoyIndexer() annoy_index.load(fpath) annoy_index.model = model return annoy_index else: # indexer: defaut is None return None
def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = doc2vec.Doc2Vec(sentences, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 300) self.vector = self.model.docvecs.doctag_syn0norm[0]
def assertLoadedIndexEqual(self, index, model): from gensim.similarities.index import AnnoyIndexer index.save('index') index2 = AnnoyIndexer() index2.load('index') index2.model = model self.assertEqual(index.index.f, index2.index.f) self.assertEqual(index.labels, index2.labels) self.assertEqual(index.num_trees, index2.num_trees)
def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer self.index.save('index') self.index2 = AnnoyIndexer() self.index2.load('index') self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = word2vec.Word2Vec(texts, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 10) self.vector = self.model.syn0norm[0]
def train_with_annoy(self): self.status = 3 push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间开始注水")) self.annoy_index = AnnoyIndexer(self.tc_wv_model, 200) fname = 'tc_index_genoy.index' self.annoy_index.save(fname) # 导出训练结果,以后直接 load 即可 # annoy_index = AnnoyIndexer() # annoy_index.load(fname) # annoy_index.model = tc_wv_model push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间注水完毕")) self.status = 4
def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer fname = get_tmpfile('gensim_similarities.tst.pkl') self.index.save(fname) self.index2 = AnnoyIndexer() self.index2.load(fname) self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
def assertLoadedIndexEqual(self, index, model): from gensim.similarities.index import AnnoyIndexer fname = get_tmpfile('gensim_similarities.tst.pkl') index.save(fname) index2 = AnnoyIndexer() index2.load(fname) index2.model = model self.assertEqual(index.index.f, index2.index.f) self.assertEqual(index.labels, index2.labels) self.assertEqual(index.num_trees, index2.num_trees)
def f(process_id): print('Process Id: {}'.format(os.getpid())) process = psutil.Process(os.getpid()) new_model = Word2Vec.load('/tmp/mymodel.pkl') vector = new_model.wv["science"] annoy_index = AnnoyIndexer() annoy_index.load('/tmp/mymodel.index') annoy_index.model = new_model approximate_neighbors = new_model.wv.most_similar([vector], topn=5, indexer=annoy_index) print('\nMemory used by process {}: {}\n---'.format( os.getpid(), process.memory_info()))
def load_w2v(): print("Loading gensim pre-trained model") # model = KeyedVectors.load_word2vec_format("SO_vectors_200.bin", binary=True) # Above is intolerably slow and large, normed by code found here: https://stackoverflow.com/a/56963501 model = KeyedVectors.load("SO_vectors_normed", mmap='r') # Use this to load the provided AnnoyIndex annoy_index = AnnoyIndexer() annoy_index.load('SO_vectors_normed_annoy_index') # Use this to generate a new AnnoyIndex in ram, number is n-gram size (2 is recommended and seems to work best here) # annoy_index = AnnoyIndexer(model, 3) return Word2Vec(model, index=annoy_index)
def load_annoy(annoypath, model): ''' :param annoypath: :type annoypath: :param model: :type model: Word2Vec :return: :rtype: AnnoyIndexer ''' if not os.path.exists(annoypath): print("开始构建annoy索引:当前时间 : " + time.asctime(time.localtime(time.time()))) starttime12 = time.time() aindex = AnnoyIndexer(model, 200) print("构建索引完毕 %.2f secs" % (time.time() - starttime12)) # 保存annoy索引 print("开始保存annoy索引") starttime13 = time.time() aindex.save(annoypath) print("保存索引完毕 %.2f secs" % (time.time() - starttime13)) else: aindex = AnnoyIndexer() aindex.load(annoypath) return aindex
def load_index(self, path): index = {} nn = 0 for field in field_list: logger.info(u'---------field:' + field) index[field] = {} for unit_type in unit_types: index[field][unit_type] = {} for parent, dirnames, filenames in os.walk(path + field + '/' + unit_type + '/'): for filename in filenames: # if len(filename.split('.')) == 2 and ( # 'A5' in os.path.join(parent, filename) or 'project' in os.path.join(parent, filename)): if len(filename.split('.')) == 2: nn = nn + 1 logger.info( u'创建AnnoyIndexer %s:field=%s,unit_type=%s,province=%s' % (nn, field, unit_type, filename.split('.')[0])) index[field][unit_type][str( filename.split('.')[0])] = AnnoyIndexer() index[field][unit_type][str( filename.split('.')[0])].load( os.path.join(parent, filename)) return index
def build_graph(filename, TOPN, A_name, indice2word_name, annoy=False, dim=100, tree_num=20): """ """ model = read_w2v(filename, dim) V = len(model.wv.vocab) print("Num. vocab = %i" % V) word_indice_dic = {word: i for i, word in enumerate(model.wv.vocab)} indice2word = {i: word for word, i in word_indice_dic.items()} A = dok_matrix((V, V), dtype=np.float32) if annoy: print("Using ANNOY...") from gensim.similarities.index import AnnoyIndexer annoy_index = AnnoyIndexer(model, tree_num) add_neighbors(A, TOPN, model, word_indice_dic, annoy_index=annoy_index) else: add_neighbors(A, TOPN, model, word_indice_dic) save_sparse_csr(A_name, A.tocsr()) pickle.dump(indice2word, open(indice2word_name, "wb"))
class TestDoc2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = doc2vec.Doc2Vec(sentences, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 300) self.vector = self.model.docvecs.doctag_syn0norm[0] def testDocumentIsSimilarToItself(self): approx_neighbors = self.index.most_similar(self.vector, 1) doc, similarity = approx_neighbors[0] self.assertEqual(doc, 0) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.docvecs.most_similar( positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words)
class TestWord2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = word2vec.Word2Vec(texts, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 10) self.vector = self.model.syn0norm[0] def testVectorIsSimilarToItself(self): label = self.model.index2word[0] approx_neighbors = self.index.most_similar(self.vector, 1) word, similarity = approx_neighbors[0] self.assertEqual(word, label) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.most_similar(positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words)
def get_similarity(model, args): indexer = AnnoyIndexer(model, 10) i = 0 chunk_i = 0 item = [] similarity = [] with tqdm.tqdm(desc="get_similarity", total=len(model.wv.vectors)) as progress: for word in np.sort(list(model.wv.vocab.keys())): item.append(word) similarity.append([ '{}={}'.format(cscore, cword) for cscore, cword in model.wv.most_similar(word, topn=args.k, indexer=indexer) ]) i += 1 if i % args.save_one_time == 0: print("save to csv chunk no: {}".format(chunk_i)) topk_df = pd.DataFrame({'item': item, 'topk': similarity}) topk_df.to_csv(args.output_file, mode='a', header=False, index=False) i = 0 chunk_i += 1 item = [] similarity = [] progress.update(1) if i > 0: print("save to csv chunk no: {}".format(chunk_i)) topk_df = pd.DataFrame({'item': item, 'topk': similarity}) topk_df.to_csv(args.output_file, mode='a', header=False, index=False) return similarity
def _initialize_nns_method(self, nnsmethod, annoymodelpath): if self.nns_method_init_dict[nnsmethod]: return if nnsmethod == NNSMethod.KD_TREE: print("Building KD tree..") self.tree = cKDTree(self.model.docvecs.vectors_docs) print("Finished building KD tree.") self.keys = list(self.model.docvecs.doctags.keys()) elif nnsmethod == NNSMethod.ANNOY: if not os.path.isfile(annoymodelpath): print("Generating annoy index...") self.annoy_indexer = AnnoyIndexer(self.model, 50) print("Finished generating annoy index.") self.annoy_indexer.save(annoymodelpath) else: self.annoy_indexer = AnnoyIndexer() self.annoy_indexer.load(annoymodelpath) self.annoy_indexer.model = self.model
def train_item2vec(df=None, sessions=None, samples=None): if df is None and samples is None: raise NotImplementedError( ">>> Must be specific no items. Can not set `df` and `samples` to None" # noqa ) if samples is None: gen_rooms = RoomsGenerator(df, sessions) else: gen_rooms = samples start_ = time.time() model_i2v_path = os.path.join(st.BASE_MODEL, "{}.model".format(st.ITEM2VEC_KEY)) if os.path.exists(model_i2v_path): logging.info("Load pre-train model") model = Word2Vec.load(model_i2v_path) logging.info("Vocabulary before re-training: %d", len(model.wv.vocab)) model.build_vocab(gen_rooms, update=True) logging.info("Vocabulary after re-training: %d", len(model.wv.vocab)) model.train(gen_rooms, total_examples=model.corpus_count, epochs=model.iter, callbacks=()) logging.info("Pre-train model took %d's'", time.time() - start_) else: model = Word2Vec(gen_rooms, sg=st.SG, size=st.I2V_DIM, window=st.WINDOWS, min_count=st.MIN_COUNT, workers=st.WORKERS, iter=st.EPOCHS, sample=st.SAMPLE, negative=st.NS, compute_loss=st.COMPUTE_LOSS, callbacks=[Timer(start_)]) logging.info("Saving item2vec model") model.save(model_i2v_path) logging.info("Build annoy index for item2vec model") annoy_index = AnnoyIndexer(model, 100) annoy_index.save( os.path.join(st.BASE_MODEL, "{}.model".format(st.ANNOY_INDEX_KEY)))
def get_annoy(w2v, embedding_type='w2v'): dims = 100 annoy_file_name = data_dir + '/annoy_index_' + '_' + str(dims) + '_' + embedding_type + '_' + str(len(w2v.vocab)) if os.path.exists(annoy_file_name): logging.info("Loading Annoy from file: %s", annoy_file_name) annoy_index = AnnoyIndexer() annoy_index.load(annoy_file_name) annoy_index.model = word_vectors else: logging.info("Creating Annoy") annoy_index = AnnoyIndexer(word_vectors, dims) annoy_index.save(annoy_file_name) logging.info("Annoy indexing saved to %s", annoy_file_name) return annoy_index
class WordCalc: def __init__(self): # 0: 未训练 # 1: 正在训练gensim版 # 2: gensim版可用 # 3: 正在训练annoy # 4: annoy版可用 self.status = 0 push.push_to_rtx(push.generate_rtx_markdown("wordcalc出仓状态良好")) def train_with_gensim(self): self.status = 1 push.push_to_rtx(push.generate_rtx_markdown("gensim转子引擎开始加热")) self.tc_wv_model = KeyedVectors.load_word2vec_format( './Tencent_AILab_ChineseEmbedding.txt', binary=False) push.push_to_rtx(push.generate_rtx_markdown("gensim转子引擎加热完毕")) self.status = 2 def train_with_annoy(self): self.status = 3 push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间开始注水")) self.annoy_index = AnnoyIndexer(self.tc_wv_model, 200) fname = 'tc_index_genoy.index' self.annoy_index.save(fname) # 导出训练结果,以后直接 load 即可 # annoy_index = AnnoyIndexer() # annoy_index.load(fname) # annoy_index.model = tc_wv_model push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间注水完毕")) self.status = 4 def calc(self, positive_set, negative_set): if self.status == 2 or self.status == 3: result = self.tc_wv_model.most_similar(positive=positive_set, negative=negative_set, topn=10) return result elif self.status == 4: result = self.tc_wv_model.most_similar(positive=positive_set, negative=negative_set, indexer=self.annoy_index, topn=10) return result else: return []
def testAnnoyIndexingOfKeyedVectors(self): from gensim.similarities.index import AnnoyIndexer keyVectors_file = datapath('lee_fasttext.vec') model = KeyedVectors.load_word2vec_format(keyVectors_file) index = AnnoyIndexer(model, 10) self.assertEqual(index.num_trees, 10) self.assertVectorIsSimilarToItself(model, index) self.assertApproxNeighborsMatchExact(model, model, index)
def index_vector(self, dimensions=300, save=False): ''' make annoy_index which is used in function 'is_word_pairs_similar' Using annoy_index, execution may be slower than normal index ''' path = Path.cwd().parent.joinpath('preprocessed/annoy.index') if path.exists(): annoy_index = AnnoyIndexer() annoy_index.load(str(path)) annoy_index.model = self.embedding else: annoy_index = AnnoyIndexer(self.embedding, dimensions) if save: annoy_index.save(str(path)) return annoy_index
def main(): parser = argparse.ArgumentParser(description='Trains word embeddings') parser.add_argument('--config_file', type=str, default='configs/echoes_local.config', help='location of the configuration file') args = parser.parse_args() config = configparser.ConfigParser() config.read(args.config_file) print(config['word']['model_dir']) sentences = Sentences(input_file=config['general']['corpus_file']) try: shutil.rmtree(config['word']['model_dir']) except FileNotFoundError: pass os.mkdir(config['word']['model_dir']) logging.info('Building fasttext model...') model = FastText(sentences, size=int(config['word']['size']), window=int(config['word']['window']), min_count=int(config['word']['min_count']), iter=int(config['word']['epochs']), workers=int(config['word']['workers'])) model.init_sims() model.save(f"{config['word']['model_dir']}/ft_model") logging.info(f"Saved fasttext model under {config['word']['model_dir']}") logging.info('Building word2vec model...') model = Word2Vec(sentences, size=int(config['word']['size']), window=int(config['word']['window']), min_count=int(config['word']['min_count']), iter=int(config['word']['epochs']), workers=int(config['word']['workers'])) model.init_sims() annoy_index = AnnoyIndexer(model, 100) annoy_index.save(f"{config['word']['model_dir']}/annoy_model") model.save(f"{config['word']['model_dir']}/w2v_model") logging.info(f"Saved word2vec model under {config['word']['model_dir']}")
class WordNeighbors: def __init__(self, model_dir): self.ft_model = FastText.load(os.path.join(model_dir, 'ft_model')) self.w2v_model = Word2Vec.load(os.path.join(model_dir, 'w2v_model')) self.annoy_index = AnnoyIndexer() self.annoy_index.load(os.path.join(model_dir, 'annoy_model')) def query(self, w, topn): if w in self.w2v_model: vector = self.w2v_model[w] neighbors = self.w2v_model.most_similar([vector], topn=topn, indexer=self.annoy_index) else: try: neighbors = self.ft_model.most_similar(w, topn=topn) except KeyError: neighbors = [] return neighbors
class TestDoc2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy # noqa:F401 except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = doc2vec.Doc2Vec(sentences, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 300) self.vector = self.model.docvecs.doctag_syn0norm[0] def testDocumentIsSimilarToItself(self): approx_neighbors = self.index.most_similar(self.vector, 1) doc, similarity = approx_neighbors[0] self.assertEqual(doc, 0) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.docvecs.most_similar( positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words) def testSave(self): fname = testfile() self.index.save(fname) self.assertTrue(os.path.exists(fname)) self.assertTrue(os.path.exists(fname + '.d')) def testLoadNotExist(self): from gensim.similarities.index import AnnoyIndexer self.test_index = AnnoyIndexer() self.assertRaises(IOError, self.test_index.load, fname='test-index') def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer fname = testfile() self.index.save(fname) self.index2 = AnnoyIndexer() self.index2.load(fname) self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
class TestWord2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = word2vec.Word2Vec(texts, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 10) self.vector = self.model.wv.syn0norm[0] def testVectorIsSimilarToItself(self): label = self.model.index2word[0] approx_neighbors = self.index.most_similar(self.vector, 1) word, similarity = approx_neighbors[0] self.assertEqual(word, label) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.most_similar(positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words) def testSave(self): self.index.save('index') self.assertTrue(os.path.exists('index')) self.assertTrue(os.path.exists('index.d')) def testLoadNotExist(self): from gensim.similarities.index import AnnoyIndexer self.test_index = AnnoyIndexer() self.assertRaises(IOError, self.test_index.load, fname='test-index') def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer self.index.save('index') self.index2 = AnnoyIndexer() self.index2.load('index') self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
class TestDoc2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = doc2vec.Doc2Vec(sentences, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 300) self.vector = self.model.docvecs.doctag_syn0norm[0] def testDocumentIsSimilarToItself(self): approx_neighbors = self.index.most_similar(self.vector, 1) doc, similarity = approx_neighbors[0] self.assertEqual(doc, 0) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.docvecs.most_similar( positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words) def testSave(self): fname = testfile() self.index.save(fname) self.assertTrue(os.path.exists(fname)) self.assertTrue(os.path.exists(fname + '.d')) def testLoadNotExist(self): from gensim.similarities.index import AnnoyIndexer self.test_index = AnnoyIndexer() self.assertRaises(IOError, self.test_index.load, fname='test-index') def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer fname = testfile() self.index.save(fname) self.index2 = AnnoyIndexer() self.index2.load(fname) self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
def build_index(): # for v, i in zip(vecs, indx): # model = gensim.models.word2vec.Word2Vec.load_word2vec_format(v, binary=False) # index = AnnoyIndexer(model, 100) # index.save(i) for vec in vecs: # 三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字 for parent, dirnames, filenames in os.walk(vec): for filename in filenames: # 输出文件信息 origin_path = os.path.join(parent, filename) target_path = os.path.join(parent, filename).replace( 'vec_new', 'ind').split('.')[0] + '.ind' logger.info('origin path:' + origin_path) logger.info('target path:' + target_path) if not os.path.exists(parent.replace('vec_new', 'ind')): os.makedirs(parent.replace('vec_new', 'ind')) model = gensim.models.word2vec.Word2Vec.load_word2vec_format( origin_path, binary=False) index = AnnoyIndexer(model, 100) index.save(target_path)
class Predictor: def __init__(self): self.model = Word2Vec.load('data/word2vec.model') self.vocab = self.model.wv.vocab self.annoy_index = AnnoyIndexer() self.annoy_index.load('data/word2vec_idx.ann') self.annoy_index.model = self.model def explain(self, word, n_words): try: ans_words = self.model.wv.most_similar( positive=[lemmatize_stemming(word)], topn=n_words + 1, indexer=self.annoy_index) print([lemmatize_stemming(word[0]) for word in ans_words[1:]]) return [word[0] for word in ans_words[1:]] except KeyError: return 'Wrong word' def guess(self, words, n_words): try: if len(words) != 1: ans_words = self.model.wv.most_similar( positive=[ lemmatize_stemming(word) for word in words if lemmatize_stemming(word) in self.vocab ], topn=n_words, indexer=self.annoy_index) return [word[0] for word in ans_words] else: ans_words = self.model.wv.most_similar( positive=[ lemmatize_stemming(word) for word in words if lemmatize_stemming(word) in self.vocab ], topn=n_words + 1, indexer=self.annoy_index) return [word[0] for word in ans_words[1:]] except ValueError: return 'Wrong word'
class TestWord2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = word2vec.Word2Vec(texts, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 10) self.vector = self.model.syn0norm[0] def testVectorIsSimilarToItself(self): label = self.model.index2word[0] approx_neighbors = self.index.most_similar(self.vector, 1) word, similarity = approx_neighbors[0] self.assertEqual(word, label) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.most_similar(positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words) def testSave(self): self.index.save('index') self.assertTrue(os.path.exists('index')) self.assertTrue(os.path.exists('index.d')) def testLoadNotExist(self): from gensim.similarities.index import AnnoyIndexer self.test_index = AnnoyIndexer() self.assertRaises(IOError, self.test_index.load, fname='test-index') def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer self.index.save('index') self.index2 = AnnoyIndexer() self.index2.load('index') self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)