def load_annoy(annoypath, model): ''' :param annoypath: :type annoypath: :param model: :type model: Word2Vec :return: :rtype: AnnoyIndexer ''' if not os.path.exists(annoypath): print("开始构建annoy索引:当前时间 : " + time.asctime(time.localtime(time.time()))) starttime12 = time.time() aindex = AnnoyIndexer(model, 200) print("构建索引完毕 %.2f secs" % (time.time() - starttime12)) # 保存annoy索引 print("开始保存annoy索引") starttime13 = time.time() aindex.save(annoypath) print("保存索引完毕 %.2f secs" % (time.time() - starttime13)) else: aindex = AnnoyIndexer() aindex.load(annoypath) return aindex
class TestDoc2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy # noqa:F401 except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = doc2vec.Doc2Vec(sentences, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 300) self.vector = self.model.docvecs.doctag_syn0norm[0] def testDocumentIsSimilarToItself(self): approx_neighbors = self.index.most_similar(self.vector, 1) doc, similarity = approx_neighbors[0] self.assertEqual(doc, 0) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.docvecs.most_similar( positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words) def testSave(self): fname = testfile() self.index.save(fname) self.assertTrue(os.path.exists(fname)) self.assertTrue(os.path.exists(fname + '.d')) def testLoadNotExist(self): from gensim.similarities.index import AnnoyIndexer self.test_index = AnnoyIndexer() self.assertRaises(IOError, self.test_index.load, fname='test-index') def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer fname = testfile() self.index.save(fname) self.index2 = AnnoyIndexer() self.index2.load(fname) self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
class TestDoc2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = doc2vec.Doc2Vec(sentences, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 300) self.vector = self.model.docvecs.doctag_syn0norm[0] def testDocumentIsSimilarToItself(self): approx_neighbors = self.index.most_similar(self.vector, 1) doc, similarity = approx_neighbors[0] self.assertEqual(doc, 0) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.docvecs.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.docvecs.most_similar( positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words) def testSave(self): fname = testfile() self.index.save(fname) self.assertTrue(os.path.exists(fname)) self.assertTrue(os.path.exists(fname + '.d')) def testLoadNotExist(self): from gensim.similarities.index import AnnoyIndexer self.test_index = AnnoyIndexer() self.assertRaises(IOError, self.test_index.load, fname='test-index') def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer fname = testfile() self.index.save(fname) self.index2 = AnnoyIndexer() self.index2.load(fname) self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
class TestWord2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = word2vec.Word2Vec(texts, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 10) self.vector = self.model.wv.syn0norm[0] def testVectorIsSimilarToItself(self): label = self.model.index2word[0] approx_neighbors = self.index.most_similar(self.vector, 1) word, similarity = approx_neighbors[0] self.assertEqual(word, label) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.most_similar(positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words) def testSave(self): self.index.save('index') self.assertTrue(os.path.exists('index')) self.assertTrue(os.path.exists('index.d')) def testLoadNotExist(self): from gensim.similarities.index import AnnoyIndexer self.test_index = AnnoyIndexer() self.assertRaises(IOError, self.test_index.load, fname='test-index') def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer self.index.save('index') self.index2 = AnnoyIndexer() self.index2.load('index') self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
class TestWord2VecAnnoyIndexer(unittest.TestCase): def setUp(self): try: import annoy except ImportError: raise unittest.SkipTest("Annoy library is not available") from gensim.similarities.index import AnnoyIndexer self.model = word2vec.Word2Vec(texts, min_count=1) self.model.init_sims() self.index = AnnoyIndexer(self.model, 10) self.vector = self.model.syn0norm[0] def testVectorIsSimilarToItself(self): label = self.model.index2word[0] approx_neighbors = self.index.most_similar(self.vector, 1) word, similarity = approx_neighbors[0] self.assertEqual(word, label) self.assertEqual(similarity, 1.0) def testApproxNeighborsMatchExact(self): approx_neighbors = self.model.most_similar([self.vector], topn=5, indexer=self.index) exact_neighbors = self.model.most_similar(positive=[self.vector], topn=5) approx_words = [neighbor[0] for neighbor in approx_neighbors] exact_words = [neighbor[0] for neighbor in exact_neighbors] self.assertEqual(approx_words, exact_words) def testSave(self): self.index.save('index') self.assertTrue(os.path.exists('index')) self.assertTrue(os.path.exists('index.d')) def testLoadNotExist(self): from gensim.similarities.index import AnnoyIndexer self.test_index = AnnoyIndexer() self.assertRaises(IOError, self.test_index.load, fname='test-index') def testSaveLoad(self): from gensim.similarities.index import AnnoyIndexer self.index.save('index') self.index2 = AnnoyIndexer() self.index2.load('index') self.index2.model = self.model self.assertEqual(self.index.index.f, self.index2.index.f) self.assertEqual(self.index.labels, self.index2.labels) self.assertEqual(self.index.num_trees, self.index2.num_trees)
def get_annoy(w2v, embedding_type='w2v'): dims = 100 annoy_file_name = data_dir + '/annoy_index_' + '_' + str(dims) + '_' + embedding_type + '_' + str(len(w2v.vocab)) if os.path.exists(annoy_file_name): logging.info("Loading Annoy from file: %s", annoy_file_name) annoy_index = AnnoyIndexer() annoy_index.load(annoy_file_name) annoy_index.model = word_vectors else: logging.info("Creating Annoy") annoy_index = AnnoyIndexer(word_vectors, dims) annoy_index.save(annoy_file_name) logging.info("Annoy indexing saved to %s", annoy_file_name) return annoy_index
def index_vector(self, dimensions=300, save=False): ''' make annoy_index which is used in function 'is_word_pairs_similar' Using annoy_index, execution may be slower than normal index ''' path = Path.cwd().parent.joinpath('preprocessed/annoy.index') if path.exists(): annoy_index = AnnoyIndexer() annoy_index.load(str(path)) annoy_index.model = self.embedding else: annoy_index = AnnoyIndexer(self.embedding, dimensions) if save: annoy_index.save(str(path)) return annoy_index
def train_item2vec(df=None, sessions=None, samples=None): if df is None and samples is None: raise NotImplementedError( ">>> Must be specific no items. Can not set `df` and `samples` to None" # noqa ) if samples is None: gen_rooms = RoomsGenerator(df, sessions) else: gen_rooms = samples start_ = time.time() model_i2v_path = os.path.join(st.BASE_MODEL, "{}.model".format(st.ITEM2VEC_KEY)) if os.path.exists(model_i2v_path): logging.info("Load pre-train model") model = Word2Vec.load(model_i2v_path) logging.info("Vocabulary before re-training: %d", len(model.wv.vocab)) model.build_vocab(gen_rooms, update=True) logging.info("Vocabulary after re-training: %d", len(model.wv.vocab)) model.train(gen_rooms, total_examples=model.corpus_count, epochs=model.iter, callbacks=()) logging.info("Pre-train model took %d's'", time.time() - start_) else: model = Word2Vec(gen_rooms, sg=st.SG, size=st.I2V_DIM, window=st.WINDOWS, min_count=st.MIN_COUNT, workers=st.WORKERS, iter=st.EPOCHS, sample=st.SAMPLE, negative=st.NS, compute_loss=st.COMPUTE_LOSS, callbacks=[Timer(start_)]) logging.info("Saving item2vec model") model.save(model_i2v_path) logging.info("Build annoy index for item2vec model") annoy_index = AnnoyIndexer(model, 100) annoy_index.save( os.path.join(st.BASE_MODEL, "{}.model".format(st.ANNOY_INDEX_KEY)))
class WordCalc: def __init__(self): # 0: 未训练 # 1: 正在训练gensim版 # 2: gensim版可用 # 3: 正在训练annoy # 4: annoy版可用 self.status = 0 push.push_to_rtx(push.generate_rtx_markdown("wordcalc出仓状态良好")) def train_with_gensim(self): self.status = 1 push.push_to_rtx(push.generate_rtx_markdown("gensim转子引擎开始加热")) self.tc_wv_model = KeyedVectors.load_word2vec_format( './Tencent_AILab_ChineseEmbedding.txt', binary=False) push.push_to_rtx(push.generate_rtx_markdown("gensim转子引擎加热完毕")) self.status = 2 def train_with_annoy(self): self.status = 3 push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间开始注水")) self.annoy_index = AnnoyIndexer(self.tc_wv_model, 200) fname = 'tc_index_genoy.index' self.annoy_index.save(fname) # 导出训练结果,以后直接 load 即可 # annoy_index = AnnoyIndexer() # annoy_index.load(fname) # annoy_index.model = tc_wv_model push.push_to_rtx(push.generate_rtx_markdown("annoy向量空间注水完毕")) self.status = 4 def calc(self, positive_set, negative_set): if self.status == 2 or self.status == 3: result = self.tc_wv_model.most_similar(positive=positive_set, negative=negative_set, topn=10) return result elif self.status == 4: result = self.tc_wv_model.most_similar(positive=positive_set, negative=negative_set, indexer=self.annoy_index, topn=10) return result else: return []
def main(): parser = argparse.ArgumentParser(description='Trains word embeddings') parser.add_argument('--config_file', type=str, default='configs/echoes_local.config', help='location of the configuration file') args = parser.parse_args() config = configparser.ConfigParser() config.read(args.config_file) print(config['word']['model_dir']) sentences = Sentences(input_file=config['general']['corpus_file']) try: shutil.rmtree(config['word']['model_dir']) except FileNotFoundError: pass os.mkdir(config['word']['model_dir']) logging.info('Building fasttext model...') model = FastText(sentences, size=int(config['word']['size']), window=int(config['word']['window']), min_count=int(config['word']['min_count']), iter=int(config['word']['epochs']), workers=int(config['word']['workers'])) model.init_sims() model.save(f"{config['word']['model_dir']}/ft_model") logging.info(f"Saved fasttext model under {config['word']['model_dir']}") logging.info('Building word2vec model...') model = Word2Vec(sentences, size=int(config['word']['size']), window=int(config['word']['window']), min_count=int(config['word']['min_count']), iter=int(config['word']['epochs']), workers=int(config['word']['workers'])) model.init_sims() annoy_index = AnnoyIndexer(model, 100) annoy_index.save(f"{config['word']['model_dir']}/annoy_model") model.save(f"{config['word']['model_dir']}/w2v_model") logging.info(f"Saved word2vec model under {config['word']['model_dir']}")
def build_index(): # for v, i in zip(vecs, indx): # model = gensim.models.word2vec.Word2Vec.load_word2vec_format(v, binary=False) # index = AnnoyIndexer(model, 100) # index.save(i) for vec in vecs: # 三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字 for parent, dirnames, filenames in os.walk(vec): for filename in filenames: # 输出文件信息 origin_path = os.path.join(parent, filename) target_path = os.path.join(parent, filename).replace( 'vec_new', 'ind').split('.')[0] + '.ind' logger.info('origin path:' + origin_path) logger.info('target path:' + target_path) if not os.path.exists(parent.replace('vec_new', 'ind')): os.makedirs(parent.replace('vec_new', 'ind')) model = gensim.models.word2vec.Word2Vec.load_word2vec_format( origin_path, binary=False) index = AnnoyIndexer(model, 100) index.save(target_path)
def train(): documents = [] with open('/home/ycw/tax_data.csv', 'r') as f: reader = csv.reader(f, dialect='excel', delimiter=',') for line in reader: print(line) word_list = transform_text(line[1].strip(), strip=False) # word_list = eval(line[2]) documents.append(doc2vec.LabeledSentence(word_list, [line[0]])) model = Doc2Vec(documents, dm=1, size=DIMENSION, window=5, negative=5, min_count=2, workers=4) model.save('../models/doc2vec.model') indexer = AnnoyIndexer(model, 2) # _, tem_fn = mkstemp() indexer.save('../models/dv_index')
def build_annoy(w2v): info('building index') annoy_index = AnnoyIndexer(w2v, 500) info('saving index') annoy_index.save(annoy_file)
# for more details. # ############################################################################### # 5. Persisting indices to disk # ----------------------------- # # You can save and load your indexes from/to disk to prevent having to # construct them each time. This will create two files on disk, *fname* and # *fname.d*. Both files are needed to correctly restore all attributes. Before # loading an index, you will have to create an empty AnnoyIndexer object. # fname = '/tmp/mymodel.index' # Persist index to disk annoy_index.save(fname) # Load index back import os.path if os.path.exists(fname): annoy_index2 = AnnoyIndexer() annoy_index2.load(fname) annoy_index2.model = model # Results should be identical to above vector = model.wv["science"] approximate_neighbors2 = model.wv.most_similar([vector], topn=11, indexer=annoy_index2) for neighbor in approximate_neighbors2: print(neighbor)
#!/usr/bin/env python3 import sys from gensim.models import Word2Vec from gensim.models import KeyedVectors from gensim.similarities.index import AnnoyIndexer if len(sys.argv) != 3: sys.stderr.write("local/prepare_words_embedding.py <src-mdl> <dest-mdl>\n") sys.exit(1) srcmdl = sys.argv[1] dstmdl = sys.argv[2] model = KeyedVectors.load_word2vec_format(srcmdl) annoy_index = AnnoyIndexer(model, 200) annoy_index.save(dstmdl)
#from gensim.models.keyedvectors import KeyedVectors import gensim model = gensim.models.Word2Vec.load(file) #model = gensim.models.KeyedVectors.load_word2vec_format(file, binary=True) model.vector_size = 1000 from gensim.similarities.index import AnnoyIndexer # 100 trees are being used in this example annoy_index = AnnoyIndexer(model,100) fname = 'index.ann' # Persist index to disk annoy_index.save(fname) # Load index back #if os.path.exists(fname): # annoy_index2 = AnnoyIndexer() # annoy_index2.load(fname) # annoy_index2.model = model # # Derive the vector for the word "army" in our model #vector = model["science"] # The instance of AnnoyIndexer we just created is passed #approximate_neighbors = model.most_similar([vector], topn=5, indexer=annoy_index) # Neatly print the approximate_neighbors and their corresponding cosine similarity values #for neighbor in approximate_neighbors: # print(neighbor)
return ans if __name__ == '__main__': data = api.load("20-newsgroups", return_path=False) start = time() all_sentences = [] for text in data: all_sentences.extend(text2sentences(text['data'])) print('Text to sentenses step complited for {}s'.format(time()-start)) start = time() preprocessed_sentences = [] for sentence in all_sentences: preprocessed_sentences.append(preprocess(sentence)) print('Sentenses to tokens step complited for {}s'.format(time() - start)) print('Learning...') start = time() epoch_num = 60 w2v_model = Word2Vec(min_count=20, window=5, size=100, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=cpu_count()) w2v_model.build_vocab(preprocessed_sentences, progress_per=1) w2v_model.train(preprocessed_sentences, total_examples=w2v_model.corpus_count, epochs=epoch_num, report_delay=1.0) print('Learning complited for {}s'.format(time() - start)) w2v_model.init_sims(replace=True) w2v_model.save('data/word2vec.model') start = time() print('Annoy indexing...') ann_model = AnnoyIndexer(w2v_model, 1000) ann_model.save('data/word2vec_idx.ann') print('Annoy indexing complited for {}s'.format(time() - start))
class Doc2VecModel: BASE_WIKI_QUERY = "https://en.wikipedia.org/w/api.php?action=query&format=json&pageids=" stopword_list = stopwords.words('english') def __init__(self, modelname): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) self.modelname = modelname if self.modelname is None: trian_corpus = self._get_training_iterator() self.model = Doc2Vec(vector_size=100, min_count=5, workers=7) self.model.build_vocab(trian_corpus, progress_per=10000) else: self.model = Doc2Vec.load(self.modelname) self.nns_method_init_dict = { NNSMethod.BRUTE: True, NNSMethod.KD_TREE: False, NNSMethod.ANNOY: False } def infer_file(self, filename, n=10): with open(filename, 'r') as f: lines = f.readlines() lines = ' '.join(lines) return self.infer(lines, n) def infer(self, string, n=10, nnsmethod=NNSMethod.ANNOY, annoymodelpath="gensim_annoy"): self._initialize_nns_method(nnsmethod, annoymodelpath) words = self._preprocess(string) # Set the random seed to make inferred vector determanistic self.model.random = np.random.mtrand.RandomState(1337) inferred_vector = self.model.infer_vector(words) ids, dists = self._calculate_most_similar(inferred_vector, n, nnsmethod) titles = self._get_title_from_pageids(ids) return titles, dists def train(self, epochs): trian_corpus = self._get_training_iterator() self.model.train(trian_corpus, total_examples=self.model.corpus_count, epochs=epochs, report_delay=10) self.model.save(self.modelname) def _initialize_nns_method(self, nnsmethod, annoymodelpath): if self.nns_method_init_dict[nnsmethod]: return if nnsmethod == NNSMethod.KD_TREE: print("Building KD tree..") self.tree = cKDTree(self.model.docvecs.vectors_docs) print("Finished building KD tree.") self.keys = list(self.model.docvecs.doctags.keys()) elif nnsmethod == NNSMethod.ANNOY: if not os.path.isfile(annoymodelpath): print("Generating annoy index...") self.annoy_indexer = AnnoyIndexer(self.model, 50) print("Finished generating annoy index.") self.annoy_indexer.save(annoymodelpath) else: self.annoy_indexer = AnnoyIndexer() self.annoy_indexer.load(annoymodelpath) self.annoy_indexer.model = self.model def _calculate_most_similar(self, vector, n, nnsmethod): start_time = time.clock() if nnsmethod == NNSMethod.BRUTE: tops = self.model.docvecs.most_similar([vector], topn=n) dists, indicies = [t[0] for t in tops], [t[1] for t in tops] if nnsmethod == NNSMethod.KD_TREE: dists, indicies = self.tree.query(vector, k=n) indicies = [self.keys[i] for i in indicies] if nnsmethod == NNSMethod.ANNOY: tops = self.model.docvecs.most_similar([vector], topn=n, indexer=self.annoy_indexer) dists, indicies = [t[0] for t in tops], [t[1] for t in tops] print(f"Time using {nnsmethod} - {time.clock() - start_time}") return dists, indicies def _preprocess(self, string): string = string.lower() string = re.sub('[^a-z\s]+', '', string) words = nltk.word_tokenize(string) return [word for word in words if word not in self.stopword_list] def _get_training_iterator(self): home = os.path.expanduser("~") path = os.path.join( home, "Documents", "text") # Data is assumed to be in ~/Documents/text files = glob.glob(os.path.join(path, "**/wiki_*"), recursive=True) return TaggedDocumentGenerator(files) def _get_title_from_pageids(self, ids): ids = '|'.join(ids) query = self.BASE_WIKI_QUERY + ids response = urlopen(query) dic = json.loads(response.read()) return [ v['title'] if 'title' in v else "PageId: " + str(v['pageid']) for v in dic['query']['pages'].values() ]
f.write(line[0][0] + ' ' + ' '.join([str(vec) for vec in line[1]]) + '\n') with codecs.open('./relation2vec.txt', 'a+', encoding='utf-8') as f: f.write( str(relation2vec.shape[0]) + ' ' + str(relation2vec.shape[1]) + '\n') for line in zip(relation2id.items(), relation2vec): f.write(line[0][0] + ' ' + ' '.join([str(vec) for vec in line[1]]) + '\n') # word2vec bin wv_ent = KeyedVectors.load_word2vec_format('./entity2vec.txt', binary=False) wv_ent.save_word2vec_format('./entity2vec.bin', binary=True) # annoy index wv_ent = KeyedVectors.load_word2vec_format('./entity2vec.bin', binary=True) annoy_index_ent = AnnoyIndexer(wv_ent, 200) annoy_index_ent.save('./entity2vec.index') # rel wv_rel = KeyedVectors.load_word2vec_format('./relation2vec.txt', binary=False) wv_rel.save_word2vec_format('./relation2vec.bin', binary=True) wv_rel = KeyedVectors.load_word2vec_format('./relation2vec.bin', binary=True) annoy_index_rel = AnnoyIndexer(wv_rel, 200) annoy_index_rel.save('./relation2vec.index') # tsne-plot def tsne_vis(X, labels, name): tsne = TSNE(n_components=2).fit_transform(X) plt.figure(figsize=(50, 50)) for i, label in enumerate(labels): x, y = tsne[i, :] plt.scatter(x, y)
# log.info("Done") # log.info("Saving as binary...") # word_model.save("word2vec-gnews-300.bin") # log.info("Done") # log.info("Loading binary model") # word_model = KeyedVectors.load( # r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440\MajorProject\word2vec-gnews-300.bin") # log.info("Done!") # for filename in os.listdir(r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440\MajorProject\models"): # if filename[-4:] == ".bin" and filename != "word2vec-gnews-300.bin": # log.info("Loading {0}".format(filename)) # wm = KeyedVectors.load(r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3" + # r"\CS39440\MajorProject\models\{0}".format(filename)) # log.info("Loaded, preprocessing L2 norms...") # wm.init_sims(replace=True) # log.info("Preprocessed, saving") # wm.save(filename) for filename in os.listdir(r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3\CS39440\MajorProject\models"): if filename[-4:] == ".bin" and filename != "word2vec-gnews-300.bin": log.info("Loading {0}".format(filename)) wm = KeyedVectors.load(r"C:\Users\benja\OneDrive\Documents\UniWork\Aberystwyth\Year3" + r"\CS39440\MajorProject\models\{0}".format(filename)) log.info("Building annoy indexer") indexer = AnnoyIndexer(wm, 5) log.info("Saving") indexer.save(r"models\{0}-5-trees.ann".format(filename.split(".")[0])) log.info("All Done")
epochs=args.epochs, hs=args.hs, negative=args.negative, ns_exponent=args.ns_exponent, dbow_words=args.dbow_words, workers=multiprocessing.cpu_count() ) end_time = time.perf_counter() logging.info('It took {} ms to train word2vec model.'.format(end_time - start_time)) logging.info('Saving model ...') model.save('./models/doc2vec/d2v_{}_d{}_win{}_mc{}_hs{}.bin'.format( 'dbow' if args.dm == 0 else 'dm', args.vector_dim, args.window, args.min_count, args.hs )) logging.info('Creating Annoy index ...') index = AnnoyIndexer(model, 300) logging.info('Saving index ...') index.save('./models/doc2vec/d2v_{}_d{}_win{}_mc{}_hs{}.idx'.format( 'dbow' if args.dm == 0 else 'dm', args.vector_dim, args.window, args.min_count, args.hs ))