paragraphs_per_article = args.paragraphs_per_article ucbl = args.ucbl_file output = args.output_file v_size = args.vec_size window_size = args.window min_count = args.min_count workers = args.workers n_iter = args.iter corpus = LoadFileJson() if paragraphs_per_article < 1 and max_nb_wiki_paragraphs > 0: print 'You need to set "paragraphs_per_article" to a number bigger than 0 if you want to load wikipedia articles' else: data = corpus.LoadDocumentsIstexAndUCBL(istex, ucbl, wiki, max_nb_wiki_paragraphs, paragraphs_per_article) model = Doc2Vec(data, min_count=min_count, size=v_size, workers=workers, iter=n_iter, window=window_size) print "Vocabulary size after training: ", len(model.vocab.keys()) print "count of documents", corpus.count + corpus.wiki_count f = open(output + "keysIndex", "w") json.dump(corpus.index, f) f.close Doc2Vec.save(model, output)
def _save_for_inference(model: Doc2Vec, path_name: str) -> None: model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) model.save(path_name)
def save_word_model(model: Doc2Vec, no_classes: str = '', filename: str = ''): if not (filename): filename = 'Doc2Vec' + '_' + str(no_classes) + "classes" model.save('model/' + filename + '.bin')