print('Convert text documents to vectors by Doc2Vec') print(sys.argv[0] + " -h for help") sys.exit() elif opt in ("-d", "--data_dir"): data_dir = arg elif opt in ("-model_size", "--model_size"): model_size = int(arg) elif opt in ("-epoch", "--epoch"): nb_epochs = int(arg) elif opt in ("-lb", "--label_file"): label_file = arg elif opt in ("--out_file"): out_filename = arg #load documents documents = load.get_doc(data_dir, label_file) print('Data Loading finished') print(len(documents), type(documents)) # build the model # model = gensim.models.Doc2Vec(documents, dm=1, alpha=0.025, size=model_size, min_alpha=0.025, min_count=0, workers=8) # # # start training # for epoch in range(epochs): # if epoch % 5 == 0: # print ('Now training epoch %s'%epoch) # model.train(documents) # model.alpha -= 0.002 # decrease the learning rate # model.min_alpha = model.alpha # fix the learning rate, no decay
print ('Convert text documents to vectors by Doc2Vec') print (sys.argv[0] + " -h for help") sys.exit () elif opt in ("-d","--data_dir"): data_dir = arg elif opt in ("-model_size","--model_size"): model_size = int (arg) elif opt in ("-epoch","--epoch"): nb_epochs = int (arg) elif opt in ("-lb","--label_file"): label_file = arg elif opt in ("--out_file"): out_filename = arg #load documents documents = load.get_doc(data_dir, label_file) print ('Data Loading finished') print (len(documents),type(documents)) # build the model model = gensim.models.Doc2Vec(documents, dm = 1, alpha=0.025, size= model_size, min_alpha=0.025, min_count=0, workers=8) # start training for epoch in range(nb_epochs): if epoch % 5 == 0: print ('Now training epoch %s'%epoch) model.train(documents) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay
import gensim import load documents = load.get_doc('docs') print('Data Loading finished') print(len(documents), type(documents)) # build the model model = gensim.models.Doc2Vec(documents, dm=0, alpha=0.025, size=20, min_alpha=0.025, min_count=0) modeldm = gensim.models.Doc2Vec(documents, alpha=0.025, size=20, min_alpha=0.025, min_count=0) # start training for epoch in range(200): if epoch % 20 == 0: print('Now training epoch %s' % epoch) model.train(documents) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay # shows the similar words print(model.most_similar('mildew'))
# doc2vectest.py #import sys #reload(sys) #sys.setdefaultencoding('utf8') #import codecs import gensim import load documents = load.get_doc('/Users/lipingzhang/Desktop/program/doc2vec/word_vectors_game_of_thrones-LIVE/data') #documents = get_doc('/Users/lipingzhang/Desktop/program/doc2vec/word_vectors_game_of_thrones-LIVE/data') print('Data Loading finished') print(len(documents), type(documents)) # build the model model = gensim.models.Doc2Vec(documents, dm = 0, alpha = 0.025, size = 20, min_alpha = 0.025, min_count = 0) # start training for epoch in range(200): if epoch % 20 == 0: print('Now training epoch %s' & epoch) model.train(documents) # decrease the learning rate model.alpha -= 0.002 # fix the learning rate, no decay model.min_alpha = model.alpha # shows the similar words print(model.most_similar('suppli')) # shows the learnt embeeding
import gensim import load documents = load.get_doc('books') print('Data Loading finished') print(len(documents), type(documents)) model = gensim.models.Doc2Vec(documents, dm=0, alpha=0.025, size=20, min_alpha=0.025, min_count=0) for epoch in range(200): if epoch % 20 == 0: print('Now training epoch %s' % epoch) token_count = sum([len(document) for document in documents]) model.train(documents, total_examples=token_count, epochs=model.iter) model.alpha -= 0.002 model.min_alpha = model.alpha print(model.most_similar('обман')) print(model['обман']) print(model.docvecs.most_similar(str('books/love.txt')))