def intersect(vectorsize): model=Word2Vec.load('corpus/fieldtrained'+str(vectorsize)+'.model') # setwordwindow(vectorsize) print 'finish load' Word2Vec.intersect_word2vec_format(model,'corpus/initindex'+str(vectorsize),binary=False) print 'finish intersect' model.save('corpus/merged'+str(vectorsize)+'.model') model.save_word2vec_format('corpus/merged'+str(vectorsize), binary=False) print 'finish save'
def intersect(vectorsize): model = Word2Vec.load('corpus/fieldtrained' + str(vectorsize) + '.model') # setwordwindow(vectorsize) print 'finish load' Word2Vec.intersect_word2vec_format(model, 'corpus/initindex' + str(vectorsize), binary=False) print 'finish intersect' model.save('corpus/merged' + str(vectorsize) + '.model') model.save_word2vec_format('corpus/merged' + str(vectorsize), binary=False) print 'finish save'
def teword(): # model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False) # sim=model.most_similar(positive=[u'好',u'开心'],negative=[u'下雨'],topn=2) # print sim documents = [u"今天 天气 真是 好 啊", u"明天 就要 下雨 了,伐 开心"] model = Word2Vec(documents, size=20, window=5, min_count=1) sim = model.most_similar(positive=[u"好"], topn=2) # model.save('./tmp/tevec') print sim model = Word2Vec.load_word2vec_format('vectorseg.bin', binary=False) Word2Vec.intersect_word2vec_format(model, 'fieldvec.bin', binary=False) Word2Vec.train_batch_sg(model, sentences, alpha, work=None)
def teword(): # model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False) # sim=model.most_similar(positive=[u'好',u'开心'],negative=[u'下雨'],topn=2) # print sim documents=[u"今天 天气 真是 好 啊",u"明天 就要 下雨 了,伐 开心"] model=Word2Vec(documents,size=20,window=5,min_count=1) sim=model.most_similar(positive=[u"好"],topn=2) # model.save('./tmp/tevec') print sim model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False) Word2Vec.intersect_word2vec_format(model,'fieldvec.bin',binary=False) Word2Vec.train_batch_sg(model, sentences, alpha, work=None)
def intersect(vectorsize): # merged OK! # vectorsize=40 model = Word2Vec(size=vectorsize, min_count=2, sg=1) sentences = LineSentence('corpus/precorpus') model.build_vocab(sentences) model.train(sentences) print 'finish pre-train' model.save('corpus/pretrain' + str(vectorsize) + '.model') model.save_word2vec_format('corpus/pretrain' + str(vectorsize)) # intersect does not delete the bibary tree, but load does # model=Word2Vec.load('corpus/pretrain'+str(vectorsize)+'.model') setwordwindow(vectorsize) Word2Vec.intersect_word2vec_format(model, 'corpus/initindex' + str(vectorsize), binary=False) print 'finish intersect' model.save('corpus/merged' + str(vectorsize) + '.model') model.save_word2vec_format('corpus/merged' + str(vectorsize), binary=False) # model.build_vocab(sentences) # sensum=0 # for i in sentences: # sensum+=1 # model.corpus_count=sensum # Word2Vec.reset_weights(model) # model=Word2Vec.load('corpus/merged40.model') print "finish load" sentences = LineSentence('corpus/fieldcorpus') print "finish sentence building" model.iter = 1 model.train(sentences) print "finish training" # in class Word2Vec # self.build_vocab(sentences, trim_rule=trim_rule) # self.train(sentences) # # train_batch_sg(model, sentences, alpha=0.1,work=None) # simply use train and set iter=1? model.save('corpus/mergedtrained' + str(vectorsize) + 'iter' + str(model.iter) + '.model') model.save_word2vec_format('corpus/mergedtrained' + str(vectorsize) + 'iter' + str(model.iter), binary=False)
def intersect(vectorsize): # merged OK! # vectorsize=40 model=Word2Vec(size=vectorsize,min_count=2,sg=1) sentences=LineSentence('corpus/precorpus') model.build_vocab(sentences) model.train(sentences) print 'finish pre-train' model.save('corpus/pretrain'+str(vectorsize)+'.model') model.save_word2vec_format('corpus/pretrain'+str(vectorsize)) # intersect does not delete the bibary tree, but load does # model=Word2Vec.load('corpus/pretrain'+str(vectorsize)+'.model') setwordwindow(vectorsize) Word2Vec.intersect_word2vec_format(model,'corpus/initindex'+str(vectorsize),binary=False) print 'finish intersect' model.save('corpus/merged'+str(vectorsize)+'.model') model.save_word2vec_format('corpus/merged'+str(vectorsize), binary=False) # model.build_vocab(sentences) # sensum=0 # for i in sentences: # sensum+=1 # model.corpus_count=sensum # Word2Vec.reset_weights(model) # model=Word2Vec.load('corpus/merged40.model') print "finish load" sentences=LineSentence('corpus/fieldcorpus') print "finish sentence building" model.iter=1 model.train(sentences) print "finish training" # in class Word2Vec # self.build_vocab(sentences, trim_rule=trim_rule) # self.train(sentences) # # train_batch_sg(model, sentences, alpha=0.1,work=None) # simply use train and set iter=1? model.save('corpus/mergedtrained'+str(vectorsize)+'iter'+str(model.iter)+'.model') model.save_word2vec_format('corpus/mergedtrained'+str(vectorsize)+'iter'+str(model.iter), binary=False)