Ejemplo n.º 1
0
def intersect(vectorsize):
    model=Word2Vec.load('corpus/fieldtrained'+str(vectorsize)+'.model')
    # setwordwindow(vectorsize)
    print 'finish load'
    Word2Vec.intersect_word2vec_format(model,'corpus/initindex'+str(vectorsize),binary=False)
    print 'finish intersect'
    model.save('corpus/merged'+str(vectorsize)+'.model')
    model.save_word2vec_format('corpus/merged'+str(vectorsize), binary=False)
    print 'finish save'
Ejemplo n.º 2
0
def intersect(vectorsize):
    model = Word2Vec.load('corpus/fieldtrained' + str(vectorsize) + '.model')
    # setwordwindow(vectorsize)
    print 'finish load'
    Word2Vec.intersect_word2vec_format(model,
                                       'corpus/initindex' + str(vectorsize),
                                       binary=False)
    print 'finish intersect'
    model.save('corpus/merged' + str(vectorsize) + '.model')
    model.save_word2vec_format('corpus/merged' + str(vectorsize), binary=False)
    print 'finish save'
Ejemplo n.º 3
0
def teword():
    # model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False)
    # sim=model.most_similar(positive=[u'好',u'开心'],negative=[u'下雨'],topn=2)
    # print sim
    documents = [u"今天 天气 真是 好 啊", u"明天 就要 下雨 了,伐 开心"]
    model = Word2Vec(documents, size=20, window=5, min_count=1)
    sim = model.most_similar(positive=[u"好"], topn=2)
    # model.save('./tmp/tevec')
    print sim

    model = Word2Vec.load_word2vec_format('vectorseg.bin', binary=False)
    Word2Vec.intersect_word2vec_format(model, 'fieldvec.bin', binary=False)
    Word2Vec.train_batch_sg(model, sentences, alpha, work=None)
Ejemplo n.º 4
0
def teword():
    # model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False)
    # sim=model.most_similar(positive=[u'好',u'开心'],negative=[u'下雨'],topn=2)
    # print sim
    documents=[u"今天 天气 真是 好 啊",u"明天 就要 下雨 了,伐 开心"]
    model=Word2Vec(documents,size=20,window=5,min_count=1)
    sim=model.most_similar(positive=[u"好"],topn=2)
    # model.save('./tmp/tevec')
    print sim

    model=Word2Vec.load_word2vec_format('vectorseg.bin',binary=False)
    Word2Vec.intersect_word2vec_format(model,'fieldvec.bin',binary=False)
    Word2Vec.train_batch_sg(model, sentences, alpha, work=None)
Ejemplo n.º 5
0
def intersect(vectorsize):
    # merged OK!
    # vectorsize=40
    model = Word2Vec(size=vectorsize, min_count=2, sg=1)

    sentences = LineSentence('corpus/precorpus')
    model.build_vocab(sentences)
    model.train(sentences)
    print 'finish pre-train'
    model.save('corpus/pretrain' + str(vectorsize) + '.model')
    model.save_word2vec_format('corpus/pretrain' + str(vectorsize))

    # intersect does not delete the bibary tree, but load does
    # model=Word2Vec.load('corpus/pretrain'+str(vectorsize)+'.model')
    setwordwindow(vectorsize)
    Word2Vec.intersect_word2vec_format(model,
                                       'corpus/initindex' + str(vectorsize),
                                       binary=False)
    print 'finish intersect'
    model.save('corpus/merged' + str(vectorsize) + '.model')
    model.save_word2vec_format('corpus/merged' + str(vectorsize), binary=False)

    # model.build_vocab(sentences)

    # sensum=0
    # for i in sentences:
    #     sensum+=1
    # model.corpus_count=sensum

    # Word2Vec.reset_weights(model)
    # model=Word2Vec.load('corpus/merged40.model')
    print "finish load"
    sentences = LineSentence('corpus/fieldcorpus')
    print "finish sentence building"

    model.iter = 1
    model.train(sentences)
    print "finish training"

    # in class Word2Vec
    # self.build_vocab(sentences, trim_rule=trim_rule)
    # self.train(sentences)
    #

    # train_batch_sg(model, sentences, alpha=0.1,work=None)
    # simply use train and set iter=1?
    model.save('corpus/mergedtrained' + str(vectorsize) + 'iter' +
               str(model.iter) + '.model')
    model.save_word2vec_format('corpus/mergedtrained' + str(vectorsize) +
                               'iter' + str(model.iter),
                               binary=False)
Ejemplo n.º 6
0
def intersect(vectorsize): 
    # merged OK!   
    # vectorsize=40
    model=Word2Vec(size=vectorsize,min_count=2,sg=1)

    sentences=LineSentence('corpus/precorpus')
    model.build_vocab(sentences)
    model.train(sentences)
    print 'finish pre-train'
    model.save('corpus/pretrain'+str(vectorsize)+'.model')
    model.save_word2vec_format('corpus/pretrain'+str(vectorsize))
    
    # intersect does not delete the bibary tree, but load does
    # model=Word2Vec.load('corpus/pretrain'+str(vectorsize)+'.model')
    setwordwindow(vectorsize)
    Word2Vec.intersect_word2vec_format(model,'corpus/initindex'+str(vectorsize),binary=False)
    print 'finish intersect'
    model.save('corpus/merged'+str(vectorsize)+'.model')
    model.save_word2vec_format('corpus/merged'+str(vectorsize), binary=False)


    # model.build_vocab(sentences)

    # sensum=0
    # for i in sentences:
    #     sensum+=1
    # model.corpus_count=sensum

    # Word2Vec.reset_weights(model)
    # model=Word2Vec.load('corpus/merged40.model')
    print "finish load"
    sentences=LineSentence('corpus/fieldcorpus')
    print "finish sentence building"
    
    model.iter=1
    model.train(sentences)
    print "finish training"

    # in class Word2Vec
    # self.build_vocab(sentences, trim_rule=trim_rule)
    # self.train(sentences)
    #

    # train_batch_sg(model, sentences, alpha=0.1,work=None)
    # simply use train and set iter=1?
    model.save('corpus/mergedtrained'+str(vectorsize)+'iter'+str(model.iter)+'.model')
    model.save_word2vec_format('corpus/mergedtrained'+str(vectorsize)+'iter'+str(model.iter), binary=False)