def main(): sentences, index2word = utils.load_sentences_brown(800) # params nb_epoch = 3 # learn `batch_size words` at a time batch_size = 128 vec_dim = 64 negsampling_num = 0.2 # half of window window_size = 6 vocab_size = len(index2word) print 'vocabulary length: ', vocab_size # create input couples, labels = utils.skip_grams(sentences, window_size, vocab_size, negsampling_num) print 'counter of positive samples and negative samples: ', Counter(labels) print 'shape of couples: ', couples.shape print 'shape of labels: ', labels.shape # metrics nb_batch = len(labels) // batch_size samples_per_epoch = batch_size * nb_batch # fit model model = make_word2vec_model(vec_dim, vocab_size) model.fit_generator(generator=utils.batch_generator( couples, labels, batch_size, nb_batch), steps_per_epoch=samples_per_epoch, epochs=nb_epoch) # save weights utils.save_weights(model, index2word, vec_dim) # eval using gensim print 'the....' utils.most_similar(positive=['the']) print 'all....' utils.most_similar(positive=['all']) print 'baby....' utils.most_similar(positive=['baby']) print 'first....' utils.most_similar(positive=['first'])
embedded_pvt = Embedding(input_dim=vocab_size, output_dim=vec_dim, input_length=1)(input_pvt) embedded_ctx = Embedding(input_dim=vocab_size, output_dim=vec_dim, input_length=1)(input_ctx) merged = merge(inputs=[embedded_pvt, embedded_ctx], mode=lambda x: (x[0] * x[1]).sum(-1), output_shape=(batch_size, 1)) predictions = Activation('sigmoid')(merged) # build and train the model model = Model(input=[input_pvt, input_ctx], output=predictions) model.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy']) model.fit_generator(generator=batch_generator(couples, labels), samples_per_epoch=samples_per_epoch, nb_epoch=nb_epoch, verbose=1) # save weights utils.save_weights(model, index2word, vec_dim) # eval using gensim print 'the....' utils.most_similar(positive=['the']) print 'she - he + him....' utils.most_similar(positive=['she', 'him'], negative=['he'])
batch_size, vocab_size, )) embedded_pvt = Dense(input_dim=vocab_size, output_dim=vec_dim)(input_pvt) embedded_ctx = Dense(input_dim=vocab_size, output_dim=vec_dim)(input_ctx) merged = merge(inputs=[embedded_pvt, embedded_ctx], mode=lambda a: (a[0] * a[1]).sum(-1).reshape((batch_size, 1)), output_shape=(batch_size, 1)) predictions = Activation('sigmoid')(merged) # build and train the model model = Model(input=[input_pvt, input_ctx], output=predictions) model.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy']) model.fit_generator(generator=batch_generator(couples, labels), samples_per_epoch=samples_per_epoch, nb_epoch=nb_epoch, verbose=1) # save weights utils.save_weights(model, index2word, vec_dim) # eval using gensim print 'the....' utils.most_similar(positive=['the']) print 'she - he + him....' utils.most_similar(positive=['she', 'him'], negative=['he'])
def found_similar(text, items): rate, similar_items = utils.most_similar(utils.similarity(items, text, min_ratio=0.96)) if len(similar_items) > 0: return similar_items[0]
# -*- coding: utf-8 -*- from gensim import models from os import path import utils import cilin from cilin import CilinSimilarity from ioFile import dataFromFile # evaluate using gensim test_word_list = [u'乙型', u'肝炎', u'肝癌'] for word in test_word_list: print word utils.most_similar(positive=[word], negative=[])
import utils # eval using gensim print('tài...') utils.most_similar(positive=['ra']) print('chữ tài chữ mệnh...') utils.most_similar(positive=['chữ', 'tài', 'mệnh'], negative=['tài'])
def get_similar_country_names(self, country_name): rate, most_similars = utils.most_similar(utils.similarity(self.normalized_country_items.keys(), country_name, 0.7)) return most_similars