Example #1
0
def main():

    sentences, index2word = utils.load_sentences_brown(800)

    # params
    nb_epoch = 3
    # learn `batch_size words` at a time
    batch_size = 128
    vec_dim = 64
    negsampling_num = 0.2
    # half of window
    window_size = 6
    vocab_size = len(index2word)

    print 'vocabulary length: ', vocab_size

    # create input
    couples, labels = utils.skip_grams(sentences, window_size, vocab_size,
                                       negsampling_num)
    print 'counter of positive samples and negative samples: ', Counter(labels)

    print 'shape of couples: ', couples.shape
    print 'shape of labels: ', labels.shape

    # metrics
    nb_batch = len(labels) // batch_size
    samples_per_epoch = batch_size * nb_batch

    # fit model
    model = make_word2vec_model(vec_dim, vocab_size)
    model.fit_generator(generator=utils.batch_generator(
        couples, labels, batch_size, nb_batch),
                        steps_per_epoch=samples_per_epoch,
                        epochs=nb_epoch)

    # save weights
    utils.save_weights(model, index2word, vec_dim)

    # eval using gensim
    print 'the....'
    utils.most_similar(positive=['the'])
    print 'all....'
    utils.most_similar(positive=['all'])
    print 'baby....'
    utils.most_similar(positive=['baby'])
    print 'first....'
    utils.most_similar(positive=['first'])
embedded_pvt = Embedding(input_dim=vocab_size,
                         output_dim=vec_dim,
                         input_length=1)(input_pvt)

embedded_ctx = Embedding(input_dim=vocab_size,
                         output_dim=vec_dim,
                         input_length=1)(input_ctx)

merged = merge(inputs=[embedded_pvt, embedded_ctx],
               mode=lambda x: (x[0] * x[1]).sum(-1),
               output_shape=(batch_size, 1))

predictions = Activation('sigmoid')(merged)


# build and train the model
model = Model(input=[input_pvt, input_ctx], output=predictions)
model.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy'])
model.fit_generator(generator=batch_generator(couples, labels),
                    samples_per_epoch=samples_per_epoch,
                    nb_epoch=nb_epoch, verbose=1)

# save weights
utils.save_weights(model, index2word, vec_dim)

# eval using gensim
print 'the....'
utils.most_similar(positive=['the'])
print 'she - he + him....'
utils.most_similar(positive=['she', 'him'], negative=['he'])
    batch_size,
    vocab_size,
))

embedded_pvt = Dense(input_dim=vocab_size, output_dim=vec_dim)(input_pvt)

embedded_ctx = Dense(input_dim=vocab_size, output_dim=vec_dim)(input_ctx)

merged = merge(inputs=[embedded_pvt, embedded_ctx],
               mode=lambda a: (a[0] * a[1]).sum(-1).reshape((batch_size, 1)),
               output_shape=(batch_size, 1))

predictions = Activation('sigmoid')(merged)

# build and train the model
model = Model(input=[input_pvt, input_ctx], output=predictions)
model.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy'])
model.fit_generator(generator=batch_generator(couples, labels),
                    samples_per_epoch=samples_per_epoch,
                    nb_epoch=nb_epoch,
                    verbose=1)

# save weights
utils.save_weights(model, index2word, vec_dim)

# eval using gensim
print 'the....'
utils.most_similar(positive=['the'])
print 'she - he + him....'
utils.most_similar(positive=['she', 'him'], negative=['he'])
def found_similar(text, items):
    rate, similar_items = utils.most_similar(utils.similarity(items, text, min_ratio=0.96))

    if len(similar_items) > 0:
        return similar_items[0]
Example #5
0
# -*- coding: utf-8 -*-
from gensim import models
from os import path

import utils
import cilin
from cilin import CilinSimilarity
from ioFile import dataFromFile


# evaluate using gensim
test_word_list = [u'乙型', u'肝炎', u'肝癌']
for word in test_word_list:
	print word
	utils.most_similar(positive=[word], negative=[])


Example #6
0
import utils

# eval using gensim
print('tài...')
utils.most_similar(positive=['ra'])
print('chữ tài chữ mệnh...')
utils.most_similar(positive=['chữ', 'tài', 'mệnh'], negative=['tài'])
 def get_similar_country_names(self, country_name):
     rate, most_similars = utils.most_similar(utils.similarity(self.normalized_country_items.keys(), country_name, 0.7))
     return most_similars