def train_glove(sentences, emb_dim=50): glove.logger.setLevel(logging.INFO) vocab = glove.build_vocab(sentences) cooccur = glove.build_cooccur(vocab, sentences, window_size=10) id2word = evaluate.make_id2word(vocab) def evaluate_word(W): words = ['good', 'movie', 'bad', 'worth', 'dog'] for word in words: print evaluate.most_similar(W, vocab, id2word, word) def save_per(W,i): if i % 100 == 0 and i >= 100: filename = "log/glove_%d_iter%d.model" % (emb_dim, i) W = evaluate.merge_main_context(W) glove.save_model(W, filename) evaluate_word(W) W = glove.train_glove(vocab, cooccur, vector_size=emb_dim, iterations=3000, iter_callback=save_per)
def train_glove(sentences, emb_dim=50): glove.logger.setLevel(logging.INFO) vocab = glove.build_vocab(sentences) cooccur = glove.build_cooccur(vocab, sentences, window_size=10) id2word = evaluate.make_id2word(vocab) def evaluate_word(W): words = ['good', 'movie', 'bad', 'worth', 'dog'] for word in words: print evaluate.most_similar(W, vocab, id2word, word) def save_per(W, i): if i % 100 == 0 and i >= 100: filename = "log/glove_%d_iter%d.model" % (emb_dim, i) W = evaluate.merge_main_context(W) glove.save_model(W, filename) evaluate_word(W) W = glove.train_glove(vocab, cooccur, vector_size=emb_dim, iterations=3000, iter_callback=save_per)
test_corpus = ("""human interface computer survey user computer system response time eps user interface system system human system eps user response time trees graph trees graph minors trees graph minors survey I like graph and stuff I like trees and stuff Sometimes I build a graph Sometimes I build trees""").split("\n") glove.logger.setLevel(logging.ERROR) vocab = glove.build_vocab(test_corpus) cooccur = glove.build_cooccur(vocab, test_corpus, window_size=10) id2word = evaluate.make_id2word(vocab) W = glove.train_glove(vocab, cooccur, vector_size=10, iterations=500) # Merge and normalize word vectors W = evaluate.merge_main_context(W) def test_similarity(): similar = evaluate.most_similar(W, vocab, id2word, 'graph') logging.debug(similar) assert_equal('trees', similar[0])