def use_glove(sentences, index2word, model_file='glove_model/glove_50_iter2900.model',emb_dim=50): glove.logger.setLevel(logging.INFO) vocab = glove.build_vocab(sentences) # cooccur = glove.build_cooccur(vocab, sentences, window_size=10) # id2word = evaluate.make_id2word(vocab) W = glove.load_model(model_file) embeddings = [] for index,word in enumerate(index2word): if word in vocab: word_id = vocab[word][0] vec = W[word_id] else: vec = np.zeros(shape=(W.shape[1],)) embeddings.append(vec) embeddings = np.asarray(embeddings) return embeddings
def train_glove(sentences, emb_dim=50): glove.logger.setLevel(logging.INFO) vocab = glove.build_vocab(sentences) cooccur = glove.build_cooccur(vocab, sentences, window_size=10) id2word = evaluate.make_id2word(vocab) def evaluate_word(W): words = ['good', 'movie', 'bad', 'worth', 'dog'] for word in words: print evaluate.most_similar(W, vocab, id2word, word) def save_per(W,i): if i % 100 == 0 and i >= 100: filename = "log/glove_%d_iter%d.model" % (emb_dim, i) W = evaluate.merge_main_context(W) glove.save_model(W, filename) evaluate_word(W) W = glove.train_glove(vocab, cooccur, vector_size=emb_dim, iterations=3000, iter_callback=save_per)
def use_glove(sentences, index2word, model_file='glove_model/glove_50_iter2900.model', emb_dim=50): glove.logger.setLevel(logging.INFO) vocab = glove.build_vocab(sentences) # cooccur = glove.build_cooccur(vocab, sentences, window_size=10) # id2word = evaluate.make_id2word(vocab) W = glove.load_model(model_file) embeddings = [] for index, word in enumerate(index2word): if word in vocab: word_id = vocab[word][0] vec = W[word_id] else: vec = np.zeros(shape=(W.shape[1], )) embeddings.append(vec) embeddings = np.asarray(embeddings) return embeddings
def train_glove(sentences, emb_dim=50): glove.logger.setLevel(logging.INFO) vocab = glove.build_vocab(sentences) cooccur = glove.build_cooccur(vocab, sentences, window_size=10) id2word = evaluate.make_id2word(vocab) def evaluate_word(W): words = ['good', 'movie', 'bad', 'worth', 'dog'] for word in words: print evaluate.most_similar(W, vocab, id2word, word) def save_per(W, i): if i % 100 == 0 and i >= 100: filename = "log/glove_%d_iter%d.model" % (emb_dim, i) W = evaluate.merge_main_context(W) glove.save_model(W, filename) evaluate_word(W) W = glove.train_glove(vocab, cooccur, vector_size=emb_dim, iterations=3000, iter_callback=save_per)
test_corpus = ("""human interface computer survey user computer system response time eps user interface system system human system eps user response time trees graph trees graph minors trees graph minors survey I like graph and stuff I like trees and stuff Sometimes I build a graph Sometimes I build trees""").split("\n") glove.logger.setLevel(logging.ERROR) vocab = glove.build_vocab(test_corpus) cooccur = glove.build_cooccur(vocab, test_corpus, window_size=10) id2word = evaluate.make_id2word(vocab) W = glove.train_glove(vocab, cooccur, vector_size=10, iterations=500) # Merge and normalize word vectors W = evaluate.merge_main_context(W) def test_similarity(): similar = evaluate.most_similar(W, vocab, id2word, 'graph') logging.debug(similar) assert_equal('trees', similar[0])
with open(path) as f: return f.read().split("\n") print("Loading corpus and lexica") #test_corpus = read_lines("../../datasets/snli_1.0/snli_sentenceA_72k_train.txt") synonyms = read_lines("../../datasets/antonym_synonym/synonym_200.txt") antonyms = read_lines("../../datasets/antonym_synonym/antonym_200.txt") glove.logger.setLevel(logging.ERROR) print("Building Vocab") vocab = glove.build_vocab(test_corpus, synonyms, antonyms) synonyms = glove.build_syncab(synonyms, vocab) antonyms = glove.build_antcab(antonyms, vocab) print("Building Cooccur") cooccur = glove.build_cooccur(vocab, test_corpus, window_size=1) id2word = evaluate.make_id2word(vocab) print("Training vectors...") W = glove.train_glove(vocab, synonyms, antonyms, cooccur,