Example #1
0
# Context size
C = 5
# Train word vectors (this could take a while!)

# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)
wordVectors = normalizeRows(np.random.randn(nWords * 2, dimVectors))
wordVectors0 = sgd(lambda wordVectors: word2vec_sgd_wrapper(skipgram, C, negSamplingCostAndGradient, wordVectors), wordVectors, 10.0, 200000, normalizeRows, True)

# just use the output vectors
wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) / 2.0

print "\n=== For autograder ==="
checkWords = ["the", "a", "an", "movie", "ordinary", "but", "and"]
checkIdx = [tokens[word] for word in checkWords]
checkVecs = wordVectors[checkIdx, :]
print checkVecs

# Visualize the word vectors you trained
_, wordVectors0 = load_saved_params()
wordVectors = (wordVectors0[:nWords,:] + wordVectors0[nWords:,:]) / 2.0
visualizeWords = ["the", "a", "an", ",", ".", "?", "!", "``", "''", "--", "good", "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth", "sweet", "warm", "enjoyable", "boring", "bad", "garbage", "waste", "disaster", "dumb", "embarrassment", "annoying", "disgusting"]
visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]

import visualizing as vs

vs.visualize(visualizeVecs, visualizeWords, "word2vec")
Example #2
0
__author__ = 'dy'
from gensim.models.word2vec import Word2Vec
import numpy as np
import matplotlib.pyplot as plt
from cs224d.datasets.data_utils import *

dataset = StanfordSentiment()
sentences = dataset.sentences()

model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
# model.save_word2vec_format("baseline.model")

print "\n=== For autograder ==="
checkWords = ["the", "a", "an", "movie", "ordinary", "but", "and"]
# checkIdx = [model.vocab[word].index for word in checkWords]
# checkVecs = model[checkIdx, :]
checkVecs = np.array([model[w] for w in checkWords])
print checkVecs

# Visualize the word vectors you trained
# model = model.load_word2vec_format("baseline.model")

visualizeWords = ["the", "a", "an", ",", ".", "?", "!", "``", "''", "--", "good", "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth", "sweet", "warm", "enjoyable", "boring", "bad", "garbage", "waste", "disaster", "dumb", "embarrassment", "annoying", "disgusting"]
visualizeVecs = np.array([model[w] for w in visualizeWords])

import visualizing as vs

vs.visualize(visualizeVecs, visualizeWords, "baseline")