def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords,:], wordVectors[nWords:,:]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev,), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest,), dtype=np.int32) for i in xrange(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # We will save our results from each run results = [] regValues = getRegularizationValues() for reg in regValues: print "Training for reg=%f" % reg # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0/(reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print "Train accuracy (%%): %f" % trainAccuracy # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print "Dev accuracy (%%): %f" % devAccuracy # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print "Test accuracy (%%): %f" % testAccuracy results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy}) # Print the accuracies print "" print "=== Recap ===" print "Reg\t\tTrain\tDev\tTest" for result in results: print "%.2E\t%.3f\t%.3f\t%.3f" % ( result["reg"], result["train"], result["dev"], result["test"]) print "" bestResult = chooseBestModel(results) print "Best regularization value: %0.2E" % bestResult["reg"] print "Test accuracy (%%): %f" % bestResult["test"] # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_dev_pred.txt") else: # plotRegVsAccuracy(regValues, results, "q4_reg_v_acc_your.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_conf_your.png")
def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain, ), dtype=np.int32) for i in xrange(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev, ), dtype=np.int32) for i in xrange(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest, ), dtype=np.int32) for i in xrange(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words) # We will save our results from each run results = [] regValues = getRegularizationValues() for reg in regValues: print "Training for reg=%f" % reg # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0 / (reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print "Train accuracy (%%): %f" % trainAccuracy # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print "Dev accuracy (%%): %f" % devAccuracy # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print "Test accuracy (%%): %f" % testAccuracy results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy }) # Print the accuracies print "" print "=== Recap ===" print "Reg\t\tTrain\tDev\tTest" for result in results: print "%.2E\t%.3f\t%.3f\t%.3f" % (result["reg"], result["train"], result["dev"], result["test"]) print "" bestResult = chooseBestModel(results) print "Best regularization value: %0.2E" % bestResult["reg"] print "Test accuracy (%%): %f" % bestResult["test"] # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_dev_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_dev_pred.txt")
import random import numpy as np from utils.treebank import StanfordSentiment import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time from q3_word2vec import * from q3_sgd import * # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) # We are going to train 10-dimensional vectors for this assignment dimVectors = 10 # Context size C = 5 # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime = time.time() wordVectors = np.concatenate( ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors,
import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time from word2vec import * from sgd import * #check python version import sys assert sys.version_info[0] == 3 assert sys.version_info[1] >= 5 #Reset the random seed to make sure that everyone gets the same results random.seed(314) datasets = StanfordSentiment() tokens = datasets.tokens() nWords = len(tokens) #We are going to train 10-dimensional vectors for this assignment dimVectors = 10 #Context size C = 5 #Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime = time.time() wordVectors = np.concatenate( ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors,
import random import numpy as np from utils.treebank import StanfordSentiment import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time from q3_word2vec import * from q3_sgd import * # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() tokens = dataset.tokens() #?应该是词表 nWords = len(tokens) #词表个数? # We are going to train 10-dimensional vectors for this assignment dimVectors = 10 #10维的词向量 # Context size C = 5 # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime = time.time() #一个 词表矩阵 + 词表矩阵,拼接到一起,是2*词表行数 * 词向量维度的矩阵 wordVectors = np.concatenate(
def main(args): """ Train a model to do sentiment analyis""" # Load the dataset dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) if args.yourvectors: _, wordVectors, _ = load_saved_params() wordVectors = np.concatenate( (wordVectors[:nWords,:], wordVectors[nWords:,:]), axis=1) elif args.pretrained: wordVectors = glove.loadWordVectors(tokens) dimVectors = wordVectors.shape[1] # Load the train set trainset = dataset.getTrainSentences() nTrain = len(trainset) trainFeatures = np.zeros((nTrain, dimVectors)) trainLabels = np.zeros((nTrain,), dtype=np.int32) #frequency counting freq = Counter() Sum = 0 for sen in trainset: for word in sen[0]: Sum += 1 freq[word]+=1 for word,tf in freq.items(): freq[word] = tf/Sum #generate all sentence features for i in range(nTrain): words, trainLabels[i] = trainset[i] trainFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) #svd in training set svd = TruncatedSVD(n_components=1, n_iter=5, random_state=0) u = svd.fit(trainFeatures).components_[0] # the first singular vector # remove the projections of the sentence embeddings to their first principal component for i in range(trainFeatures.shape[0]): trainFeatures[i] = trainFeatures[i] - np.dot(trainFeatures[i],u.T) * u # Prepare dev set features devset = dataset.getDevSentences() nDev = len(devset) devFeatures = np.zeros((nDev, dimVectors)) devLabels = np.zeros((nDev,), dtype=np.int32) for i in range(nDev): words, devLabels[i] = devset[i] devFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) for i in range(devFeatures.shape[0]): devFeatures[i] = devFeatures[i] - np.dot(devFeatures[i],u.T) * u # Prepare test set features testset = dataset.getTestSentences() nTest = len(testset) testFeatures = np.zeros((nTest, dimVectors)) testLabels = np.zeros((nTest,), dtype=np.int32) for i in range(nTest): words, testLabels[i] = testset[i] testFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) for i in range(testFeatures.shape[0]): testFeatures[i] = testFeatures[i] - np.dot(testFeatures[i],u.T) * u # We will save our results from each run results = [] regValues = getRegularizationValues() for reg in regValues: print("Training for reg=%f" % reg) # Note: add a very small number to regularization to please the library clf = LogisticRegression(C=1.0/(reg + 1e-12)) clf.fit(trainFeatures, trainLabels) # Test on train set pred = clf.predict(trainFeatures) trainAccuracy = accuracy(trainLabels, pred) print("Train accuracy (%%): %f" % trainAccuracy) # Test on dev set pred = clf.predict(devFeatures) devAccuracy = accuracy(devLabels, pred) print("Dev accuracy (%%): %f" % devAccuracy) # Test on test set # Note: always running on test is poor style. Typically, you should # do this only after validation. pred = clf.predict(testFeatures) testAccuracy = accuracy(testLabels, pred) print("Test accuracy (%%): %f" % testAccuracy) results.append({ "reg": reg, "clf": clf, "train": trainAccuracy, "dev": devAccuracy, "test": testAccuracy}) # Print the accuracies print ("") print ("=== Recap ===") print ("Reg\t\tTrain\tDev\tTest") for result in results: print ("%.2E\t%.3f\t%.3f\t%.3f" % ( result["reg"], result["train"], result["dev"], result["test"])) print ("") bestResult = chooseBestModel(results) print ("Best regularization value: %0.2E" % bestResult["reg"]) print ("Test accuracy (%%): %f" % bestResult["test"]) # do some error analysis if args.pretrained: plotRegVsAccuracy(regValues, results, "q4_sif_reg_v_acc.png") outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"], "q4_sif_dev_conf.png") outputPredictions(devset, devFeatures, devLabels, bestResult["clf"], "q4_sif_dev_pred.txt")
import random import numpy as np from utils.treebank import StanfordSentiment import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time from q3_word2vec import * from q3_sgd import * # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) # We are going to train 10-dimensional vectors for this assignment dimVectors = 10 # Context size C = 5 # Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) startTime=time.time() wordVectors = np.concatenate( ((np.random.rand(nWords, dimVectors) - 0.5) /
def get_glove_data(): embedding_dimension = 100 x_text, y = load_data_and_labels_bow("thread_content.npy", "thread_labels.npy") # num_recipients_features = np.array(np.load("num_recipients_features_nodup.npy")) # # avgNumRecipients = np.array(np.load("avg_num_recipients.npy")) # avgNumTokensPerEmail = np.array(np.load("avg_num_tokens_per_email.npy")) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) # Initialize word vectors with glove. embedded_vectors = glove.loadWordVectors(tokens) print("The shape of embedding matrix is:") print(embedded_vectors.shape) # Should be number of e-mails, number of embeddings nTrain = len(x_text) trainFeatures = np.zeros((nTrain, embedding_dimension)) #5 is the number of slots the extra features take up toRemove = [] for i in xrange(nTrain): words = x_text[i] num_words = len(words) #place number of words in buckets if num_words < 10: num_words_bucket = 0 elif num_words >= 10 and num_words < 100: num_words_bucket = 1 elif num_words >= 100 and num_words < 500: num_words_bucket = 2 elif num_words >= 500 and num_words < 1000: num_words_bucket = 3 elif num_words >= 1000 and num_words < 2000: num_words_bucket = 4 elif num_words >= 2000: num_words_bucket = 5 sentenceFeatures = getSentenceFeatures(tokens, embedded_vectors, words) if sentenceFeatures is None: toRemove.append(i) else: featureVector = sentenceFeatures #num_words = avgNumTokensPerEmail[i] #place number of words in buckets # if num_words < 10: # num_words_bucket = 0 # elif num_words >= 10 and num_words < 100: # num_words_bucket = 1 # elif num_words >= 100 and num_words < 500: # num_words_bucket = 2 # elif num_words >= 500 and num_words < 1000: # num_words_bucket = 3 # elif num_words >= 1000 and num_words < 2000: # num_words_bucket = 4 # elif num_words >= 2000: # num_words_bucket = 5 # featureVector = np.hstack((featureVector, num_words_bucket)) #featureVector = np.hstack((featureVector, avgNumRecipients[i])) trainFeatures[i, :] = featureVector print(len(toRemove)) y = np.delete(y, toRemove, axis=0) trainFeatures = np.delete(trainFeatures, toRemove, axis=0) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) # Array of random numbers from 1 to # of labels. x_shuffled = trainFeatures[shuffle_indices] y_shuffled = y[shuffle_indices] train = 0.6 dev = 0.2 test = 0.2 # train x, dev x, test x, train y, dev y, test y train_cutoff = int(0.6 * len(x_shuffled)) dev_cutoff = int(0.8 * len(x_shuffled)) test_cutoff = int(len(x_shuffled)) return x_shuffled[0:train_cutoff], x_shuffled[train_cutoff:dev_cutoff], x_shuffled[dev_cutoff:test_cutoff], \ y_shuffled[0:train_cutoff], y_shuffled[train_cutoff:dev_cutoff], y_shuffled[dev_cutoff:test_cutoff],
import numpy as np from utils.treebank import StanfordSentiment import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time # Check Python Version import sys assert sys.version_info[0] == 3 assert sys.version_info[1] >= 5 # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) dimVectors = 1024 sparseness = 0.03 # Context size C = 5 SPARSENESS = 0.03 LAMBDA = 0.05 GAMMA = 0.05 startTime=time.time() wordVectors = np.random.rand(nWords, dimVectors) - (1 - SPARSENESS)
import random import numpy as np from utils.treebank import StanfordSentiment import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time from q3_word2vec import * from q3_sgd import * # Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() tokens_encoded = dataset.tokens() for k, v in tokens_encoded.items(): if type(k) == str: tokens_encoded.pop(k) tokens_encoded[k.encode('latin1')] = v tokens = dict((k.decode('latin1'), v) for k, v in tokens_encoded.items()) print("SCREEN ({})".format(tokens['screenwriter'])) #print("SCRBBB ({})".format(tokens[b'screenwriter'])) nWords = len(tokens) # We are going to train 10-dimensional vectors for this assignment dimVectors = 10 # Context size C = 5
import random import numpy as np from utils.treebank import StanfordSentiment import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import time import argparse from sgd import * from word2vec import * random.seed(314) dataset = StanfordSentiment() word2Ind = dataset.tokens() nWords = len(word2Ind) def word2vec_sgd_wrapper(word2vecModel, word2Ind, wordVectors, dataset, windowSize, word2vecLossAndGradient=naiveSoftmaxLossAndGradient): batchsize = 50 loss = 0.0 grad = np.zeros(wordVectors.shape) N = wordVectors.shape[0] centerWordVectors = wordVectors[:int(N/2),:] outsideVectors = wordVectors[int(N/2):,:] for i in range(batchsize): windowSize1 = random.randint(1, windowSize) centerWord, context = dataset.getRandomContext(windowSize1) c, gin, gout = word2vecModel( centerWord, windowSize1, context, word2Ind, centerWordVectors,
def run(): ### Here is the main body of this file. We initialize the model and clean up the dataset ### Reset the random seed to make sure that everyone gets the same results random.seed(314) dataset = StanfordSentiment() tokens = dataset.tokens() nWords = len(tokens) ### We are going to train 10-dimensional vectors for this assignment dimVectors = 10 ### The maximum half context size C = 5 ### Reset the random seed to make sure that everyone gets the same results random.seed(31415) np.random.seed(9265) ### Start the clock when we begin to train this model startTime = time.time() ### The initial point to start SGD from wordVectors = np.concatenate( ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors, np.zeros((nWords, dimVectors))), axis=0) ### Call the sgd function to train our model, wordVectors = sgd(lambda vec: word2vec_sgd_wrapper( skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient), wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10) ### Note that normalization is not called here. This is not a bug, ### normalizing during training loses the notion of length. print("sanity check: cost at convergence should be around or below 10") print("training took %d seconds" % (time.time() - startTime)) ### Concatenate the input and output word vectors wordVectors = np.concatenate( (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0) ### wordVectors = wordVectors[:nWords,:] + wordVectors[nWords:,:] ### Visualize word embeddings visualizeWords = [ "the", "a", "an", ",", ".", "?", "!", "``", "''", "--", "good", "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth", "sweet", "enjoyable", "boring", "bad", "waste", "dumb", "annoying" ] visualizeIdx = [tokens[word] for word in visualizeWords] visualizeVecs = wordVectors[visualizeIdx, :] temp = (visualizeVecs - np.mean(visualizeVecs, axis=0)) covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp) U, S, V = np.linalg.svd(covariance) coord = temp.dot(U[:, 0:2]) for i in range(len(visualizeWords)): plt.text(coord[i, 0], coord[i, 1], visualizeWords[i], bbox=dict(facecolor='green', alpha=0.1)) plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0]))) plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1]))) plt.savefig('q3_word_vectors.png')