def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords,:], wordVectors[nWords:,:]),
            axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain,), dtype=np.int32)
    for i in xrange(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev,), dtype=np.int32)
    for i in xrange(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest,), dtype=np.int32)
    for i in xrange(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print "Training for reg=%f" % reg
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0/(reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print "Train accuracy (%%): %f" % trainAccuracy

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print "Dev accuracy (%%): %f" % devAccuracy

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print "Test accuracy (%%): %f" % testAccuracy

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy})

    # Print the accuracies
    print ""
    print "=== Recap ==="
    print "Reg\t\tTrain\tDev\tTest"
    for result in results:
        print "%.2E\t%.3f\t%.3f\t%.3f" % (
            result["reg"],
            result["train"],
            result["dev"],
            result["test"])
    print ""

    bestResult = chooseBestModel(results)
    print "Best regularization value: %0.2E" % bestResult["reg"]
    print "Test accuracy (%%): %f" % bestResult["test"]

    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_dev_pred.txt")
    else:
        # plotRegVsAccuracy(regValues, results, "q4_reg_v_acc_your.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_conf_your.png")
def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain, ), dtype=np.int32)
    for i in xrange(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev, ), dtype=np.int32)
    for i in xrange(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest, ), dtype=np.int32)
    for i in xrange(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print "Training for reg=%f" % reg
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0 / (reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print "Train accuracy (%%): %f" % trainAccuracy

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print "Dev accuracy (%%): %f" % devAccuracy

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print "Test accuracy (%%): %f" % testAccuracy

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy
        })

    # Print the accuracies
    print ""
    print "=== Recap ==="
    print "Reg\t\tTrain\tDev\tTest"
    for result in results:
        print "%.2E\t%.3f\t%.3f\t%.3f" % (result["reg"], result["train"],
                                          result["dev"], result["test"])
    print ""

    bestResult = chooseBestModel(results)
    print "Best regularization value: %0.2E" % bestResult["reg"]
    print "Test accuracy (%%): %f" % bestResult["test"]

    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_dev_pred.txt")
import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time

from q3_word2vec import *
from q3_sgd import *

# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

# Context size
C = 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime = time.time()
wordVectors = np.concatenate(
    ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors,
Exemple #4
0
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time
from word2vec import *
from sgd import *

#check python version
import sys
assert sys.version_info[0] == 3
assert sys.version_info[1] >= 5

#Reset the random seed to make sure that everyone gets the same results
random.seed(314)
datasets = StanfordSentiment()
tokens = datasets.tokens()
nWords = len(tokens)

#We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

#Context size
C = 5

#Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime = time.time()
wordVectors = np.concatenate(
    ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors,
Exemple #5
0
import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time

from q3_word2vec import *
from q3_sgd import *

# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
tokens = dataset.tokens()  #?应该是词表
nWords = len(tokens)  #词表个数?

# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10  #10维的词向量

# Context size
C = 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime = time.time()
#一个 词表矩阵 + 词表矩阵,拼接到一起,是2*词表行数 * 词向量维度的矩阵
wordVectors = np.concatenate(
Exemple #6
0
def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    
    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords,:], wordVectors[nWords:,:]),
            axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)

    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain,), dtype=np.int32)
    
    #frequency counting
    freq = Counter()
    Sum = 0
    for sen in trainset:
        for word in sen[0]:
            Sum += 1
            freq[word]+=1
    for word,tf in freq.items():
        freq[word] = tf/Sum
    
    #generate all sentence features
    for i in range(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq)
    #svd in training set
    svd = TruncatedSVD(n_components=1, n_iter=5, random_state=0)
    u = svd.fit(trainFeatures).components_[0] # the first singular vector
    # remove the projections of the sentence embeddings to their first principal component
    for i in range(trainFeatures.shape[0]):
        trainFeatures[i] = trainFeatures[i] - np.dot(trainFeatures[i],u.T) * u
    
    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev,), dtype=np.int32)
    for i in range(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) 
    for i in range(devFeatures.shape[0]):
            devFeatures[i] = devFeatures[i] - np.dot(devFeatures[i],u.T) * u
            
    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest,), dtype=np.int32)
    for i in range(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq)
    for i in range(testFeatures.shape[0]):
            testFeatures[i] = testFeatures[i] - np.dot(testFeatures[i],u.T) * u
            
    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print("Training for reg=%f" % reg)
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0/(reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print("Train accuracy (%%): %f" % trainAccuracy)

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print("Dev accuracy (%%): %f" % devAccuracy)

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print("Test accuracy (%%): %f" % testAccuracy)

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy})

    # Print the accuracies
    print ("")
    print ("=== Recap ===")
    print ("Reg\t\tTrain\tDev\tTest")
    for result in results:
        print ("%.2E\t%.3f\t%.3f\t%.3f" % (
            result["reg"],
            result["train"],
            result["dev"],
            result["test"]))
    print ("")

    bestResult = chooseBestModel(results)
    print ("Best regularization value: %0.2E" % bestResult["reg"])
    print ("Test accuracy (%%): %f" % bestResult["test"])
    
    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_sif_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_sif_dev_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_sif_dev_pred.txt")
Exemple #7
0
import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time

from q3_word2vec import *
from q3_sgd import *

# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

# Context size
C = 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime=time.time()
wordVectors = np.concatenate(
    ((np.random.rand(nWords, dimVectors) - 0.5) /
def get_glove_data():
  embedding_dimension = 100
  x_text, y = load_data_and_labels_bow("thread_content.npy", "thread_labels.npy")
  # num_recipients_features = np.array(np.load("num_recipients_features_nodup.npy"))
  #
  # avgNumRecipients = np.array(np.load("avg_num_recipients.npy"))
  # avgNumTokensPerEmail = np.array(np.load("avg_num_tokens_per_email.npy"))

  dataset = StanfordSentiment()
  tokens = dataset.tokens()
  nWords = len(tokens)

  # Initialize word vectors with glove.
  embedded_vectors = glove.loadWordVectors(tokens)
  print("The shape of embedding matrix is:")
  print(embedded_vectors.shape) # Should be number of e-mails, number of embeddings

  nTrain = len(x_text)
  trainFeatures = np.zeros((nTrain, embedding_dimension)) #5 is the number of slots the extra features take up
  toRemove = []
  for i in xrange(nTrain):
    words = x_text[i]
    num_words = len(words)

    #place number of words in buckets
    if num_words < 10:
        num_words_bucket = 0
    elif num_words >= 10 and num_words < 100:
        num_words_bucket = 1
    elif num_words >= 100 and num_words < 500:
        num_words_bucket = 2
    elif num_words >= 500 and num_words < 1000:
        num_words_bucket = 3
    elif num_words >= 1000 and num_words < 2000:
        num_words_bucket = 4
    elif num_words >= 2000:
        num_words_bucket = 5

    sentenceFeatures = getSentenceFeatures(tokens, embedded_vectors, words)
    if sentenceFeatures is None:
      toRemove.append(i)
    else:
      featureVector = sentenceFeatures
      #num_words = avgNumTokensPerEmail[i]
      #place number of words in buckets
      # if num_words < 10:
      #   num_words_bucket = 0
      # elif num_words >= 10 and num_words < 100:
      #   num_words_bucket = 1
      # elif num_words >= 100 and num_words < 500:
      #   num_words_bucket = 2
      # elif num_words >= 500 and num_words < 1000:
      #   num_words_bucket = 3
      # elif num_words >= 1000 and num_words < 2000:
      #   num_words_bucket = 4
      # elif num_words >= 2000:
      #   num_words_bucket = 5
      # featureVector = np.hstack((featureVector, num_words_bucket))
      #featureVector = np.hstack((featureVector, avgNumRecipients[i]))
      trainFeatures[i, :] = featureVector

  print(len(toRemove))
  y = np.delete(y, toRemove, axis=0)
  trainFeatures = np.delete(trainFeatures, toRemove, axis=0)

  # Randomly shuffle data
  np.random.seed(10)
  shuffle_indices = np.random.permutation(np.arange(len(y)))  # Array of random numbers from 1 to # of labels.
  x_shuffled = trainFeatures[shuffle_indices]
  y_shuffled = y[shuffle_indices]

  train = 0.6
  dev = 0.2
  test = 0.2
  # train x, dev x, test x, train y, dev y, test y
  train_cutoff = int(0.6 * len(x_shuffled))
  dev_cutoff = int(0.8 * len(x_shuffled))
  test_cutoff = int(len(x_shuffled))
  return x_shuffled[0:train_cutoff], x_shuffled[train_cutoff:dev_cutoff], x_shuffled[dev_cutoff:test_cutoff], \
         y_shuffled[0:train_cutoff], y_shuffled[train_cutoff:dev_cutoff], y_shuffled[dev_cutoff:test_cutoff],
Exemple #9
0
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time

# Check Python Version
import sys
assert sys.version_info[0] == 3
assert sys.version_info[1] >= 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

dimVectors = 1024
sparseness = 0.03

# Context size
C = 5
SPARSENESS = 0.03
LAMBDA = 0.05
GAMMA = 0.05

startTime=time.time()
wordVectors = np.random.rand(nWords, dimVectors) - (1 - SPARSENESS)

Exemple #10
0
import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time

from q3_word2vec import *
from q3_sgd import *

# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
tokens_encoded = dataset.tokens()
for k, v in tokens_encoded.items():
    if type(k) == str:
        tokens_encoded.pop(k)
        tokens_encoded[k.encode('latin1')] = v
tokens = dict((k.decode('latin1'), v) for k, v in tokens_encoded.items())
print("SCREEN ({})".format(tokens['screenwriter']))
#print("SCRBBB ({})".format(tokens[b'screenwriter']))
nWords = len(tokens)

# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

# Context size
C = 5
Exemple #11
0
import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time
import argparse
from sgd import *
from word2vec import *

random.seed(314)
dataset = StanfordSentiment()
word2Ind = dataset.tokens()
nWords = len(word2Ind)

def word2vec_sgd_wrapper(word2vecModel, word2Ind, wordVectors, dataset, 
                         windowSize,
                         word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
    batchsize = 50
    loss = 0.0
    grad = np.zeros(wordVectors.shape)
    N = wordVectors.shape[0]
    centerWordVectors = wordVectors[:int(N/2),:]
    outsideVectors = wordVectors[int(N/2):,:]
    for i in range(batchsize):
        windowSize1 = random.randint(1, windowSize)
        centerWord, context = dataset.getRandomContext(windowSize1)

        c, gin, gout = word2vecModel(
            centerWord, windowSize1, context, word2Ind, centerWordVectors,
Exemple #12
0
def run():
    ### Here is the main body of this file. We initialize the model and clean up the dataset
    ### Reset the random seed to make sure that everyone gets the same results
    random.seed(314)
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    ### We are going to train 10-dimensional vectors for this assignment
    dimVectors = 10

    ### The maximum half context size
    C = 5

    ### Reset the random seed to make sure that everyone gets the same results
    random.seed(31415)
    np.random.seed(9265)

    ### Start the clock when we begin to train this model
    startTime = time.time()

    ### The initial point to start SGD from
    wordVectors = np.concatenate(
        ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors,
         np.zeros((nWords, dimVectors))),
        axis=0)

    ### Call the sgd function to train our model,
    wordVectors = sgd(lambda vec: word2vec_sgd_wrapper(
        skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient),
                      wordVectors,
                      0.3,
                      40000,
                      None,
                      True,
                      PRINT_EVERY=10)

    ### Note that normalization is not called here. This is not a bug,
    ### normalizing during training loses the notion of length.

    print("sanity check: cost at convergence should be around or below 10")
    print("training took %d seconds" % (time.time() - startTime))

    ### Concatenate the input and output word vectors
    wordVectors = np.concatenate(
        (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0)
    ### wordVectors = wordVectors[:nWords,:] + wordVectors[nWords:,:]

    ### Visualize word embeddings
    visualizeWords = [
        "the", "a", "an", ",", ".", "?", "!", "``", "''", "--", "good",
        "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth",
        "sweet", "enjoyable", "boring", "bad", "waste", "dumb", "annoying"
    ]

    visualizeIdx = [tokens[word] for word in visualizeWords]
    visualizeVecs = wordVectors[visualizeIdx, :]
    temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
    covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
    U, S, V = np.linalg.svd(covariance)
    coord = temp.dot(U[:, 0:2])

    for i in range(len(visualizeWords)):
        plt.text(coord[i, 0],
                 coord[i, 1],
                 visualizeWords[i],
                 bbox=dict(facecolor='green', alpha=0.1))

    plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0])))
    plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1])))

    plt.savefig('q3_word_vectors.png')