def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords,:], wordVectors[nWords:,:]),
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain,), dtype=np.int32)
    for i in xrange(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev,), dtype=np.int32)
    for i in xrange(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest,), dtype=np.int32)
    for i in xrange(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print "Training for reg=%f" % reg
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0/(reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print "Train accuracy (%%): %f" % trainAccuracy

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print "Dev accuracy (%%): %f" % devAccuracy

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print "Test accuracy (%%): %f" % testAccuracy

            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy})

    # Print the accuracies
    print ""
    print "=== Recap ==="
    print "Reg\t\tTrain\tDev\tTest"
    for result in results:
        print "%.2E\t%.3f\t%.3f\t%.3f" % (
    print ""

    bestResult = chooseBestModel(results)
    print "Best regularization value: %0.2E" % bestResult["reg"]
    print "Test accuracy (%%): %f" % bestResult["test"]

    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
        # plotRegVsAccuracy(regValues, results, "q4_reg_v_acc_your.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
def get_glove_data():
  embedding_dimension = 100
  x_text, y = load_data_and_labels_bow("thread_content.npy", "thread_labels.npy")
  # num_recipients_features = np.array(np.load("num_recipients_features_nodup.npy"))
  # avgNumRecipients = np.array(np.load("avg_num_recipients.npy"))
  # avgNumTokensPerEmail = np.array(np.load("avg_num_tokens_per_email.npy"))

  dataset = StanfordSentiment()
  tokens = dataset.tokens()
  nWords = len(tokens)

  # Initialize word vectors with glove.
  embedded_vectors = glove.loadWordVectors(tokens)
  print("The shape of embedding matrix is:")
  print(embedded_vectors.shape) # Should be number of e-mails, number of embeddings

  nTrain = len(x_text)
  trainFeatures = np.zeros((nTrain, embedding_dimension)) #5 is the number of slots the extra features take up
  toRemove = []
  for i in xrange(nTrain):
    words = x_text[i]
    num_words = len(words)

    #place number of words in buckets
    if num_words < 10:
        num_words_bucket = 0
    elif num_words >= 10 and num_words < 100:
        num_words_bucket = 1
    elif num_words >= 100 and num_words < 500:
        num_words_bucket = 2
    elif num_words >= 500 and num_words < 1000:
        num_words_bucket = 3
    elif num_words >= 1000 and num_words < 2000:
        num_words_bucket = 4
    elif num_words >= 2000:
        num_words_bucket = 5

    sentenceFeatures = getSentenceFeatures(tokens, embedded_vectors, words)
    if sentenceFeatures is None:
      featureVector = sentenceFeatures
      #num_words = avgNumTokensPerEmail[i]
      #place number of words in buckets
      # if num_words < 10:
      #   num_words_bucket = 0
      # elif num_words >= 10 and num_words < 100:
      #   num_words_bucket = 1
      # elif num_words >= 100 and num_words < 500:
      #   num_words_bucket = 2
      # elif num_words >= 500 and num_words < 1000:
      #   num_words_bucket = 3
      # elif num_words >= 1000 and num_words < 2000:
      #   num_words_bucket = 4
      # elif num_words >= 2000:
      #   num_words_bucket = 5
      # featureVector = np.hstack((featureVector, num_words_bucket))
      #featureVector = np.hstack((featureVector, avgNumRecipients[i]))
      trainFeatures[i, :] = featureVector

  y = np.delete(y, toRemove, axis=0)
  trainFeatures = np.delete(trainFeatures, toRemove, axis=0)

  # Randomly shuffle data
  shuffle_indices = np.random.permutation(np.arange(len(y)))  # Array of random numbers from 1 to # of labels.
  x_shuffled = trainFeatures[shuffle_indices]
  y_shuffled = y[shuffle_indices]

  train = 0.6
  dev = 0.2
  test = 0.2
  # train x, dev x, test x, train y, dev y, test y
  train_cutoff = int(0.6 * len(x_shuffled))
  dev_cutoff = int(0.8 * len(x_shuffled))
  test_cutoff = int(len(x_shuffled))
  return x_shuffled[0:train_cutoff], x_shuffled[train_cutoff:dev_cutoff], x_shuffled[dev_cutoff:test_cutoff], \
         y_shuffled[0:train_cutoff], y_shuffled[train_cutoff:dev_cutoff], y_shuffled[dev_cutoff:test_cutoff],
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
import matplotlib.pyplot as plt
import time

# Check Python Version
import sys
assert sys.version_info[0] == 3
assert sys.version_info[1] >= 5

# Reset the random seed to make sure that everyone gets the same results
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

dimVectors = 1024
sparseness = 0.03

# Context size
C = 5
LAMBDA = 0.05
GAMMA = 0.05

wordVectors = np.random.rand(nWords, dimVectors) - (1 - SPARSENESS)

def run():
    ### Here is the main body of this file. We initialize the model and clean up the dataset
    ### Reset the random seed to make sure that everyone gets the same results
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    ### We are going to train 10-dimensional vectors for this assignment
    dimVectors = 10

    ### The maximum half context size
    C = 5

    ### Reset the random seed to make sure that everyone gets the same results

    ### Start the clock when we begin to train this model
    startTime = time.time()

    ### The initial point to start SGD from
    wordVectors = np.concatenate(
        ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors,
         np.zeros((nWords, dimVectors))),

    ### Call the sgd function to train our model,
    wordVectors = sgd(lambda vec: word2vec_sgd_wrapper(
        skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient),

    ### Note that normalization is not called here. This is not a bug,
    ### normalizing during training loses the notion of length.

    print("sanity check: cost at convergence should be around or below 10")
    print("training took %d seconds" % (time.time() - startTime))

    ### Concatenate the input and output word vectors
    wordVectors = np.concatenate(
        (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0)
    ### wordVectors = wordVectors[:nWords,:] + wordVectors[nWords:,:]

    ### Visualize word embeddings
    visualizeWords = [
        "the", "a", "an", ",", ".", "?", "!", "``", "''", "--", "good",
        "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth",
        "sweet", "enjoyable", "boring", "bad", "waste", "dumb", "annoying"

    visualizeIdx = [tokens[word] for word in visualizeWords]
    visualizeVecs = wordVectors[visualizeIdx, :]
    temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
    covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
    U, S, V = np.linalg.svd(covariance)
    coord = temp.dot(U[:, 0:2])

    for i in range(len(visualizeWords)):
        plt.text(coord[i, 0],
                 coord[i, 1],
                 bbox=dict(facecolor='green', alpha=0.1))

    plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0])))
    plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1])))
