Exemple #1
0
def initialize():
    # Reset the random seed to make sure that everyone gets the same results
    random.seed(314)
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    print(tokens)
    json.dump(tokens, open("tokens.json", "w"))

    nWords = len(tokens)
    print(nWords, "word")

    # We are going to train 10-dimensional vectors for this assignment
    dimVectors = 50
    EPOCH = 100

    # Context size
    C = 5
    # Reset the random seed to make sure that everyone gets the same results
    random.seed(31415)
    np.random.seed(9265)
    in_glove = 0
    wordVectors = np.zeros((2 * nWords, dimVectors))

    for i in range(0, nWords):
        if list(tokens.keys())[i] in wv_from_bin.vocab.keys():
            wordVectors[i] = np.array(
                wv_from_bin.word_vec(list(tokens.keys())[i]))
            in_glove += 1
        else:
            wordVectors[i] = (np.random.rand(1, dimVectors) - 0.5) / dimVectors

    for i in range(nWords, 2 * nWords):
        if list(tokens.keys())[i - nWords] in wv_from_bin.vocab.keys():
            wordVectors[i] = np.array(
                wv_from_bin.word_vec(list(tokens.keys())[i - nWords]))

    print(wordVectors)
    print(in_glove, " in GloVe")

    wordVectors = sgd(lambda vec: word2vec_sgd_wrapper(
        skipgram, tokens, vec, dataset, C, negSamplingLossAndGradient),
                      wordVectors,
                      0.3,
                      EPOCH,
                      None,
                      True,
                      PRINT_EVERY=1)
    # Note that normalization is not called here. This is not a bug,
    # normalizing during training loses the notion of length.

    print("sanity check: cost at convergence should be around or below 10")

    # concatenate the input and output word vectors
    wordVectors = np.concatenate(
        (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0)
    print(wordVectors.shape)
    # %%
    np.save("wordVectors", wordVectors)
def run():
    dataset = StanfordSentiment()
    tokens_encoded = dataset.tokens()
    for k,v in tokens_encoded.items():
        if type(k) == str:
            tokens_encoded.pop(k)
            tokens_encoded[k.encode('latin1')] = v
    tokens = dict((k.decode('latin1'),v) for k,v in tokens_encoded.items())
    V,D = len(tokens),10
    random.seed(319)
    np.random.seed(419)
    vectors = np.concatenate((np.random.randn(V,D), np.zeros((V,D))), axis=0)
    vectors = sgd(lambda vecs: sgd_wrapper(tokens_encoded, vecs, 7, dataset), vectors, 4001, 3e-1)
Exemple #3
0
def main(args):
    """ Train a model to do sentiment analyis"""
    
    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)
    
    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
                                     (wordVectors[:nWords,:], wordVectors[nWords:,:]),
                                     axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]
Exemple #4
0
def run():
    random.seed(314)
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    # Train 10-dimensional vectors
    dimVectors = 10

    # Context size
    C = 5

    random.seed(31415)
    np.random.seed(9265)

    startTime = time.time()
    wordVectors = np.concatenate(((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors, np.zeros((nWords, dimVectors))), axis=0)
    wordVectors = sgd(lambda vec: test_word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient), wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10)

    print "Sanity check: cost at convergence should be around or below 10"
    print "Training took %d seconds" % (time.time() - startTime)

    # Concatenate the input and output word vectors
    wordVectors = np.concatenate((wordVectors[:nWords,:], wordVectors[nWords:,:]), axis=0)
    # wordVectors = wordVectors[:nWords,:] + wordVectors[nWords:,:]

    visualizeWords = [
        "the", "a", "an", ",", ".", "?", "!", "``", "''", "--",
        "good", "great", "cool", "brilliant", "wonderful", "well", "amazing",
        "worth", "sweet", "enjoyable", "boring", "bad", "waste", "dumb",
        "annoying"]

    visualizeIdx = [tokens[word] for word in visualizeWords]
    visualizeVecs = wordVectors[visualizeIdx, :]
    temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
    covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
    U,S,V = np.linalg.svd(covariance)
    coord = temp.dot(U[:,0:2])

    for i in xrange(len(visualizeWords)):
        plt.text(coord[i,0], coord[i,1], visualizeWords[i], bbox=dict(facecolor='green', alpha=0.1))

    plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
    plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))

    plt.savefig('q3_word_vectors.png') # Save a visualization for the word vectors
def run():
    random.seed(319)
    dataset = StanfordSentiment()
    tokens_encoded = dataset.tokens()
    for k, v in tokens_encoded.items():
        if type(k) == str:
            tokens_encoded.pop(k)
            tokens_encoded[k.encode('latin1')] = v
    tokens = dict((k.decode('latin1'), v) for k, v in tokens_encoded.items())
    V, D = len(tokens), 10
    random.seed(31919)
    np.random.seed(41717)
    vectors = np.concatenate((np.random.randn(V, D), np.zeros((V, D))), axis=0)
    start_time = time.time()
    vectors = sgd(
        lambda vecs: sgd_wrapper(
            tokens_encoded, vecs, dataset, 5, w2vmodel=skipgram), vectors,
        14001, 3e-1)
    print("w2v run in (%f) seconds" % (time.time() - start_time))
Exemple #6
0
def main(args):
    print 80 * "="
    print "INITIALIZING"
    print 80 * "="
    dataset = StanfordSentiment()
    print "Done, read total %d windows" % dataset.word_count()
    print 80 * "="
    print "TRAINING"
    print 80 * "="
    print "Training %s word vectors" % args.model
    if not os.path.exists(args.vector_path):
        os.makedirs(args.vector_path)

    if args.model == 'word2vec':
        word_vectors = word2vec_model(args, dataset)
    else:
        # glove model
        vocab = dataset.tokens()
        word_freq = dataset.tokenfreq()
        cooccur = build_cooccur(vocab, word_freq, dataset, window_size=10)
        word_vectors = train_glove(vocab, cooccur, args.vector_size, args.vector_path, iterations=args.iterations)
def run():
    random.seed(319)
    dataset = StanfordSentiment()
    tokens_encoded = dataset.tokens()
    for k, v in tokens_encoded.items():
        if type(k) == str:
            tokens_encoded.pop(k)
            tokens_encoded[k.encode('latin1')] = v
    tokens = dict((k.decode('latin1'), v) for k, v in tokens_encoded.items())
    V, D = len(tokens), 10
    random.seed(31919)
    np.random.seed(419)
    vectors = np.concatenate((np.random.randn(V, D), np.zeros((V, D))), axis=0)
    st = time.time()
    vectors = sgd(
        lambda vecs: sgd_wrapper(tokens_encoded,
                                 vectors,
                                 5,
                                 dataset,
                                 w2vModel=skipgram,
                                 w2vCAG=negSamplingCAG), vectors, 5001, 3e-1)
    print("run-sgd finished in (%f) seconds" % (time.time() - st))
Exemple #8
0
def do_train(args):
    # Set up some parameters.
    config = Config(args)

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    if args.vector == "yourvectors":
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1)
    elif args.vector == "pretrained":
        wordVectors = glove.loadWordVectors(tokens)

    # Load the train set
    trainset = dataset.getTrainSentences()
    train_max_length, train, train_raw = word2index(tokens, trainset)
    print(train_raw[0])
    print(train[0])

    # Prepare dev set features
    devset = dataset.getDevSentences()
    _, dev, dev_raw = word2index(tokens, devset)

    # Prepare test set features
    testset = dataset.getTestSentences()
    _, test, test_raw = word2index(tokens, testset)

    config.max_length = train_max_length
    config.embed_size = wordVectors.shape[1]

    handler = logging.FileHandler(config.log_output)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(
        logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
    logging.getLogger().addHandler(handler)

    report = None  #Report(Config.eval_output)

    with tf.Graph().as_default():
        logger.info("Building model...", )
        start = time.time()
        model = RNNModel(config, wordVectors, tokens)
        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as session:
            session.run(init)
            model.fit(session, saver, train, dev)

            # do some error analysis
            if args.vector == "pretrained":
                y_true, preds = model.output(session, dev_raw)
                outputConfusionMatrix(preds, y_true, "q5_dev_conf.png")
def run():
    random.seed(319)
    dataset = StanfordSentiment()
    tokens_encoded = dataset.tokens()
    for k, v in tokens_encoded.items():
        if type(k) == str:
            tokens_encoded.pop(k)
            tokens_encoded[k.encode('latin1')] = v
    tokens = dict((k.decode('latin1'), v) for k, v in tokens_encoded.items())
    V, D = len(tokens), 10
    random.seed(31919)
    np.random.seed(41717)
    vectors = np.concatenate((np.random.randn(V, D), np.zeros((V, D))), axis=0)
    start_time = time.time()
    vectors = sgd(
        lambda vecs: sgd_wrapper(
            tokens_encoded, vecs, dataset, 5, w2vmodel=skipgram), vectors,
        24001, 3e-1)
    print("w2v run in (%f) seconds" % (time.time() - start_time))
    visualize_words = [
        'smart', 'dumb', 'tall', 'short', 'good', 'bad', 'king', 'queen',
        'man', 'woman'
    ]
    visualize_indices = [tokens[w] for w in visualize_words]
    visualize_vecs = vectors[visualize_indices, :]
    temp = (visualize_vecs - np.mean(visualize_vecs, axis=0))
    covariance = 1.0 / len(visualize_indices) * temp.T.dot(temp)
    U, S, V = np.linalg.svd(covariance)
    coord = temp.dot(U[:, 0:2])
    for i in range(len(visualize_words)):
        plt.text(coord[i, 0],
                 coord[i, 1],
                 visualize_words[i],
                 bbox=dict(facecolor='green', alpha=0.1))
    plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0])))
    plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1])))
    plt.savefig('q3_word_vectors.png')
def run():
    random.seed(319)
    dataset = StanfordSentiment()
    tokens_encoded = dataset.tokens()
    for k, v in tokens_encoded.items():
        if type(k) == str:
            tokens_encoded.pop(k)
            tokens_encoded[k.encode('latin1')] = v
    tokens = dict((k.decode('latin1'), v) for k, v in tokens_encoded.items())
    nWords = len(tokens)
    dimVectors = 10
    C = 5
    random.seed(31919)
    np.random.seed(41717)
    start_time = time.time()
    vectors = np.concatenate(
        (np.random.randn(nWords, dimVectors), np.zeros((nWords, dimVectors))),
        axis=0)
    vectors, cost = sgd(
        lambda vecs: sgd_wrapper(
            tokens_encoded, vecs, C, dataset, soc=skipgram), vectors, 40000,
        3e-1)
    print("SGD finished in ({}) seconds with cost ({})".format(
        time.time() - start_time, cost))
def main(args):
	dataset = StanfordSentiment()
	tokens = dataset.tokens()
	nWords = len(tokens)

	if args.yourvectors:
		_, wordVectors, _ = load_saved_params()
		wordVectors = np.concatenate(
			(wordVectors[:nWords,:], wordVectors[nWords:,:]),
			axis=1)
	elif args.pretrained:
		wordVectors = glove.loadWordVectors(tokens)
	dimVectors = wordVectors.shape[1]
	print dimVectors

	trainset = dataset.getTrainSentences()
	nTrain = len(trainset)
	trainFeatures = np.zeros((nTrain, dimVectors))
	trainLabels = np.zeros((nTrain,), dtype=np.int32)
	for i in xrange(nTrain):
		words, trainLabels[i] = trainset[i]
		trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

	# Prepare dev set features
	devset = dataset.getDevSentences()
	nDev = len(devset)
	devFeatures = np.zeros((nDev, dimVectors))
	devLabels = np.zeros((nDev,), dtype=np.int32)
	for i in xrange(nDev):
		words, devLabels[i] = devset[i]
		devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

	# Prepare test set features
	testset = dataset.getTestSentences()
	nTest = len(testset)
	testFeatures = np.zeros((nTest, dimVectors))
	testLabels = np.zeros((nTest,), dtype=np.int32)
	for i in xrange(nTest):
		words, testLabels[i] = testset[i]
		testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)
Exemple #12
0
from utils.utils import get_relative_path
from loguru import logger
from knn import run_knn
from matplotlib import use as use_matplotlib
from typing import Dict, cast, List

use_matplotlib('agg')


# Check Python Version
assert sys.version_info[0] == 3
assert sys.version_info[1] >= 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment(
    path=get_relative_path('data/stanfordSentimentTreebank'))
tokens = dataset.tokens()
nWords = len(tokens)

# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

# Context size
C = 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime = time.time()
wordVectors = np.concatenate(
import matplotlib.pyplot as plt
import time

from q3_word2vec import *
from q3_sgd import *
from itertools import islice


def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))


# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
tokens_encoded = dataset.tokens()
strcnt, bytcnt, othcnt, unks = 0, 0, 0, 0
for k, v in tokens_encoded.items():
    if type(k) == str:
        strcnt += 1
        print("the string is (%s)" % (k))
        tokens_encoded.pop(k)
        tokens_encoded[k.encode('latin1')] = v
    elif type(k) == bytes:
        bytcnt += 1
        if k == b'unk': print("UNKUNKUNKUNKUNKUNKUNKUNKUNKUNKUNK")
    else: othcnt += 1
print("str(%d)byt(%d)oth(%d)" % (strcnt, bytcnt, othcnt))
tokens = dict((k.decode('latin1'), v) for (k, v) in tokens_encoded.items())
nWords = len(tokens)
Exemple #14
0
#!/usr/bin/env python

import random
import numpy as np
from utils.treebank import StanfordSentiment

dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

# print(dataset.type)
print(dataset.getRandomContext(5))
Exemple #15
0
import codecs
import _pickle as pickle
from q3_sgd import load_saved_params
path = "utils/datasets/stanfordSentimentTreebank"
from utils.treebank import StanfordSentiment

dataset = StanfordSentiment()
sentences = dataset.sentences()
sentence = [codecs.decode(word, 'latin1') for word in sentences[0]]
" ".join(sentence)

dictionary = dict()
phrases = 0
with open(path + "/dictionary.txt", "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        splitted = line.split("|")
        dictionary[splitted[0].lower()] = int(splitted[1])
        phrases += 1

# sentences = []

# with open(path + "/datasetSentences.txt", "r", encoding='utf-8') as f:
#     for line in f:
#
#         splitted = line.strip().split()[1:]
#         # print(splitted)
#         # Deal with some peculiar encoding issues with this file
#
Exemple #16
0
#!/usr/bin/env python

import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time

from q3_word2vec import *
from q3_sgd import *

# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

# Context size
C = 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime=time.time()
wordVectors = np.concatenate(
Exemple #17
0
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time
from word2vec import *
from sgd import *

#check python version
import sys
assert sys.version_info[0] == 3
assert sys.version_info[1] >= 5

#Reset the random seed to make sure that everyone gets the same results
random.seed(314)
datasets = StanfordSentiment()
tokens = datasets.tokens()
nWords = len(tokens)

#We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

#Context size
C = 5

#Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime = time.time()
wordVectors = np.concatenate(
Exemple #18
0
#!/usr/bin/env python

import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time

from q3_word2vec import *
from q3_sgd import *

# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
dataset.sentences()
tokens = dataset.tokens()
nWords = len(tokens)

# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

# Context size
C = 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime = time.time()
Exemple #19
0
def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    
    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords,:], wordVectors[nWords:,:]),
            axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)

    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain,), dtype=np.int32)
    
    #frequency counting
    freq = Counter()
    Sum = 0
    for sen in trainset:
        for word in sen[0]:
            Sum += 1
            freq[word]+=1
    for word,tf in freq.items():
        freq[word] = tf/Sum
    
    #generate all sentence features
    for i in range(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq)
    #svd in training set
    svd = TruncatedSVD(n_components=1, n_iter=5, random_state=0)
    u = svd.fit(trainFeatures).components_[0] # the first singular vector
    # remove the projections of the sentence embeddings to their first principal component
    for i in range(trainFeatures.shape[0]):
        trainFeatures[i] = trainFeatures[i] - np.dot(trainFeatures[i],u.T) * u
    
    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev,), dtype=np.int32)
    for i in range(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) 
    for i in range(devFeatures.shape[0]):
            devFeatures[i] = devFeatures[i] - np.dot(devFeatures[i],u.T) * u
            
    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest,), dtype=np.int32)
    for i in range(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq)
    for i in range(testFeatures.shape[0]):
            testFeatures[i] = testFeatures[i] - np.dot(testFeatures[i],u.T) * u
            
    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print("Training for reg=%f" % reg)
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0/(reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print("Train accuracy (%%): %f" % trainAccuracy)

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print("Dev accuracy (%%): %f" % devAccuracy)

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print("Test accuracy (%%): %f" % testAccuracy)

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy})

    # Print the accuracies
    print ("")
    print ("=== Recap ===")
    print ("Reg\t\tTrain\tDev\tTest")
    for result in results:
        print ("%.2E\t%.3f\t%.3f\t%.3f" % (
            result["reg"],
            result["train"],
            result["dev"],
            result["test"]))
    print ("")

    bestResult = chooseBestModel(results)
    print ("Best regularization value: %0.2E" % bestResult["reg"])
    print ("Test accuracy (%%): %f" % bestResult["test"])
    
    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_sif_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_sif_dev_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_sif_dev_pred.txt")
def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords,:], wordVectors[nWords:,:]),
            axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain,), dtype=np.int32)
    for i in xrange(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev,), dtype=np.int32)
    for i in xrange(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest,), dtype=np.int32)
    for i in xrange(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print "Training for reg=%f" % reg
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0/(reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print "Train accuracy (%%): %f" % trainAccuracy

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print "Dev accuracy (%%): %f" % devAccuracy

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print "Test accuracy (%%): %f" % testAccuracy

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy})

    # Print the accuracies
    print ""
    print "=== Recap ==="
    print "Reg\t\tTrain\tDev\tTest"
    for result in results:
        print "%.2E\t%.3f\t%.3f\t%.3f" % (
            result["reg"],
            result["train"],
            result["dev"],
            result["test"])
    print ""

    bestResult = chooseBestModel(results)
    print "Best regularization value: %0.2E" % bestResult["reg"]
    print "Test accuracy (%%): %f" % bestResult["test"]

    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_dev_pred.txt")
    else:
        # plotRegVsAccuracy(regValues, results, "q4_reg_v_acc_your.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_conf_your.png")
Exemple #21
0
#!/usr/bin/env python

import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time

from q3_word2vec import *
from q3_sgd import *

# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

# Context size
C = 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime = time.time()
wordVectors = np.concatenate(
Exemple #22
0
def run():
    ### Here is the main body of this file. We initialize the model and clean up the dataset
    ### Reset the random seed to make sure that everyone gets the same results
    random.seed(314)
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    ### We are going to train 10-dimensional vectors for this assignment
    dimVectors = 10

    ### The maximum half context size
    C = 5

    ### Reset the random seed to make sure that everyone gets the same results
    random.seed(31415)
    np.random.seed(9265)

    ### Start the clock when we begin to train this model
    startTime = time.time()

    ### The initial point to start SGD from
    wordVectors = np.concatenate(
        ((np.random.rand(nWords, dimVectors) - 0.5) / dimVectors,
         np.zeros((nWords, dimVectors))),
        axis=0)

    ### Call the sgd function to train our model,
    wordVectors = sgd(lambda vec: word2vec_sgd_wrapper(
        skipgram, tokens, vec, dataset, C, negSamplingCostAndGradient),
                      wordVectors,
                      0.3,
                      40000,
                      None,
                      True,
                      PRINT_EVERY=10)

    ### Note that normalization is not called here. This is not a bug,
    ### normalizing during training loses the notion of length.

    print("sanity check: cost at convergence should be around or below 10")
    print("training took %d seconds" % (time.time() - startTime))

    ### Concatenate the input and output word vectors
    wordVectors = np.concatenate(
        (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=0)
    ### wordVectors = wordVectors[:nWords,:] + wordVectors[nWords:,:]

    ### Visualize word embeddings
    visualizeWords = [
        "the", "a", "an", ",", ".", "?", "!", "``", "''", "--", "good",
        "great", "cool", "brilliant", "wonderful", "well", "amazing", "worth",
        "sweet", "enjoyable", "boring", "bad", "waste", "dumb", "annoying"
    ]

    visualizeIdx = [tokens[word] for word in visualizeWords]
    visualizeVecs = wordVectors[visualizeIdx, :]
    temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
    covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
    U, S, V = np.linalg.svd(covariance)
    coord = temp.dot(U[:, 0:2])

    for i in range(len(visualizeWords)):
        plt.text(coord[i, 0],
                 coord[i, 1],
                 visualizeWords[i],
                 bbox=dict(facecolor='green', alpha=0.1))

    plt.xlim((np.min(coord[:, 0]), np.max(coord[:, 0])))
    plt.ylim((np.min(coord[:, 1]), np.max(coord[:, 1])))

    plt.savefig('q3_word_vectors.png')
Exemple #23
0
import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time
import argparse
from sgd import *
from word2vec import *

random.seed(314)
dataset = StanfordSentiment()
word2Ind = dataset.tokens()
nWords = len(word2Ind)

def word2vec_sgd_wrapper(word2vecModel, word2Ind, wordVectors, dataset, 
                         windowSize,
                         word2vecLossAndGradient=naiveSoftmaxLossAndGradient):
    batchsize = 50
    loss = 0.0
    grad = np.zeros(wordVectors.shape)
    N = wordVectors.shape[0]
    centerWordVectors = wordVectors[:int(N/2),:]
    outsideVectors = wordVectors[int(N/2):,:]
    for i in range(batchsize):
        windowSize1 = random.randint(1, windowSize)
        centerWord, context = dataset.getRandomContext(windowSize1)

        c, gin, gout = word2vecModel(
            centerWord, windowSize1, context, word2Ind, centerWordVectors,
Exemple #24
0
import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time

# Check Python Version
import sys
assert sys.version_info[0] == 3
assert sys.version_info[1] >= 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

dimVectors = 1024
sparseness = 0.03

# Context size
C = 5
SPARSENESS = 0.03
LAMBDA = 0.05
GAMMA = 0.05

startTime=time.time()
wordVectors = np.random.rand(nWords, dimVectors) - (1 - SPARSENESS)
Exemple #25
0
import numpy as np
from time import time

from utils.treebank import StanfordSentiment
from libNN.network import Word2Vec

# load data sets
dataset = StanfordSentiment()

# model
model = Word2Vec(word_dim=50)

# training
start_time = time()
word_vectors = model.fit(dataset=dataset)

# save
print("Saving word vectors...")
np.save("word_vectors", word_vectors)

print("Training took {0} seconds".format(time() - start_time))
print(word_vectors[0:5])
Exemple #26
0
from q4_softmaxreg import softmaxRegression, getSentenceFeature, accuracy, softmax_wrapper

import seaborn as sns

sns.set(style='whitegrid', context='talk')

# Try different regularizations and pick the best!
# NOTE: fill in one more "your code here" below before running!
REGULARIZATION = None  # Assign a list of floats in the block below
### YOUR CODE HERE
REGULARIZATION = np.logspace(-6, 0.1, 21)
REGULARIZATION = np.hstack([0, REGULARIZATION])
### END YOUR CODE

# Load the dataset
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

# Load the word vectors we trained earlier
_, wordVectors0, _ = load_saved_params()
N = wordVectors0.shape[0] // 2
#assert nWords == N
wordVectors = (wordVectors0[:N, :] + wordVectors0[N:, :])
dimVectors = wordVectors.shape[1]

# Load the train set
trainset = dataset.getTrainSentences()
nTrain = len(trainset)
trainFeatures = np.zeros((nTrain, dimVectors))
trainLabels = np.zeros((nTrain, ), dtype=np.int32)
Exemple #27
0
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import time

from word2vec import *
from sgd import *

# Check Python Version检查python版本
import sys
assert sys.version_info[0] == 3
assert sys.version_info[1] >= 5

# Reset the random seed to make sure that everyone gets the same results 设置随机种子
random.seed(314)
dataset = StanfordSentiment()  #导入数据集
tokens = dataset.tokens()  #拿token
nWords = len(tokens)  #words的个数

#print("dataset:",dataset)
#print("tokens:",tokens)#tokens是对应的单词和对应的矩阵序列号{'the': 0, 'rock': 1, 'is': 2, 'destined': 3, 'to': 4, 'be': 5, '21st'....这种之类的
#print("nWords",nWords)#一共19539个

# We are going to train 10-dimensional vectors for this assignment 这一次训练10个向量
dimVectors = 10

# Context size 滑动框
C = 5

# Reset the random seed to make sure that everyone gets the same results#对应的np的随机种子
random.seed(31415)
def get_glove_data():
  embedding_dimension = 100
  x_text, y = load_data_and_labels_bow("thread_content.npy", "thread_labels.npy")
  # num_recipients_features = np.array(np.load("num_recipients_features_nodup.npy"))
  #
  # avgNumRecipients = np.array(np.load("avg_num_recipients.npy"))
  # avgNumTokensPerEmail = np.array(np.load("avg_num_tokens_per_email.npy"))

  dataset = StanfordSentiment()
  tokens = dataset.tokens()
  nWords = len(tokens)

  # Initialize word vectors with glove.
  embedded_vectors = glove.loadWordVectors(tokens)
  print("The shape of embedding matrix is:")
  print(embedded_vectors.shape) # Should be number of e-mails, number of embeddings

  nTrain = len(x_text)
  trainFeatures = np.zeros((nTrain, embedding_dimension)) #5 is the number of slots the extra features take up
  toRemove = []
  for i in xrange(nTrain):
    words = x_text[i]
    num_words = len(words)

    #place number of words in buckets
    if num_words < 10:
        num_words_bucket = 0
    elif num_words >= 10 and num_words < 100:
        num_words_bucket = 1
    elif num_words >= 100 and num_words < 500:
        num_words_bucket = 2
    elif num_words >= 500 and num_words < 1000:
        num_words_bucket = 3
    elif num_words >= 1000 and num_words < 2000:
        num_words_bucket = 4
    elif num_words >= 2000:
        num_words_bucket = 5

    sentenceFeatures = getSentenceFeatures(tokens, embedded_vectors, words)
    if sentenceFeatures is None:
      toRemove.append(i)
    else:
      featureVector = sentenceFeatures
      #num_words = avgNumTokensPerEmail[i]
      #place number of words in buckets
      # if num_words < 10:
      #   num_words_bucket = 0
      # elif num_words >= 10 and num_words < 100:
      #   num_words_bucket = 1
      # elif num_words >= 100 and num_words < 500:
      #   num_words_bucket = 2
      # elif num_words >= 500 and num_words < 1000:
      #   num_words_bucket = 3
      # elif num_words >= 1000 and num_words < 2000:
      #   num_words_bucket = 4
      # elif num_words >= 2000:
      #   num_words_bucket = 5
      # featureVector = np.hstack((featureVector, num_words_bucket))
      #featureVector = np.hstack((featureVector, avgNumRecipients[i]))
      trainFeatures[i, :] = featureVector

  print(len(toRemove))
  y = np.delete(y, toRemove, axis=0)
  trainFeatures = np.delete(trainFeatures, toRemove, axis=0)

  # Randomly shuffle data
  np.random.seed(10)
  shuffle_indices = np.random.permutation(np.arange(len(y)))  # Array of random numbers from 1 to # of labels.
  x_shuffled = trainFeatures[shuffle_indices]
  y_shuffled = y[shuffle_indices]

  train = 0.6
  dev = 0.2
  test = 0.2
  # train x, dev x, test x, train y, dev y, test y
  train_cutoff = int(0.6 * len(x_shuffled))
  dev_cutoff = int(0.8 * len(x_shuffled))
  test_cutoff = int(len(x_shuffled))
  return x_shuffled[0:train_cutoff], x_shuffled[train_cutoff:dev_cutoff], x_shuffled[dev_cutoff:test_cutoff], \
         y_shuffled[0:train_cutoff], y_shuffled[train_cutoff:dev_cutoff], y_shuffled[dev_cutoff:test_cutoff],
def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain, ), dtype=np.int32)
    for i in xrange(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev, ), dtype=np.int32)
    for i in xrange(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest, ), dtype=np.int32)
    for i in xrange(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print("Training for reg=%f" % reg)
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0 / (reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print("Train accuracy (%%): %f" % trainAccuracy)

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print("Dev accuracy (%%): %f" % devAccuracy)

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print("Test accuracy (%%): %f" % testAccuracy)

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy
        })

    # Print the accuracies
    print("")
    print("=== Recap ===")
    print("Reg\t\tTrain\tDev\tTest")
    for result in results:
        print("%.2E\t%.3f\t%.3f\t%.3f" %
              (result["reg"], result["train"], result["dev"], result["test"]))
    print("")

    bestResult = chooseBestModel(results)
    print("Best regularization value: %0.2E" % bestResult["reg"])
    print("Test accuracy (%%): %f" % bestResult["test"])

    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_dev_pred.txt")
Exemple #30
0
import matplotlib

matplotlib.use('agg')
import matplotlib.pyplot as plt
import time

from word2vec import *
from sgd import *

# Check Python Version
assert sys.version_info[0] == 3
assert sys.version_info[1] >= 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(314)
dataset = StanfordSentiment(path=r'C:\Users\msingleton\Documents\XCS224N-A2/utils/datasets/stanfordSentimentTreebank')
tokens = dataset.tokens()
nWords = len(tokens)

# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

# Context size
C = 5

# Reset the random seed to make sure that everyone gets the same results
random.seed(31415)
np.random.seed(9265)

startTime = time.time()
wordVectors = np.concatenate(