def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords,:], wordVectors[nWords:,:]),
            axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain,), dtype=np.int32)
    for i in xrange(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev,), dtype=np.int32)
    for i in xrange(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest,), dtype=np.int32)
    for i in xrange(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print "Training for reg=%f" % reg
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0/(reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print "Train accuracy (%%): %f" % trainAccuracy

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print "Dev accuracy (%%): %f" % devAccuracy

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print "Test accuracy (%%): %f" % testAccuracy

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy})

    # Print the accuracies
    print ""
    print "=== Recap ==="
    print "Reg\t\tTrain\tDev\tTest"
    for result in results:
        print "%.2E\t%.3f\t%.3f\t%.3f" % (
            result["reg"],
            result["train"],
            result["dev"],
            result["test"])
    print ""

    bestResult = chooseBestModel(results)
    print "Best regularization value: %0.2E" % bestResult["reg"]
    print "Test accuracy (%%): %f" % bestResult["test"]

    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_dev_pred.txt")
    else:
        # plotRegVsAccuracy(regValues, results, "q4_reg_v_acc_your.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_conf_your.png")
def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))  # sentence-level features
    trainLabels = np.zeros((nTrain, ), dtype=np.int32)
    for i in xrange(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev, ), dtype=np.int32)
    for i in xrange(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest, ), dtype=np.int32)
    for i in xrange(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print "Training for reg=%f" % reg
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0 / (reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print "Train accuracy (%%): %f" % trainAccuracy

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print "Dev accuracy (%%): %f" % devAccuracy

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print "Test accuracy (%%): %f" % testAccuracy

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy
        })

    # Print the accuracies
    print ""
    print "=== Recap ==="
    print "Reg\t\tTrain\tDev\tTest"
    for result in results:
        print "%.2E\t%.3f\t%.3f\t%.3f" % (result["reg"], result["train"],
                                          result["dev"], result["test"])
    print ""

    bestResult = chooseBestModel(results)
    print "Best regularization value: %0.2E" % bestResult["reg"]
    print "Test accuracy (%%): %f" % bestResult["test"]

    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_dev_pred.txt")
Beispiel #3
0
def trial2():
    sentences = np.array([[0, 1, 2, 4], [0, 1, 3, 0]])
    sentences = np.array([[0, 1, 2, 4], [0, 1, 3, 5]])
    mask = np.array([[0, 0, 0, 1], [0, 0, 1, 0]])
    mask2 = np.array([[0, 0, 1, 1], [0, 0, 1, 0]])
    labels = np.array([[1, 0, 0], [0, 1, 0]])

    n_classes = 3
    embed_size = 6
    max_length = 4
    batch_size = 1
    lr = 0.001
    n_features = 6
    hidden_size = 10
    DUMMY_PATH = "utils/glove/glove_dummy.txt"

    token_list = glove.loadWordTokens(DUMMY_PATH)
    tokens = {}
    for i in range(len(token_list)):
        tokens[token_list[i]] = i
    wordVectors = glove.loadWordVectors(tokens, DUMMY_PATH, embed_size)
    token_list.append("cqian23th7zhangrao")
    tokens["cqian23th7zhangrao"] = len(token_list) - 1
    print 'WV', np.shape(wordVectors)
    wordVectors = np.append(wordVectors, [np.zeros(embed_size)], axis=0)
    print 'WV', np.shape(wordVectors)

    wordVectors2 = data_util.load_embeddings(DUMMY_PATH, embed_size)

    assert (wordVectors.all() == wordVectors2.all())

    # start buiding model
    #cell=RNNCell(n_features,hidden_size)

    input_placeholder = tf.placeholder(tf.int32, [None, max_length])
    labels_placeholder = tf.placeholder(tf.int32, [None, n_classes])
    mask_placeholder = tf.placeholder(tf.bool, [None, max_length])

    U = tf.Variable(
        np.random.rand(hidden_size, n_classes).astype(np.float32), tf.float32)

    # feed dict
    feed_dict = {
        input_placeholder: sentences,
        labels_placeholder: labels,
        mask_placeholder: mask
    }
    feed_dict2 = {
        input_placeholder: sentences,
        labels_placeholder: labels,
        mask_placeholder: mask2
    }
    emb = tf.Variable(wordVectors, dtype=tf.float32)
    x = tf.nn.embedding_lookup(emb, input_placeholder)

    h = tf.zeros([tf.shape(x)[0], hidden_size], tf.float32)

    preds = []
    W_h = tf.Variable(
        np.random.rand(hidden_size, hidden_size).astype(np.float32),
        tf.float32)
    W_x = tf.Variable(
        np.random.rand(n_features, hidden_size).astype(np.float32), tf.float32)
    b1 = tf.Variable(
        np.random.rand(hidden_size).astype(np.float32), tf.float32)

    # run through rnn
    for i in range(max_length):
        if i >= 1:
            tf.get_variable_scope().reuse_variables()
        h = tf.nn.sigmoid(tf.matmul(h, W_h) + tf.matmul(x[:, i, :], W_x) + b1)
        p = tf.matmul(h, U)
        print 'p', tf.shape(p)
        preds.append(p)

    # prediction
    preds = tf.pack(preds)
    preds2 = tf.reshape(preds, [-1, max_length, n_classes])

    # these are for verification
    preds3 = tf.nn.softmax(preds2)
    preds4 = tf.log(preds3)

    # loss calculation
    labels_to_loss = tf.tile(labels_placeholder, [max_length, 1])
    labels_to_loss = tf.reshape(labels_to_loss, [-1, max_length, n_classes])
    loss = tf.nn.softmax_cross_entropy_with_logits(preds2, labels_to_loss)
    loss2 = tf.boolean_mask(loss, mask_placeholder)
    loss3 = tf.reduce_mean(loss2)

    # training op
    train_op = tf.train.AdamOptimizer(lr).minimize(loss)

    # test implementation
    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)
    xx = sess.run(x, feed_dict=feed_dict)
    print 'embedding', xx
    print 'embedding shape', np.shape(xx)
    pp = sess.run(preds, feed_dict=feed_dict)
    print 'preds after pack', pp
    pp2 = sess.run(preds2, feed_dict=feed_dict)
    print 'preds after reshape', pp2

    pp3 = sess.run(preds3, feed_dict=feed_dict)
    print 'preds after softmax', pp3

    mask2 = np.stack([mask for i in range(n_classes)], 2)
    pred6 = np.sum(np.multiply(pp3, mask2), 1)
    print 'test batch_pred', pred6

    pp4 = sess.run(preds4, feed_dict=feed_dict)
    print 'preds after log', pp4

    lalo = sess.run(labels_to_loss, feed_dict=feed_dict)
    print 'labels to loss', lalo.shape, lalo
    ll = sess.run(loss, feed_dict=feed_dict)
    print 'after softmax loss', ll.shape, ll
    ll2 = sess.run(loss2, feed_dict=feed_dict)
    print 'after boolean_mask loss', ll2
    print np.shape(ll2)
    ll2 = sess.run(loss2, feed_dict=feed_dict2)
    print 'after boolean_mask loss', ll2
    print np.shape(ll2)
    ll3 = sess.run(loss3, feed_dict=feed_dict)
    print 'final loss', ll3
Beispiel #4
0
def main(args):
    """ Train a model to do sentiment analyis"""
    dataset, tokens, num_labels = getToxicDataMultilabel()
    target_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'non_toxic']

    # Shuffle data
    shuffle(dataset)

    num_data = len(dataset)

    # Create train, dev, and test
    train_cutoff = int(0.6 * num_data)
    dev_start = int(0.6 * num_data) + 1
    dev_cutoff = int(0.8 * num_data)

    trainset = dataset[:train_cutoff]
    devset = dataset[dev_start:dev_cutoff]
    testset = dataset[dev_cutoff + 1:]

    nWords = len(tokens)
    wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    #trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = []


    for i in xrange(nTrain):
        words = trainset[i][0]
        trainLabels.append(trainset[i][1])
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = []
    for i in xrange(nDev):
        words = devset[i][0]
        devLabels.append(devset[i][1])
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    #testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = []
    for i in xrange(nTest):
        words = testset[i][0]
        testLabels.append(testset[i][1])
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)


    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    print "LR Results:"
    classifier = Pipeline([
        ('vectorizer', CountVectorizer(min_n=1,max_n=2)),
        ('tfidf', TfidfTransformer()),
        ('clf', OneVsRestClassifier(LinearSVC()))])

    classifier.fit(trainFeatures, trainLabels)
    predicted = classifier.predict(devFeatures)
    clf = SVC()
    clf.fit(trainFeatures, trainLabels)

    # Test on train set
    pred = clf.predict(trainFeatures)
    trainAccuracy = accuracy(trainLabels, pred)
    print "Train accuracy (%%): %f" % trainAccuracy

    # Test on dev set
    pred = clf.predict(devFeatures)
    devAccuracy = accuracy(devLabels, pred)
    print "Dev accuracy (%%): %f" % devAccuracy

    # Test on test set
    # Note: always running on test is poor style. Typically, you should
    # do this only after validation.
    pred = clf.predict(testFeatures)
    testAccuracy = accuracy(testLabels, pred)
    print "Test accuracy (%%): %f" % testAccuracy

    results.append({
        "reg": 0.0,
        "clf": clf,
        "train": trainAccuracy,
        "dev": devAccuracy,
        "test": testAccuracy})

    # Print the accuracies
    print ""
    print "=== Recap ==="
    print "Reg\t\tTrain\tDev\tTest"
    for result in results:
        print "%.2E\t%.3f\t%.3f\t%.3f" % (
            result["reg"],
            result["train"],
            result["dev"],
            result["test"])
    print ""

    bestResult = chooseBestModel(results)
    # print "Best regularization value: %0.2E" % bestResult["reg"]
    # print "Test accuracy (%%): %f" % bestResult["test"]

    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_svm_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_dev_svm_pred.txt")
Beispiel #5
0
def getLSTMMultiData():
    dataset, tokens, maxLength = getMLToxicData()
    # Shuffle data
    shuffle(dataset)

    maxLength = 100

    num_data = len(dataset)

    # Create train, dev, and test
    train_cutoff = int(0.6 * num_data)
    dev_start = int(0.6 * num_data) + 1
    dev_cutoff = int(0.8 * num_data)

    trainset = dataset[:train_cutoff]
    devset = dataset[dev_start:dev_cutoff]
    testset = dataset[dev_cutoff + 1:]

    nWords = len(tokens)

    wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    #trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, maxLength, dimVectors))
    trainLabels = np.zeros((nTrain, 5), dtype=np.int32)

    for i in xrange(nTrain):
        words = trainset[i][0]
        trainLabels[i][0] = trainset[i][1]
        trainLabels[i][1] = trainset[i][2]
        trainLabels[i][2] = trainset[i][3]
        trainLabels[i][3] = trainset[i][4]
        trainLabels[i][4] = trainset[i][5]
        #trainLabels[i][5]= trainset[i][6]
        trainFeatures[i, :] = getLSTMSentenceFeatures(tokens, wordVectors,
                                                      dimVectors, words,
                                                      maxLength)

    # Prepare dev set features
    #devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, maxLength, dimVectors))
    devLabels = np.zeros((nDev, 5), dtype=np.int32)
    for i in xrange(nDev):
        words = devset[i][0]
        devLabels[i][0] = devset[i][1]
        devLabels[i][1] = devset[i][2]
        devLabels[i][2] = devset[i][3]
        devLabels[i][3] = devset[i][4]
        devLabels[i][4] = devset[i][5]
        #devLabels[i][5] = devset[i][6]
        devFeatures[i, :] = getLSTMSentenceFeatures(tokens, wordVectors,
                                                    dimVectors, words,
                                                    maxLength)

    # Prepare test set features
    #testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, maxLength, dimVectors))
    testLabels = np.zeros((nTest, 5), dtype=np.int32)
    for i in xrange(nTest):
        words = testset[i][0]
        testLabels[i][0] = testset[i][1]
        testLabels[i][1] = testset[i][2]
        testLabels[i][2] = testset[i][3]
        testLabels[i][3] = testset[i][4]
        testLabels[i][4] = testset[i][5]
        #testLabels[i][5] = testset[i][6]
        testFeatures[i, :] = getLSTMSentenceFeatures(tokens, wordVectors,
                                                     dimVectors, words,
                                                     maxLength)

    return trainFeatures, trainLabels, devFeatures, devLabels, testFeatures, testLabels, maxLength, dimVectors  #, tokens, wordVectors
Beispiel #6
0
def main(args):
    """ Train a model to do sentiment analyis"""
    dataset, tokens, maxSentence = getToxicData()
    print len(dataset)
    # Shuffle data
    shuffle(dataset)

    num_data = len(dataset)

    # Create train, dev, and test
    train_cutoff = int(0.6 * num_data)
    dev_start = int(0.6 * num_data) + 1
    dev_cutoff = int(0.8 * num_data)

    trainset = dataset[:train_cutoff]
    devset = dataset[dev_start:dev_cutoff]
    testset = dataset[dev_cutoff + 1:]

    nWords = len(tokens)

    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    #trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain, ), dtype=np.int32)

    for i in xrange(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    #devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev, ), dtype=np.int32)
    for i in xrange(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    #testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest, ), dtype=np.int32)
    for i in xrange(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    print "SVM Results:"

    clf = SVC()
    clf.fit(trainFeatures, trainLabels)

    # Test on train set
    pred = clf.predict(trainFeatures)
    trainAccuracy = accuracy(trainLabels, pred)
    print "Train accuracy (%%): %f" % trainAccuracy

    # Test on dev set
    pred = clf.predict(devFeatures)
    devAccuracy = accuracy(devLabels, pred)
    print "Dev accuracy (%%): %f" % devAccuracy

    # Test on test set
    # Note: always running on test is poor style. Typically, you should
    # do this only after validation.
    pred = clf.predict(testFeatures)
    testAccuracy = accuracy(testLabels, pred)
    print "Test accuracy (%%): %f" % testAccuracy

    results.append({
        "reg": 0.0,
        "clf": clf,
        "train": trainAccuracy,
        "dev": devAccuracy,
        "test": testAccuracy
    })

    # Print the accuracies
    print ""
    print "=== Recap ==="
    print "Reg\t\tTrain\tDev\tTest"
    for result in results:
        print "%.2E\t%.3f\t%.3f\t%.3f" % (result["reg"], result["train"],
                                          result["dev"], result["test"])
    print ""

    bestResult = chooseBestModel(results)
    # print "Best regularization value: %0.2E" % bestResult["reg"]
    # print "Test accuracy (%%): %f" % bestResult["test"]

    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_svm_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_dev_svm_pred.txt")
from prep import clean

files = [
    'data/hillaryclinton.csv', 'data/realdonaldtrump.csv',
    'data/jimmyfallon.csv', 'data/barackobama.csv', 'data/conanobrien.csv'
]

res, labels, vocab_dict, handle_dict = clean(files)

print('len(vocab_dict):', len(vocab_dict))

import json
import utils.glove as glove

wordVectors = glove.loadWordVectors(vocab_dict)
print(wordVectors.shape)
new_res = []
for re in res:
    new_res.append(np.array([wordVectors[i] for i in re]))
res = np.stack(new_res)

print(res.shape, labels.shape)

assert 1 == 2

ratio = .0  #proportion of test data
cutoff = int(len(res) * ratio)

X_test, X_train = res[:cutoff], res[cutoff:]
y_test, y_train = labels[:cutoff], labels[cutoff:]
Beispiel #8
0
def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    
    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords,:], wordVectors[nWords:,:]),
            axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)

    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain,), dtype=np.int32)
    
    #frequency counting
    freq = Counter()
    Sum = 0
    for sen in trainset:
        for word in sen[0]:
            Sum += 1
            freq[word]+=1
    for word,tf in freq.items():
        freq[word] = tf/Sum
    
    #generate all sentence features
    for i in range(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq)
    #svd in training set
    svd = TruncatedSVD(n_components=1, n_iter=5, random_state=0)
    u = svd.fit(trainFeatures).components_[0] # the first singular vector
    # remove the projections of the sentence embeddings to their first principal component
    for i in range(trainFeatures.shape[0]):
        trainFeatures[i] = trainFeatures[i] - np.dot(trainFeatures[i],u.T) * u
    
    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev,), dtype=np.int32)
    for i in range(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) 
    for i in range(devFeatures.shape[0]):
            devFeatures[i] = devFeatures[i] - np.dot(devFeatures[i],u.T) * u
            
    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest,), dtype=np.int32)
    for i in range(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq)
    for i in range(testFeatures.shape[0]):
            testFeatures[i] = testFeatures[i] - np.dot(testFeatures[i],u.T) * u
            
    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print("Training for reg=%f" % reg)
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0/(reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print("Train accuracy (%%): %f" % trainAccuracy)

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print("Dev accuracy (%%): %f" % devAccuracy)

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print("Test accuracy (%%): %f" % testAccuracy)

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy})

    # Print the accuracies
    print ("")
    print ("=== Recap ===")
    print ("Reg\t\tTrain\tDev\tTest")
    for result in results:
        print ("%.2E\t%.3f\t%.3f\t%.3f" % (
            result["reg"],
            result["train"],
            result["dev"],
            result["test"]))
    print ("")

    bestResult = chooseBestModel(results)
    print ("Best regularization value: %0.2E" % bestResult["reg"])
    print ("Test accuracy (%%): %f" % bestResult["test"])
    
    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_sif_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_sif_dev_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_sif_dev_pred.txt")
def get_glove_data():
  embedding_dimension = 100
  x_text, y = load_data_and_labels_bow("thread_content.npy", "thread_labels.npy")
  # num_recipients_features = np.array(np.load("num_recipients_features_nodup.npy"))
  #
  # avgNumRecipients = np.array(np.load("avg_num_recipients.npy"))
  # avgNumTokensPerEmail = np.array(np.load("avg_num_tokens_per_email.npy"))

  dataset = StanfordSentiment()
  tokens = dataset.tokens()
  nWords = len(tokens)

  # Initialize word vectors with glove.
  embedded_vectors = glove.loadWordVectors(tokens)
  print("The shape of embedding matrix is:")
  print(embedded_vectors.shape) # Should be number of e-mails, number of embeddings

  nTrain = len(x_text)
  trainFeatures = np.zeros((nTrain, embedding_dimension)) #5 is the number of slots the extra features take up
  toRemove = []
  for i in xrange(nTrain):
    words = x_text[i]
    num_words = len(words)

    #place number of words in buckets
    if num_words < 10:
        num_words_bucket = 0
    elif num_words >= 10 and num_words < 100:
        num_words_bucket = 1
    elif num_words >= 100 and num_words < 500:
        num_words_bucket = 2
    elif num_words >= 500 and num_words < 1000:
        num_words_bucket = 3
    elif num_words >= 1000 and num_words < 2000:
        num_words_bucket = 4
    elif num_words >= 2000:
        num_words_bucket = 5

    sentenceFeatures = getSentenceFeatures(tokens, embedded_vectors, words)
    if sentenceFeatures is None:
      toRemove.append(i)
    else:
      featureVector = sentenceFeatures
      #num_words = avgNumTokensPerEmail[i]
      #place number of words in buckets
      # if num_words < 10:
      #   num_words_bucket = 0
      # elif num_words >= 10 and num_words < 100:
      #   num_words_bucket = 1
      # elif num_words >= 100 and num_words < 500:
      #   num_words_bucket = 2
      # elif num_words >= 500 and num_words < 1000:
      #   num_words_bucket = 3
      # elif num_words >= 1000 and num_words < 2000:
      #   num_words_bucket = 4
      # elif num_words >= 2000:
      #   num_words_bucket = 5
      # featureVector = np.hstack((featureVector, num_words_bucket))
      #featureVector = np.hstack((featureVector, avgNumRecipients[i]))
      trainFeatures[i, :] = featureVector

  print(len(toRemove))
  y = np.delete(y, toRemove, axis=0)
  trainFeatures = np.delete(trainFeatures, toRemove, axis=0)

  # Randomly shuffle data
  np.random.seed(10)
  shuffle_indices = np.random.permutation(np.arange(len(y)))  # Array of random numbers from 1 to # of labels.
  x_shuffled = trainFeatures[shuffle_indices]
  y_shuffled = y[shuffle_indices]

  train = 0.6
  dev = 0.2
  test = 0.2
  # train x, dev x, test x, train y, dev y, test y
  train_cutoff = int(0.6 * len(x_shuffled))
  dev_cutoff = int(0.8 * len(x_shuffled))
  test_cutoff = int(len(x_shuffled))
  return x_shuffled[0:train_cutoff], x_shuffled[train_cutoff:dev_cutoff], x_shuffled[dev_cutoff:test_cutoff], \
         y_shuffled[0:train_cutoff], y_shuffled[train_cutoff:dev_cutoff], y_shuffled[dev_cutoff:test_cutoff],
Beispiel #10
0
visualizeWords = [
    "the", "a", "an", ",", ".", "?", "!", "``", "''", "--",
    "good", "great", "cool", "brilliant", "wonderful", "well", "amazing",
    "worth", "sweet", "enjoyable", "boring", "bad", "waste", "dumb",
    "annoying"]

key_words = ["the", "unique", "superb", "comedy", "surprisingly"]

tokens = {}
ind = 0
for word in visualizeWords + key_words:
    if word not in tokens:
        tokens[word] = ind
        ind += 1

wordVectors = loadWordVectors(tokens)

visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
U,S,V = np.linalg.svd(covariance)
coord = temp.dot(U[:,0:2])

for i in xrange(len(visualizeWords)):
    plt.text(coord[i,0], coord[i,1], visualizeWords[i],
        bbox=dict(facecolor='green', alpha=0.1))

plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
# the path of the file where the word vectors are stored
DUMMY_PATH = "utils/glove/glove_dummy.txt"

option = 2
# token_list is the list containing all the tokens
# tokens is a dictionary that maps a token to its index
if option == 1:
    token_list = ["is", "this", "a", "file", "dummy"]
elif option == 2:
    token_list = glove.loadWordTokens(DUMMY_PATH)
else:
    assert false, 'Not a valid option'
# create an empty dictionary
tokens = {}
for i in range(len(token_list)):
    tokens[token_list[i]] = i

# read in word vectors
# the function takes 3 arguments:
"""
tokens:     a dictionary maps the token to their index in token_list
filepath:   a string, the path of the word vector file to be read
dimension:  integer, the length of the vector
            (it must be consistent with the file)
"""
dummy_vectors = glove.loadWordVectors(tokens, DUMMY_PATH, 6)

for i in range(len(dummy_vectors)):
    # print the words (formatted to have a tab behind them) and the word vectors
    print "{0}\t".format(token_list[i]), dummy_vectors[i]