Exemple #1
0
def do_train(args):
    # Set up some parameters.
    config = Config(args)

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    if args.vector == "yourvectors":
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1)
    elif args.vector == "pretrained":
        wordVectors = glove.loadWordVectors(tokens)

    # Load the train set
    trainset = dataset.getTrainSentences()
    train_max_length, train, train_raw = word2index(tokens, trainset)
    print(train_raw[0])
    print(train[0])

    # Prepare dev set features
    devset = dataset.getDevSentences()
    _, dev, dev_raw = word2index(tokens, devset)

    # Prepare test set features
    testset = dataset.getTestSentences()
    _, test, test_raw = word2index(tokens, testset)

    config.max_length = train_max_length
    config.embed_size = wordVectors.shape[1]

    handler = logging.FileHandler(config.log_output)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(
        logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
    logging.getLogger().addHandler(handler)

    report = None  #Report(Config.eval_output)

    with tf.Graph().as_default():
        logger.info("Building model...", )
        start = time.time()
        model = RNNModel(config, wordVectors, tokens)
        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as session:
            session.run(init)
            model.fit(session, saver, train, dev)

            # do some error analysis
            if args.vector == "pretrained":
                y_true, preds = model.output(session, dev_raw)
                outputConfusionMatrix(preds, y_true, "q5_dev_conf.png")
def main(args):
	dataset = StanfordSentiment()
	tokens = dataset.tokens()
	nWords = len(tokens)

	if args.yourvectors:
		_, wordVectors, _ = load_saved_params()
		wordVectors = np.concatenate(
			(wordVectors[:nWords,:], wordVectors[nWords:,:]),
			axis=1)
	elif args.pretrained:
		wordVectors = glove.loadWordVectors(tokens)
	dimVectors = wordVectors.shape[1]
	print dimVectors

	trainset = dataset.getTrainSentences()
	nTrain = len(trainset)
	trainFeatures = np.zeros((nTrain, dimVectors))
	trainLabels = np.zeros((nTrain,), dtype=np.int32)
	for i in xrange(nTrain):
		words, trainLabels[i] = trainset[i]
		trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

	# Prepare dev set features
	devset = dataset.getDevSentences()
	nDev = len(devset)
	devFeatures = np.zeros((nDev, dimVectors))
	devLabels = np.zeros((nDev,), dtype=np.int32)
	for i in xrange(nDev):
		words, devLabels[i] = devset[i]
		devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

	# Prepare test set features
	testset = dataset.getTestSentences()
	nTest = len(testset)
	testFeatures = np.zeros((nTest, dimVectors))
	testLabels = np.zeros((nTest,), dtype=np.int32)
	for i in xrange(nTest):
		words, testLabels[i] = testset[i]
		testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)
def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords, :], wordVectors[nWords:, :]), axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain, ), dtype=np.int32)
    for i in xrange(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev, ), dtype=np.int32)
    for i in xrange(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest, ), dtype=np.int32)
    for i in xrange(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print("Training for reg=%f" % reg)
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0 / (reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print("Train accuracy (%%): %f" % trainAccuracy)

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print("Dev accuracy (%%): %f" % devAccuracy)

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print("Test accuracy (%%): %f" % testAccuracy)

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy
        })

    # Print the accuracies
    print("")
    print("=== Recap ===")
    print("Reg\t\tTrain\tDev\tTest")
    for result in results:
        print("%.2E\t%.3f\t%.3f\t%.3f" %
              (result["reg"], result["train"], result["dev"], result["test"]))
    print("")

    bestResult = chooseBestModel(results)
    print("Best regularization value: %0.2E" % bestResult["reg"])
    print("Test accuracy (%%): %f" % bestResult["test"])

    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_dev_pred.txt")
def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords,:], wordVectors[nWords:,:]),
            axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain,), dtype=np.int32)
    for i in xrange(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev,), dtype=np.int32)
    for i in xrange(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest,), dtype=np.int32)
    for i in xrange(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print "Training for reg=%f" % reg
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0/(reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print "Train accuracy (%%): %f" % trainAccuracy

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print "Dev accuracy (%%): %f" % devAccuracy

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print "Test accuracy (%%): %f" % testAccuracy

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy})

    # Print the accuracies
    print ""
    print "=== Recap ==="
    print "Reg\t\tTrain\tDev\tTest"
    for result in results:
        print "%.2E\t%.3f\t%.3f\t%.3f" % (
            result["reg"],
            result["train"],
            result["dev"],
            result["test"])
    print ""

    bestResult = chooseBestModel(results)
    print "Best regularization value: %0.2E" % bestResult["reg"]
    print "Test accuracy (%%): %f" % bestResult["test"]

    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_dev_pred.txt")
    else:
        # plotRegVsAccuracy(regValues, results, "q4_reg_v_acc_your.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_dev_conf_your.png")
Exemple #5
0
def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    
    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords,:], wordVectors[nWords:,:]),
            axis=1)
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)

    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain,), dtype=np.int32)
    
    #frequency counting
    freq = Counter()
    Sum = 0
    for sen in trainset:
        for word in sen[0]:
            Sum += 1
            freq[word]+=1
    for word,tf in freq.items():
        freq[word] = tf/Sum
    
    #generate all sentence features
    for i in range(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq)
    #svd in training set
    svd = TruncatedSVD(n_components=1, n_iter=5, random_state=0)
    u = svd.fit(trainFeatures).components_[0] # the first singular vector
    # remove the projections of the sentence embeddings to their first principal component
    for i in range(trainFeatures.shape[0]):
        trainFeatures[i] = trainFeatures[i] - np.dot(trainFeatures[i],u.T) * u
    
    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev,), dtype=np.int32)
    for i in range(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq) 
    for i in range(devFeatures.shape[0]):
            devFeatures[i] = devFeatures[i] - np.dot(devFeatures[i],u.T) * u
            
    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest,), dtype=np.int32)
    for i in range(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeaturesSIF(tokens, wordVectors, words, freq)
    for i in range(testFeatures.shape[0]):
            testFeatures[i] = testFeatures[i] - np.dot(testFeatures[i],u.T) * u
            
    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print("Training for reg=%f" % reg)
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0/(reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print("Train accuracy (%%): %f" % trainAccuracy)

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print("Dev accuracy (%%): %f" % devAccuracy)

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print("Test accuracy (%%): %f" % testAccuracy)

        results.append({
            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy})

    # Print the accuracies
    print ("")
    print ("=== Recap ===")
    print ("Reg\t\tTrain\tDev\tTest")
    for result in results:
        print ("%.2E\t%.3f\t%.3f\t%.3f" % (
            result["reg"],
            result["train"],
            result["dev"],
            result["test"]))
    print ("")

    bestResult = chooseBestModel(results)
    print ("Best regularization value: %0.2E" % bestResult["reg"])
    print ("Test accuracy (%%): %f" % bestResult["test"])
    
    # do some error analysis
    if args.pretrained:
        plotRegVsAccuracy(regValues, results, "q4_sif_reg_v_acc.png")
        outputConfusionMatrix(devFeatures, devLabels, bestResult["clf"],
                              "q4_sif_dev_conf.png")
        outputPredictions(devset, devFeatures, devLabels, bestResult["clf"],
                          "q4_sif_dev_pred.txt")
Exemple #6
0
### END YOUR CODE

# Load the dataset
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

# Load the word vectors we trained earlier
_, wordVectors0, _ = load_saved_params()
N = wordVectors0.shape[0] // 2
#assert nWords == N
wordVectors = (wordVectors0[:N, :] + wordVectors0[N:, :])
dimVectors = wordVectors.shape[1]

# Load the train set
trainset = dataset.getTrainSentences()
nTrain = len(trainset)
trainFeatures = np.zeros((nTrain, dimVectors))
trainLabels = np.zeros((nTrain, ), dtype=np.int32)
for i in range(nTrain):
    words, trainLabels[i] = trainset[i]
    trainFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)

# Prepare dev set features
devset = dataset.getDevSentences()
nDev = len(devset)
devFeatures = np.zeros((nDev, dimVectors))
devLabels = np.zeros((nDev, ), dtype=np.int32)
for i in range(nDev):
    words, devLabels[i] = devset[i]
    devFeatures[i, :] = getSentenceFeature(tokens, wordVectors, words)