Beispiel #1
0
def main():
    print ("Generating language models....")
    trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE)
    trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE)
        
    trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE)
    trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE)

    testCleanLM = LanguageModel(CLEAN_TEST_FILE)
    testInsultLM = LanguageModel(INSULT_TEST_FILE)

    trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount()))
    testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount()))


    ### Just baseline probabilities
    print ("Running baseline....")
    NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM)
    print ("\tTraining NB....") 
    NB.train()
    print ("\tTesting NB....")  
    totalNBMatrix = np.array(NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents()))

    trainMatrix = totalNBMatrix 

    testMatrix = np.array(NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents()))

    clf = LogisticRegression()
    print ("\tTraining SVM....")    
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....") 
    output1 = clf.predict(testMatrix).tolist()


    ### Baseline + PoS Features
    print ("Running baseline + PoS Features....")
    cleanPosMatrix = trainABCleanLM.getPosMatrix()
    insultPosMatrix = trainABInsultLM.getPosMatrix()

    testCleanPosMatrix = testCleanLM.getPosMatrix()
    testInsultPosMatrix = testInsultLM.getPosMatrix()

    posFeatures = np.array(cleanPosMatrix + insultPosMatrix)
    testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix)
    trainMatrix = np.hstack((trainMatrix, posFeatures))
    testMatrix = np.hstack((testMatrix, testPosFeatures))

    clf = LogisticRegression()
    print ("\tTraining SVM....")    
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....") 
    output2 = clf.predict(testMatrix).tolist()


    ### Baseline + PoS Features + TF-IDF Features (TODO Arun)
    print("Running baseline + PoS Features + TF-IDF Features")
    # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM
    # trainMatrix = np.hstack((trainMatrix, the new thing you just generated))
    # do same for testMatrix
    # clf = svm.SVC()
    # print ("\tTraining SVM....")  
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")   
    # output3 = clf.predict(testMatrix).tolist()    
    # then update the output_file.txt thing below


    tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM,
            trainAAInsultLM, trainABCleanLM, trainABInsultLM)

    tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM,
            trainAAInsultLM, testCleanLM, testInsultLM)

    print tfidf_test_features.shape, tfidf_train_features.shape
    print testMatrix.shape, trainMatrix.shape

    trainMatrix = np.hstack((trainMatrix, tfidf_train_features))
    testMatrix = np.hstack((testMatrix, tfidf_test_features))


    clf = LogisticRegression()
    print ("\tTraining SVM....")  
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....")   
    output3 = clf.predict(testMatrix).tolist()  

    ### SENTIMENT ###
    print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features")
    s = Sentiment()
    clean_train = np.array(s.get_clean_train_vector())
    insult_train = np.array(s.get_insult_train_vector())
    sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0)
    shape = sentiment_train_features.shape
    sentiment_train_features = sentiment_train_features.reshape((shape[0], 1))
    print sentiment_train_features.shape

    clean_test = np.array(s.get_clean_test_vector())
    insult_test = np.array(s.get_insult_test_vector())
    sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = sentiment_test_features.shape
    sentiment_test_features = sentiment_test_features.reshape((shape[0], 1))
    print sentiment_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = LogisticRegression()
    print ("\tTraining SVM....")  
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....")   
    output4 = clf.predict(testMatrix).tolist()  

    ### MISSPELLINGS ###
    print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features")
    m = Misspellings()
    clean_train = np.array(m.get_clean_misspellings(False))
    insult_train = np.array(m.get_insult_misspellings(False))
    misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0)
    shape = misspellings_train_features.shape
    misspellings_train_features = misspellings_train_features.reshape((shape[0], 1))
    print misspellings_train_features.shape

    clean_test = np.array(m.get_clean_misspellings())
    insult_test = np.array(m.get_insult_misspellings())
    misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = misspellings_test_features.shape
    misspellings_test_features = misspellings_test_features.reshape((shape[0], 1))
    print misspellings_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = LogisticRegression()
    print ("\tTraining SVM....")  
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....")   
    output5 = clf.predict(testMatrix).tolist()  

    with open('LOG_REG_output_file_w_SB.txt', 'w+') as f:
        f.write("Output 1\n")
        f.write("{}\n".format(output1))
        interpret_results(output1, testLabels, f)
        f.write("\nOutput 2\n") 
        f.write("{}\n".format(output2))
        interpret_results(output2, testLabels, f)
        f.write("\nOutput 3\n") 
        f.write("{}\n".format(output3))
        interpret_results(output3, testLabels, f)
        f.write("Output 4\n")
        f.write("{}\n".format(output4))
        interpret_results(output4, testLabels, f)
        f.write("Output 5\n")
        f.write("{}\n".format(output5))
        interpret_results(output5, testLabels, f)
Beispiel #2
0
def main():
    print("Generating language models....")
    trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE)
    trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE)

    trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE)
    trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE)

    testCleanLM = LanguageModel(CLEAN_TEST_FILE)
    testInsultLM = LanguageModel(INSULT_TEST_FILE)

    trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) +
                           ([1] * trainABInsultLM.getDocCount()))
    testLabels = np.array(([0] * testCleanLM.getDocCount()) +
                          ([1] * testInsultLM.getDocCount()))

    ### Just baseline probabilities
    print("Running baseline....")
    NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM)
    print("\tTraining NB....")
    NB.train()
    print("\tTesting NB....")
    totalNBMatrix = np.array(
        NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents()))

    trainMatrix = totalNBMatrix

    testMatrix = np.array(
        NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents()))

    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")
    # output1 = clf.predict(testMatrix).tolist()

    ## Baseline + PoS Features
    print("Running baseline + PoS Features....")
    cleanPosMatrix = trainABCleanLM.getPosMatrix()
    insultPosMatrix = trainABInsultLM.getPosMatrix()

    testCleanPosMatrix = testCleanLM.getPosMatrix()
    testInsultPosMatrix = testInsultLM.getPosMatrix()

    posFeatures = np.array(cleanPosMatrix + insultPosMatrix)
    testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix)
    trainMatrix = np.hstack((trainMatrix, posFeatures))
    testMatrix = np.hstack((testMatrix, testPosFeatures))

    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")
    # output2 = clf.predict(testMatrix).tolist()

    ### Baseline + PoS Features + TF-IDF Features (TODO Arun)
    print("Running baseline + PoS Features + TF-IDF Features")
    # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM
    # trainMatrix = np.hstack((trainMatrix, the new thing you just generated))
    # do same for testMatrix
    # clf = svm.SVC()
    # print ("\tTraining SVM....")
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")
    # output3 = clf.predict(testMatrix).tolist()
    # then update the output_file.txt thing below

    tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM,
                                                      trainAAInsultLM,
                                                      trainABCleanLM,
                                                      trainABInsultLM)

    tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM,
                                                     trainAAInsultLM,
                                                     testCleanLM, testInsultLM)

    print tfidf_test_features.shape, tfidf_train_features.shape
    print testMatrix.shape, trainMatrix.shape

    trainMatrix = np.hstack((trainMatrix, tfidf_train_features))
    testMatrix = np.hstack((testMatrix, tfidf_test_features))

    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")
    # output3 = clf.predict(testMatrix).tolist()

    ### SENTIMENT ###
    print(
        "Running baseline + PoS Features + TF-IDF Features + Sentiment Features"
    )
    s = Sentiment()
    clean_train = np.array(s.get_clean_train_vector())
    insult_train = np.array(s.get_insult_train_vector())
    sentiment_train_features = np.concatenate((clean_train, insult_train),
                                              axis=0)
    shape = sentiment_train_features.shape
    sentiment_train_features = sentiment_train_features.reshape((shape[0], 1))
    print sentiment_train_features.shape

    clean_test = np.array(s.get_clean_test_vector())
    insult_test = np.array(s.get_insult_test_vector())
    sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = sentiment_test_features.shape
    sentiment_test_features = sentiment_test_features.reshape((shape[0], 1))
    print sentiment_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")
    # output4 = clf.predict(testMatrix).tolist()

    ### MISSPELLINGS ###
    print(
        "Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features"
    )
    m = Misspellings()
    clean_train = np.array(m.get_clean_misspellings(False))
    insult_train = np.array(m.get_insult_misspellings(False))
    misspellings_train_features = np.concatenate((clean_train, insult_train),
                                                 axis=0)
    shape = misspellings_train_features.shape
    misspellings_train_features = misspellings_train_features.reshape(
        (shape[0], 1))
    print misspellings_train_features.shape

    clean_test = np.array(m.get_clean_misspellings())
    insult_test = np.array(m.get_insult_misspellings())
    misspellings_test_features = np.concatenate((clean_test, insult_test),
                                                axis=0)
    shape = misspellings_test_features.shape
    misspellings_test_features = misspellings_test_features.reshape(
        (shape[0], 1))
    print misspellings_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = svm.SVC(kernel='linear')
    print("\tTraining SVM....")
    clf.fit(trainMatrix, trainLabels)
    print("\tTesting SVM....")
    output5 = clf.predict(testMatrix).tolist()

    index_shuf = range(len(trainMatrix))
    trainMatrix_shuf = []
    trainLabel_shuf = []
    shuffle(index_shuf)
    for i in index_shuf:
        trainMatrix_shuf.append(trainMatrix[i])
        trainLabel_shuf.append(trainLabels[i])

    train_sizes, train_scores, valid_scores = learning_curve(
        svm.SVC(),
        trainMatrix_shuf,
        trainLabel_shuf,
        train_sizes=[100, 300, 500, 700, 900],
        cv=2)
    average_train_scores = [sum(i) / float(len(i)) for i in train_scores]
    average_valid_scores = [sum(i) / float(len(i)) for i in valid_scores]
    plt.plot(train_sizes, average_train_scores)
    plt.plot(train_sizes, average_valid_scores)
    plt.legend(['Training score', 'Cross-validation score'],
               loc='center left',
               bbox_to_anchor=(0.85, 0.5))
    plt.ylabel('Score')
    plt.xlabel('Training examples')
    plt.show()

    # with open('SVM_output_file_with_SB.txt', 'w+') as f:
    #     f.write("Output 1\n")
    #     f.write("{}\n".format(output1))
    #     interpret_results(output1, testLabels, f)
    #     f.write("\nOutput 2\n")
    #     f.write("{}\n".format(output2))
    #     interpret_results(output2, testLabels, f)
    #     f.write("\nOutput 3\n")
    #     f.write("{}\n".format(output3))
    #     interpret_results(output3, testLabels, f)
    #     f.write("Output 4\n")
    #     f.write("{}\n".format(output4))
    #     interpret_results(output4, testLabels, f)
    #     f.write("Output 5\n")
    #     f.write("{}\n".format(output5))
    #     interpret_results(output5, testLabels, f)

    get_pca_graph(trainMatrix,
                  trainLabels,
                  "train_pca.png",
                  title="PCA of Training Set")
    get_pca_graph(testMatrix,
                  testLabels,
                  "test_pca.png",
                  title="PCA of Test Set")
    get_pca_graph(trainMatrix,
                  trainLabels,
                  "train_pca2.png",
                  title="PCA of Training Set (Insults Only)",
                  plot_negative=False)
    get_pca_graph(testMatrix,
                  testLabels,
                  "test_pca2.png",
                  title="PCA of Test Set (Insults Only)",
                  plot_negative=False)
Beispiel #3
0
def main():
    print("Generating language models....")
    trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE)
    trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE)

    trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE)
    trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE)

    testCleanLM = LanguageModel(CLEAN_TEST_FILE)
    testInsultLM = LanguageModel(INSULT_TEST_FILE)

    trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) +
                           ([1] * trainABInsultLM.getDocCount()))
    testLabels = np.array(([0] * testCleanLM.getDocCount()) +
                          ([1] * testInsultLM.getDocCount()))

    ### Just baseline probabilities
    print("Running baseline....")
    NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM)
    print("\tTraining NB....")
    NB.train()
    print("\tTesting NB....")
    totalNBMatrix = np.array(
        NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents()))

    trainMatrix = totalNBMatrix

    testMatrix = np.array(
        NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents()))

    clf = RandomForestClassifier()
    print("\tTraining random forest....")
    clf.fit(trainMatrix, trainLabels)
    print("\tTesting random forest....")
    output1 = clf.predict(testMatrix).tolist()

    ### Baseline + PoS Features
    print("Running baseline + PoS Features....")
    cleanPosMatrix = trainABCleanLM.getPosMatrix()
    insultPosMatrix = trainABInsultLM.getPosMatrix()

    testCleanPosMatrix = testCleanLM.getPosMatrix()
    testInsultPosMatrix = testInsultLM.getPosMatrix()

    posFeatures = np.array(cleanPosMatrix + insultPosMatrix)
    testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix)
    trainMatrix = np.hstack((trainMatrix, posFeatures))
    testMatrix = np.hstack((testMatrix, testPosFeatures))

    clf = RandomForestClassifier()
    print("\tTraining SVM....")
    clf.fit(trainMatrix, trainLabels)
    print("\tTesting SVM....")
    output2 = clf.predict(testMatrix).tolist()

    ### Baseline + PoS Features + TF-IDF Features (TODO Arun)
    print("Running baseline + PoS Features + TF-IDF Features")
    # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM
    # trainMatrix = np.hstack((trainMatrix, the new thing you just generated))
    # do same for testMatrix
    # clf = svm.SVC()
    # print ("\tTraining SVM....")
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")
    # output3 = clf.predict(testMatrix).tolist()
    # then update the output_file.txt thing below

    tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM,
                                                      trainAAInsultLM,
                                                      trainABCleanLM,
                                                      trainABInsultLM)

    tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM,
                                                     trainAAInsultLM,
                                                     testCleanLM, testInsultLM)

    print tfidf_test_features.shape, tfidf_train_features.shape
    print testMatrix.shape, trainMatrix.shape

    trainMatrix = np.hstack((trainMatrix, tfidf_train_features))
    testMatrix = np.hstack((testMatrix, tfidf_test_features))

    clf = RandomForestClassifier()
    print("\tTraining random forest....")
    clf.fit(trainMatrix, trainLabels)
    print("\tTesting random forest....")
    output3 = clf.predict(testMatrix).tolist()

    ### SENTIMENT ###
    print(
        "Running baseline + PoS Features + TF-IDF Features + Sentiment Features"
    )
    s = Sentiment()
    clean_train = np.array(s.get_clean_train_vector())
    insult_train = np.array(s.get_insult_train_vector())
    sentiment_train_features = np.concatenate((clean_train, insult_train),
                                              axis=0)
    shape = sentiment_train_features.shape
    sentiment_train_features = sentiment_train_features.reshape((shape[0], 1))
    print sentiment_train_features.shape

    clean_test = np.array(s.get_clean_test_vector())
    insult_test = np.array(s.get_insult_test_vector())
    sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = sentiment_test_features.shape
    sentiment_test_features = sentiment_test_features.reshape((shape[0], 1))
    print sentiment_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = RandomForestClassifier()
    print("\tTraining random forest....")
    clf.fit(trainMatrix, trainLabels)
    print("\tTesting random forest....")
    output4 = clf.predict(testMatrix).tolist()

    ### MISSPELLINGS ###
    print(
        "Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features"
    )
    m = Misspellings()
    clean_train = np.array(m.get_clean_misspellings(False))
    insult_train = np.array(m.get_insult_misspellings(False))
    misspellings_train_features = np.concatenate((clean_train, insult_train),
                                                 axis=0)
    shape = misspellings_train_features.shape
    misspellings_train_features = misspellings_train_features.reshape(
        (shape[0], 1))
    print misspellings_train_features.shape

    clean_test = np.array(m.get_clean_misspellings())
    insult_test = np.array(m.get_insult_misspellings())
    misspellings_test_features = np.concatenate((clean_test, insult_test),
                                                axis=0)
    shape = misspellings_test_features.shape
    misspellings_test_features = misspellings_test_features.reshape(
        (shape[0], 1))
    print misspellings_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = RandomForestClassifier()
    print("\tTraining random forest....")
    clf.fit(trainMatrix, trainLabels)
    print("\tTesting forest....")
    output5 = clf.predict(testMatrix).tolist()

    with open('RANDOM_FOREST_output_file_without_SB.txt', 'w+') as f:
        f.write("Output 1\n")
        f.write("{}\n".format(output1))
        interpret_results(output1, testLabels, f)
        f.write("\nOutput 2\n")
        f.write("{}\n".format(output2))
        interpret_results(output2, testLabels, f)
        f.write("\nOutput 3\n")
        f.write("{}\n".format(output3))
        interpret_results(output3, testLabels, f)
        f.write("Output 4\n")
        f.write("{}\n".format(output4))
        interpret_results(output4, testLabels, f)
        f.write("Output 5\n")
        f.write("{}\n".format(output5))
        interpret_results(output5, testLabels, f)
Beispiel #4
0
def main():
    print ("Generating language models....")
    trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE)
    trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE)
        
    trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE)
    trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE)

    testCleanLM = LanguageModel(CLEAN_TEST_FILE)
    testInsultLM = LanguageModel(INSULT_TEST_FILE)

    trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount()))
    testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount()))


    ### Just baseline probabilities
    print ("Running baseline....")
    NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM)
    print ("\tTraining NB....") 
    NB.train()
    print ("\tTesting NB....")  
    totalNBMatrix = np.array(NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents()))

    trainMatrix = totalNBMatrix 

    testMatrix = np.array(NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents()))

    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")    
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....") 
    # output1 = clf.predict(testMatrix).tolist()

    ## Baseline + PoS Features
    print ("Running baseline + PoS Features....")
    cleanPosMatrix = trainABCleanLM.getPosMatrix()
    insultPosMatrix = trainABInsultLM.getPosMatrix()

    testCleanPosMatrix = testCleanLM.getPosMatrix()
    testInsultPosMatrix = testInsultLM.getPosMatrix()

    posFeatures = np.array(cleanPosMatrix + insultPosMatrix)
    testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix)
    trainMatrix = np.hstack((trainMatrix, posFeatures))
    testMatrix = np.hstack((testMatrix, testPosFeatures))

    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")    
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....") 
    # output2 = clf.predict(testMatrix).tolist()


    ### Baseline + PoS Features + TF-IDF Features (TODO Arun)
    print("Running baseline + PoS Features + TF-IDF Features")
    # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM
    # trainMatrix = np.hstack((trainMatrix, the new thing you just generated))
    # do same for testMatrix
    # clf = svm.SVC()
    # print ("\tTraining SVM....")  
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")   
    # output3 = clf.predict(testMatrix).tolist()    
    # then update the output_file.txt thing below


    tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM,
            trainAAInsultLM, trainABCleanLM, trainABInsultLM)

    tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM,
            trainAAInsultLM, testCleanLM, testInsultLM)

    print tfidf_test_features.shape, tfidf_train_features.shape
    print testMatrix.shape, trainMatrix.shape

    trainMatrix = np.hstack((trainMatrix, tfidf_train_features))
    testMatrix = np.hstack((testMatrix, tfidf_test_features))


    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")  
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")   
    # output3 = clf.predict(testMatrix).tolist()  

    ### SENTIMENT ###
    print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features")
    s = Sentiment()
    clean_train = np.array(s.get_clean_train_vector())
    insult_train = np.array(s.get_insult_train_vector())
    sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0)
    shape = sentiment_train_features.shape
    sentiment_train_features = sentiment_train_features.reshape((shape[0], 1))
    print sentiment_train_features.shape

    clean_test = np.array(s.get_clean_test_vector())
    insult_test = np.array(s.get_insult_test_vector())
    sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = sentiment_test_features.shape
    sentiment_test_features = sentiment_test_features.reshape((shape[0], 1))
    print sentiment_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")  
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")   
    # output4 = clf.predict(testMatrix).tolist()  

    ### MISSPELLINGS ###
    print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features")
    m = Misspellings()
    clean_train = np.array(m.get_clean_misspellings(False))
    insult_train = np.array(m.get_insult_misspellings(False))
    misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0)
    shape = misspellings_train_features.shape
    misspellings_train_features = misspellings_train_features.reshape((shape[0], 1))
    print misspellings_train_features.shape

    clean_test = np.array(m.get_clean_misspellings())
    insult_test = np.array(m.get_insult_misspellings())
    misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = misspellings_test_features.shape
    misspellings_test_features = misspellings_test_features.reshape((shape[0], 1))
    print misspellings_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = svm.SVC(kernel='linear')
    print ("\tTraining SVM....")  
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....")   
    output5 = clf.predict(testMatrix).tolist()  


    index_shuf = range(len(trainMatrix))
    trainMatrix_shuf = []
    trainLabel_shuf = []
    shuffle(index_shuf)
    for i in index_shuf:
        trainMatrix_shuf.append(trainMatrix[i])
        trainLabel_shuf.append(trainLabels[i])

    train_sizes, train_scores, valid_scores = learning_curve(svm.SVC(), trainMatrix_shuf, trainLabel_shuf, train_sizes=[100, 300, 500, 700, 900], cv=2)
    average_train_scores = [sum(i)/float(len(i)) for i in train_scores]
    average_valid_scores = [sum(i)/float(len(i)) for i in valid_scores]
    plt.plot(train_sizes, average_train_scores)
    plt.plot(train_sizes, average_valid_scores)
    plt.legend(['Training score', 'Cross-validation score'], loc='center left', bbox_to_anchor=(0.85, 0.5))
    plt.ylabel('Score')
    plt.xlabel('Training examples')
    plt.show()
    
    # with open('SVM_output_file_with_SB.txt', 'w+') as f:
    #     f.write("Output 1\n")
    #     f.write("{}\n".format(output1))
    #     interpret_results(output1, testLabels, f)
    #     f.write("\nOutput 2\n") 
    #     f.write("{}\n".format(output2))
    #     interpret_results(output2, testLabels, f)
    #     f.write("\nOutput 3\n") 
    #     f.write("{}\n".format(output3))
    #     interpret_results(output3, testLabels, f)
    #     f.write("Output 4\n")
    #     f.write("{}\n".format(output4))
    #     interpret_results(output4, testLabels, f)
    #     f.write("Output 5\n")
    #     f.write("{}\n".format(output5))
    #     interpret_results(output5, testLabels, f)

    get_pca_graph(trainMatrix, trainLabels, "train_pca.png", title="PCA of Training Set")
    get_pca_graph(testMatrix, testLabels, "test_pca.png", title="PCA of Test Set")
    get_pca_graph(trainMatrix, trainLabels, "train_pca2.png", title="PCA of Training Set (Insults Only)", plot_negative=False)
    get_pca_graph(testMatrix, testLabels, "test_pca2.png", title="PCA of Test Set (Insults Only)", plot_negative=False)