def ReadEnvironment(remove_center=False):

    # This method was used to read and create feature matrices from a previously
    # created environment in a first stage of the project. This method might be outdated.

    print("Extracting word features without n-grams...", end="")
    (X_train_w, Y_train, X_test_w, Y_test, vectorizer,
     feature_names_w) = features.ExtractWordFeatures(
         "speeches_110_dwnominate_nonames.txt",
         "speeches_112_dwnominate_nonames.txt",
         remove_center_interval=remove_center)
    print("[DONE]")

    print("Extracting word features with 2-grams...", end="")
    (X_train_w2, Y_train, X_test_w2, Y_test, vectorizer2,
     feature_names_w2) = features.ExtractWordFeatures(
         "speeches_110_dwnominate_nonames.txt",
         "speeches_112_dwnominate_nonames.txt",
         ngrams=2,
         remove_center_interval=remove_center)
    print("[DONE]")

    print("Reading collocation features...", end="")
    (X_train_c, feature_names_c) = features.ReadX("X_train_1000_l_2.txt")
    (X_test_c, feature_names_c) = features.ReadX("X_test_1000_l_2.txt")
    print("[DONE]")

    if remove_center:
        print("Removing center...", end="")
        path_train = "../datasets/train/"
        train_dataset_df = pd.read_csv(path_train +
                                       "speeches_110_dwnominate_nonames.txt",
                                       sep="|",
                                       encoding="latin_1",
                                       header=None)
        train_dataset_df.columns = ['nominate_dim1', 'nominate_dim2', 'speech']
        dw_nominates = train_dataset_df['nominate_dim1'].values.tolist()
        indices = []
        for i in range(0, len(dw_nominates)):
            if float(dw_nominates[i]) < -0.2 or float(dw_nominates[i]) > 0.2:
                indices.append(i)
        X_train_c = [X_train_c[index] for index in indices]
        print("[DONE]")

    print("Joining word features without n-grams and collocations...", end="")
    feature_names_t = feature_names_w + feature_names_c
    X_train_t = hstack((X_train_w, X_train_c))
    X_test_t = hstack((X_test_w, X_test_c))
    print("[DONE]")

    return X_train_w, Y_train, X_test_w, Y_test, vectorizer, feature_names_w, X_train_w2, X_test_w2, vectorizer2, feature_names_w2, X_train_c, X_test_c, feature_names_c, X_train_t, X_test_t, feature_names_t
Beispiel #2
0
def RemoveCenterExperimentBoW():

    # This experiment evaluates the performance of the model in
    # relation with the amount of moderate tests we remove. To do
    # so, we calculate the accuracy of the model when we eliminate
    # some percentage of the DW Nominate space around zero.
    # Considering the highest positive and negative observations,
    # we extract a 10% more of the Nomi-nate scale from zero to
    # these values on each iteration, and calculate the accuracy
    # over the rest of the observations.

    centers = [
        None, [-0.068, 0.091], [-0.136, 0.182], [-0.204, 0.273],
        [-0.272, 0.364], [-0.34, 0.455], [-0.408, 0.546], [-0.476, 0.637],
        [-0.544, 0.728], [-0.612, 0.819]
    ]
    #centers = [[-0.2, 0.2]]
    nb_accuracies = []
    lr_accuracies = []
    t0 = time.time()
    t1 = time.time()
    for i in range(0, len(centers)):
        print(i)
        X_train, Y_train, X_test, Y_test, vectorizer, feature_names = features.ExtractWordFeatures(
            "speeches_110_dwnominate_nonames.txt",
            "speeches_112_dwnominate_nonames.txt",
            vectorizer_type="CountVectorizer",
            ngrams=None,
            balance_dataset=False,
            remove_center_interval=centers[i])
        print(len(Y_train))
        print(len(Y_test))
        print("NB")
        nb = models.NBtrain(X_train, Y_train)
        (accuracy, precision, recall) = evaluation.Test(X_test, Y_test, nb)
        nb_accuracies.append(accuracy)
        print("LR")
        lr = models.LRtrain(X_train, Y_train)
        accuracy, precision, recall = evaluation.Test(X_test, Y_test, lr)
        lr_accuracies.append(accuracy)
        t2 = time.time()
        print(t2 - t1, " seconds")
        t1 = time.time()
    t3 = time.time()
    print("Total time: ", t3 - t0)

    x = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    plt.plot(x, nb_accuracies, color="#2D00B7", label="Naive Bayes")
    plt.title("Accuracy and center removal relation")
    plt.xlabel("Percentage of DW Nominate removed")
    plt.ylabel("Accuracy")
    plt.plot(x, lr_accuracies, color="#64EEBA", label="Logistic Regression")
    plt.legend()

    return nb_accuracies, lr_accuracies
def SetEnvironment():

    # This method was used to create the train and test datasets and
    # extract some feature matrices from this datasets in a
    # first stage of the project. This method might be outdated.

    print("Creating train dataset...")
    CreateDataset("110_SpeakerMap.txt",
                  "speeches_110.txt",
                  "HS110_members.csv",
                  True,
                  "train_dataset.csv",
                  includeNames=False)

    print("Creating test dataset...")
    CreateDataset("112_SpeakerMap.txt",
                  "speeches_112.txt",
                  "HS112_members.csv",
                  False,
                  "test_dataset.csv",
                  includeNames=False)

    print("Extracting word features...")
    (X_train_w, Y_train_w, X_test_w, Y_test_w, vectorizer,
     BoW_names) = features.ExtractWordFeatures("train_dataset.csv",
                                               "test_dataset.csv")

    print("Extracting collocation features...")
    (X_train_1000, Y_train_1000, X_test_1000, Y_test_1000,
     feature_names_1000) = features.ExtractCollocationFeatures(
         "train_dataset.csv", "test_dataset.csv", "X_train_1000.txt",
         "X_test_1000.txt", 5, 1000)

    print("Creating total matrices...")
    feature_words = vectorizer.get_feature_names()
    total_features = feature_words + feature_names_1000
    X_train_t = hstack((X_train_w, X_train_1000))
    X_test_t = hstack((X_train_w, X_train_1000))

    return X_train_w, Y_train_w, X_test_w, Y_test_w, X_train_1000, Y_train_1000, X_test_1000, Y_test_1000, X_train_t, X_test_t, vectorizer, feature_words, feature_names_1000, total_features
Beispiel #4
0
def crossValidationExperiment():

    # This method implements a experiment in which we
    # apply a 10-fold cross validation technique to the
    # train dataset for both, Naive Bayes and Logistic Regression
    # models.

    (X_train, Y_train, X_test, Y_test, vectorizer,
     feature_names) = features.ExtractWordFeatures(
         "speeches_110_dwnominate_nonames.txt",
         "speeches_112_dwnominate_nonames.txt",
         vectorizer_type="CountVectorizer",
         ngrams=None,
         balance_dataset=False,
         remove_center_interval=None)
    nb = MultinomialNB()
    scores_nb = cross_val_score(nb, X_train, Y_train, cv=10)
    score_nb = np.average(scores_nb)
    print("Accuracy NB: ", score_nb)

    lr = LogisticRegression(solver='lbfgs', max_iter=1000)
    scores_lr = cross_val_score(lr, X_train, Y_train, cv=10)
    score_lr = np.average(scores_lr)
    print("Accuracy LR: ", score_lr)
Beispiel #5
0
def differentSetsCominationsExperiment():

    # In this experiment we introduce more congresses among
    # our datasets. We try different combinations as
    # train-test datasets including data from 109, 110, 111 and 112 Congresses.
    # We carry out the experiment always with the the BoW model and
    # with Naïve Bayes and Logistic Regression algorithms

    congresses = ["109", "110", "111", "112"]

    print("Only one congress for training.")
    for i in range(0, len(congresses)):
        train_congress = congresses[i]
        for j in range(i + 1, len(congresses)):
            test_congress = congresses[j]
            print(train_congress, test_congress)
            (X_train, Y_train, X_test, Y_test, vectorizer,
             feature_names) = features.ExtractWordFeatures(
                 "speeches_" + train_congress + "_dwnominate_nonames.txt",
                 "speeches_" + test_congress + "_dwnominate_nonames.txt",
                 vectorizer_type="CountVectorizer",
                 ngrams=None,
                 balance_dataset=False,
                 remove_center_interval=None)
            evaluation.Evaluate(
                X_train, Y_train, X_test, Y_test,
                "speeches_" + test_congress + "_dwnominate_nonames.txt",
                feature_names)

    print("Two congresses for training")
    for i in range(0, len(congresses) - 1):
        train_congress_1 = congresses[i]
        train_congress_2 = congresses[i + 1]
        train_dataset_df_1 = features.datasetTodf(
            "../datasets/train/speeches_" + train_congress_1 +
            "_dwnominate_nonames.txt")
        train_dataset_df_2 = features.datasetTodf(
            "../datasets/train/speeches_" + train_congress_2 +
            "_dwnominate_nonames.txt")
        train_dataset_df = pd.concat([train_dataset_df_1, train_dataset_df_2],
                                     ignore_index=True)

        for j in range(i + 2, len(congresses)):
            test_congress = congresses[j]
            test_dataset_df = features.datasetTodf(
                "../datasets/test/speeches_" + test_congress +
                "_dwnominate_nonames.txt")
            print("Train:", train_congress_1, train_congress_2, "Test:",
                  test_congress)
            (X_train, Y_train, X_test, Y_test, vectorizer,
             feature_names) = features.ExtractWordFeaturesWithDataframes(
                 train_dataset_df,
                 test_dataset_df,
                 vectorizer_type="CountVectorizer",
                 ngrams=None,
                 balance_dataset=False,
                 remove_center_interval=None)
            evaluation.Evaluate(
                X_train, Y_train, X_test, Y_test,
                "speeches_" + test_congress + "_dwnominate_nonames.txt",
                feature_names)

    print("Three congresses for training")
    train_congresses = congresses[:3]
    train_datasets = []
    for congress in train_congresses:
        train_dataset_df_i = features.datasetTodf(
            "../datasets/train/speeches_" + congress +
            "_dwnominate_nonames.txt")
        train_datasets.append(train_dataset_df_i)
    train_dataset_df = pd.concat(train_datasets, ignore_index=True)

    test_congress = congresses[len(congresses) - 1]
    test_dataset_df = features.datasetTodf("../datasets/test/speeches_" +
                                           test_congress +
                                           "_dwnominate_nonames.txt")

    print("Train:", train_congresses, "Test:", test_congress)

    (X_train, Y_train, X_test, Y_test, vectorizer,
     feature_names) = features.ExtractWordFeaturesWithDataframes(
         train_dataset_df,
         test_dataset_df,
         vectorizer_type="CountVectorizer",
         ngrams=None,
         balance_dataset=False,
         remove_center_interval=None)
    evaluation.Evaluate(
        X_train, Y_train, X_test, Y_test,
        "speeches_" + test_congress + "_dwnominate_nonames.txt", feature_names)