Beispiel #1
0
def started():

    if __name__ == '__main__':
        print("Ok let's go!")

        # Where to find data
        datasource_info = [('newyorktimes', 'data/nyt_discussions.json'),
                           ('motherjones', 'data/motherjones_discussions.json'),
                           ('breitbart', 'data/breitbart_discussions.json')]

        # Load the dataset into memory
        json_text = load_json_files(datasource_info, verbose=True)
        dataset = build_dataset(json_text, featurize_text, verbose=True)

        # Split our data into train and test
        train_dataset, test_dataset = split_dataset(dataset, fraction_train=0.8)

        # Train our classifier
        nb_classifier = NaiveBayesClassifier()
        nb_classifier.train(train_dataset)

        # Evaluate our classifier, for each class
        performance_string = 'Class {klass} performance: f1={f1:.{digits}}, precision={precision:.{digits}}, recall={recall:.{digits}}'
        for klass in sorted(nb_classifier.class_counter):  # sort just for nicer output
            f1, precision, recall = evaluate_classifier(nb_classifier, klass,
                                                        test_dataset)

            print(performance_string.format(klass=klass, f1=f1, precision=precision, recall=recall, digits=3))
    else:
        print("Ok let's go!")

        # Where to find data
        datasource_info = [('newyorktimes', 'data/nyt_discussions.json'),
                           ('motherjones', 'data/motherjones_discussions.json'),
                           ('breitbart', 'data/breitbart_discussions.json')]

        # Load the dataset into memory
        json_text = load_json_files(datasource_info, verbose=True)
        dataset = build_dataset(json_text, featurize_text, verbose=True)

        # Split our data into train and test
        train_dataset, test_dataset = split_dataset(dataset, fraction_train=0.8)

        # Train our classifier
        nb_classifier = NaiveBayesClassifier()
        nb_classifier.train(train_dataset)

        # Evaluate our classifier, for each class
        performance_string = 'Class {klass} performance: f1={f1:.{digits}}, precision={precision:.{digits}}, recall={recall:.{digits}}'
        for klass in sorted(nb_classifier.class_counter):  # sort just for nicer output
            f1, precision, recall = evaluate_classifier(nb_classifier, klass,
                                                        test_dataset)

            print(performance_string.format(klass=klass, f1=f1, precision=precision, recall=recall, digits=3))
def evaluate_ensemble(new,
                      x_test,
                      y_test,
                      x_textReviews_test,
                      y_textReviews_test,
                      datasetOld="",
                      d="",
                      rep="",
                      exper=""):
    global classifier1, classifier2, classifier3, classifier4, dfA, dfN, dfALL
    global acc1HistTestNew, acc2HistTestNew, acc3HistTestNew, acc4HistTestNew, accEnseHistTestOld
    acc1, y_predict1 = evaluate_classifier(classifier1, x_test, y_test)
    print("acc1", acc1)
    acc2, y_predict2 = evaluate_classifier(classifier2, x_test, y_test)
    print("acc2", acc2)
    acc3, y_predict3 = evaluate_classifier(classifier3, x_test, y_test)
    print("acc3", acc3)
    acc4, y_predict4 = evaluate_classifier(classifier4, x_test, y_test)
    print("acc4", acc4)
    acc_ensemble, recall_ensemble, precision_ensemble, y_voting = voting(
        y_predict1, y_predict2, y_predict3, y_predict4, y_test,
        x_textReviews_test, y_textReviews_test)

    if new == True:  #New data
        data = pd.DataFrame({
            'd': [d],
            'dataset': [datasetOld],
            'acc': [acc_ensemble],
            'rep': [rep],
            'exp': [exper],
            'recall_ensemble': [recall_ensemble],
            'precision_ensemble': [precision_ensemble]
        })
        dfN = dfN.append(data, ignore_index=True)
        data = pd.DataFrame({
            'State': 'NEW',
            'acc1': [acc1],
            'acc2': [acc2],
            'acc3': [acc3],
            'acc4': [acc4],
            'd': [d],
            'dataset': [datasetOld],
            'acc': [acc_ensemble],
            'rep': [rep],
            'exp': [exper],
            'recall_ensemble': [recall_ensemble],
            'precision_ensemble': [precision_ensemble]
        })
        dfALL = dfALL.append(data, ignore_index=True)
        acc1HistTestNew.append(acc1)
        acc2HistTestNew.append(acc2)
        acc3HistTestNew.append(acc3)
        acc4HistTestNew.append(acc4)
        accEnseHistTestNew.append(acc_ensemble)
        print("ACCURACY FOR NEW DATA", acc_ensemble)
    else:  #Old data
        data = pd.DataFrame({
            'd': [d],
            'dataset': [datasetOld],
            'acc': [acc_ensemble],
            'rep': [rep],
            'exp': [exper],
            'recall_ensemble': [recall_ensemble],
            'precision_ensemble': [precision_ensemble]
        })
        dfA = dfA.append(data, ignore_index=True)
        data = pd.DataFrame({
            'State': 'OLD',
            'acc1': [acc1],
            'acc2': [acc2],
            'acc3': [acc3],
            'acc4': [acc4],
            'd': [d],
            'dataset': [datasetOld],
            'acc': [acc_ensemble],
            'rep': [rep],
            'exp': [exper],
            'recall_ensemble': [recall_ensemble],
            'precision_ensemble': [precision_ensemble]
        })
        dfALL = dfALL.append(data, ignore_index=True)
        acc1HistTestOld.append(acc1)
        acc2HistTestOld.append(acc2)
        acc3HistTestOld.append(acc3)
        acc4HistTestOld.append(acc4)
        accEnseHistTestOld.append(acc_ensemble)
        print("ACCURACY FOR OLD DATA", acc_ensemble)

    return acc1, acc2, acc3, acc4, acc_ensemble
            np.random.seed(randint(0, 50))
            datasetOld.append(dataset[i])
            previous_weight = False
            if i == 0:  #All classifiers are trained using the same training set
                x_train, x_test, y_train, y_test, x_textReviews_test, y_textReviews_test = get_train_test(
                    dataset=dataset[i], test_size=test_size, convert=False)
                #Store test set, so I can know if they have fogotten
                xTestArray.append(x_test)
                yTestArray.append(y_test)
                xRTestArray.append(x_textReviews_test)
                yRTestArray.append(y_textReviews_test)
                #Classifier 1
                classifier1, history1, acc_train1, acc_test1, weights1 = train_classifier1(
                    embedding_matrix, x_train, x_test, y_train, y_test,
                    previous_weight)
                acc1, y_predict1 = evaluate_classifier(classifier1, x_test,
                                                       y_test)
                #print("acc1", acc1)
                acc1HistTrainNew.append(
                    acc_train1)  #later decide if it is new or old
                acc1HistTestNew.append(acc_test1)

                #Classifier 2
                classifier2, history2, acc_train2, acc_test2, weights2 = train_classifier2(
                    embedding_matrix, x_train, x_test, y_train, y_test,
                    previous_weight)
                acc2, y_predict2 = evaluate_classifier(classifier2, x_test,
                                                       y_test)
                #print("acc2", acc2)
                acc2HistTrainNew.append(acc_train2)
                acc2HistTestNew.append(acc_test2)
Beispiel #4
0
    datasource_info = [('newyorktimes', 'data/nyt_discussions.json'),
                       ('motherjones', 'data/motherjones_discussions.json'),
                       ('breitbart', 'data/breitbart_discussions.json')]

    # Load the dataset into memory
    json_text = load_json_files(datasource_info, verbose=True)
    dataset = build_dataset(json_text, featurize_text, verbose=True)

    # Split our data into train and test
    train_dataset, test_dataset = split_dataset(dataset, fraction_train=0.8)

    # Train our classifier
    nb_classifier = NaiveBayesClassifier()
    nb_classifier.train(train_dataset)

    #pdb.set_trace()

    # Evaluate our classifier, for each class
    performance_string = 'Class {klass} performance: f1={f1:.{digits}}, precision={precision:.{digits}}, recall={recall:.{digits}}'
    for klass in sorted(
            nb_classifier.class_counter):  # sort just for nicer output
        f1, precision, recall = evaluate_classifier(nb_classifier, klass,
                                                    test_dataset)

        print(
            performance_string.format(klass=klass,
                                      f1=f1,
                                      precision=precision,
                                      recall=recall,
                                      digits=3))
Beispiel #5
0
    print("--------Bag Of Words-------------")
    #Pre-processing, processing of the text,Feature extraction using Bag of words
    #When I carried out the experiment I used that bag of words that it was saved, because this process take long time
    bowReview = bag_of_words(X)
    save_bow(bowReview, dataset)
    bowReview = load_bow(dataset)
    #Split dataset: training and testing
    x_train, x_test, y_train, y_test, textReviews_train, textReviews_test, textReviews_train, textReviews_test = get_train_test(
        textReviews=textReviews, X=bowReview, y=y, test_size=0.10)
    #    print("x_train.shape", x_train.shape)
    #    print("x_test.shape", x_test.shape)
    #Classifier(Nnet) using the bag of words
    classifier1, history1, acc_train1, acc_test1 = train_classifier1(
        x_train, x_test, y_train, y_test)
    #Get accuracy, precsion and recall
    acc1, y_predict1 = evaluate_classifier(classifier1, x_test, y_test)
    recall1 = recall_score(y_test, y_predict1) * 100
    precision1 = precision_score(y_test, y_predict1) * 100
    data = pd.DataFrame({
        'acc': [acc1],
        'recall': [recall1],
        'precision': [precision1]
    })
    dfBow = dfBow.append(data, ignore_index=True)

    ###########################################
    print("--------Embedding-------------")
    #Pre-processing, processing of the text,Feature extraction using embedding: GloVe
    #Get embedding matrix and convert reviews to numbers
    #When I carried out the experiment I used that embedding that it was saved, because this process take long time
    embedding_matrix, x = embeddings_matrix_glove(textReviews, y)