def started(): if __name__ == '__main__': print("Ok let's go!") # Where to find data datasource_info = [('newyorktimes', 'data/nyt_discussions.json'), ('motherjones', 'data/motherjones_discussions.json'), ('breitbart', 'data/breitbart_discussions.json')] # Load the dataset into memory json_text = load_json_files(datasource_info, verbose=True) dataset = build_dataset(json_text, featurize_text, verbose=True) # Split our data into train and test train_dataset, test_dataset = split_dataset(dataset, fraction_train=0.8) # Train our classifier nb_classifier = NaiveBayesClassifier() nb_classifier.train(train_dataset) # Evaluate our classifier, for each class performance_string = 'Class {klass} performance: f1={f1:.{digits}}, precision={precision:.{digits}}, recall={recall:.{digits}}' for klass in sorted(nb_classifier.class_counter): # sort just for nicer output f1, precision, recall = evaluate_classifier(nb_classifier, klass, test_dataset) print(performance_string.format(klass=klass, f1=f1, precision=precision, recall=recall, digits=3)) else: print("Ok let's go!") # Where to find data datasource_info = [('newyorktimes', 'data/nyt_discussions.json'), ('motherjones', 'data/motherjones_discussions.json'), ('breitbart', 'data/breitbart_discussions.json')] # Load the dataset into memory json_text = load_json_files(datasource_info, verbose=True) dataset = build_dataset(json_text, featurize_text, verbose=True) # Split our data into train and test train_dataset, test_dataset = split_dataset(dataset, fraction_train=0.8) # Train our classifier nb_classifier = NaiveBayesClassifier() nb_classifier.train(train_dataset) # Evaluate our classifier, for each class performance_string = 'Class {klass} performance: f1={f1:.{digits}}, precision={precision:.{digits}}, recall={recall:.{digits}}' for klass in sorted(nb_classifier.class_counter): # sort just for nicer output f1, precision, recall = evaluate_classifier(nb_classifier, klass, test_dataset) print(performance_string.format(klass=klass, f1=f1, precision=precision, recall=recall, digits=3))
def evaluate_ensemble(new, x_test, y_test, x_textReviews_test, y_textReviews_test, datasetOld="", d="", rep="", exper=""): global classifier1, classifier2, classifier3, classifier4, dfA, dfN, dfALL global acc1HistTestNew, acc2HistTestNew, acc3HistTestNew, acc4HistTestNew, accEnseHistTestOld acc1, y_predict1 = evaluate_classifier(classifier1, x_test, y_test) print("acc1", acc1) acc2, y_predict2 = evaluate_classifier(classifier2, x_test, y_test) print("acc2", acc2) acc3, y_predict3 = evaluate_classifier(classifier3, x_test, y_test) print("acc3", acc3) acc4, y_predict4 = evaluate_classifier(classifier4, x_test, y_test) print("acc4", acc4) acc_ensemble, recall_ensemble, precision_ensemble, y_voting = voting( y_predict1, y_predict2, y_predict3, y_predict4, y_test, x_textReviews_test, y_textReviews_test) if new == True: #New data data = pd.DataFrame({ 'd': [d], 'dataset': [datasetOld], 'acc': [acc_ensemble], 'rep': [rep], 'exp': [exper], 'recall_ensemble': [recall_ensemble], 'precision_ensemble': [precision_ensemble] }) dfN = dfN.append(data, ignore_index=True) data = pd.DataFrame({ 'State': 'NEW', 'acc1': [acc1], 'acc2': [acc2], 'acc3': [acc3], 'acc4': [acc4], 'd': [d], 'dataset': [datasetOld], 'acc': [acc_ensemble], 'rep': [rep], 'exp': [exper], 'recall_ensemble': [recall_ensemble], 'precision_ensemble': [precision_ensemble] }) dfALL = dfALL.append(data, ignore_index=True) acc1HistTestNew.append(acc1) acc2HistTestNew.append(acc2) acc3HistTestNew.append(acc3) acc4HistTestNew.append(acc4) accEnseHistTestNew.append(acc_ensemble) print("ACCURACY FOR NEW DATA", acc_ensemble) else: #Old data data = pd.DataFrame({ 'd': [d], 'dataset': [datasetOld], 'acc': [acc_ensemble], 'rep': [rep], 'exp': [exper], 'recall_ensemble': [recall_ensemble], 'precision_ensemble': [precision_ensemble] }) dfA = dfA.append(data, ignore_index=True) data = pd.DataFrame({ 'State': 'OLD', 'acc1': [acc1], 'acc2': [acc2], 'acc3': [acc3], 'acc4': [acc4], 'd': [d], 'dataset': [datasetOld], 'acc': [acc_ensemble], 'rep': [rep], 'exp': [exper], 'recall_ensemble': [recall_ensemble], 'precision_ensemble': [precision_ensemble] }) dfALL = dfALL.append(data, ignore_index=True) acc1HistTestOld.append(acc1) acc2HistTestOld.append(acc2) acc3HistTestOld.append(acc3) acc4HistTestOld.append(acc4) accEnseHistTestOld.append(acc_ensemble) print("ACCURACY FOR OLD DATA", acc_ensemble) return acc1, acc2, acc3, acc4, acc_ensemble
np.random.seed(randint(0, 50)) datasetOld.append(dataset[i]) previous_weight = False if i == 0: #All classifiers are trained using the same training set x_train, x_test, y_train, y_test, x_textReviews_test, y_textReviews_test = get_train_test( dataset=dataset[i], test_size=test_size, convert=False) #Store test set, so I can know if they have fogotten xTestArray.append(x_test) yTestArray.append(y_test) xRTestArray.append(x_textReviews_test) yRTestArray.append(y_textReviews_test) #Classifier 1 classifier1, history1, acc_train1, acc_test1, weights1 = train_classifier1( embedding_matrix, x_train, x_test, y_train, y_test, previous_weight) acc1, y_predict1 = evaluate_classifier(classifier1, x_test, y_test) #print("acc1", acc1) acc1HistTrainNew.append( acc_train1) #later decide if it is new or old acc1HistTestNew.append(acc_test1) #Classifier 2 classifier2, history2, acc_train2, acc_test2, weights2 = train_classifier2( embedding_matrix, x_train, x_test, y_train, y_test, previous_weight) acc2, y_predict2 = evaluate_classifier(classifier2, x_test, y_test) #print("acc2", acc2) acc2HistTrainNew.append(acc_train2) acc2HistTestNew.append(acc_test2)
datasource_info = [('newyorktimes', 'data/nyt_discussions.json'), ('motherjones', 'data/motherjones_discussions.json'), ('breitbart', 'data/breitbart_discussions.json')] # Load the dataset into memory json_text = load_json_files(datasource_info, verbose=True) dataset = build_dataset(json_text, featurize_text, verbose=True) # Split our data into train and test train_dataset, test_dataset = split_dataset(dataset, fraction_train=0.8) # Train our classifier nb_classifier = NaiveBayesClassifier() nb_classifier.train(train_dataset) #pdb.set_trace() # Evaluate our classifier, for each class performance_string = 'Class {klass} performance: f1={f1:.{digits}}, precision={precision:.{digits}}, recall={recall:.{digits}}' for klass in sorted( nb_classifier.class_counter): # sort just for nicer output f1, precision, recall = evaluate_classifier(nb_classifier, klass, test_dataset) print( performance_string.format(klass=klass, f1=f1, precision=precision, recall=recall, digits=3))
print("--------Bag Of Words-------------") #Pre-processing, processing of the text,Feature extraction using Bag of words #When I carried out the experiment I used that bag of words that it was saved, because this process take long time bowReview = bag_of_words(X) save_bow(bowReview, dataset) bowReview = load_bow(dataset) #Split dataset: training and testing x_train, x_test, y_train, y_test, textReviews_train, textReviews_test, textReviews_train, textReviews_test = get_train_test( textReviews=textReviews, X=bowReview, y=y, test_size=0.10) # print("x_train.shape", x_train.shape) # print("x_test.shape", x_test.shape) #Classifier(Nnet) using the bag of words classifier1, history1, acc_train1, acc_test1 = train_classifier1( x_train, x_test, y_train, y_test) #Get accuracy, precsion and recall acc1, y_predict1 = evaluate_classifier(classifier1, x_test, y_test) recall1 = recall_score(y_test, y_predict1) * 100 precision1 = precision_score(y_test, y_predict1) * 100 data = pd.DataFrame({ 'acc': [acc1], 'recall': [recall1], 'precision': [precision1] }) dfBow = dfBow.append(data, ignore_index=True) ########################################### print("--------Embedding-------------") #Pre-processing, processing of the text,Feature extraction using embedding: GloVe #Get embedding matrix and convert reviews to numbers #When I carried out the experiment I used that embedding that it was saved, because this process take long time embedding_matrix, x = embeddings_matrix_glove(textReviews, y)