def plot_confusion_matrix (confusion_matrix_array): print ('###### Start Confusion Matrix ####') print (confusion_matrix_array) save_report_to_csv (REPORT_FOLDER + get_model_name_by_file(VALIDATION_FILE)+'_confusion_report.csv', [ 'MultinomialNB', get_model_name_by_file(MODEL_FILE), confusion_matrix_array[0][0], confusion_matrix_array[0][1], confusion_matrix_array[1][0], confusion_matrix_array[1][1] ]) print ('###### End Confusion Matrix ####') df_cm = pd.DataFrame(confusion_matrix_array, range(2), range(2)) #plt.figure(figsize = (10,7)) plot = df_cm.plot() fig = plot.get_figure() ax = plt.subplot() sn.heatmap(df_cm, annot=True, fmt='g', ax = ax, annot_kws={"size": 16})# font size # labels, title and ticks ax.set_xlabel('Predicted') ax.set_ylabel('Real') ax.yaxis.set_ticklabels(['Non Political', 'Political']) ax.xaxis.set_ticklabels(['Non Political', 'Political']) model_name = MODEL_FILE model_name = model_name.replace ('.politics_ben.skl', '') model_name = model_name.replace (SKL_FOLDER, '') ax.set_title(model_name.replace ('_', ' ').upper()) fig.add_subplot(ax) fig.savefig(PLOT_FOLDER + 'confusion_matrix_publica_'+ model_name + '.png', dpi=400)
def train_CNN(X, y, inp_dim, model, weights, epochs=EPOCHS, batch_size=BATCH_SIZE): cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42) print(cv_object) p, r, f1 = [], [], [] p1, r1, f11 = 0., 0., 0. p_class, r_class, f1_class = [], [], [] sentence_len = X.shape[1] marcro_f1, macro_r, macro_p = [], [], [] precision_scores = [] recall_scores = [] f1_scores = [] for train_index, test_index in cv_object.split(X): if INITIALIZE_WEIGHTS_WITH == "word2vec": model.layers[0].set_weights([weights]) elif INITIALIZE_WEIGHTS_WITH == "random": shuffle_weights(model) else: print("ERROR!") return X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] y_train = y_train.reshape((len(y_train), 1)) X_temp = np.hstack((X_train, y_train)) for epoch in range(epochs): for X_batch in batch_gen(X_temp, batch_size): x = X_batch[:, :sentence_len] y_temp = X_batch[:, sentence_len] class_weights = None if SCALE_LOSS_FUN: class_weights = {} for cw in range(len(set(tx_class))): class_weights[cw] = np.where( y_temp == cw)[0].shape[0] / float(len(y_temp)) try: y_temp = np_utils.to_categorical(y_temp, num_classes=len( set(tx_class))) except Exception as e: print(e) print(y_temp) #print(x.shape, y.shape) loss, acc = model.train_on_batch(x, y_temp, class_weight=class_weights) y_pred = model.predict_on_batch(X_test) y_pred = np.argmax(y_pred, axis=1) #print(classification_report(y_test, y_pred)) #print(precision_recall_fscore_support(y_test, y_pred)) #print(y_pred) p.append(precision_score(y_test, y_pred, average='weighted')) p1 += precision_score(y_test, y_pred, average='micro') p_class.append(precision_score(y_test, y_pred, average=None)) r.append(recall_score(y_test, y_pred, average='weighted')) r1 += recall_score(y_test, y_pred, average='micro') r_class.append(recall_score(y_test, y_pred, average=None)) f1.append(f1_score(y_test, y_pred, average='weighted')) f11 += f1_score(y_test, y_pred, average='micro') f1_class.append(f1_score(y_test, y_pred, average=None)) macro_p.append(precision_score(y_test, y_pred, average='macro')) macro_r.append(recall_score(y_test, y_pred, average='macro')) marcro_f1.append(f1_score(y_test, y_pred, average='macro')) print("macro results are") print("average precision is %f" % (np.array(p).mean())) print("average recall is %f" % (np.array(r).mean())) print("average f1 is %f" % (np.array(f1).mean())) save_report_to_csv( REPORT_FOLDER + 'CNN_training_report.csv', [ 'CNN', get_model_name_by_file(POLITICS_FILE), #weighted scores np.array(p).mean(), np.array(p).std() * 2, np.array(r).mean(), np.array(r).std() * 2, np.array(f1).mean(), np.array(f1).std() * 2, #macro scores np.array(macro_p).mean(), np.array(macro_p).std() * 2, np.array(macro_r).mean(), np.array(macro_r).std() * 2, np.array(marcro_f1).mean(), np.array(marcro_f1).std() * 2, #by class scores np.array(np.array(p_class)[:, 0]).mean(), np.array(np.array(p_class)[:, 1]).mean(), np.array(np.array(r_class)[:, 0]).mean(), np.array(np.array(r_class)[:, 1]).mean(), np.array(np.array(f1_class)[:, 0]).mean(), np.array(np.array(f1_class)[:, 1]).mean() ]) print("micro results are") print("average precision is %f" % (p1 / NO_OF_FOLDS)) print("average recall is %f" % (r1 / NO_OF_FOLDS)) print("average f1 is %f" % (f11 / NO_OF_FOLDS))
path = dict(cf.items("file_path")) dir_w2v = path['dir_w2v'] dir_in = path['dir_in'] texts, y_true = load_validation_file_csv(VALIDATION_FILE) print ('Loading '+MODEL_FILE+' file...') model = joblib.load(MODEL_FILE) vectorizer = get_vectorizer() pol = '' n_pol = '' y_pred = list() mean_auc, std_auc = generate_roc_curve (model, texts, y_true, MODEL_FILE, get_model_name_by_file(VALIDATION_FILE)) print ('Predicting...') X = vectorizer.transform(texts) y_pred = model.predict(X) print ('Classification Report') print(classification_report(y_true, y_pred)) p, r, f1, s = precision_recall_fscore_support(y_true, y_pred) ff1 = f1_score (y_true, y_pred, average='weighted') recall = recall_score (y_true, y_pred, average='weighted') precision = precision_score (y_true, y_pred, average='weighted') f1_macro = f1_score (y_true, y_pred, average='macro')
n_pol += text + '\n' y_pred.append(0) print(classification_report(y_true, y_pred)) p, r, f1, s = precision_recall_fscore_support(y_true, y_pred) ff1 = f1_score(y_true, y_pred, average='weighted') recall = recall_score(y_true, y_pred, average='weighted') precision = precision_score(y_true, y_pred, average='weighted') f1_macro = f1_score(y_true, y_pred, average='macro') recall_macro = recall_score(y_true, y_pred, average='macro') precision_macro = precision_score(y_true, y_pred, average='macro') accuracy = accuracy_score(y_true, y_pred) generate_normal(X, y_true) mean_auc, std_auc = generate_roc_curve( X, y_true, get_model_name_by_file(VALIDATION_FILE)) #plot_confusion_matrix (confusion_matrix(y_true, y_pred)) save_report_to_csv(REPORT_FOLDER + 'CNN_validation_report.csv', [ 'CNN', get_model_name_by_file(H5_FILE), get_model_name_by_file(VALIDATION_FILE), accuracy, p[0], p[1], r[0], r[1], f1[0], f1[1], s[0], s[1], f1_macro, recall_macro, precision_macro, mean_auc, std_auc, ff1, recall, precision ])
dir_w2v + W2VEC_MODEL_FILE, binary=False, unicode_errors="ignore") texts, y_true = load_validation_file_csv(VALIDATION_FILE) print('Loading ' + MODEL_FILE + ' file...') model = joblib.load(MODEL_FILE) pol = '' n_pol = '' y_pred = list() tp = TextProcessor() texts = tp.text_process(texts, text_only=True) X = gen_data(texts) mean_auc, std_auc = generate_roc_curve( model, X, y_true, MODEL_FILE, get_model_name_by_file(VALIDATION_FILE)) print('Predicting...') y_pred = model.predict(X) print('Classification Report') print(classification_report(y_true, y_pred)) p, r, f1, s = precision_recall_fscore_support(y_true, y_pred) model_name = MODEL_FILE.replace(SKL_FOLDER, '') model_name = model_name.replace('.politics_ben.skl', '') model_name = model_name.replace('_', ' ').upper() generate_normal(model, X, y_true, model_name)
def train_classifier(classifier, vectorizer, data): train, test = train_test_split(data, test_size=0.1, random_state=SEED, shuffle=True) x_train, y_train = zip(*train) x_test, y_test = zip(*test) x_train = vectorizer.transform(x_train) x_test = vectorizer.transform(x_test) x_train, y_train = equalize_classes(x_train, y_train) print("final size of training data: %s" % x_train.shape[0]) classifier.fit(x_train, y_train) print(classification_report(y_test, classifier.predict(x_test))) scores1 = cross_val_score(classifier, x_train, y_train, cv=NO_OF_FOLDS, scoring='precision_weighted') predictions = cross_val_predict(classifier.fit(x_train, y_train), x_train, y_train, cv=NO_OF_FOLDS) print("Precision(avg): %0.3f (+/- %0.3f)" % (scores1.mean(), scores1.std() * 2)) precision_score_mean = scores1.mean() precision_score_std = scores1.std() * 2 scores2 = cross_val_score( classifier, x_train, y_train, cv=NO_OF_FOLDS, scoring='recall_weighted') print("Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2)) recall_score_mean = scores2.mean() recall_score_std = scores2.std() * 2 scores3 = cross_val_score( classifier, x_train, y_train, cv=NO_OF_FOLDS, scoring='f1_weighted') print("F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2)) f1_score_mean = scores3.mean() f1_score_std = scores3.std() * 2 # getting metrics by class f1_class = f1_score(y_train, predictions, average=None) r_class = recall_score(y_train, predictions, average=None) p_class = precision_score(y_train, predictions, average=None) f1_macro = cross_val_score(classifier, x_train, y_train, cv=NO_OF_FOLDS, scoring='f1_macro') r_macro = cross_val_score(classifier, x_train, y_train, cv=NO_OF_FOLDS, scoring='recall_macro') p_macro = cross_val_score(classifier, x_train, y_train, cv=NO_OF_FOLDS, scoring='precision_macro') save_report_to_csv (REPORT_FOLDER + 'training_report.csv', [ 'MultinomialNB', get_model_name_by_file(POLITICS_FILE), precision_score_mean, precision_score_std, recall_score_mean, recall_score_std, f1_score_mean, f1_score_std, #macro scores f1_macro.mean(), f1_macro.std() * 2, r_macro.mean(), r_macro.std() * 2, p_macro.mean(), p_macro.std() * 2, #class scores f1_class[0], f1_class[1], r_class[0], r_class[1], p_class[0], p_class[1], ]) return classifier
def classification_model(X, Y, model_type=None): X, Y = shuffle(X, Y, random_state=SEED) print("Model Type:", model_type) params = load_hiperparameters (POLITICS_FILE) if not params: model = GridSearchCV(estimator=get_model(model_type), param_grid=param_grid[model_type], n_jobs=-1, verbose=3) else: model = get_model(model_type) model.set_params (**params) model.fit(X, Y) predictions = cross_val_predict(model, X, Y, cv=NO_OF_FOLDS) if params is None: try: print('\n Best estimator:') print(model.best_estimator_) save_hiperparameters (POLITICS_FILE, model.best_estimator_) print('\n Best hyperparameters:') print(model.best_params_) except Exception as error: print (error) print ('Nothind to do!') pass scores1 = cross_val_score(model, X, Y, cv=NO_OF_FOLDS, scoring='precision_weighted') print("Precision(avg): %0.3f (+/- %0.3f)" %(scores1.mean(), scores1.std() * 2)) precision_score_mean = scores1.mean() precision_score_std = scores1.std() * 2 scores2 = cross_val_score(model, X, Y, cv=NO_OF_FOLDS, scoring='recall_weighted') print("Recall(avg): %0.3f (+/- %0.3f)" % (scores2.mean(), scores2.std() * 2)) recall_score_mean = scores2.mean() recall_score_std = scores2.std() * 2 scores3 = cross_val_score( model, X, Y, cv=NO_OF_FOLDS, scoring='f1_weighted') print("F1-score(avg): %0.3f (+/- %0.3f)" % (scores3.mean(), scores3.std() * 2)) f1_score_mean = scores3.mean() f1_score_std = scores3.std() * 2 # getting metrics by class f1_class = f1_score(Y, predictions, average=None) r_class = recall_score(Y, predictions, average=None) p_class = precision_score(Y, predictions, average=None) f1_macro = cross_val_score(model, X, Y, cv=NO_OF_FOLDS, scoring='f1_macro') r_macro = cross_val_score(model, X, Y, cv=NO_OF_FOLDS, scoring='recall_macro') p_macro = cross_val_score(model, X, Y, cv=NO_OF_FOLDS, scoring='precision_macro') print (f1_class, r_class, p_class) save_report_to_csv (REPORT_FOLDER + model_type +'_training_report.csv', [ model_type, get_model_name_by_file(POLITICS_FILE), # weighted scores precision_score_mean, precision_score_std, recall_score_mean, recall_score_std, f1_score_mean, f1_score_std, #macro scores f1_macro.mean(), f1_macro.std() * 2, r_macro.mean(), r_macro.std() * 2, p_macro.mean(), p_macro.std() * 2, # by class f1_class[0], f1_class[1], r_class[0], r_class[1], p_class[0], p_class[1], ]) return model