def baeysian_clas(train, test, val_trai, val_test, auto_calibration=False, calibration_func=None, clf=None, CostMatrix=None, CostMatrixTrain=None): scaler = MinMaxScaler() train = scaler.fit_transform(train) val_trai = scaler.fit_transform(val_trai) if calibration_func is None: model = clf.fit(train, test) else: cc = CalibratedClassifierCV(clf, method=calibration_func, cv=3) model = cc.fit(train, test) prob_test = model.predict_proba(val_trai) bmr = BayesMinimumRiskClassifier(calibration=auto_calibration) pred_test = bmr.predict(prob_test, CostMatrix) prob_test_train = model.predict_proba(train) bmr_train = BayesMinimumRiskClassifier(calibration=auto_calibration) pred_train = bmr_train.predict(prob_test_train, CostMatrixTrain) print(classification_report(val_test, pred_test)) loss = cost_loss(val_test, pred_test, CostMatrix) print("%d\n" % loss) print(confusion_matrix(val_test, pred_test).T) return pred_train, pred_test
def _cs_report(true, predicted, label_names, cost_matrix) -> None: """ Shows a full cost sensitive classification report. :param cost_matrix: the cost matrix. :param label_names: the class names. :param true: the true labels. :param predicted: the predicted labels. """ # Show a classification report. print(classification_report(true, predicted, target_names=label_names)) # Create a confusion matrix with the metrics. matrix = confusion_matrix(true, predicted) # Create a heatmap of the confusion matrix. plt.figure(figsize=(8, 8)) sns.heatmap(matrix, annot=True, fmt='d', linewidths=.1, cmap='YlGnBu', cbar=False, xticklabels=label_names, yticklabels=label_names) plt.title('Total Classification Cost -> {}'.format(cost_loss(true, predicted, cost_matrix)), fontsize='x-large') plt.xticks(fontsize='large') plt.yticks(fontsize='large') plt.xlabel('True output', fontsize='x-large') plt.ylabel('Predicted output', fontsize='x-large') plt.savefig(fname='confusion_matrix.png') plt.show()
def print_metrics(self, y_test, y_pred): if self.configuration.cost_option == COST_OPTION_MODEL: costs = [] for current_y in y_test: costs_array = self.configuration.cost.costcla_cost_array( current_y) costs.append(costs_array) costs = np.asarray(costs) cost_loss = cost_metrics.cost_loss(y_test, y_pred, costs) print("\tCost loss: %f" % cost_loss) bin_class_metrics = cost_metrics.binary_classification_metrics( y_test, y_pred, y_pred) print("\tBinary classification metrics:", bin_class_metrics) accuracy = metrics.accuracy_score(y_test, y_pred) recall = metrics.recall_score(y_test, y_pred) precision = metrics.precision_score(y_test, y_pred) f1 = metrics.f1_score(y_test, y_pred) print("\tAccuracy: %f" % accuracy) print("\tRecall: %f" % recall) print("\tPrecision: %f" % precision) print("\tF1: %f" % f1)
def _create_model_summary(model, name, X_test, y_test, cost_matrix_test): standard_model_type = type(model) if standard_model_type == tuple: standard_model, extra_model = model extra_model_type = type(extra_model) if extra_model_type == BayesMinimumRiskClassifier: y_hat_proba = standard_model.predict_proba(X_test) y_hat = extra_model.predict(y_hat_proba, cost_matrix_test) elif extra_model_type == ThresholdingOptimization: y_hat_proba = standard_model.predict_proba(X_test) y_hat = extra_model.predict(y_hat_proba) else: raise ValueError(f'Unknown model type: {extra_model_type}.') elif standard_model_type in ECSDT_MODELS: y_hat = model.predict(X_test, cost_matrix_test) else: y_hat = model.predict(X_test) return { 'Name': name, 'Accuracy': accuracy_score(y_test, y_hat), 'Precision': precision_score(y_test, y_hat), 'Recall': recall_score(y_test, y_hat), 'F1': f1_score(y_test, y_hat), 'Cost': cost_loss(y_test, y_hat, cost_matrix_test), 'Savings': savings_score(y_test, y_hat, cost_matrix_test) }
def predict(f, class_index): test_docs_bin = read_train("../Data/test-data.dat") X_test = tfIdf(test_docs_bin) y_test = load_labels("../Data/test-label.dat", class_index) cost_mat_test = calculate_cost_matrix(y_test) y_pred_test_cslr = f.predict(X_test) return cost_loss(y_test, y_pred_test_cslr, cost_mat_test)
def calculate_all_evaluation_metrics(test_label, test_predictions, test_costs): """ Calculate several evaluation metrics using sklearn for a set of labels and predictions. :param list test_labels: list of true labels for the test data. :param list test_predictions: list of risk scores for the test data. :return: all_metrics :rtype: dict """ all_metrics = dict() #test_costs = test_costs.as_matrix() # FORMAT FOR DICTIONARY KEY # all_metrics["metric|parameter|unit|comment"] OR # all_metrics["metric|parameter|unit"] OR # all_metrics["metric||comment"] OR # all_metrics["metric"] cutoffs = [.1, .15, .2, .25, .3, .35, .4, .45, .5, .55, .6, .65, .7, .75, .8, .85, .9] for cutoff in cutoffs: test_predictions_binary_at_x = generate_binary_at_x(test_predictions, cutoff) # confusion matrix TP, TN, FP, FN = confusion_matrix_at_x(test_label, test_predictions_binary_at_x) all_metrics["true positives@|{}".format(str(cutoff))] = TP all_metrics["true negatives@|{}".format(str(cutoff))] = TN all_metrics["false positives@|{}".format(str(cutoff))] = FP all_metrics["false negatives@|{}".format(str(cutoff))] = FN # precision all_metrics["precision@|{}".format(str(cutoff))] = [TP / ((TP + FP) * 1.0) if (TP + FP) > 0 else 'Null'][0] # recall all_metrics["recall@|{}".format(str(cutoff))] = [TP / ((TP + FN) * 1.0) if (TP + FN)> 0 else 'Null'][0] # f1 all_metrics["f1@|{}".format(str(cutoff))] = [(2* TP) / ((2*TP + FP + FN)*1.0) if (TP + FP + FN) > 0 else 'Null'][0] # accuracy all_metrics["auc@|{}".format(str(cutoff))] = (TP + TN) / ((TP + TN + FP + FN)*1.0) # cost sensity all_metrics["savings@|{}".format(str(cutoff))] = savings_score(test_label, test_predictions_binary_at_x, test_costs) all_metrics["cost_loss@|{}".format(str(cutoff))] = cost_loss(test_label, test_predictions_binary_at_x, test_costs) #Adding only the changes TP_c, TN_c, FP_c, FN_c = confusion_matrix_cost_at_x(test_label, test_predictions_binary_at_x, test_costs) all_metrics["true positives ch@|{}".format(str(cutoff))] = TP_c all_metrics["true negatives ch@|{}".format(str(cutoff))] = TN_c all_metrics["false positives ch@|{}".format(str(cutoff))] = FP_c all_metrics["false negatives ch@|{}".format(str(cutoff))] = FN_c all_metrics["precision ch@|{}".format(str(cutoff))] = [TP_c / ((TP_c + FP_c) * 1.0) if (TP_c + FP_c) > 0 else 'Null'][0] all_metrics["recall ch@|{}".format(str(cutoff))] = [TP_c / ((TP_c + FN_c) * 1.0) if (TP_c + FN_c)> 0 else 'Null'][0] all_metrics["f1 ch@|{}".format(str(cutoff))] = [(2* TP_c) / ((2*TP_c + FP_c + FN_c)*1.0) if (TP_c + FP_c + FN_c) > 0 else 'Null'][0] all_metrics["auc ch@|{}".format(str(cutoff))] = (TP_c + TN_c) / ((TP_c + TN_c + FP_c + FN_c)*1.0) return all_metrics
y_pred = clf[0].predict(x_test) print("Some evaluation metrics for classifier:\n") acc, cflrep, mcm, hamls = metrics(x_train, x_test, y_train, y_test, y_pred, 0) #, labels=df.Class) ######################### # convert Y from multilabel to multi class b_y_test = multi_labelTo_multi_class_D(y_test, transformer) b_y_pred = multi_labelTo_multi_class_D(y_pred, transformer) b_y_test = np.where(np.array(b_y_test) == costclass, 1, 0) b_y_pred = np.where(np.array(b_y_pred) == costclass, 1, 0) fp = np.full((b_y_test.shape[0], 1), costval) fn = np.full((b_y_test.shape[0], 1), 1) tp = np.zeros((b_y_test.shape[0], 1)) tn = np.zeros((b_y_test.shape[0], 1)) cost_matrix = np.hstack((fp, fn, tp, tn)) loss = cost_loss(b_y_test, b_y_pred, cost_matrix) ########################### all_metrics.update({ clf[2]: { 'Accurancy': acc, 'Classification Report': cflrep, 'Confusion Matrix': mcm, 'Hamming Loss': hamls, 'Cost Loss': loss } }) print( '\n\n=============================================================================\n' ) print('---------------Final Results-----------------')
RandomForestClassifier(n_estimators=100, random_state=0), SVC(kernel='linear', C=1) ] for name, clf in zip(names, classifiers): print(name) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(classification_report(y_test, y_pred, target_names=data.target_names)) conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides print(conf_m) print(np.sum(conf_m * cost_m)) loss = cost_loss(y_test, y_pred, cost_matrix) print("%d\n" % loss) """Minimizing the expected cost""" from sklearn.ensemble import RandomForestClassifier from sklearn.calibration import CalibratedClassifierCV from costcla.models import BayesMinimumRiskClassifier X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3, random_state=0) # 0 is malignant, 1 is benign #fp, fn, tp, tn fp = np.full((y_test.shape[0], 1), 4) fn = np.full((y_test.shape[0], 1), 1)
def cost_loss_score(y_test, pred_test): return cost_loss(y_test, pred_test, cost_matrix_cv)
cost_mat_train[:, 3] = 0 cost_mat_test = np.zeros((len(y_test), 4)) cost_mat_test[:, 0] = 2 cost_mat_test[:, 1] = 250 cost_mat_test[:, 2] = 0 cost_mat_test[:, 3] = 0 g = CostSensitiveRandomForestClassifier() g.fit(np.array(X_train), np.array(y_train), cost_mat_train) y_pred_rf_cslr = g.predict(np.array(X_test)) print('--------CostSensitiveRandomForestClassifier------') display_summary(y_test, y_pred_rf_cslr) cm = confusion_matrix(y_test, y_pred_rf_cslr) tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf_cslr).ravel() plot_confusion_matrix( cm, ['0', '1'], ) pr, tpr, fpr = show_data(cm, print_res=1) # Savings using only RandomForest print("savings_score=", savings_score(y_test, y_pred_rf_cslr, cost_mat_test)) print("cost_loss=", cost_loss(y_test, y_pred_rf_cslr, cost_mat_test)) print('F1_score = {:.8f}'.format( 2 * (((tp / (tp + fp)) * (tp / (tp + fn))) / ((tp / (tp + fp)) + (tp / (tp + fn))))))
tp = np.zeros((y_test.shape[0], 1)) tn = np.zeros((y_test.shape[0], 1)) cost_matrix = np.hstack((fp, fn, tp, tn)) if cim == 2: data, target = classimbalance.random_undersampler(data, target) elif cim == 3: data, target = classimbalance.smote(data, target) if cm == 1: # Probability calibration using Isotonic Method cc = CalibratedClassifierCV(clf, method="isotonic", cv=3) model = cc.fit(data, target) prob_test = model.predict_proba(X_test) bmr = BayesMinimumRiskClassifier(calibration=False) prediction = bmr.predict(prob_test, cost_matrix) loss = cost_loss(y_test[:, e], prediction, cost_matrix) pred_BR.append(prediction) cost_BR.append(loss) elif cm == 2: # Probability calibration using CostCla calibration model = clf.fit(data, target) prob_train = model.predict_proba(data) bmr = BayesMinimumRiskClassifier(calibration=True) bmr.fit(target, prob_train) prob_test = model.predict_proba(X_test) prediction = bmr.predict(prob_test, cost_matrix) loss = cost_loss(y_test[:, e], prediction, cost_matrix) pred_BR.append(prediction) cost_BR.append(loss)
csdt = CostSensitiveDecisionTreeClassifier( criterion='direct_cost', criterion_weight=False, num_pct=20000, max_features=None, max_depth=None, min_samples_split=30, min_samples_leaf=1, min_gain=0.01, pruned=False) cost = 0 savings = 0 size = 0 for key, fold in ds.folds.items(): tree = csdt.fit(fold.x_train, fold.y_train, fold.cost_mat_train) print('Fold: ' + str(key)) printTree(tree.tree_.tree, '', ds.feature_names) print('\n') y_pred = tree.predict(fold.x_test) curr_cost = cost_loss(fold.y_test, y_pred, fold.cost_mat_test) curr_savings = savings_score(fold.y_test, y_pred, fold.cost_mat_test) cost += curr_cost savings += curr_savings size += tree.tree_.n_nodes print (key, curr_cost, curr_savings, tree.tree_.n_nodes) print ("Summary:", cost/len(ds.folds), savings/len(ds.folds), size/len(ds.folds))