def baeysian_clas(train,
                  test,
                  val_trai,
                  val_test,
                  auto_calibration=False,
                  calibration_func=None,
                  clf=None,
                  CostMatrix=None,
                  CostMatrixTrain=None):

    scaler = MinMaxScaler()
    train = scaler.fit_transform(train)
    val_trai = scaler.fit_transform(val_trai)

    if calibration_func is None:
        model = clf.fit(train, test)
    else:
        cc = CalibratedClassifierCV(clf, method=calibration_func, cv=3)
        model = cc.fit(train, test)

    prob_test = model.predict_proba(val_trai)
    bmr = BayesMinimumRiskClassifier(calibration=auto_calibration)
    pred_test = bmr.predict(prob_test, CostMatrix)

    prob_test_train = model.predict_proba(train)
    bmr_train = BayesMinimumRiskClassifier(calibration=auto_calibration)
    pred_train = bmr_train.predict(prob_test_train, CostMatrixTrain)

    print(classification_report(val_test, pred_test))
    loss = cost_loss(val_test, pred_test, CostMatrix)
    print("%d\n" % loss)
    print(confusion_matrix(val_test, pred_test).T)
    return pred_train, pred_test
Example #2
0
def _cs_report(true, predicted, label_names, cost_matrix) -> None:
    """
    Shows a full cost sensitive classification report.

    :param cost_matrix: the cost matrix.
    :param label_names: the class names.
    :param true: the true labels.
    :param predicted: the predicted labels.
    """
    # Show a classification report.
    print(classification_report(true, predicted, target_names=label_names))

    # Create a confusion matrix with the metrics.
    matrix = confusion_matrix(true, predicted)

    # Create a heatmap of the confusion matrix.
    plt.figure(figsize=(8, 8))
    sns.heatmap(matrix, annot=True, fmt='d', linewidths=.1, cmap='YlGnBu',
                cbar=False, xticklabels=label_names, yticklabels=label_names)
    plt.title('Total Classification Cost -> {}'.format(cost_loss(true, predicted, cost_matrix)), fontsize='x-large')
    plt.xticks(fontsize='large')
    plt.yticks(fontsize='large')
    plt.xlabel('True output', fontsize='x-large')
    plt.ylabel('Predicted output', fontsize='x-large')
    plt.savefig(fname='confusion_matrix.png')
    plt.show()
Example #3
0
    def print_metrics(self, y_test, y_pred):
        if self.configuration.cost_option == COST_OPTION_MODEL:
            costs = []

            for current_y in y_test:
                costs_array = self.configuration.cost.costcla_cost_array(
                    current_y)
                costs.append(costs_array)

            costs = np.asarray(costs)

            cost_loss = cost_metrics.cost_loss(y_test, y_pred, costs)
            print("\tCost loss: %f" % cost_loss)

            bin_class_metrics = cost_metrics.binary_classification_metrics(
                y_test, y_pred, y_pred)
            print("\tBinary classification metrics:", bin_class_metrics)

        accuracy = metrics.accuracy_score(y_test, y_pred)
        recall = metrics.recall_score(y_test, y_pred)
        precision = metrics.precision_score(y_test, y_pred)
        f1 = metrics.f1_score(y_test, y_pred)

        print("\tAccuracy: %f" % accuracy)
        print("\tRecall: %f" % recall)
        print("\tPrecision: %f" % precision)
        print("\tF1: %f" % f1)
Example #4
0
def _create_model_summary(model, name, X_test, y_test, cost_matrix_test):
    standard_model_type = type(model)
    if standard_model_type == tuple:
        standard_model, extra_model = model
        extra_model_type = type(extra_model)
        if extra_model_type == BayesMinimumRiskClassifier:
            y_hat_proba = standard_model.predict_proba(X_test)
            y_hat = extra_model.predict(y_hat_proba, cost_matrix_test)
        elif extra_model_type == ThresholdingOptimization:
            y_hat_proba = standard_model.predict_proba(X_test)
            y_hat = extra_model.predict(y_hat_proba)
        else:
            raise ValueError(f'Unknown model type: {extra_model_type}.')
    elif standard_model_type in ECSDT_MODELS:
        y_hat = model.predict(X_test, cost_matrix_test)
    else:
        y_hat = model.predict(X_test)
    return {
        'Name': name,
        'Accuracy': accuracy_score(y_test, y_hat),
        'Precision': precision_score(y_test, y_hat),
        'Recall': recall_score(y_test, y_hat),
        'F1': f1_score(y_test, y_hat),
        'Cost': cost_loss(y_test, y_hat, cost_matrix_test),
        'Savings': savings_score(y_test, y_hat, cost_matrix_test)
    }
Example #5
0
def predict(f, class_index):
    test_docs_bin = read_train("../Data/test-data.dat")
    X_test = tfIdf(test_docs_bin)

    y_test = load_labels("../Data/test-label.dat", class_index)

    cost_mat_test = calculate_cost_matrix(y_test)

    y_pred_test_cslr = f.predict(X_test)

    return cost_loss(y_test, y_pred_test_cslr, cost_mat_test)
Example #6
0
def calculate_all_evaluation_metrics(test_label, test_predictions, test_costs):
    """
    Calculate several evaluation metrics using sklearn for a set of
        labels and predictions.
    :param list test_labels: list of true labels for the test data.
    :param list test_predictions: list of risk scores for the test data.
    :return: all_metrics
    :rtype: dict
    """
    all_metrics = dict()
    #test_costs = test_costs.as_matrix()
    # FORMAT FOR DICTIONARY KEY
    # all_metrics["metric|parameter|unit|comment"] OR
    # all_metrics["metric|parameter|unit"] OR
    # all_metrics["metric||comment"] OR
    # all_metrics["metric"]

    cutoffs = [.1, .15, .2, .25, .3, .35, .4, .45, .5, .55,  .6,
               .65, .7, .75, .8, .85, .9]
    for cutoff in cutoffs:
        test_predictions_binary_at_x = generate_binary_at_x(test_predictions, cutoff)
        # confusion matrix
        TP, TN, FP, FN = confusion_matrix_at_x(test_label,  test_predictions_binary_at_x)

        all_metrics["true positives@|{}".format(str(cutoff))] = TP
        all_metrics["true negatives@|{}".format(str(cutoff))] = TN
        all_metrics["false positives@|{}".format(str(cutoff))] = FP
        all_metrics["false negatives@|{}".format(str(cutoff))] = FN
        # precision
        all_metrics["precision@|{}".format(str(cutoff))] = [TP / ((TP + FP) * 1.0) if (TP + FP) > 0 else 'Null'][0]
        # recall
        all_metrics["recall@|{}".format(str(cutoff))] = [TP / ((TP + FN) * 1.0) if (TP + FN)> 0 else 'Null'][0]
        # f1
        all_metrics["f1@|{}".format(str(cutoff))] = [(2* TP) / ((2*TP + FP + FN)*1.0) if (TP + FP + FN) > 0 else 'Null'][0]
        # accuracy
        all_metrics["auc@|{}".format(str(cutoff))] = (TP + TN) / ((TP + TN + FP + FN)*1.0)
        # cost sensity
        all_metrics["savings@|{}".format(str(cutoff))] = savings_score(test_label, test_predictions_binary_at_x, test_costs)
        all_metrics["cost_loss@|{}".format(str(cutoff))] = cost_loss(test_label, test_predictions_binary_at_x, test_costs)

        #Adding only the changes
        TP_c, TN_c, FP_c, FN_c = confusion_matrix_cost_at_x(test_label, test_predictions_binary_at_x, test_costs)
        all_metrics["true positives ch@|{}".format(str(cutoff))] = TP_c
        all_metrics["true negatives ch@|{}".format(str(cutoff))] = TN_c
        all_metrics["false positives ch@|{}".format(str(cutoff))] = FP_c
        all_metrics["false negatives ch@|{}".format(str(cutoff))] = FN_c
        all_metrics["precision ch@|{}".format(str(cutoff))] = [TP_c / ((TP_c + FP_c) * 1.0) if (TP_c + FP_c) > 0 else 'Null'][0]
        all_metrics["recall ch@|{}".format(str(cutoff))] = [TP_c / ((TP_c + FN_c) * 1.0) if (TP_c + FN_c)> 0 else 'Null'][0]
        all_metrics["f1 ch@|{}".format(str(cutoff))] = [(2* TP_c) / ((2*TP_c + FP_c + FN_c)*1.0) if (TP_c + FP_c + FN_c) > 0 else 'Null'][0]
        all_metrics["auc ch@|{}".format(str(cutoff))] = (TP_c + TN_c) / ((TP_c + TN_c + FP_c + FN_c)*1.0)

    return all_metrics
Example #7
0
        y_pred = clf[0].predict(x_test)
        print("Some evaluation metrics for classifier:\n")
        acc, cflrep, mcm, hamls = metrics(x_train, x_test, y_train, y_test,
                                          y_pred, 0)  #, labels=df.Class)
        #########################
        # convert Y from multilabel to multi class
        b_y_test = multi_labelTo_multi_class_D(y_test, transformer)
        b_y_pred = multi_labelTo_multi_class_D(y_pred, transformer)
        b_y_test = np.where(np.array(b_y_test) == costclass, 1, 0)
        b_y_pred = np.where(np.array(b_y_pred) == costclass, 1, 0)
        fp = np.full((b_y_test.shape[0], 1), costval)
        fn = np.full((b_y_test.shape[0], 1), 1)
        tp = np.zeros((b_y_test.shape[0], 1))
        tn = np.zeros((b_y_test.shape[0], 1))
        cost_matrix = np.hstack((fp, fn, tp, tn))
        loss = cost_loss(b_y_test, b_y_pred, cost_matrix)
        ###########################
        all_metrics.update({
            clf[2]: {
                'Accurancy': acc,
                'Classification Report': cflrep,
                'Confusion Matrix': mcm,
                'Hamming Loss': hamls,
                'Cost Loss': loss
            }
        })

    print(
        '\n\n=============================================================================\n'
    )
    print('---------------Final Results-----------------')
Example #8
0
    RandomForestClassifier(n_estimators=100, random_state=0),
    SVC(kernel='linear', C=1)
]

for name, clf in zip(names, classifiers):
    print(name)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred,
                                target_names=data.target_names))

    conf_m = confusion_matrix(y_test,
                              y_pred).T  # transpose to align with slides
    print(conf_m)
    print(np.sum(conf_m * cost_m))
    loss = cost_loss(y_test, y_pred, cost_matrix)
    print("%d\n" % loss)
"""Minimizing the expected cost"""

from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from costcla.models import BayesMinimumRiskClassifier

X_train, X_test, y_train, y_test = train_test_split(data.data,
                                                    data.target,
                                                    test_size=0.3,
                                                    random_state=0)
# 0 is malignant, 1 is benign
#fp, fn, tp, tn
fp = np.full((y_test.shape[0], 1), 4)
fn = np.full((y_test.shape[0], 1), 1)
def cost_loss_score(y_test, pred_test):
    return cost_loss(y_test, pred_test, cost_matrix_cv)
cost_mat_train[:, 3] = 0

cost_mat_test = np.zeros((len(y_test), 4))

cost_mat_test[:, 0] = 2
cost_mat_test[:, 1] = 250
cost_mat_test[:, 2] = 0
cost_mat_test[:, 3] = 0

g = CostSensitiveRandomForestClassifier()
g.fit(np.array(X_train), np.array(y_train), cost_mat_train)
y_pred_rf_cslr = g.predict(np.array(X_test))

print('--------CostSensitiveRandomForestClassifier------')
display_summary(y_test, y_pred_rf_cslr)

cm = confusion_matrix(y_test, y_pred_rf_cslr)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf_cslr).ravel()

plot_confusion_matrix(
    cm,
    ['0', '1'],
)
pr, tpr, fpr = show_data(cm, print_res=1)

# Savings using only RandomForest
print("savings_score=", savings_score(y_test, y_pred_rf_cslr, cost_mat_test))
print("cost_loss=", cost_loss(y_test, y_pred_rf_cslr, cost_mat_test))
print('F1_score  =     {:.8f}'.format(
    2 * (((tp / (tp + fp)) * (tp / (tp + fn))) / ((tp / (tp + fp)) +
                                                  (tp / (tp + fn))))))
Example #11
0
        tp = np.zeros((y_test.shape[0], 1))
        tn = np.zeros((y_test.shape[0], 1))
        cost_matrix = np.hstack((fp, fn, tp, tn))
        if cim == 2:
            data, target = classimbalance.random_undersampler(data, target)
        elif cim == 3:
            data, target = classimbalance.smote(data, target)
            
        if cm == 1:
            # Probability calibration using Isotonic Method
            cc = CalibratedClassifierCV(clf, method="isotonic", cv=3)
            model = cc.fit(data, target)
            prob_test = model.predict_proba(X_test)
            bmr = BayesMinimumRiskClassifier(calibration=False)
            prediction = bmr.predict(prob_test, cost_matrix)
            loss = cost_loss(y_test[:, e], prediction, cost_matrix)
            pred_BR.append(prediction)
            cost_BR.append(loss)
            
        elif cm == 2:
            # Probability calibration using CostCla calibration            
            model = clf.fit(data, target)
            prob_train = model.predict_proba(data)
            bmr = BayesMinimumRiskClassifier(calibration=True)
            bmr.fit(target, prob_train)
            prob_test = model.predict_proba(X_test)
            prediction = bmr.predict(prob_test, cost_matrix)
            loss = cost_loss(y_test[:, e], prediction, cost_matrix)
            pred_BR.append(prediction)
            cost_BR.append(loss)
csdt = CostSensitiveDecisionTreeClassifier(
    criterion='direct_cost',
    criterion_weight=False,
    num_pct=20000,
    max_features=None,
    max_depth=None,
    min_samples_split=30,
    min_samples_leaf=1,
    min_gain=0.01,
    pruned=False)


cost = 0
savings = 0
size = 0
for key, fold in ds.folds.items():
    tree = csdt.fit(fold.x_train, fold.y_train, fold.cost_mat_train)
    print('Fold: ' + str(key))
    printTree(tree.tree_.tree, '', ds.feature_names)
    print('\n')
    y_pred = tree.predict(fold.x_test)
    curr_cost = cost_loss(fold.y_test, y_pred, fold.cost_mat_test)
    curr_savings = savings_score(fold.y_test, y_pred, fold.cost_mat_test)
    cost += curr_cost
    savings += curr_savings
    size += tree.tree_.n_nodes
    
    print (key, curr_cost, curr_savings, tree.tree_.n_nodes)

print ("Summary:", cost/len(ds.folds), savings/len(ds.folds), size/len(ds.folds))