def get_reward(self, train_x, train_y, train_weights, valid_x, valid_y, test_x, test_y):
     idx = train_weights == 1
     x = train_x[idx]
     y = train_y[idx]
     self.env.fit(x, np.argmax(y, axis=1).astype('int32'))
     probs = self.env.predict_proba(train_x)
     preds = (probs[:, 1] > (self.cost_mat_train[:, 0] / self.cost_mat_train[:, 1])).astype('int32')
     train_reward = savings_score(np.argmax(train_y, axis=1).astype('int32'), preds, self.cost_mat_train)
     probs = self.env.predict_proba(valid_x)
     preds = (probs[:, 1] > (self.cost_mat_valid[:, 0] / self.cost_mat_valid[:, 1])).astype('int32')
     valid_reward = savings_score(np.argmax(valid_y, axis=1).astype('int32'), preds, self.cost_mat_valid)
     probs = self.env.predict_proba(test_x)
     preds = (probs[:, 1] > (self.cost_mat_test[:, 0] / self.cost_mat_test[:, 1])).astype('int32')
     test_reward = savings_score(np.argmax(test_y, axis=1).astype('int32'), preds, self.cost_mat_test)
     return train_reward, valid_reward, test_reward
Beispiel #2
0
    def cost_sensitive(self, y_, y, Amounts):

        costmatrix = np.zeros((y.shape[0], 2))
        costmatrix[:, 0] = np.ones(y.shape[0]) * 0.22
        costmatrix[:, 1] = Amounts

        thresholds = costmatrix[:, 0] / costmatrix[:, 1]

        pred = [1 if y_i > threshold else 0 for y_i, threshold in zip(y_[:, 1], thresholds)]
        pred = np.array(pred)

        tp, tn, fp, fn = 0, 0, 0, 0

        for prediction, label in zip(pred, np.argmax(y, axis=1)):

            if label == 1 and prediction == 1:
                tp += 1
            elif label == 0 and  prediction == 0:
                tn += 1
            elif label == 1 and prediction == 0:
                fn += 1
            else:
                fp += 1

        r = self.recall(tp, fn)
        p = self.precision(tp, fp)
        s = self.specificity(tn, fp)

        savings = savings_score(np.argmax(y, axis=1), pred, self.cost_matrix)

        return r, p, s, savings
Beispiel #3
0
    def evaluate(self, y_, y):
        y_ = self.label_transform(y_)
        y = self.label_transform(y)

        tp, tn, fp, fn = 0, 0, 0, 0

        for prediction, label in zip(np.argmax(y_, axis=1), np.argmax(y, axis=1)):

            if label == 1 and prediction == 1:
                tp += 1
            elif label == 0 and  prediction == 0:
                tn += 1
            elif label == 1 and prediction == 0:
                fn += 1
            else:
                fp += 1

        r = self.recall(tp, fn)
        p = self.precision(tp, fp)
        s = self.specificity(tn, fp)

        savings = savings_score(np.argmax(y, axis=1), np.argmax(y_, axis=1), self.cost_matrix)

        C_recall, C_precision, C_specificity, C_savings = self.cost_sensitive(y_, y, self.amounts)


        #print n_samples, r, p, f1, savings, C_recall, C_precision, C_F1, C_savings

        self.results.append([r, p, s, savings, C_recall, C_precision, C_specificity, C_savings])
Beispiel #4
0
def evaluate(y_, y, costmatrix):

    precision = lambda tp, fp: float(tp) / (tp + fp) if (tp + fp) > 0 else 0
    recall = lambda tp, fn: float(tp) / (tp + fn) if (tp + fn) > 0 else 0
    specificity = lambda tn, fp: float(tn) / (tn + fp) if (tn + fp) > 0 else 0
    tp, tn, fp, fn = 0, 0, 0, 0

    for prediction, label in zip(y_, y):

        if label == 1 and prediction == 1:
            tp += 1
        elif label == 0 and prediction == 0:
            tn += 1
        elif label == 1 and prediction == 0:
            fn += 1
        else:
            fp += 1

    r = recall(tp, fn)
    p = precision(tp, fp)
    s = specificity(tn, fp)

    savings = savings_score(y, y_, costmatrix)

    return r, p, s, savings
Beispiel #5
0
def _create_model_summary(model, name, X_test, y_test, cost_matrix_test):
    standard_model_type = type(model)
    if standard_model_type == tuple:
        standard_model, extra_model = model
        extra_model_type = type(extra_model)
        if extra_model_type == BayesMinimumRiskClassifier:
            y_hat_proba = standard_model.predict_proba(X_test)
            y_hat = extra_model.predict(y_hat_proba, cost_matrix_test)
        elif extra_model_type == ThresholdingOptimization:
            y_hat_proba = standard_model.predict_proba(X_test)
            y_hat = extra_model.predict(y_hat_proba)
        else:
            raise ValueError(f'Unknown model type: {extra_model_type}.')
    elif standard_model_type in ECSDT_MODELS:
        y_hat = model.predict(X_test, cost_matrix_test)
    else:
        y_hat = model.predict(X_test)
    return {
        'Name': name,
        'Accuracy': accuracy_score(y_test, y_hat),
        'Precision': precision_score(y_test, y_hat),
        'Recall': recall_score(y_test, y_hat),
        'F1': f1_score(y_test, y_hat),
        'Cost': cost_loss(y_test, y_hat, cost_matrix_test),
        'Savings': savings_score(y_test, y_hat, cost_matrix_test)
    }
Beispiel #6
0
def calculate_all_evaluation_metrics(test_label, test_predictions, test_costs):
    """
    Calculate several evaluation metrics using sklearn for a set of
        labels and predictions.
    :param list test_labels: list of true labels for the test data.
    :param list test_predictions: list of risk scores for the test data.
    :return: all_metrics
    :rtype: dict
    """
    all_metrics = dict()
    #test_costs = test_costs.as_matrix()
    # FORMAT FOR DICTIONARY KEY
    # all_metrics["metric|parameter|unit|comment"] OR
    # all_metrics["metric|parameter|unit"] OR
    # all_metrics["metric||comment"] OR
    # all_metrics["metric"]

    cutoffs = [.1, .15, .2, .25, .3, .35, .4, .45, .5, .55,  .6,
               .65, .7, .75, .8, .85, .9]
    for cutoff in cutoffs:
        test_predictions_binary_at_x = generate_binary_at_x(test_predictions, cutoff)
        # confusion matrix
        TP, TN, FP, FN = confusion_matrix_at_x(test_label,  test_predictions_binary_at_x)

        all_metrics["true positives@|{}".format(str(cutoff))] = TP
        all_metrics["true negatives@|{}".format(str(cutoff))] = TN
        all_metrics["false positives@|{}".format(str(cutoff))] = FP
        all_metrics["false negatives@|{}".format(str(cutoff))] = FN
        # precision
        all_metrics["precision@|{}".format(str(cutoff))] = [TP / ((TP + FP) * 1.0) if (TP + FP) > 0 else 'Null'][0]
        # recall
        all_metrics["recall@|{}".format(str(cutoff))] = [TP / ((TP + FN) * 1.0) if (TP + FN)> 0 else 'Null'][0]
        # f1
        all_metrics["f1@|{}".format(str(cutoff))] = [(2* TP) / ((2*TP + FP + FN)*1.0) if (TP + FP + FN) > 0 else 'Null'][0]
        # accuracy
        all_metrics["auc@|{}".format(str(cutoff))] = (TP + TN) / ((TP + TN + FP + FN)*1.0)
        # cost sensity
        all_metrics["savings@|{}".format(str(cutoff))] = savings_score(test_label, test_predictions_binary_at_x, test_costs)
        all_metrics["cost_loss@|{}".format(str(cutoff))] = cost_loss(test_label, test_predictions_binary_at_x, test_costs)

        #Adding only the changes
        TP_c, TN_c, FP_c, FN_c = confusion_matrix_cost_at_x(test_label, test_predictions_binary_at_x, test_costs)
        all_metrics["true positives ch@|{}".format(str(cutoff))] = TP_c
        all_metrics["true negatives ch@|{}".format(str(cutoff))] = TN_c
        all_metrics["false positives ch@|{}".format(str(cutoff))] = FP_c
        all_metrics["false negatives ch@|{}".format(str(cutoff))] = FN_c
        all_metrics["precision ch@|{}".format(str(cutoff))] = [TP_c / ((TP_c + FP_c) * 1.0) if (TP_c + FP_c) > 0 else 'Null'][0]
        all_metrics["recall ch@|{}".format(str(cutoff))] = [TP_c / ((TP_c + FN_c) * 1.0) if (TP_c + FN_c)> 0 else 'Null'][0]
        all_metrics["f1 ch@|{}".format(str(cutoff))] = [(2* TP_c) / ((2*TP_c + FP_c + FN_c)*1.0) if (TP_c + FP_c + FN_c) > 0 else 'Null'][0]
        all_metrics["auc ch@|{}".format(str(cutoff))] = (TP_c + TN_c) / ((TP_c + TN_c + FP_c + FN_c)*1.0)

    return all_metrics
Beispiel #7
0
def eval_model(model, model_name, x_train, y_train, x_test, y_test, cost_mat_train, cost_mat_test, dataset_name,
               where_to_pickle_path, cost_flag=True):
    """
    the main function ot run a model on the data set.
    :param model: the model to be used.
    :param model_name: the model name to use for records and logging.
    :param x_train: the training set.
    :param y_train: the training set labels.
    :param x_test:  the test set.
    :param y_test: the test set labels.
    :param cost_mat_train: the cost matrix for the training set.
    :param cost_mat_test: the cost matrix for the test set.
    :param dataset_name:  the name of the data set to use for logging.
    :param where_to_pickle_path: a path to dump the trained models for later use.
    :param cost_flag: a boolean to decide if we need to use the cost matrix or not(random forest doesnt need it)
    :return: a list describing the results of the run. thi will be a single record in the final result file.
    """
    file_name = '{}_{}.sav'.format(model_name, dataset_name)
    print(dataset_name, model_name)
    start_time = time.time()
    if cost_flag:
        model.fit(x_train, y_train, cost_mat_train)
    else:
        model.fit(x_train, y_train)
    file_path = os.path.join(where_to_pickle_path, file_name)
    pickle.dump(model, open(file_path, 'wb'))
    end_time = time.time()
    fit_time = end_time - start_time
    start_time = time.time()
    pred = model.predict(x_test)
    end_time = time.time()
    pred_time = end_time - start_time
    inducer, combiner, num_of_iterations, ne, nf = model_name.split("_")
    return [dataset_name, model_name, inducer, combiner, num_of_iterations, ne, nf, fit_time, pred_time,
            max(0.0, savings_score(y_test, pred, cost_mat_test)),
            f1_score(np.append(y_test, [0, 1]), np.append(pred, [0, 1]))]
Beispiel #8
0
#exit(0)
y_train = np.squeeze(y_train)
y_valid = np.squeeze(y_valid)

print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)
#exit(0)
cost_mat_train=np.zeros((X_train.shape[0],4))
cost_mat_valid=np.zeros((X_valid.shape[0],4))

cost_mat_train[:,0]=3.28
cost_mat_train[:,1]=4.0

cost_mat_valid[:,0]=3.28
cost_mat_valid[:,1]=4.0

clf = CostSensitiveRandomForestClassifier(n_estimators=100)
#clf = DecisionTreeClassifier(random_state=0)
#clf = XGBClassifier(max_depth=1000, eta=0.1)
#clf = KMeans(n_clusters=2, random_state=0)
#clf = svm.SVC(kernel='poly', degree=3)
#clf = KNeighborsClassifier()
#clf = RandomForestClassifier(n_estimators=100, random_state=1)
m=clf.fit(X_train, y_train, cost_mat_train)
print(savings_score(y_train, m.predict(X_train), cost_mat_train))
print(savings_score(y_valid, m.predict(X_valid), cost_mat_valid))

#print(clf.score(X_valid, y_valid))
#print(cross_val_score(clf, X_train, y_train, cv=10))
#print(m.predict(X_valid))

cost_mat_train[:, 3] = 0

cost_mat_test = np.zeros((len(y_test), 4))

cost_mat_test[:, 0] = 2
cost_mat_test[:, 1] = 250
cost_mat_test[:, 2] = 0
cost_mat_test[:, 3] = 0

g = CostSensitiveRandomForestClassifier()
g.fit(np.array(X_train), np.array(y_train), cost_mat_train)
y_pred_rf_cslr = g.predict(np.array(X_test))

print('--------CostSensitiveRandomForestClassifier------')
display_summary(y_test, y_pred_rf_cslr)

cm = confusion_matrix(y_test, y_pred_rf_cslr)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf_cslr).ravel()

plot_confusion_matrix(
    cm,
    ['0', '1'],
)
pr, tpr, fpr = show_data(cm, print_res=1)

# Savings using only RandomForest
print("savings_score=", savings_score(y_test, y_pred_rf_cslr, cost_mat_test))
print("cost_loss=", cost_loss(y_test, y_pred_rf_cslr, cost_mat_test))
print('F1_score  =     {:.8f}'.format(
    2 * (((tp / (tp + fp)) * (tp / (tp + fn))) / ((tp / (tp + fp)) +
                                                  (tp / (tp + fn))))))
csdt = CostSensitiveDecisionTreeClassifier(
    criterion='direct_cost',
    criterion_weight=False,
    num_pct=20000,
    max_features=None,
    max_depth=None,
    min_samples_split=30,
    min_samples_leaf=1,
    min_gain=0.01,
    pruned=False)


cost = 0
savings = 0
size = 0
for key, fold in ds.folds.items():
    tree = csdt.fit(fold.x_train, fold.y_train, fold.cost_mat_train)
    print('Fold: ' + str(key))
    printTree(tree.tree_.tree, '', ds.feature_names)
    print('\n')
    y_pred = tree.predict(fold.x_test)
    curr_cost = cost_loss(fold.y_test, y_pred, fold.cost_mat_test)
    curr_savings = savings_score(fold.y_test, y_pred, fold.cost_mat_test)
    cost += curr_cost
    savings += curr_savings
    size += tree.tree_.n_nodes
    
    print (key, curr_cost, curr_savings, tree.tree_.n_nodes)

print ("Summary:", cost/len(ds.folds), savings/len(ds.folds), size/len(ds.folds))