Exemple #1
0
    def calculate_auroc(self, X, y_gt, n_steps=100, make_plot=True):
        '''
        This function calculates the AUROC and plots the ROC curve.

        Args:
            X (array-like): Input testing data
            y_gt (array-like): Ground truth labels for testing data
            n_steps (int, default=100): Number of threshold steps to use when
                calculating.
            plot (bool): Boolean regarding a plot.

        Returns:
            roc_auc (float): Area under the reciever operator characterstic
                curve or AUROC.
        '''
        assert self._model_trained == True, 'Model must be trained prior to calculatin auroc'
        # forward pass for prediction
        X = X.T
        y_hat, _ = self.forward(X)
        y_hat = np.squeeze(y_hat)
        y_gt = np.squeeze(y_gt)
        fpr, tpr, thresholds = skmetrics.roc_curve(y_gt, y_hat)
        roc_auc = skmetrics.auc(fpr, tpr)
        display = skmetrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
        if make_plot == True:
            display.plot()
            plt.show()
        return roc_auc
Exemple #2
0
 def plot_roc_curve(self, title, outname, **kwargs):
     """ This method works only for binary classification"""
     disp = metrics.RocCurveDisplay(**kwargs)
     disp.ax_.set_title(title)
     disp.plot()
     fname = "/".join((self.outpath, outname))
     plt.savefig(fname)
def SVM_algorithm(X_train, X_test, y_train, y_test: pd.DataFrame):

    from sklearn.svm import SVC
    svm_classifier = SVC(kernel='linear', probability=True)
    svm_classifier.fit(X_train, y_train.values.ravel())
    svm_predi = svm_classifier.predict(X_test)

    print('confiusion matrix :\n {} \n '.format(
        m.confusion_matrix(y_test, svm_predi)))
    print('Classification report SVM  \n : {} \n'.format(
        m.classification_report(y_test, svm_predi)))
    print("SVMs accuracy :", m.accuracy_score(y_test, svm_predi))
    print("SVM precision : ", m.precision_score(y_test, svm_predi,
                                                pos_label=1))
    print("SVM recall: ",
          m.recall_score(y_test, svm_predi, average='binary', pos_label=1))
    print(
        "SVM f1 score: ",
        m.f1_score(y_test, svm_predi, labels=np.unique(svm_predi),
                   pos_label=1))

    ######### ROC CURVE FOR SVM ################
    svm_roc = m.roc_auc_score(y_test,
                              svm_classifier.predict_proba(X_test)[:, 1])
    print("roc curve accuracy : ", svm_roc)
    fpr, tpr, thresh = m.roc_curve(y_test,
                                   svm_classifier.predict_proba(X_test)[:, 1],
                                   pos_label=1)
    figure_svm = m.RocCurveDisplay(fpr=fpr,
                                   tpr=tpr,
                                   roc_auc=svm_roc,
                                   estimator_name="SVM")
    figure_svm.plot()
    plt.show()
Exemple #4
0
def plot_auc(best_model, y_test, y_hat, output_type='save'):
  '''
  Plot and save ROC_AUC curve.
  '''
  # plt.clf()
  # fig, ax1 = plt.subplots()
  # plot_roc_curve(best_model, x_test, y_test)
  # title = ax1.set_title(textwrap.fill(plot_name, 70))
  # fig.tight_layout()
  # fig.subplots_adjust(top=0.75)  

  # Name plot 
  model_name = str(best_model).split('(')[0]
  plot_name = model_name
  # Find metrics
  fpr, tpr, thresholds = metrics.roc_curve(y_test, y_hat)
  roc_auc = metrics.auc(fpr, tpr)
  display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name=plot_name) 
  display.plot()

  #Save or show plot
  if (output_type == 'save'):
      plt.savefig('ROC_'+ str(plot_name) +'.png')
  elif (output_type == 'show'):
      plt.show()
  plt.close()
Exemple #5
0
def roc_curve(
    y_pred: np.ndarray, 
    y_labels: np.ndarray, 
    fp: FreePlot, 
    index: Union[Tuple[int], str] = (0, 0), 
    name: Optional[str] = None,
    estimator_name: Optional[str] = None,
    style: Union[str, Iterable[str]] = "whitegrid",
    dict_: Optional[Dict] = None,
) -> "tpr, fpr, roc_auc":
    """
    y_pred: the prediction
    y_labels: the corresponding labels of instances
    fp: ...
    index: ...
    name: for labelling the roc_curve, is None, use the estimator_name
    estimator_name: the name of classifier
    style: the style of seaborn
    dict_: the correspoding properties dict
    """
    from sklearn import metrics
    fpr, tpr, thresholds = metrics.roc_curve(y_labels, y_pred)
    roc_auc = metrics.auc(fpr, tpr)
    display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, 
                            roc_auc=roc_auc, estimator_name=estimator_name)
    with sns.axes_style(style, dict_):
        display.plot(fp[index], name)
    return tpr, fpr, roc_auc
Exemple #6
0
def iris_data_metrics_k_fold(trainX, trainY, threshold, classnames):
    print(
        "======================Classification - K-Fold Iterations (K=5)=========================\n"
    )
    k = 5
    for i in range(k):
        print("=============Iteration Number " + str(i + 1) +
              "================")
        k_trainX, k_trainY, k_validationX, k_validationY = k_fold_split(
            trainX, trainY, k, i + 1)
        model = LogisticRegression(trainX, trainY, np.array([0, 0, 0, 0]),
                                   0.05, 1e-5, 1e-12, 'Iris Data')
        model.train()
        prediction = np.where(
            model.predict(validationX).values > threshold, 1, 0)
        confusion_matrix = metrics.confusion_matrix(validationY.values,
                                                    prediction)
        tn, fp, fn, tp = confusion_matrix.ravel()
        print('Precision: ',
              metrics.precision_score(validationY.values, prediction))
        print('Recall: ', metrics.recall_score(validationY.values, prediction))
        print('FPR: ', fp / (fp + tn))
        fpr, tpr, _ = metrics.roc_curve(validationY.values, prediction)
        roc_auc = metrics.auc(fpr, tpr)
        display = metrics.RocCurveDisplay(fpr=fpr,
                                          tpr=tpr,
                                          roc_auc=roc_auc,
                                          estimator_name='Iris Data')
        display.plot()
        plt.show()
        plot_confusion_matrix(confusion_matrix, class_names, threshold)
Exemple #7
0
def _plot_roc_fold(fprs, tprs, i, ax, alpha=.5):
    
    display = metrics.RocCurveDisplay(fpr=fprs[i], tpr=tprs[i],
                                      roc_auc=metrics.auc(fprs[i], tprs[i]),
                                      estimator_name=f'Fold {i+1}')

    display.plot(ax=ax, alpha=alpha)
def add_entropy_roc(run, plots_dir):
    # Using the undocumented summary_metrics, because summary doesn't contain
    # histograms.
    entropy_id = run.summary_metrics['valid/entropy']
    entropy_ood = run.summary_metrics['valid/entropy_ood']

    preds_id = histogram_to_preds(entropy_id)
    preds_ood = histogram_to_preds(entropy_ood)

    targets_id = np.zeros_like(preds_id)
    targets_ood = np.ones_like(preds_ood)

    preds = np.concatenate([preds_id, preds_ood])
    targets = np.concatenate([targets_id, targets_ood])

    fpr, tpr, _ = metrics.roc_curve(targets, preds)
    roc_auc = metrics.auc(fpr, tpr)

    if 'valid/entropy_auc' not in run.summary:
        tqdm.write(f"   + ROC AUC: {roc_auc}")
        run.summary['valid/entropy_auc'] = roc_auc

    if plots_dir:
        fig, ax = plt.subplots()
        metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot(ax, name=run.name)

        # FIXME: unfortunately adding plots or images in retrospect is not supported
        # run.summary['valid/entropy_roc'] = wandb.Image(fig)

        # Save the ROC plot locally
        roc_dir = os.path.join(plots_dir, 'entropy_roc')
        os.makedirs(roc_dir, exist_ok=True)
        fig.savefig(os.path.join(roc_dir, f'{run.id}_{run.name}.pdf'))
        plt.close(fig)
def GAUSSIAN_algorithm(X_train, X_test, y_train, y_test: pd.DataFrame):

    from sklearn.naive_bayes import GaussianNB
    gnb = GaussianNB()
    gnb.fit(X_train, y_train.values.ravel())
    gnb_predctions = gnb.predict(X_test)

    print('confiusion matrix from gaussianNB :\n {} \n '.format(
        m.confusion_matrix(y_test, gnb_predctions)))
    print('Classification report GaussianNB  \n : {} \n'.format(
        m.classification_report(y_test, gnb_predctions)))
    print("GaussianNB accuracy :", m.accuracy_score(y_test, gnb_predctions))
    print("GaussianNB precision : ", m.precision_score(y_test, gnb_predctions))
    print("GaussianNB recall: ", m.recall_score(y_test, gnb_predctions))
    print("GaussianNB f1 score: ",
          m.f1_score(y_test, gnb_predctions, labels=np.unique(gnb_predctions)))

    fpr_gnb, tpr_gnb, _ = m.roc_curve(y_test, gnb.predict_proba(X_test)[:, 1])
    gnb_roc_acc = m.roc_auc_score(y_test, gnb.predict_proba(X_test)[:, 1])
    print("roc curve accuracy : ", gnb_roc_acc)
    bayes_fig = m.RocCurveDisplay(fpr=fpr_gnb,
                                  tpr=tpr_gnb,
                                  estimator_name="BAYES",
                                  roc_auc=gnb_roc_acc)
    bayes_fig.plot()
    plt.show()
def KNN_algorithm(X_train, X_test, y_train, y_test: pd.DataFrame):
    KNN = KNeighborsClassifier(n_neighbors=5, weights='distance', leaf_size=20)
    KNN.fit(X_train, y_train.values.ravel())

    # print(np.mean(cross_val_score(KNN ,X_all_train,y_all_train, cv=5)))

    predictions_from_knn = KNN.predict(X_test)
    print('confiusion matrix from knn with k=5 :\n {} \n '.format(
        m.confusion_matrix(y_test, predictions_from_knn)))
    print('Classification report from knn with k=5 \n : {} \n'.format(
        m.classification_report(y_test, predictions_from_knn)))
    print("knn k= accuracy :", m.accuracy_score(y_test, predictions_from_knn))
    print("precision : ", m.precision_score(y_test, predictions_from_knn))
    print("recall: ", m.recall_score(y_test, predictions_from_knn))
    print(
        "f1 score: ",
        m.f1_score(y_test,
                   predictions_from_knn,
                   labels=np.unique(predictions_from_knn)))

    ########## ROC CURVE FOR KNN ################
    roc_acc_knn_re = m.roc_auc_score(y_test, KNN.predict_proba(X_test)[:, 1])
    print("roc curve accuracy : ", roc_acc_knn_re)
    fpr, tpr, thres = m.roc_curve(y_test, KNN.predict_proba(X_test)[:, 1])
    figure = m.RocCurveDisplay(fpr=fpr,
                               tpr=tpr,
                               roc_auc=roc_acc_knn_re,
                               estimator_name='KNN')
    figure.plot()
    plt.title("roc curve kNN")
    plt.show()
Exemple #11
0
def plot_ROC(fpr, tpr):
    """Plot ROC curve."""
    roc_auc = metrics.auc(fpr, tpr)
    display = metrics.RocCurveDisplay(fpr=fpr,
                                      tpr=tpr,
                                      roc_auc=roc_auc,
                                      estimator_name='TransposonFinder')
    display.plot()
    plt.show()
def eval_from_save(output_folder):
    folder_path = Path(output_folder)
    novel_true = torch.load(folder_path / "test_novel_true.pt")
    novel_score = torch.load(folder_path / "test_novel_score.pt")
    # upsample normal data so it accounts for 3/4 of the weight, roughly the split of an episode
    # should affect PRC but not ROC
    norm_count = torch.sum(novel_true == 0)
    novel_count = torch.sum(novel_true == 1)
    weight = torch.ones_like(novel_score)
    weight[novel_true == 0] = 3 * novel_count / norm_count
    # ROC with 1 as novel target
    fpr, tpr, roc_threshs = metrics.roc_curve(novel_true,
                                              novel_score,
                                              sample_weight=weight)
    auroc = metrics.roc_auc_score(novel_true,
                                  novel_score,
                                  sample_weight=weight)
    metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=auroc).plot()
    plt.savefig(folder_path / "roc.png")
    plt.close()
    print(f"AUROC: {auroc}")
    # TNR at TPR 95%
    tpr_95_ind = np.argwhere(tpr >= .95)[0]
    print(f"TNR @ TPR {tpr[tpr_95_ind][0]}%: {1 - fpr[tpr_95_ind][0]}")
    # PRC with 1 as novel target
    precision, recall, prc_threshs = metrics.precision_recall_curve(
        novel_true, novel_score, sample_weight=weight)
    prc_threshs = np.hstack([prc_threshs, prc_threshs[-1] + 1e-4
                             ])  # extra thresh to match lens
    av_p = metrics.average_precision_score(novel_true,
                                           novel_score,
                                           sample_weight=weight)
    metrics.PrecisionRecallDisplay(precision=precision,
                                   recall=recall,
                                   average_precision=av_p).plot()
    plt.savefig(folder_path / "prc.png")
    plt.close()
    print(f"Average Precision: {av_p}")
    # recall at precision 80%
    precision_80_ind = np.argwhere(precision >= .8)[0]
    print(f"Recall(TPR) @ Precision {precision[precision_80_ind][0]}%: " +
          f"{recall[precision_80_ind][0]}")
    # precision at TPR 95%
    prc_tpr_95_ind = np.argwhere(prc_threshs >= roc_threshs[tpr_95_ind])[0]
    print(
        f"Precision @ TPR {tpr[tpr_95_ind][0]}%: {precision[prc_tpr_95_ind][0]}"
    )
    # TNR at precision 80%
    roc_precision_80_ind = np.argwhere(
        roc_threshs >= prc_threshs[precision_80_ind])[-1]
    print(f"TNR @ Precision {precision[precision_80_ind][0]}%: " +
          f"{1 - fpr[roc_precision_80_ind][0]}")
    print(f"TPR @ Precision {precision[precision_80_ind][0]}%: " +
          f"{tpr[roc_precision_80_ind][0]}")
    return fpr, tpr, auroc, precision, recall, av_p
def plot_roc_curve(validations, predictions):
    for i in range(predictions.shape[1]):
        FP_rates, TP_rates, thresholds = roc_curve(validations[:, i],
                                                   predictions[:, i])
        roc_auc = metrics.auc(FP_rates, TP_rates)
        display = metrics.RocCurveDisplay(fpr=FP_rates,
                                          tpr=TP_rates,
                                          roc_auc=roc_auc,
                                          estimator_name='example estimator')
        display.plot()
        plt.show()
Exemple #14
0
def plot_roc(t_pos, f_pos, t_neg, f_neg):
    """
    Plot ROC curve based on previously determined false and true positives.
    """
    f_pos_rate = ratio(f_pos, t_neg)
    t_pos_rate = ratio(t_pos, f_neg)
    roc_auc = metrics.auc(f_pos_rate, t_pos_rate)
    disp = metrics.RocCurveDisplay(fpr=f_pos_rate,
                                   tpr=t_pos_rate,
                                   roc_auc=roc_auc)
    disp = disp.plot()
    return disp.figure_
Exemple #15
0
def plot_aurocs(runs):
    displays = []
    fig, ax = plt.subplots()

    for run_setup, run_name in runs.items():
        dev_preds = pd.read_csv(
            f"../model_checkpoints/{run_name}/meme_dev_seen_preds.csv")
        fpr, tpr, thresholds = metrics.roc_curve(dev_preds['gt'],
                                                 dev_preds['proba'])
        roc_auc = metrics.auc(fpr, tpr)
        display = metrics.RocCurveDisplay(fpr=fpr,
                                          tpr=tpr,
                                          roc_auc=roc_auc,
                                          estimator_name=run_setup)
        display.plot(ax)

    plt.show()
Exemple #16
0
def AUROC_cruve(trained_NN, inputs, outputs, Fig = False):
    '''
    This function leverage the trained neurla network 
    to predict the probability of the target seuqnce is the candidate or not
    And will draw the ROC plot or return the AUROC score
    '''
    results = []
    for i in range (len(inputs)):
        
        results.append(trained_NN.test(inputs[i])[0][0])
    #results = results.reshape((result.shape[0]))
    # draw the AUC cruve
    fpr, tpr, _ = skl_metrics.roc_curve(outputs, results, pos_label=1)
    roc_display = skl_metrics.RocCurveDisplay(fpr=fpr, tpr=tpr)
    # decide if the function show the score or show the plot
    if Fig == False:
        
        return skl_metrics.roc_auc_score(outputs, results)
    else:
        
        return roc_display
Exemple #17
0
def iris_data_metrics(trainX, trainY, threshold, classnames):
    print(
        "======================Classification - Full Training Set=========================\n"
    )
    model = LogisticRegression(trainX, trainY, np.array([0, 0, 0, 0]), 0.05,
                               1e-5, 1e-12, 'Iris Data')
    model.train()
    prediction = np.where(model.predict(validationX).values > threshold, 1, 0)
    confusion_matrix = metrics.confusion_matrix(validationY.values, prediction)
    tn, fp, fn, tp = confusion_matrix.ravel()
    print('Precision: ', metrics.precision_score(validationY.values,
                                                 prediction))
    print('Recall: ', metrics.recall_score(validationY.values, prediction))
    print('FPR: ', fp / (fp + tn))
    fpr, tpr, _ = metrics.roc_curve(validationY.values, prediction)
    roc_auc = metrics.auc(fpr, tpr)
    display = metrics.RocCurveDisplay(fpr=fpr,
                                      tpr=tpr,
                                      roc_auc=roc_auc,
                                      estimator_name='Iris Data')
    display.plot()
    plt.show()
    plot_confusion_matrix(confusion_matrix, class_names, threshold)
def NN_algorithm(X_train, X_test, y_train, y_test: pd.DataFrame):

    from sklearn.neural_network import MLPClassifier

    # # #‘lbfgs’ is an optimizer in the family of quasi-Newton methods.
    # # #‘relu’, the rectified linear unit function, returns f(x) = max(0, x)
    # # # hidden layer size : the ith element represents the number of neurons in the ith hidden layer.
    nn = MLPClassifier(hidden_layer_sizes=(30, 30, 30),
                       activation="relu",
                       solver='lbfgs',
                       alpha=1e-5,
                       random_state=1,
                       max_iter=1000)
    nn.fit(X_train, y_train.values.ravel())
    nn_predictions = nn.predict(X_test)

    print('confiusion matrix from NNs :\n {} \n '.format(
        m.confusion_matrix(y_test, nn_predictions)))
    print('NNs report   \n : {} \n'.format(
        m.classification_report(y_test, nn_predictions)))
    print("NNs accuracy :", m.accuracy_score(y_test, nn_predictions))
    print("NNs precision : ", m.precision_score(y_test, nn_predictions))
    print("NNs recall: ", m.recall_score(y_test, nn_predictions))
    print("NNs f1 score: ",
          m.f1_score(y_test, nn_predictions, labels=np.unique(nn_predictions)))

    #####################  ROC CURVE ACCURACY  #############################################

    fpr_nn, tpr_nn, t_ = m.roc_curve(y_test, nn.predict_proba(X_test)[:, 1])
    nn_roc_acc = m.roc_auc_score(y_test, nn.predict_proba(X_test)[:, 1])
    print("roc curve accuracy : ", nn_roc_acc)
    nn_fig = m.RocCurveDisplay(fpr=fpr_nn,
                               tpr=tpr_nn,
                               estimator_name="NNs",
                               roc_auc=nn_roc_acc)
    nn_fig.plot()
    plt.show()
def construct_eval_model(xtrn,
                         ytrn,
                         xtest,
                         ytest,
                         max_depth,
                         option=3,
                         attribute_value_pairs=None,
                         bag_size=1,
                         type=None):
    """
    creates the requested model, trains and tests the model, and then displays the results.
    """
    print('-' * 30)
    # use our bagging or boosting function
    if option == 0 or option == 1:
        # create ensemble model
        if option == 0:
            start = time.process_time()
            model = bagging(xtrn, ytrn, max_depth, attribute_value_pairs,
                            bag_size)
            end = time.process_time() - start
        else:
            start = time.process_time()
            model = boosting(xtrn, ytrn, max_depth, bag_size,
                             attribute_value_pairs)
            end = time.process_time() - start

        # Compute the test error and display the confusion matrix
        y_pred = [predict_example(x, model, probMode=True) for x in xtest]

        modelName = 'Bagging' if option == 0 else 'AdaBoost'
        probMode = True
        if probMode:
            fpr, tpr, thresholds = metrics.roc_curve(list(ytest), y_pred)
            roc_auc = metrics.auc(fpr, tpr)
            display = metrics.RocCurveDisplay(fpr=fpr,
                                              tpr=tpr,
                                              roc_auc=roc_auc,
                                              estimator_name=type)
            plot = display
        numberOf = ': Number of bags =' if option == 0 else ": Number of learners ="
        print(modelName, numberOf, bag_size, ", Max Depth =", max_depth)
        tst_err = compute_error(list(ytest), y_pred, probMode=True)
        print('Test Error = {0:4.2f}%.'.format(tst_err * 100))
        return plot
        #print('CPU Runtime: {0}'.format(end))

    # use scikit learners
    if option == 2 or option == 3:
        # bagging classifier
        if option == 2:
            start = time.process_time()
            model = BaggingClassifier(
                base_estimator=DecisionTreeClassifier(max_depth=max_depth),
                n_estimators=bag_size,
                random_state=0).fit(xtrn, ytrn)
            end = time.process_time() - start
        # boosting classifier
        else:
            start = time.process_time()
            model = AdaBoostClassifier(
                base_estimator=DecisionTreeClassifier(max_depth=max_depth),
                n_estimators=bag_size,
                random_state=0).fit(xtrn, ytrn)
            end = time.process_time() - start

        # Compute the test error
        y_pred = model.predict(xtest)
        modelName = 'Scikit-Learn Bagging' if option == 2 else 'SciKit-Learn AdaBoostClassifier'
        numberOf = ': Number of bags =' if option == 2 else ": Number of learners ="
        print(modelName, numberOf, bag_size, ", Max Depth =", max_depth)
        tst_err = compute_error(list(ytest), y_pred)
        print('Test Error = {0:4.2f}%.'.format(tst_err * 100))

        #print('CPU Runtime: {0}'.format(end))

    if option == 5:
        tree = id3(np.transpose(xtrn),
                   ytrn,
                   attribute_value_pairs=attribute_value_pairs,
                   max_depth=bag_size)
        model = [[1, tree]]
        y_pred = [predict_example(x, model) for x in xtest]
        modelName = 'Decision Tree Classifier, '
        numberOf = 'max depth of the tree:'
        print(modelName, numberOf, bag_size)
        tst_err = compute_error(list(ytest), y_pred)
        print('Test Error = {0:4.2f}%.'.format(tst_err * 100))
        print("-+-" * 5)
        print(tree)
        print("-+-" * 5, '\n')

    print('-' * 30)
Exemple #20
0
#from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from sklearn import metrics

pred = [.99, .98, .72, .70, .65, .51, .39, .24, .11, .01]
y = [1, 1, 0, 1, 1, 0, 0, 1, 0, 0]
"""
score = roc_auc_score(y, pred)
fpr, tpr, _ = roc_curve(y, pred, drop_intermediate=False)

print(score, fpr, tpr)

plt.plot(fpr, tpr, marker='.', label='Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
"""

fpr, tpr, thresholds = metrics.roc_curve(y, pred)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr,
                                  tpr=tpr,
                                  roc_auc=roc_auc,
                                  estimator_name='example estimator')
display.plot()
plt.show()
Exemple #21
0
"""## 4. Metrics & Error Measures

4.1 Importing matplotlib for curve visualization
"""

from sklearn import metrics

"""4.2 Performing predcition on trainset by trained model"""

y_train_pred = modelKM.predict(X_train)

"""4.3 Visualizing ROC Curve of trained model on trainset"""

fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_pred)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='demo estimator')
display.plot()
plt.show()

"""4.4 Calculating the accuracy on trainset"""

print("Accuracy on trainset: ",metrics.accuracy_score(y_train, y_train_pred))

"""4.5 Viewing confusion matirx on trainset"""

print (metrics.confusion_matrix(y_train, y_train_pred))

"""4.6 Viewing homogeneity score on trainset"""

from sklearn.metrics.cluster import homogeneity_score
print('Homogeneity score: ', homogeneity_score(y_train, y_train_pred))
Exemple #22
0
def train_eval(config, exp_path):
    dataset = MarkerExpressionDataset(config)
    if dataset.data_clean is not None:
        with open(os.path.join(exp_path, 'dirty_data.txt'), 'w') as f:
            f.write('---data clean method: %s---\n' % dataset.data_clean)
            for marker, item in dataset.outlier_samples.items():
                f.write('marker %s:\n' % marker)
                for class_id in dataset.classes:
                    f.write('class %s:\n' % class_id)
                    for sample_id in item.keys():
                        if item[sample_id]['class'] == class_id:
                            f.write('\t%s\n' % sample_id)

    if dataset.feature_selection is not None or dataset.feature_transformation is not None:
        with open(
                os.path.join(exp_path,
                             'feature_selection_and_transformation.txt'),
                'w') as f:
            if dataset.feature_selection is not None:
                f.write('---feature selection method: %s---\n' %
                        dataset.feature_selection['method'])
                if 'kwargs' in dataset.feature_selection:
                    f.write('---feature selection kwargs: %s---\n' %
                            str(dataset.feature_selection['kwargs']))
            if dataset.feature_transformation is not None:
                f.write('---feature transformation method: %s---\n' %
                        dataset.feature_transformation['method'])
                if 'kwargs' in dataset.feature_transformation:
                    f.write('---feature transformation kwargs: %s---\n' %
                            str(dataset.feature_transformation['kwargs']))

            for marker in dataset.markers:
                f.write('marker %s:\n' % marker)
                if dataset.fs_metric_params is not None:
                    f.write(
                        '---feature selection and transformation kwargs: %s---\n'
                        % str(dataset.fs_metric_params[marker]))
                if dataset.feature_selection is not None:
                    features = dataset.features
                    feature_index = 0
                    f.write('---selected features---\n')
                    if dataset.feature_selection['method'] == 'custom':
                        support_flags = dataset.feature_selection['selection'][
                            marker]
                    else:
                        support_flags = dataset.feature_selector[
                            marker].get_support()
                    for flag in support_flags:
                        f.write('%s:\t%s\n' % (features[feature_index], flag))
                        feature_index = (feature_index + 1) % len(features)
                if dataset.feature_transformation is not None:
                    components = dataset.feature_transformer[
                        marker].components_
                    f.write('---feature transformation components---:\n%s' %
                            components.tolist())
                    # if 'feature_mean' in config:
                    #     feature_mean = config['feature_mean']
                    #     coefficients = np.abs(feature_mean*components.sum(axis=0)).\
                    #         reshape([len(dataset.features), -1]).sum(axis=0)
                    # else:
                    #     coefficients = np.abs(components.sum(axis=0)).reshape([len(dataset.features), -1]).sum(axis=0)
                    # coefficients = coefficients / coefficients.sum()
                    #
                    # f.write('---feature transformation coefficients---:\n%s' % coefficients.tolist())

    threshold = config.get('threshold', 'roc_optimal')
    metrics_names = ['sensitivity', 'specificity', 'roc_auc_score']
    metrics_avg_names = ['roc_auc_score_avg', 'roc_auc_score_avg_std']

    fig, ax = plt.subplots(9,
                           len(dataset.markers),
                           squeeze=False,
                           figsize=(6 * len(dataset.markers), 40))
    metrics_file = open(os.path.join(exp_path, 'metrics.txt'), 'w')
    metrics_fig_filename = os.path.join(exp_path, 'conf_mat.png')
    best_params = dict()
    all_marker_train_metrics = []
    all_marker_test_metrics = []
    for i, marker in enumerate(dataset.markers):
        model = get_model(config)
        if 'model_kwargs_search' in config:
            # parameter search
            print('parameter search for marker %s...' % marker)
            all_x, all_y, cv_index = dataset.get_all_data(marker)
            best_model = GridSearchCV(model,
                                      param_grid=config['model_kwargs_search'],
                                      cv=cv_index,
                                      scoring='roc_auc_ovr')
            best_model.fit(all_x, all_y)
            best_params[marker] = best_model.best_params_
            print('search done')
        else:
            best_model = model
            best_params[marker] = config['model_kwargs']

        # run train and test
        train_xs = []
        train_ys = []
        train_ys_score = []
        test_xs = []
        test_ys = []
        test_ys_score = []
        for fold_i, (train_x, train_y, test_x,
                     test_y) in enumerate(dataset.get_split_data(marker)):
            model = base.clone(model)
            model.set_params(**best_params[marker])
            model.fit(train_x, train_y)
            # model.classes_ = dataset.classes
            train_xs += train_x
            train_ys += train_y
            test_xs += test_x
            test_ys += test_y
            train_y_score = model.predict_proba(train_x).tolist()
            train_ys_score += train_y_score
            test_y_score = model.predict_proba(test_x).tolist()
            test_ys_score += test_y_score
            # model_filename = os.path.join(exp_path, 'model', '%s_%s_fold_%d.pkl'
            #                               % (config['model'], marker, fold_i))
            # maybe_create_path(os.path.dirname(model_filename))
            # with open(model_filename, 'wb') as f:
            #     pickle.dump(model, f)

        train_metrics = eval_results(train_ys,
                                     train_ys_score,
                                     labels=dataset.classes,
                                     average='macro',
                                     threshold=threshold,
                                     num_fold=dataset.num_fold)
        test_metrics = eval_results(test_ys,
                                    test_ys_score,
                                    labels=dataset.classes,
                                    average='macro',
                                    threshold=train_metrics['used_threshold'],
                                    num_fold=dataset.num_fold)
        all_marker_train_metrics.append(train_metrics)
        all_marker_test_metrics.append(test_metrics)

        # print metrics to console and file
        double_print('marker: %s' % marker, metrics_file)
        double_print('metrics on training set:', metrics_file)
        for j, class_j in enumerate(dataset.classes):
            log_str = '[class: %s. threshold: %1.1f] ' % (
                class_j, 100 * train_metrics['used_threshold'][j])
            for metrics_name in metrics_names:
                log_str += '%s: %1.1f. ' % (metrics_name,
                                            train_metrics[metrics_name][j])
            double_print(log_str, metrics_file)
        for metrics_name in metrics_avg_names:
            double_print(
                '%s: %1.1f' % (metrics_name, train_metrics[metrics_name]),
                metrics_file)
        double_print('metrics on test set:', metrics_file)
        for j, class_j in enumerate(dataset.classes):
            log_str = '[class: %s. threshold: %1.1f] ' % (
                class_j, 100 * test_metrics['used_threshold'][j])
            for metrics_name in metrics_names:
                log_str += '%s: %1.1f. ' % (metrics_name,
                                            test_metrics[metrics_name][j])
            double_print(log_str, metrics_file)
        for metrics_name in metrics_avg_names:
            double_print(
                '%s: %1.1f' % (metrics_name, test_metrics[metrics_name]),
                metrics_file)

        # generate figure
        current_ax = ax[0, i]
        dataset.plot_data_clean_distribution(current_ax, marker)
        current_ax.set_title('data cleaning on marker %s' % marker)

        current_ax = ax[1, i]
        contour_flag = len(train_xs[0]) == 2
        # dup_reduced = list(tuple(tuple([train_xs[j] + [train_ys[j]] for j in range(len(train_xs))])))
        # dup_reduced_train_xs = [item[:-1] for item in dup_reduced]
        # dup_reduced_train_ys = [item[-1] for item in dup_reduced]
        # dup_reduced_train_ys_str = [str(item) for item in dup_reduced_train_ys]
        dup_reduced_train_xs = train_x + test_x
        dup_reduced_train_ys = train_y + test_y
        dup_reduced_train_ys_str = [str(item) for item in dup_reduced_train_ys]
        classes_str = [str(item) for item in dataset.classes]
        plot_feature_distribution(
            dup_reduced_train_xs,
            ax=current_ax,
            t_sne=True,
            hue=dup_reduced_train_ys_str,
            hue_order=classes_str,
            style=dup_reduced_train_ys_str,
            style_order=classes_str,
            # x_lim='box', y_lim='box',
            x_lim='min_max_extend',
            y_lim='min_max_extend',
            contour=contour_flag,
            z_generator=best_model.predict)
        current_ax.set_title('%s trained on whole set' % marker)

        current_ax = ax[2, i]
        metrics.ConfusionMatrixDisplay(
            train_metrics['conf_mat'],
            display_labels=dataset.classes).plot(ax=current_ax)
        current_ax.set_title('%s on train set of all folds' % marker)

        current_ax = ax[3, i]
        for j in range(len(dataset.classes)):
            roc_curve = train_metrics['roc_curve'][j]
            roc_auc_score = train_metrics['roc_auc_score'][j]
            class_id = dataset.classes[j]
            sen = train_metrics['sensitivity'][j] / 100
            spe = train_metrics['specificity'][j] / 100
            metrics.RocCurveDisplay(fpr=roc_curve[0],
                                    tpr=roc_curve[1],
                                    roc_auc=roc_auc_score,
                                    estimator_name='class %s' %
                                    class_id).plot(ax=current_ax)
            current_ax.scatter(1 - spe, sen)

        current_ax = ax[4, i]
        table_val_list = [
            dataset.classes,
            [100 * item for item in train_metrics['used_threshold']]
        ]
        row_labels = ['cls', 'thr']
        for metrics_name in metrics_names:
            table_val_list.append(train_metrics[metrics_name])
            row_labels.append(metrics_name[:min(3, len(metrics_name))])
        additional_text = []
        for metrics_name in metrics_avg_names:
            additional_text.append('%s: %1.1f' %
                                   (metrics_name, train_metrics[metrics_name]))
        additional_text.append(best_params[marker])
        plot_table(table_val_list,
                   row_labels,
                   ax=current_ax,
                   additional_text=additional_text)

        current_ax = ax[5, i]
        contour_flag = len(train_xs[0]) == 2
        test_y_str = [str(item) for item in test_y]
        classes_str = [str(item) for item in dataset.classes]
        plot_feature_distribution(
            test_x,
            ax=current_ax,
            t_sne=True,
            hue=test_y_str,
            hue_order=classes_str,
            style=test_y_str,
            style_order=classes_str,
            # x_lim='box', y_lim='box',
            x_lim='min_max_extend',
            y_lim='min_max_extend',
            contour=contour_flag,
            z_generator=model.predict)
        current_ax.set_title('%s on test set of the last fold' % marker)

        current_ax = ax[6, i]
        metrics.ConfusionMatrixDisplay(
            test_metrics['conf_mat'],
            display_labels=dataset.classes).plot(ax=current_ax)
        current_ax.set_title('%s on test set of all folds' % marker)

        current_ax = ax[7, i]
        for j in range(len(dataset.classes)):
            roc_curve = test_metrics['roc_curve'][j]
            roc_auc_score = test_metrics['roc_auc_score'][j]
            class_id = dataset.classes[j]
            sen = test_metrics['sensitivity'][j] / 100
            spe = test_metrics['specificity'][j] / 100
            metrics.RocCurveDisplay(fpr=roc_curve[0],
                                    tpr=roc_curve[1],
                                    roc_auc=roc_auc_score,
                                    estimator_name='class %s' %
                                    class_id).plot(ax=current_ax)
            current_ax.scatter(1 - spe, sen)

        current_ax = ax[8, i]
        table_val_list = [
            dataset.classes,
            [100 * item for item in test_metrics['used_threshold']]
        ]
        row_labels = ['cls', 'thr']
        for metrics_name in metrics_names:
            table_val_list.append(test_metrics[metrics_name])
            row_labels.append(metrics_name[:min(3, len(metrics_name))])
        additional_text = []
        for metrics_name in metrics_avg_names:
            additional_text.append('%s: %1.1f' %
                                   (metrics_name, test_metrics[metrics_name]))
        plot_table(table_val_list,
                   row_labels,
                   ax=current_ax,
                   additional_text=additional_text)

    for metrics_name in metrics_avg_names:
        all_marker_values = [
            item[metrics_name] for item in all_marker_train_metrics
        ]
        double_print(
            'overall train %s: %1.1f' %
            (metrics_name, sum(all_marker_values) / len(all_marker_values)),
            metrics_file)
    for metrics_name in metrics_avg_names:
        all_marker_values = [
            item[metrics_name] for item in all_marker_test_metrics
        ]
        double_print(
            'overall test %s: %1.1f' %
            (metrics_name, sum(all_marker_values) / len(all_marker_values)),
            metrics_file)
    metrics_file.close()
    save_yaml(os.path.join(exp_path, 'best_params.yaml'), best_params)
    fig.savefig(metrics_fig_filename, bbox_inches='tight', pad_inches=1)
Exemple #23
0
            roc_auc = metrics.auc(fpr, tpr)
            treeDict[str(depth) + 'Entropy'] = {
                'fpr': fpr,
                'tpr': tpr,
                'auc': roc_auc
            }
            print("")

        ax = plt.gca()
        for key in treeDict:
            info = treeDict[key]
            fpr = info['fpr']
            tpr = info['tpr']
            roc_auc = info['auc']
            display = metrics.RocCurveDisplay(fpr=fpr,
                                              tpr=tpr,
                                              roc_auc=roc_auc,
                                              estimator_name=key + fileKey)
            display.plot(ax=ax)

        plt.savefig('../6_output/Scikit/rocCurves/' + fileKey + '_SK_DTs')
        plt.close()

        # Create Neural Networks
        max_epoch = 500
        step_sizes = [0.001, 0.01, 0.1, 1]
        print("************************************")
        print("Scikit's Neural Networks:")
        NNDict = {}

        for step in step_sizes:
Exemple #24
0
targets = []
with torch.no_grad():
    for image, label in test_dataset:
        image.unsqueeze_(0)
        conv_pred = conv_net(image)
        lr_pred = lr_model(image)
        conv_pred = torch.max(torch.softmax(conv_pred, dim=1),
                              dim=1)[0].squeeze()
        lr_pred = torch.sigmoid(lr_pred).squeeze()
        conv_preds.append(conv_pred.item())
        lr_preds.append(lr_pred.item())
        targets.append(label)

fpr, tpr, thresholds = metrics.roc_curve(targets, conv_preds)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr,
                                  tpr=tpr,
                                  roc_auc=roc_auc,
                                  estimator_name='ConvNet')
display.plot()
plt.show()

fpr, tpr, thresholds = metrics.roc_curve(targets, lr_preds)
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr,
                                  tpr=tpr,
                                  roc_auc=roc_auc,
                                  estimator_name='LR')
display.plot()
plt.show()
    print('Binary-accuracy:\n{}'.format(binary_accuracy))
    print('Balanced Binary-accuracy:\n{}'.format(binary_balanced_accuracy))
    print('Binary-confusion matrix:\n{}'.format(binary_confusion_matrix))
    print('Precision:\n{}'.format(precision))
    print('Recall:\n{}'.format(recall))
    print('F1 score:\n{}'.format(f1_score))
    # calculate the accuracy, balanced accuracy score and confusion matrix of the 5-calss classification
    multi_accuracy = metrics.accuracy_score(multi_true, multi_pred)
    multi_balanced_accuracy = metrics.balanced_accuracy_score(multi_true, multi_pred)
    multi_confusion_matrix = metrics.confusion_matrix(multi_true, multi_pred)
    # output the result of the 5-calss classification
    print('Multi-accuracy:\n{}'.format(multi_accuracy))
    print('Balanced Multi-accuracy:\n{}'.format(multi_balanced_accuracy))
    print('Multi-confusion matrix:\n{}'.format(multi_confusion_matrix))
    print('-------------------------------------')
    # plot the ROC and PRC of the ensemble learning model
    if regression_predictions_list != []:
        pred = np.clip((np.mean(regression_predictions_list, axis=0) + 0.5) / 4.0, a_min=0.0, a_max=1.0).astype(np.float64)
    elif multi_predictions_list != []:
        pred = np.clip(multi_pred.astype(np.float64) / 4.0, a_min=0.0, a_max=1.0).astype(np.float64)
    pred = np.squeeze(pred)
    true = binary_true

    fpr, tpr, _ = metrics.roc_curve(true, pred, pos_label=1.0)
    roc_display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

    precision_list, recall_list, _ = metrics.precision_recall_curve(true, pred, pos_label=1.0)
    pr_display = metrics.PrecisionRecallDisplay(precision=precision_list, recall=recall_list).plot()

    plt.show()