Example #1
0
def main():
    '''Sorted data'''
    inputsorted='german-sorted.xlsx'
    datasorted=readxlsx(inputsorted)
    score_sorted=datasorted[0,:]
    act_class_sorted=datasorted[1,:]
        
    '''calculating ROC AUC'''
    fpr_sorted,tpr_sorted,thresholds_sorted=metrics.roc_curve(act_class_sorted,score_sorted)
    aucvalue_sorted=metrics.auc(fpr_sorted,tpr_sorted)
    print 'AUC value of sorted data'
    print aucvalue_sorted
    #print 'Threshold'
    #print thresholds_sorted
    print ''
    
    '''Unsorted data'''
    inputunsorted='german-unsorted.xlsx'
    dataunsorted=readxlsx(inputunsorted)
    score_unsorted=dataunsorted[0,:]
    act_class_unsorted=dataunsorted[1,:]
        
    '''calculating ROC AUC'''
    fpr_unsorted,tpr_unsorted,thresholds_unsorted=metrics.roc_curve(act_class_unsorted,score_unsorted)
    aucvalue_unsorted=metrics.auc(fpr_unsorted,tpr_unsorted)
    print 'AUC value of sorted data'
    print aucvalue_unsorted
Example #2
0
def test_roc_curve_hard():
    # roc_curve for hard decisions
    y_true, pred, probas_pred = make_prediction(binary=True)

    # always predict one
    trivial_pred = np.ones(y_true.shape)
    fpr, tpr, thresholds = roc_curve(y_true, trivial_pred)
    roc_auc = auc(fpr, tpr)
    assert_array_almost_equal(roc_auc, 0.50, decimal=2)
    assert_equal(fpr.shape, tpr.shape)
    assert_equal(fpr.shape, thresholds.shape)

    # always predict zero
    trivial_pred = np.zeros(y_true.shape)
    fpr, tpr, thresholds = roc_curve(y_true, trivial_pred)
    roc_auc = auc(fpr, tpr)
    assert_array_almost_equal(roc_auc, 0.50, decimal=2)
    assert_equal(fpr.shape, tpr.shape)
    assert_equal(fpr.shape, thresholds.shape)

    # hard decisions
    fpr, tpr, thresholds = roc_curve(y_true, pred)
    roc_auc = auc(fpr, tpr)
    assert_array_almost_equal(roc_auc, 0.78, decimal=2)
    assert_equal(fpr.shape, tpr.shape)
    assert_equal(fpr.shape, thresholds.shape)
Example #3
0
def roc_calculation(y_pred, y_test, model, type = sys.argv[2]):
    plt.figure()
    if type == 'gender':
        fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=0)
        roc_auc = metrics.auc(fpr, tpr)
        plt.plot(fpr,tpr,label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(GENDER_CLASSES[0], roc_auc))
        fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)
        roc_auc = metrics.auc(fpr, tpr)
        plt.plot(fpr,tpr,label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(GENDER_CLASSES[1], roc_auc))
    else:
        for i in [0,1,2,3,4]:
            fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=i)
            roc_auc = metrics.auc(fpr, tpr)
            plt.plot(fpr,tpr,label='ROC curve of class {0} (area = {1:0.2f})'
                                   ''.format(AGE_CLASSES[i], roc_auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
    plt.savefig('experiments/fensemble-roc-'+model+'.png')
def plot_ROC(classifier, X,y):
	cv = StratifiedKFold(y, n_folds=2)
	mean_tpr = 0.0
	mean_fpr = np.linspace(0, 1, 100)
	all_tpr = []

	for i, (train, test) in enumerate(cv):
    		probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    		fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    		mean_tpr += interp(mean_fpr, fpr, tpr)
    		mean_tpr[0] = 0.0
    		roc_auc = auc(fpr, tpr)
    		plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

	plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

	mean_tpr /= len(cv)
	mean_tpr[-1] = 1.0
	mean_auc = auc(mean_fpr, mean_tpr)
	plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
	plt.xlim([-0.05, 1.05])
	plt.ylim([-0.05, 1.05])
	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.show()
def classification_metrics (targets, preds, probs=None):

    if probs != None:
        fpr, tpr, thresholds = roc_curve(targets, probs[:, 1], 1)
        roc_auc = auc(fpr, tpr)
    else:
        fpr, tpr, thresholds = roc_curve(targets, preds, 1)
        roc_auc = auc(fpr, tpr)

    cm = confusion_matrix(targets, preds)

    #accuracy
    acc = accuracy_score(targets, preds)

    #recall? True Positive Rate or Sensitivity or Recall
    sens = recall_score(targets, preds)

    #precision
    prec = precision_score(targets, preds)

    #f1-score
    f1 = f1_score(targets, preds, np.unique(targets), 1)

    tnr = 0.0
    #True Negative Rate or Specificity (tn / (tn+fp))
    if len(cm) == 2:
        spec = float(cm[0,0])/(cm[0,0] + cm[0,1])

    return acc, sens, spec, prec, f1, fpr, tpr, roc_auc
Example #6
0
def plot_roc_cv(classifier, X, y, cv):
    '''
    cv = KFold(len(y),n_folds=5)
    '''
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []

    for i, (train, test) in enumerate(cv):
        probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--',
             label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
Example #7
0
 def evaluation(self, test_data, test_label):
     dinx = np.array(list(self.train_drugs))
     DS = self.dsMat[:, dinx]
     tinx = np.array(list(self.train_targets))
     TS = self.tsMat[:, tinx]
     scores = []
     if self.K2 > 0:
         for d, t in test_data:
             if d in self.train_drugs:
                 if t in self.train_targets:
                     val = np.sum(self.U[d, :]*self.V[t, :])
                 else:
                     jj = np.argsort(TS[t, :])[::-1][:self.K2]
                     val = np.sum(self.U[d, :]*np.dot(TS[t, jj], self.V[tinx[jj], :]))/np.sum(TS[t, jj])
             else:
                 if t in self.train_targets:
                     ii = np.argsort(DS[d, :])[::-1][:self.K2]
                     val = np.sum(np.dot(DS[d, ii], self.U[dinx[ii], :])*self.V[t, :])/np.sum(DS[d, ii])
                 else:
                     ii = np.argsort(DS[d, :])[::-1][:self.K2]
                     jj = np.argsort(TS[t, :])[::-1][:self.K2]
                     v1 = DS[d, ii].dot(self.U[dinx[ii], :])/np.sum(DS[d, ii])
                     v2 = TS[t, jj].dot(self.V[tinx[jj], :])/np.sum(TS[t, jj])
                     val = np.sum(v1*v2)
             scores.append(np.exp(val)/(1+np.exp(val)))
     elif self.K2 == 0:
         for d, t in test_data:
             val = np.sum(self.U[d, :]*self.V[t, :])
             scores.append(np.exp(val)/(1+np.exp(val)))
     prec, rec, thr = precision_recall_curve(test_label, np.array(scores))
     aupr_val = auc(rec, prec)
     fpr, tpr, thr = roc_curve(test_label, np.array(scores))
     auc_val = auc(fpr, tpr)
     return aupr_val, auc_val
Example #8
0
def plot_roc_class(x, y, fit_class, **kwargs):
    kf = KFold(len(y), n_folds=10, shuffle=True)
    y_prob = np.zeros((len(y), 2))
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    for i, (train_index, test_index) in enumerate(kf):
        x_train, x_test = x[train_index], x[test_index]
        y_train = y[train_index]
        clf = fit_class(**kwargs)
        clf.fit(x_train, y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(x_test)
        fpr, tpr, thresholds = roc_curve(y[test_index], y_prob[test_index, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
    mean_tpr /= len(kf)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
Example #9
0
def learning(X, y, depth, eta, rounds, subs=1.0):

    rng = np.random.RandomState()
    skf = cross_validation.StratifiedKFold(y, n_folds=3, shuffle=True, random_state=rng)
    trscores, cvscores = [], []

    num_round = rounds
    param = {"max_depth": depth, "eta": eta, "sub_sample": subs, "silent": 1, "objective": "binary:logistic"}

    for train_index, test_index in skf:
        print("TRAIN:", train_index, "TEST:", test_index)

        #### cross validations #####
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
        watchlist = [(dtest, "eval"), (dtrain, "train")]

        bst = xgb.train(param, dtrain, num_round, watchlist)
        ptrain = bst.predict(dtrain)
        ptest = bst.predict(dtest)

        trscore = auc(y_train, ptrain)
        cvscore = auc(y_test, ptest)
        trscores.append(trscore)
        cvscores.append(cvscore)

    return np.mean(trscores), np.mean(cvscores), bst
Example #10
0
def display_roc():
    thresholds = np.linspace(0, 1, 21)
    for hash_name in hash_names:
        tpr = []
        fpr = []
        with open(hash_name + ".same", 'r+b') as f:
            same_family_dm = np.array(cPickle.load(f))
        same_family_uniqw, same_family_inverse = np.unique(same_family_dm, return_inverse=True)
        same_family_dmlist = dict(zip(same_family_uniqw, np.bincount(same_family_inverse)))
        with open(hash_name + ".diff", 'r+b') as f:
            diff_family_dm = np.array(cPickle.load(f))
        diff_family_uniqw, diff_family_inverse = np.unique(diff_family_dm, return_inverse=True)
        diff_family_dmlist = dict(zip(diff_family_uniqw, np.bincount(diff_family_inverse)))
        for threshold in thresholds:
            tp = fp = 0
            for dm in same_family_dmlist:
                if dm <= threshold:
                    tp += same_family_dmlist[dm]
            for dm in diff_family_dmlist:
                if dm <= threshold:
                    fp += diff_family_dmlist[dm]
            tpr.append(tp*1.0/same_family_dm.size)
            fpr.append(fp*1.0/diff_family_dm.size)
        print sm.auc(fpr, tpr)
        print "Fuzzy hashing algorithm: %s, AUC: %f" %(hash_name, sm.auc(fpr, tpr))
        plt.figure(0)
        plt.plot(fpr, tpr, label=hash_name)
        plt.ylim(0.75, 1)
        plt.legend(loc='best')
        plt.title("ROC curve for different algorithms")
        plt.xlabel("False posive rate")
        plt.ylabel("True posive rate")
    plt.show()
Example #11
0
def plot_roc_estimator(estimator, x, y):
    kf = KFold(len(y), n_folds=10, shuffle=True)
    y_prob = np.zeros((len(y), 2))
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    for i, (train_index, test_index) in enumerate(kf):
        x_train, x_test = x[train_index], x[test_index]
        y_train = y[train_index]

        estimator.fit(x_train, y_train)
        y_prob[test_index] = estimator.predict_proba(x_test)
        fpr, tpr, thresholds = roc_curve(y[test_index], y_prob[test_index, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
    mean_tpr /= len(kf)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()
def eva(fff1, fff2, fff3, fff4, rocfile):
	truth = open(fff1)
	pred = open(fff2)

	y = [float(line.split(' ',1)[0]) for line in truth]
	p = [float(line) for line in pred]

	fpr, tpr, thresholds = roc_curve(y, p, pos_label=1)  
	print auc(fpr, tpr)

	plt.figure(figsize=(4, 4), dpi=80)
	x = [0.0, 1.0]
	plt.plot(x, x, linestyle='dashed', color='red', linewidth=2, label='random')

	plt.xlim(0.0, 1.0)
	plt.ylim(0.0, 1.0)
	plt.xlabel("FPR", fontsize=14)
	plt.ylabel("TPR", fontsize=14)
	plt.title("ROC Curve", fontsize=14)
	plt.plot(fpr, tpr, linewidth=2, label = "adaboost_fea1")

	truth = open(fff3)
	pred = open(fff4)

	y = [float(line.split(' ',1)[0]) for line in truth]
	p = [float(line) for line in pred]

	fpr, tpr, thresholds = roc_curve(y, p, pos_label=1)  
	print auc(fpr, tpr)
	plt.plot(fpr, tpr, linewidth=2, label = "adaboost_fea2")
	plt.legend(fontsize=10, loc='best')
	plt.tight_layout()

	plt.savefig(rocfile)
Example #13
0
def main():
    (X,y) = skd.make_classification()
    N = X.shape[0]
    X = np.append(X,np.ones((N,1)),axis=1)
    y = 2*y-1
        
    skf = StratifiedKFold(y,5)
    for train,test in skf:
        X_train = X[train,:]
        y_train = y[train]
        
        X_test = X[test,:]
        y_test = y[test]
        
        C = 0.01
        
        # dual co-ordinate descent SVM
        clf = SVMCD(C)
        clf.fit(X_train,y_train,w_prior=np.ones(21))
        pred = clf.decision_function(X_test)
        score = clf.score(X_test,y_test)
        fpr, tpr, thresholds = metrics.roc_curve(y_test, pred)
        print score, metrics.auc(fpr, tpr), "//",
        w1  = clf.w;
        
        # standard svm
        clf = SVC(C=C,kernel='linear')
        clf.fit(X_train, y_train) 
        pred = clf.decision_function(X_test)
        score = clf.score(X_test,y_test)
        fpr, tpr, thresholds = metrics.roc_curve(y_test, pred)
        print score, metrics.auc(fpr, tpr)
        w2 = clf.coef_
        w2.shape = (21,)
    def makeROCPlot(self, filename, title, labels, roc_data):
        y = np.array(self.create_binary_label_matrix(labels))
        n_classes = y.shape[1]
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y[:, i], roc_data[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        fpr["micro"], tpr["micro"], _ = roc_curve(y.ravel(), roc_data.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        # Plot ROC curve
        plt.figure()
        plt.plot(fpr["micro"], tpr["micro"],label='Average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"]))
        for i in range(n_classes):
            plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'.format(i+1, roc_auc[i]))

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(title)
        plt.legend(loc="lower right")
        plt.savefig("figs/"+filename+'.png',bbox_inches='tight')
        #plt.show()
        plt.clf()
        return roc_auc
Example #15
0
def draw(X, y, classifier):
    cv = StratifiedKFold(y, n_folds=6)
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []

    for i, (train, test) in enumerate(cv):
        probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label="ROC fold %d (area = %0.2f)" % (i, roc_auc))

    plt.plot([0, 1], [0, 1], "--", color=(0.6, 0.6, 0.6), label="Luck")

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Receiver operating characteristic example")
    plt.legend(loc="lower right")
    plt.show()
def calculate_roc(truth, predictions):
    lb_truth = label_binarize(truth.iloc[:, -1].astype(int), np.arange(n_classes))
    lb_prediction = label_binarize(predictions.iloc[:, -1].astype(int), np.arange(n_classes))

    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(len(letter_set)):
        fpr[i], tpr[i], _ = roc_curve(lb_truth[:, i], lb_prediction[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(lb_truth.ravel(), lb_prediction.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    return fpr, tpr, roc_auc
Example #17
0
def plotROC(y_score, labels, outpdf):
    n_classes = labels.shape[1]
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(labels[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(labels.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    # Plot of a ROC curve for a specific class
    plt.figure()
    plt.figure(figsize = (6,6))
    
    # Plot ROC curve
    for i in range(4):
        plt.plot(fpr[i], tpr[i], label='' + classifiers[i]+ ' AUC={1:0.2f}'
                                       ''.format(i, roc_auc[i]))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False positive rate(1-Specificity)')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc="lower right")
    savefig(outpdf)  
    plt.show()
Example #18
0
 def evaluation(self, test_data, test_label):
     scores = self.predictR[test_data[:, 0], test_data[:, 1]]
     prec, rec, thr = precision_recall_curve(test_label, scores)
     aupr_val = auc(rec, prec)
     fpr, tpr, thr = roc_curve(test_label, scores)
     auc_val = auc(fpr, tpr)
     return aupr_val, auc_val
def roc_auc_truncated(labels, predictions, tpr_thresholds=(0.2, 0.4, 0.6, 0.8),
                      roc_weights=(4, 3, 2, 1, 0)):
    """
    Compute weighted area under ROC curve.
    :param labels: array-like, true labels
    :param predictions: array-like, predictions
    :param tpr_thresholds: array-like, true positive rate thresholds delimiting the ROC segments
    :param roc_weights: array-like, weights for true positive rate segments
    :return: weighted AUC
    """
    assert np.all(predictions >= 0.) and np.all(
        predictions <= 1.), 'Data predictions are out of range [0, 1]'
    assert len(tpr_thresholds) + \
        1 == len(roc_weights), 'Incompatible lengths of thresholds and weights'
    fpr, tpr, _ = roc_curve(labels, predictions)
    area = 0.
    tpr_thresholds = [0.] + list(tpr_thresholds) + [1.]
    for index in range(1, len(tpr_thresholds)):
        tpr_cut = np.minimum(tpr, tpr_thresholds[index])
        tpr_previous = np.minimum(tpr, tpr_thresholds[index - 1])
        area += roc_weights[index - 1] * \
            (auc(fpr, tpr_cut, reorder=True) -
             auc(fpr, tpr_previous, reorder=True))
    tpr_thresholds = np.array(tpr_thresholds)
    # roc auc normalization to be 1 for an ideal classifier
    area /= np.sum((tpr_thresholds[1:] -
                    tpr_thresholds[:-1]) * np.array(roc_weights))
    return area
Example #20
0
def draw_roc_curve(classifier, cv, X, y):
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)

    colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
    lw = 2

    for i, (train, test) in enumerate(cv):
        probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

    plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k',
             label='Luck')

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
             label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
Example #21
0
def prc_curve(targets_ts, scores_ts, targets_tr, scores_tr, model_no):
    plt.clf()
    colors = ['r', 'g', 'b', 'y', 'k', 'm']
    classes = ['lunge', 'wing_threat', 'charge', 'hold', 'tussle', 'other']
    for i in range(NUM_CLASSES):
        i = 5
        precision_ts, recall_ts, thresholds_ts = precision_recall_curve(targets_ts[:,i], scores_ts[:,i], pos_label=1)
        precision_tr, recall_tr, thresholds = precision_recall_curve(targets_tr[:,i], scores_tr[:,i], pos_label=1)
        area_ts = auc(recall_ts, precision_ts)
        area_tr = auc(recall_tr, precision_tr)
        test_i, f1_ts = compute_f1(precision_ts, recall_ts)
        train_i, f1_tr = compute_f1(precision_tr, recall_tr)
        print thresholds_ts[train_i]
        plt.plot(recall_ts, precision_ts, '--',label="%s test AUC: %0.3f f1: %0.3f" %(classes[i], area_ts, f1_ts), 
            color=colors[i])
        plt.plot(recall_tr, precision_tr, label="%s train AUC: %0.3f f1: %0.3f" %(classes[i],area_tr, f1_tr),
            color=colors[i])
        break
    plt.title('Precision Recall of MC Model ' + model_no)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.legend(loc="lower left", prop={'size':8})
    plt.grid(b=True, which='major')
    figure = plt.gcf()
    figure.set_size_inches(8, 6)
    plt.savefig('PRC_mc_model' + model_no +'.png')
def linreg_ccv_plot_roc(num_folds):

    global data
    folds = pd.create_folds(data, num_folds)
    classifier = LinearRegression()
    
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []

    for i in range(num_folds):
        test_x, test_y, train_x, train_y = pd.split_into_sets(data, folds, i)
        probs = classifier.fit(train_x, train_y).predict(test_x)
        fpr, tpr, thresholds = roc_curve(test_y, probs) #takes, y_true and y_score
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
    
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

    mean_tpr /= len(folds) 
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--',
             label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('%d-fold Clustered Cross-Validation' % num_folds)
    plt.legend(loc="lower right")
    plt.show()   
Example #23
0
def plot_roc_curves(results):
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []

    # Plot for each cross validation results
    for i in range(len(results)):
        fpr = results[i][0]
        tpr = results[i][1]
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(results[i][0], results[i][1])
        pl.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

    # Plot default for 'luck'
    pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

    mean_tpr /= len(results)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)

    # Plot the mean ROC curve
    pl.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    # lot the other axis
    pl.xlim([-0.05, 1.05])
    pl.ylim([-0.05, 1.05])
    pl.xlabel('False Positive Rate')
    pl.ylabel('True Positive Rate')
    pl.title('Receiver operating characteristic example')
    pl.legend(loc="lower right", prop=font_prop)
Example #24
0
    def compute_rocauc(self):
        """

        :return:
        """
        # Binarize the output
        y_test = label_binarize(self.y_test, classes=list(range(self.n_classes)))

        # Compute ROC curve and ROC area for each class
        y_score = self.clf.predict_proba(self.X_test)
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(self.n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        self.report["roc_auc"] = dict(
            fpr={str(k): v.tolist() for k, v in fpr.items()},
            tpr={str(k): v.tolist() for k, v in tpr.items()},
            roc_auc={str(k): v.tolist() for k, v in roc_auc.items()}
        )
Example #25
0
def roc_plot(X,y, classifier,filename):
    from sklearn.metrics import roc_curve, auc
    from sklearn.cross_validation import StratifiedKFold
    plt.figure(figsize=(10,9))
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    cv = StratifiedKFold(y, n_folds=5)
    for i, (train, test) in enumerate(cv):
        probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        mean_tpr += scipy.interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

        
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--',  label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
    
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig(filename+'.pdf')
Example #26
0
def CV(clf, X, y, n_folds=10):
    """
    returns gini values and classifier
    """
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.metrics import roc_curve, auc
    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import roc_curve, auc
    import pandas as pd
    
    cv = StratifiedKFold(y, n_folds=n_folds)
    auccka = []
    try:
        for train_ix, test_ix in cv:
            clf.fit(X.ix[train_ix,:], y[train_ix])
            y_pred = clf.predict_proba(X.ix[test_ix,:])[:,1]
            y_true = y[test_ix]
            fpr, tpr, tresholds = roc_curve(y_true, y_pred)
            auccka.append(auc(fpr,tpr))
    except Exception:
        # treba kdyz vyjde log(0) v nejakem foldu
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
        # uprava: train_test_split vraci ndarray, ja chci ale DataFrame:
        X_train = pd.DataFrame(X_train)
        X_test = pd.DataFrame(X_test)
        X_train.columns = X.columns
        X_test.columns = X.columns
        # konec upravy        
        clf.fit(X_train, y_train)
        y_pred = clf.predict_proba(X_test)[:,1]
        y_true = y_test
        fpr, tpr, tresholds = roc_curve(y_true, y_pred)
        auccka.append(auc(fpr, tpr))
    gini = [2*auc-1 for auc in auccka]
    return gini, clf
def AUC(test_labels, predicted_labels, n_classes):
    y_test = testProbVector(n_classes, test_labels)
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(0,n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:,i], predicted_labels[:,i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), predicted_labels.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # Compute macro-average ROC curve and ROC area

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    return np.asarray(roc_auc)
def bootstrap(n_percent, m_times):

    global data
    classifier = LogisticRegression()
    
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []

    for i in range(m_times):
        test_x, test_y, train_x, train_y = pd.bootstrap_sampling(data, n_percent, i) #use i as seed
        probs = classifier.fit(train_x, train_y).predict_proba(test_x)
        fpr, tpr, thresholds = roc_curve(test_y, probs[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        #plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
        plt.plot(fpr, tpr, lw=1) #lets not do labels

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

    mean_tpr /= m_times
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--',
             label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Bootstrap %d percent of data %d times (SCOREDATA.vina.balanced)' % (n_percent, m_times))
    plt.legend(loc="lower right")
    plt.show()
def eva_complex(fff1, y1, fff3, y2, rocfile):
	truth = open(fff1)

	y = [float(line.split(' ',1)[0]) for line in truth]
	p = y1

	fpr, tpr, thresholds = roc_curve(y, p, pos_label=1)  
	print auc(fpr, tpr)

	plt.figure(figsize=(4, 4), dpi=80)
	x = [0.0, 1.0]
	plt.plot(x, x, linestyle='dashed', color='red', linewidth=2, label='random')

	plt.xlim(0.0, 1.0)
	plt.ylim(0.0, 1.0)
	plt.xlabel("FPR", fontsize=14)
	plt.ylabel("TPR", fontsize=14)
	plt.title("ROC Curve", fontsize=14)
	plt.plot(fpr, tpr, linewidth=2, label = "complex_allfea")
	'''
	truth = open(fff3)

	y = [float(line.split(' ',1)[0]) for line in truth]
	p = y2

	fpr, tpr, thresholds = roc_curve(y, p, pos_label=1)  
	print auc(fpr, tpr)
	plt.plot(fpr, tpr, linewidth=2, label = "complex_fea2")
	'''
	plt.legend(fontsize=10, loc='best')
	plt.tight_layout()

	plt.savefig(rocfile)
Example #30
0
def classify_only(X, Y, model):
    cv = cross_validation.StratifiedKFold(Y, n_folds=K_FOLDS)
    # print len(Y)
    mean_tpr = 0.0
    mean_fpr = numpy.linspace(0, 1, 100)
    all_tpr = []

    for i, (train, test) in enumerate(cv):
        probas_ = model.fit(X.values[train], Y.values[train]).predict_proba(X.values[test])
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(Y.values[test], probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

    mean_tpr /= len(cv)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example: '+model.__class__.__name__)
    plt.legend(loc="lower right")
    plt.show()
    print "Plot done"
def plot_roc_curve(fper, tper, roc_auc, n_classes):
    '''
    This funcion plots the ROC(Reciever Operating Characteristic) curve and calculates the area under the curve(AUC).
    Parameters:
        fper : array-like
        tper : array-like
        roc_auc : array-like
        n_classes : int

    Returns:
        null
    '''

    # Aggregating all false positive rates
    all_fper = np.unique(np.concatenate([fper[i] for i in range(n_classes)]))
    lw = 2

    # Then interpolate all ROC curves at these points
    mean_tper = np.zeros_like(all_fper)
    for i in range(n_classes):
        mean_tper += np.interp(all_fper, fper[i], tper[i])

    # Average it and compute AUC
    mean_tper /= n_classes

    fper["macro"] = all_fper
    tper["macro"] = mean_tper
    roc_auc["macro"] = auc(fper["macro"], tper["macro"])

    # Plotting all ROC curves
    plt.figure()
    # micro-avg
    plt.plot(fper["micro"],
             tper["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["micro"]),
             color='deeppink',
             linestyle=':',
             linewidth=4)
    # macro-avg
    plt.plot(fper["macro"],
             tper["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
             ''.format(roc_auc["macro"]),
             color='navy',
             linestyle=':',
             linewidth=4)

    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fper[i],
                 tper[i],
                 color=color,
                 lw=lw,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()
Example #32
0
def evaluate(model: net, true: torch.tensor, cond: torch.tensor,
             out: torch.tensor, loss_fn: Callable[[torch.tensor, torch.tensor],
                                                  torch.tensor],
             acc: Callable[[torch.tensor, torch.tensor],
                           float], test_dir: str) -> None:
    """Use trained model to generate ROC curve and magnitude
    distribution plots.
    Args:
        model (net): the feedforward network
        true (torch.tensor): the true galaxy magnitudes used as inputs
        cond (torch.tensor): the observing conditions used as inputs
        out (torch.tensor): ground truth observed galaxy magnitudes
        loss_fn (Callable[[torch.tensor, torch.tensor], torch.tensor]):
            loss function
        acc (Callable[[torch.tensor, torch.tensor], float]): accuracy
            function
        test_dir (str): the directory to save the plots to.
    """
    model.eval()

    noise = Variable(torch.randn(cond.shape[0], 1)).cuda(non_blocking=True)

    predout = model(cond, true, noise).squeeze().data.cpu()
    loss = loss_fn(predout, out).item()

    out = out.cpu().numpy()
    true = true.cpu().numpy()

    pred = (predout >= 0.5).int().numpy()
    accuracy = acc(pred, out)

    fpr, tpr, _ = roc_curve(out, predout, pos_label=1)
    roc_auc = auc(fpr, tpr)

    r = -2.5 * np.log10(true[pred == 1][:, 1]) + 30.
    i = -2.5 * np.log10(true[pred == 1][:, 2]) + 30.
    z = -2.5 * np.log10(true[pred == 1][:, 3]) + 30.

    plt.figure()
    plt.plot(fpr, tpr, lw=2, label="AUC = {:.2f}".format(roc_auc))
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("Detection ROC Curve")
    plt.legend(loc="best")
    plt.grid(True)
    plt.savefig(os.path.join(test_dir, "detroc.png"))

    plt.figure()
    plt.hist2d(i, r - i, bins=100, range=[[20, 25], [-2, 2]])
    plt.xlabel("$i_\mathrm{true}$")
    plt.ylabel("$(r-i)_\mathrm{true}$")
    plt.colorbar()
    plt.savefig(os.path.join(test_dir, "ri_i_t.png"))

    plt.figure()
    plt.hist2d(i - z, r - i, bins=100, range=[[-2, 2], [-2, 2]])
    plt.xlabel("$(i-z)_\mathrm{true}$")
    plt.ylabel("$(r-i)_\mathrm{true}$")
    plt.colorbar()
    plt.savefig(os.path.join(test_dir, "ri_iz_t.png"))

    logging.info(f"- Test metrics : loss = {loss}; accuracy = {accuracy}; "
                 f"roc_auc = {roc_auc}")

    return None
Example #33
0
print(accuracy_score(train_labels, train_pred), file=f)

print('\n********AdaBoosting_Performance on the Test Set********', file=f)
print(confusion_matrix(test_labels, test_pred), file=f)
print(classification_report(test_labels, test_pred), file=f)
print(accuracy_score(test_labels, test_pred), file=f)

test_pred = ada.fit(train_features, train_labels).predict(test_features)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
    fpr[i], tpr[i], _ = roc_curve(test_labels[:], test_pred[:])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(test_labels.ravel(),
                                          test_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

plt.figure()
lw = 2
plt.plot(fpr[0],
         tpr[0],
         color='darkorange',
         label='ROC curve (area = %0.2f)' % (roc_auc[0]))
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
Example #34
0
def model(protein,i):
    if i == 0:
        with open('.\\result\\'+protein+'-CNN_test.csv','a+') as f:
            f.write('cycle--epoch'+','+'ACC'+','+'SPE'+','+'SEN'+','+'AUC'+'\n')
        with open('.\\result\\'+protein+'-CNN_val.csv','a+') as e:
            e.write('cycle--epoch'+','+'ACC'+','+'SPE'+','+'SEN'+','+'AUC'+'\n')
    
    data = pd.read_table(path+protein+'-maccs.csv',delimiter=',')   
    X = data.iloc[:,0]
    y = data.iloc[:,2]
    X_,X_test,y_,y_test = train_test_split(X,y,test_size=0.2)
    X_train,X_validation,y_train,y_validation = train_test_split(X_,y_,test_size=0.25)
    X_train,X_validation,X_test = resplit(X_train),resplit(X_validation),resplit(X_test)
    X_train,y_train = del_oversize(X_train,y_train,MAXLEN)
    X_test,y_test = del_oversize(X_test,y_test,MAXLEN)
    X_validation,y_validation = del_oversize(X_validation,y_validation,MAXLEN)
    X_train,X_test,X_validation = onehot_encode(MAXLEN,X_train),onehot_encode(MAXLEN,X_test),onehot_encode(MAXLEN,X_validation)
    X_train = X_train[:, np.newaxis, :, :]
    X_test = X_test[:, np.newaxis, :, :]    
    X_validation = X_validation[:, np.newaxis, :, :]
    y_train,y_test,y_validation = y_train.values,y_test.values,y_validation.values
    X_train,X_test,X_validation,y_train,y_test,y_validation = t.from_numpy(X_train).type(t.FloatTensor),t.from_numpy(X_test).type(t.FloatTensor),t.from_numpy(X_validation).type(t.FloatTensor),t.from_numpy(y_train),t.from_numpy(y_test),t.from_numpy(y_validation)
    net=CNN(N_HIDDEN,DROPOUT)
    if i == 0:
        print(net)
    optimizer = t.optim.Adam(net.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    loss_func = nn.CrossEntropyLoss()
    test_x,test_y,validation_x,validation_y = Variable(X_test), Variable(y_test), Variable(X_validation), Variable(y_validation)
    train_loader = create_loader(X_train,y_train)
    for epoch in range(EPOCH):
        for step, (x, y) in enumerate(train_loader):
            b_x = Variable(x)  # batch x
            b_y = Variable(y)  # batch y
            output = net(b_x)  # cnn output
            pred_t_y = t.max(F.softmax(output,dim=1), 1)[1]
            accuracy_t_y = (pred_t_y == b_y).data.numpy().sum() / b_y.size(0)
            loss = loss_func(output, b_y)  # cross entropy loss
            optimizer.zero_grad()  # clear gradients for this training step
            loss.backward()  # backpropagation, compute gradients
            optimizer.step()  # apply gradients
            
            if step == math.floor(X_train.size(0)/BATCH_SIZE):
                net.eval()  #有dropout时,预测时转换模式,把dropout断掉
                validation_output = net(validation_x)
                score_v = F.softmax(validation_output,dim=1)[:,1].data.numpy()
                fpr_v,tpr_v,thresholds_v = metrics.roc_curve(y_validation.numpy(),score_v,pos_label=1)
                auc_v = metrics.auc(fpr_v,tpr_v)
                pred_v_y = t.max(F.softmax(validation_output,dim=1), 1)[1]
                accuracy_v = (pred_v_y == validation_y).data.numpy().sum() / validation_y.size(0)
                if (epoch+1) % 10 == 0:
                    confusion_v = metrics.confusion_matrix(y_validation.numpy(),pred_v_y.data.numpy())
                    TP_v = confusion_v[1, 1]
                    TN_v = confusion_v[0, 0]
                    FP_v = confusion_v[0, 1]
                    FN_v = confusion_v[1, 0]
                    sen_v = TP_v / (TP_v+FN_v)
                    spe_v = TN_v / (TN_v+FP_v)
                    with open('.\\result\\'+protein+'-CNN_val.csv','a+') as e:
                        e.write(str(i)+'--'+str(epoch)+','+'{:.3f}'.format(float(accuracy_v))+','+'{:.3f}'.format(float(spe_v))+','+'{:.3f}'.format(float(sen_v))+','+'{:.3f}'.format(float(auc_v))+'\n')       
                text = 'Epoch:&nbsp;' + str(epoch) + '&nbsp;&nbsp;|&nbsp;acc: %.4f' % accuracy_v + '&nbsp;&nbsp;|&nbsp;auc: %.4f' % auc_v
                x_1 = t.Tensor([epoch])
                y_3 = t.Tensor([loss.data[0]])  #交叉熵损失
                y_1 = t.Tensor([accuracy_t_y])
                y_2 = t.Tensor([accuracy_v])
                y_4 = t.Tensor([auc_v])
                vis.line(X=x_1,Y=y_3,win='pic1',update='append' if epoch >0 else None,opts=dict(title='acc & loss'))
                vis.updateTrace(X=x_1, Y=y_1,win='pic1',name='train')
                vis.updateTrace(X=x_1, Y=y_2,win='pic1',name='validation')
                vis.updateTrace(X=x_1, Y=y_4,win='pic1',name='auc')
                vis.text(text,win='log',opts={'title':'nn accuracy'},append=True)
                net.train()
    
        if (epoch+1) % 10 == 0:
            net.eval()
            test_output = net(test_x)
            score = F.softmax(test_output,dim=1)[:,1].data.numpy()
            fpr,tpr,thresholds = metrics.roc_curve(y_test.numpy(),score,pos_label=1)
            auc_t = metrics.auc(fpr,tpr)
            pred_y = t.max(F.softmax(test_output,dim=1), 1)[1]
            acc = (pred_y == test_y).data.numpy().sum() / test_y.size(0)
            confusion = metrics.confusion_matrix(y_test.numpy(),pred_y.data.numpy())
            TP = confusion[1, 1]
            TN = confusion[0, 0]
            FP = confusion[0, 1]
            FN = confusion[1, 0]
            sen = TP / (TP+FN)
            spe = TN / (TN+FP)
            print('=========='+protein+'-CNN=========')
            print('The accuracy is: %.3f' %acc)
            print('The specificity is: %.3f' %spe)
            print('The sensitivity is: %.3f' %sen)
            print('The auc is: %.3f' %auc_t)
            print('============================')        

            with open('.\\result\\'+protein+'-CNN_test.csv','a+') as f:
                f.write(str(i)+'--'+str(epoch)+','+'{:.3f}'.format(float(acc))+','+'{:.3f}'.format(float(spe))+','+'{:.3f}'.format(float(sen))+','+'{:.3f}'.format(float(auc_t))+'\n')
            net.train()
Example #35
0
def plot_roc_curve(y_test, y_pred, title=None, micro=False, macro=True, per_class=False):

    if y_test.ndim == 2:
        num_instances, num_classes = y_test.shape
    else:
        num_instances = y_test.shape[0]
        num_classes = 1
    if (num_classes != 2) and (y_test.ndim == 1):
        bi_y_test = label_binarize(y_test, classes=range(num_classes))
    else:
        bi_y_test = y_test
    
    fpr = {}
    tpr = {}
    roc_auc = {}
    for i in range(num_classes):
        fpr[i], tpr[i], _ = roc_curve(bi_y_test[:, i], y_pred[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    fpr['micro'], tpr['micro'], _ = roc_curve(y_test.ravel(), y_pred.ravel())
    roc_auc['micro'] = auc(fpr['micro'], tpr['micro'])

    # Compute macro-average ROC curve and AUC
    # Aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(num_classes)]))
    # Interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(num_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])
    # Average and compute AUC
    mean_tpr /= num_classes

    fpr['macro'] = all_fpr
    tpr['macro'] = mean_tpr
    roc_auc['macro'] = auc(fpr['macro'], tpr['macro'])

    # Plot all ROC curves
    plt.figure(figsize=(10, 10))
    
    if per_class == True:
        for i in range(num_classes):
            plt.plot(fpr[i], tpr[i], alpha=0.2,
                     label='ROC curve of class {0} (area = {1:0.4f})'
                     ''.format(i+1, roc_auc[i]))
    if micro == True:
        plt.plot(fpr['micro'], tpr['micro'],
                 label='micro-average ROC curve (area = {0:0.4f})'
                       ''.format(roc_auc['micro']),
                 color='orangered', linestyle=':', linewidth=3)

    if macro == True:
        plt.plot(fpr['macro'], tpr['macro'],
                 label='macro-average ROC curve (area = {0:0.4f})'
                       ''.format(roc_auc['macro']),
                 color='navy', linestyle=':', linewidth=3)

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xticks(fontsize=13)
    plt.xticks(fontsize=13)
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    if type(title) == str:
        plt.title(title, fontsize=16)
    elif title != None:
        print('Title must be a string.')
        plt.title('ROC Curves', fontsize=16)
    else:
        plt.title('ROC Curves', fontsize=16)
    plt.legend(loc=4)
    plt.show()
Example #36
0
    def sim_same_and_diff_category_samples(self,
                                           df,
                                           cat_index=1,
                                           dist_type='cosine',
                                           equal_var=False,
                                           plot_roc=True,
                                           precalc_dist=False,
                                           calc_roc=True):
        '''
      Calculate the similarity of samples from the same and different categories. The
      cat_index gives the index of the category, where 1 in the first category
      '''

        cols = df.columns.tolist()

        if type(precalc_dist) == bool:
            # compute distnace between rows (transpose to get cols as rows)
            dist_arr = 1 - pdist(df.transpose(), metric=dist_type)
        else:
            dist_arr = precalc_dist

        # generate sample names with categories
        sample_combos = list(combinations(range(df.shape[1]), 2))

        sample_names = [
            str(ind) + '_same' if cols[x[0]][cat_index]
            == cols[x[1]][cat_index] else str(ind) + '_different'
            for ind, x in enumerate(sample_combos)
        ]

        ser_dist = pd.Series(data=dist_arr, index=sample_names)

        # find same-cat sample comparisons
        same_cat = [x for x in sample_names if x.split('_')[1] == 'same']

        # find diff-cat sample comparisons
        diff_cat = [x for x in sample_names if x.split('_')[1] == 'different']

        # make series of same and diff category sample comparisons
        ser_same = ser_dist[same_cat]
        ser_same.name = 'Same Category'
        ser_diff = ser_dist[diff_cat]
        ser_diff.name = 'Different Category'

        sim_dict = {}
        roc_data = {}
        sim_data = {}

        sim_dict['same'] = ser_same
        sim_dict['diff'] = ser_diff

        pval_dict = {}
        ttest_stat, pval_dict['ttest'] = ttest_ind(ser_diff,
                                                   ser_same,
                                                   equal_var=equal_var)

        ttest_stat, pval_dict['mannwhitney'] = mannwhitneyu(ser_diff, ser_same)

        if calc_roc:
            # calc AUC
            true_index = list(np.ones(sim_dict['same'].shape[0]))
            false_index = list(np.zeros(sim_dict['diff'].shape[0]))
            y_true = true_index + false_index

            true_val = list(sim_dict['same'].get_values())
            false_val = list(sim_dict['diff'].get_values())
            y_score = true_val + false_val

            fpr, tpr, thresholds = roc_curve(y_true, y_score)

            inst_auc = auc(fpr, tpr)

            if plot_roc:
                plt.figure()
                plt.plot(fpr, tpr)
                plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
                plt.figure(figsize=(10, 10))

                print('AUC', inst_auc)

            roc_data['true'] = y_true
            roc_data['score'] = y_score
            roc_data['fpr'] = fpr
            roc_data['tpr'] = tpr
            roc_data['thresholds'] = thresholds
            roc_data['auc'] = inst_auc

        sim_data['sim_dict'] = sim_dict
        sim_data['pval_dict'] = pval_dict
        sim_data['roc_data'] = roc_data

        return sim_data
Example #37
0
	use_multiprocessing = True,
	validation_data = testingGenerator,
	validation_steps = 1,
	verbose = 1,
	workers = settings['cores']
)
kerasDFFNN.save(os.path.join(settings['outputDirectory'], 'final.hdf5'))

### AREA UNDER THE PRECISION/RECALL CURVE
x = kerasDFFNN.predict_generator(
	validationGenerator,
	steps = settings['epochSteps'],
	use_multiprocessing = True,
	verbose = 1,
	workers = settings['cores']
)
y = []
for k in range(0, settings['epochSteps']): ### inefficient (reads files a second time), but works
	with open(validationFiles[k]) as reader:
		for line in reader:
			d = list(map(float, line.rstrip().split(',')))
			if len(d) == settings['width']+1:
				y.append(int(d[0]))
p, r, t = precision_recall_curve(y, x[:, 1])
a = auc(r, p)
stats = open(os.path.join(settings['outputDirectory'], 'stats.csv'), 'w')
stats.write('threshold,precision,recall,F1,AUC\n')
for k in range(0, len(t)):
	stats.write(str(t[k]) + ',' + str(p[k]) + ',' + str(r[k]) + ',' + str(2*((p[k]*r[k])/(p[k]+r[k]))) + ',' + str(a) + '\n')
stats.close()
Example #38
0
    ranked_frequencies = y_true[ranking]
    ranked_exposure = exposure[ranking]
    cumulated_claims = np.cumsum(ranked_frequencies * ranked_exposure)
    cumulated_claims /= cumulated_claims[-1]
    cumulated_exposure = np.cumsum(ranked_exposure)
    cumulated_exposure /= cumulated_exposure[-1]
    return cumulated_exposure, cumulated_claims


fig, ax = plt.subplots(figsize=(8, 8))

for model in [dummy, ridge_glm, poisson_glm, poisson_gbrt]:
    y_pred = model.predict(df_test)
    cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], y_pred,
                                            df_test["Exposure"])
    gini = 1 - 2 * auc(cum_exposure, cum_claims)
    label = "{} (Gini: {:.2f})".format(model[-1], gini)
    ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)

# Oracle model: y_pred == y_test
cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"],
                                        df_test["Frequency"],
                                        df_test["Exposure"])
gini = 1 - 2 * auc(cum_exposure, cum_claims)
label = "Oracle (Gini: {:.2f})".format(gini)
ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label)

# Random Baseline
ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline")
ax.set(
    title="Lorenz curves by model",
         label=u'预测值,$R^2$=%.3f' % lr2.score(X1_train, Y1_train))
plt.legend(loc='upper left')
plt.xlabel(u'数据编号', fontsize=18)
plt.ylabel(u'葡萄酒质量', fontsize=18)
plt.title(u'葡萄酒质量预测统计(降维处理)', fontsize=20)
plt.show()

### 从auc角度看效果===>效果不错
from sklearn.preprocessing import label_binarize
from sklearn import metrics

y_test_hot = label_binarize(Y_test, classes=(3, 4, 5, 6, 7, 8, 9)).ravel()

### 计算原始数据模型
## 得到预测的损失值
lr_y_score = lr.decision_function(X_test).ravel()
## 计算roc的值
lr_fpr, lr_tpr, lr_threasholds = metrics.roc_curve(y_test_hot, lr_y_score)
## 计算auc的值
lr_auc = metrics.auc(lr_fpr, lr_tpr)

## 计算降维后的数据模型
lr2_y_score = lr2.decision_function(X1_test).ravel()
## 计算roc的值
lr2_fpr, lr2_tpr, lr2_threasholds = metrics.roc_curve(y_test_hot, lr2_y_score)
## 计算auc的值
lr2_auc = metrics.auc(lr2_fpr, lr2_tpr)

print("原始数据AUC值:", lr_auc)
print("降维数据AUC值:", lr2_auc)
Example #40
0
def test(model, training_data, validation_data, test_data, loss_fn, device,
         opt):
    ''' Epoch operation in evaluation phase '''

    best_train_scores = eval_epoch(model, training_data, loss_fn, device,
                                   opt)[1]
    best_valid_scores = eval_epoch(model, validation_data, loss_fn, device,
                                   opt)[1]

    model.eval()
    count = 0
    total_loss = 0
    true_all = []
    pred_all = []
    with torch.no_grad():
        for batch in tqdm(test_data,
                          mininterval=2,
                          desc='  - (Validation) ',
                          leave=False):
            # prepare data
            if opt.feature:
                note, length, mortality, feature = map(lambda x: x.to(device),
                                                       batch)
                pred = model(note, length, feature)
            else:
                note, length, mortality = map(lambda x: x.to(device), batch)
                pred = model(note, length)
            # backward
            loss = loss_fn(pred, mortality.view(-1))
            # note keeping
            total_loss += loss.item()
            count += 1
            # probability
            true_all.append(mortality.view(-1))
            pred_all.append(F.softmax(pred)[:, 1].view(-1))
    true_all = torch.cat(true_all, axis=0)
    pred_all = torch.cat(pred_all, axis=0)
    roc_auc = roc_auc_score(true_all.cpu(), pred_all.cpu())
    precision, recall, thresholds = precision_recall_curve(
        true_all.cpu(), pred_all.cpu())
    pr_auc = auc(recall, precision)
    ap = average_precision_score(true_all.cpu(), pred_all.cpu())
    p_at_1 = precision_at_k(true_all.cpu(), pred_all.cpu(), 1)
    p_at_5 = precision_at_k(true_all.cpu(), pred_all.cpu(), 5)
    p_at_10 = precision_at_k(true_all.cpu(), pred_all.cpu(), 10)

    loss_per_word = total_loss / count
    print("----- Test Result -----")
    print("ROC AUC:", roc_auc)
    print("PR AUC:", pr_auc)
    print("Loss:", loss_per_word)
    if not os.path.exists("./results/"):
        os.mkdir("results")
    if not os.path.exists(f"./results/{opt.task}"):
        os.mkdir(f"./results/{opt.task}")
    if not os.path.exists(f"./results/{opt.task}/{opt.name}"):
        os.mkdir(f"./results/{opt.task}/{opt.name}")

    outname = f'{opt.period}.csv'
    if opt.text:
        outname = "text_" + outname
    if opt.feature:
        outname = "feature_" + outname

    print("Write Result to ", outname)
    with open(os.path.join('./results/', opt.task, opt.name, outname),
              'w') as f:
        f.write("TYPE,ROCAUC,PRAUC,AP,P@1,P@5,P@10\n")
        f.write(
            f"train,{best_train_scores[0]},{best_train_scores[1]},{best_train_scores[2]},{best_train_scores[3]},{best_train_scores[4]},{best_train_scores[5]}\n"
        )
        f.write(
            f"valid,{best_valid_scores[0]},{best_valid_scores[1]},{best_valid_scores[2]},{best_valid_scores[3]},{best_valid_scores[4]},{best_valid_scores[5]}\n"
        )
        f.write(f"test,{roc_auc},{pr_auc},{ap},{p_at_1},{p_at_5},{p_at_10}")
    def drawROCCurveFromClassifiers(classifilers,
                                    class_labels,
                                    X_train,
                                    y_train,
                                    X_test,
                                    y_test,
                                    positiveLabel=1):
        """
        トレーニングデータとテストデータを分割するイテレータから、ROC曲線を描写する.

        [Input]
            classifilers : 推定器クラスのオブジェクト
                fit() 関数と predict() 関数が実装されたクラスのオブジェクト
            
            class_labels : list <str>

        """
        # 分類器 classifers に対応したMAPの作成(最大5クラス対応)
        #tuple_makers = ( "s","x","+","^","v" )                          # タプル(定数リスト)
        #tuple_colors = ( "red","blue","lightgreen", "gray", "cyan" )    # 塗りつぶす色を表すタプル(定数リスト)
        #tuple_linestyle = ( 'k--', '-', '-.', '--', "---" )

        # classifilers 内の各弱識別器 clf の ROC 曲線を作図
        for (clf, label) in zip(classifilers, class_labels):
            # トレーニングデータで推定器 classifiler を学習 fit()
            predict = clf.fit(X_train, y_train)
            #print("predict : \n", predict )

            # test データの予想所属確率を predict_proba() で算出
            proba = predict.predict_proba(X_test)
            #print("predict_proba : \n", proba )

            # 実際の所属確率と予想の所属確率から roc_curve() 関数で ROC 曲線の性能値(FPR,TPR)を計算
            fpr, tpr, thresholds = roc_curve(
                y_true=y_test,  # True binary labels in range {0, 1} or {-1, 1} 
                y_score=
                proba[:,
                      1],  # Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions 
                pos_label=positiveLabel  # positive と見なすラベルの値
            )

            # AUC 値を計算
            roc_auc = auc(fpr, tpr)
            #print("roc_auc : \n", roc_auc )

            # 計算したROC 曲線の性能を plot
            plt.plot(
                fpr,
                tpr,  # 偽陽性率 [FPR] と真陽性率 [TPR]
                lw=2,
                label='%s (AUC = %0.2f)' % (label, roc_auc))

        # perfect performance 時の ROC 曲線 plot
        plt.plot([0, 0, 1], [0, 1, 1],
                 lw=1,
                 linestyle=':',
                 color='black',
                 label='perfect performance (AUC = 1.00)')

        # 当て推量時の ROC 曲線 & AUC値 plot
        plt.plot([0, 1], [0, 1],
                 lw=1,
                 linestyle='--',
                 color=(0.6, 0.6, 0.6),
                 label='random guessing (AUC =0.50)')

        #
        plt.title("ROC Curve [Receiver Operator Characteristic Curve]")
        plt.xlabel("FPR : false positive rate")
        plt.ylabel("TPR : true positive rate")

        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.legend(loc='best')

        return
Example #42
0
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    print("k: ", bestK) 
    print("auc =", roc_auc)
       
    cat=pd.concat([dfpredicted,dfphenotypes], axis=1, )
    cat.columns=["PredictedPhenoProba","realPheno" ]
    catsort=cat.sort_values(by="PredictedPhenoProba")
    precision, recall, thresholds=metrics.precision_recall_curve(catsort["realPheno"], catsort["PredictedPhenoProba"], pos_label=1)
    
    
    cat=pd.concat([pd.DataFrame(precision),pd.DataFrame(recall)], axis=1 )
    cat.columns=["precision","recall" ]
    catsort=cat.sort_values(by="recall")
    
    
    plt.plot( catsort["recall"],catsort["precision"],label='Precision Recall curve (area = %0.3f)' % metrics.auc(catsort["recall"],catsort["precision"]))
    #plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision Recall curve')
    plt.legend(loc="upper right")   
    a=metrics.auc(catsort["recall"],catsort["precision"])
    
    print("aupr =", a)
    plt.savefig('/Users/Iryna/Desktop/myimage.pdf', format="pdf", dpi=1200)
    out=pd.DataFrame([roc_auc,a], index=["auc", "aupr"], columns=[bestK])
    out.to_csv(dirShared+"PerfMeasures_k="+str(bestK)+".txt" , sep= "\t")
Example #43
0
def cal_auc_ks_iv(df, targets=[0, 1, 3, 7, 14, 30], text='', max_depth=2, plot=True, precision=3):
    '''
    计算 AUC KS 和 IV的值
    并画出对应的AUC图
    '''
    ks = pd.DataFrame()
    ac = pd.DataFrame()
    iv = pd.DataFrame()

    dn = [f'{n}d' for n in targets]
    cols = set(df.columns) - set(dn)

    for n in targets:
        auc_value = []
        ks_value = []
        iv_value = []

        plt.figure(figsize=(6,4), dpi=100)
        for var in cols:
            y_true = df[df[var].notnull()][f'{n}d']
            y_pred = df[df[var].notnull()][var]

            # 计算各个指标的 fpr tpr 和 thr
            fpr, tpr, thr = roc_curve(y_true, y_pred, pos_label=1)

            # 计算AUC值
            ac_single = auc(fpr, tpr)
            if ac_single < 0.5:
                fpr, tpr, thr = roc_curve(y_true, -y_pred, pos_label=1)
                ac_single = auc(fpr, tpr)
            auc_value.append(ac_single)

            # 计算K-S值
            ks_single = (tpr - fpr).max()
            ks_value.append(ks_single)

            # 计算IV值
            iv_single = cal_woe_iv(y_pred, y_true, max_depth=max_depth)[1]
            iv_value.append(iv_single)

            if plot:
                # ROC Cureve
                plt.plot(fpr, tpr, lw=1, label=f'{var}(auc=' + str(round(ac_single, precision)) + ')')
                plt.plot(fpr, tpr, lw=1)

                # Labels
                plt.grid()
                plt.plot([0,1], [0,1], linestyle='--', color=(0.6, 0.6, 0.6))
                plt.plot([0, 0, 1], [0, 1, 1], lw=1, linestyle=':', color='black')
                plt.xlabel('false positive rate')
                plt.ylabel('true positive rate')
                plt.title(f'{text}ROC for {n}d')
                plt.legend(loc='best')

        auc_part = pd.DataFrame(auc_value, columns=[f'{n}d'], index=cols)
        ac = pd.concat([ac, auc_part], axis=1)

        ks_part  = pd.DataFrame(ks_value, columns=[f'{n}d'], index=cols)
        ks = pd.concat([ks, ks_part], axis=1)

        iv_part  = pd.DataFrame(iv_value, columns=[f'{n}d'], index=cols)
        iv = pd.concat([iv, iv_part], axis=1)

    iv = np.round(iv, precision)
    ac = np.round(ac, precision)
    ks = np.round(ks, precision)
    return ac, ks, iv
Example #44
0
def metrics(X, Y, A, B, N):
    incorrect = 0
    true_pos = 0
    false_pos = 0
    true_neg = 0
    false_neg = 0

    y_true = []
    y_pred = []

    i = 0
    for x in X:
        prediction = np.argmax(stable_softmax(x, A, B))
        true_label = np.argmax(Y[i])

        y_true.append(true_label)
        y_pred.append(prediction)

        if prediction != true_label:
            incorrect += 1

        if prediction == 1 and true_label == 1:
            true_pos += 1

        if prediction == 1 and true_label == 0:
            false_pos += 1

        if prediction == 0 and true_label == 0:
            true_neg += 1

        if prediction == 0 and true_label == 1:
            false_neg += 1

        i += 1

    print("confusion matrix: ")
    print("[ ", true_neg, false_pos, " ]")
    print("[ ", false_neg, true_pos, " ]")

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Compute fpr, tpr, thresholds and roc auc
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)

    print("AUC score: ", roc_auc)

    if true_pos == 0 and false_pos == 0:
        print("WARNING::True pos and False pos both zero")
        precision = true_pos / 0.000001
        recall = true_pos / 0.000001
        F1 = 2 * ((precision * recall) / (precision + recall))
        classification_error = incorrect / N
    else:
        precision = true_pos / (true_pos + false_pos)  # true pos rate (TRP)
        recall = true_pos / (true_pos + false_neg)  #
        F1 = 2 * ((precision * recall) / (precision + recall))
        classification_error = incorrect / N

    print()

    return classification_error, precision, recall, F1, roc_auc, fpr, tpr
def showResults(pred_labels, test_labels):
    binder, nonBinder = showBinder(pred_labels)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, pred_labels)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print("SVM predicted %s binder and %s nonBinder. AUC = %s" % (binder, nonBinder, roc_auc))
    def drawROCCurveFromTrainTestIterator(classifiler,
                                          iterator,
                                          X_train,
                                          y_train,
                                          X_test,
                                          y_test,
                                          positiveLabel=1):
        """
        トレーニングデータとテストデータを分割するイテレータから、ROC曲線を描写する.

        [Input]
            classifiler : 推定器クラスのオブジェクト
                fit() 関数と predict() 関数が実装されたクラスのオブジェクト
            iterator : list
                イテレータ
        [Output]
            figure : matplotlib.figure クラスのオブジェクト
                描画される部品を納めるコンテナクラス ( Artist の派生クラス )

        """
        # Figure クラスのオブジェクト作成&グラフサイズを設定
        figure = plt.figure(figsize=(7, 5))

        # ROC 曲線を構成する偽陽性率 [FPR] と真陽性率 [TPR] の初期化
        means_tpr = 0.0  #
        means_fpr = numpy.linspace(0, 1, 100)  # [0,1] の範囲(確率)を 100 個で分割
        #all_tpr   = []                          # 空のリストで初期化

        #---------------------------------------------------------------------------------------
        # iterator 内の分割された ( train, test ) のペアでループ処理 (enumerate で並列ループ)
        # イテレータ毎に ROC曲線 & AUC の描写処理
        #---------------------------------------------------------------------------------------
        for it, (train, test) in enumerate(iterator):
            #print("X_train[train] : \n", X_train[train] )
            #print("y_train[train] : \n", y_train[train] )

            # トレーニングデータで推定器 classifiler を学習 fit()
            predict = classifiler.fit(X_train[train], y_train[train])
            #print("predict : \n", predict )

            # test データの予想所属確率を predict_proba() で算出
            proba = predict.predict_proba(X_train[test])
            #print("predict_proba : \n", proba )

            # 実際の所属確率と予想の所属確率から roc_curve() 関数で ROC 曲線の性能値(FPR,TPR)を計算
            fpr, tpr, thresholds = roc_curve(
                y_true=y_train[
                    test],  # True binary labels in range {0, 1} or {-1, 1} 
                y_score=
                proba[:,
                      1],  # Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions 
                pos_label=positiveLabel  # positive と見なすラベルの値
            )
            #print("roc_curve() retrun FPR : \n", fpr )
            #print("roc_curve() retrun TPR : \n", tpr )
            #print("roc_curve() retrun thresholds : \n", thresholds )

            # 得られた fpr (x軸値) と tpr (y軸値) の線形補間処理
            means_tpr += interp(means_fpr, fpr, tpr)

            #print("means_tpr : \n", means_tpr )
            means_tpr[0] = 0.0  # ?
            #print("means_tpr : \n", means_tpr )

            # AUC 値を計算
            roc_auc = auc(fpr, tpr)
            #print("roc_auc : \n", roc_auc )

            # 計算したROC 曲線の性能を plot
            plt.plot(
                fpr,
                tpr,  # 偽陽性率 [FPR] と真陽性率 [TPR]
                lw=1,
                label='ROC k=%d fold CV (AUC = %0.2f)' % (it + 1, roc_auc))

        # ROC の平均を plot
        means_tpr /= len(iterator)
        means_tpr[-1] = 1.0
        mean_auc = auc(means_fpr, means_tpr)
        #print("means_tpr : \n", means_tpr )

        plt.plot(means_fpr,
                 means_tpr,
                 'k--',
                 label='mean ROC (AUC = %0.2f)' % mean_auc,
                 lw=2)

        # perfect performance 時の ROC 曲線 plot
        plt.plot([0, 0, 1], [0, 1, 1],
                 lw=2,
                 linestyle=':',
                 color='black',
                 label='perfect performance (AUC = 1.00)')

        # 当て推量時の ROC 曲線 & AUC値 plot
        plt.plot([0, 1], [0, 1],
                 linestyle='--',
                 color=(0.6, 0.6, 0.6),
                 label='random guessing (AUC =0.50)')

        #
        plt.title("ROC Curve [Receiver Operator Characteristic Curve]")
        plt.xlabel("FPR : false positive rate")
        plt.ylabel("TPR : true positive rate")

        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.legend(loc='best')

        #plt.grid()
        #plt.tight_layout()

        return figure
Example #47
0
                corrctLTPC += 1
        elif true_label == 1:
            allTPC += 1
            if true_label == pred_label:
                corrctTPC += 1
    acc_lst.append(acc)
    class2_acc_lst.append(
        [corrctLTPC / float(allLTPC), corrctTPC / float(allTPC)])

    # auc roc
    true_class = np.array(test_label_set)  # true_class为数据的真实标签
    pred_scores = np.array([a[0] for a in result1])  # scores为分类其预测的得分
    fpr, tpr, thresholds = metrics.roc_curve(true_class,
                                             pred_scores,
                                             pos_label=0)  # bcc
    AUC = auc(fpr, tpr)
    # tpr fpr
    yuedeng = []
    for i in range(len(fpr)):
        yuedeng.append(tpr[i] - fpr[i])
    yuedeng_index = yuedeng.index(max(yuedeng))
    # print 'the best TPR FPR in subset-%d'%testIndex, tpr[yuedeng_index], fpr[yuedeng_index]

    auc_lst.append(AUC)
    trueAllLst += test_label_set
    scoreAllLst += [a[0] for a in result1]

    true_class = np.array(test_label_set)  # true_class为数据的真实标签
    pred_scores = np.array([a[1] for a in result1])  # scores为分类其预测的得分
    fpr0, tpr0, thresholds0 = metrics.roc_curve(true_class,
                                                pred_scores,
def getAUC(pred_labels, test_labels):
    false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, pred_labels)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    return roc_auc
print(
    classification_report(y_true=y_test,
                          y_pred=y_pred_rnd,
                          target_names=['normal', 'covid']))

fig1 = plt.figure()
sns.heatmap(data=cm,
            cmap='Blues',
            annot=True,
            annot_kws={'size': 14},
            fmt='d',
            vmin=0,
            vmax=len(y_test) / 2.)
plt.title('annotated heatmap for confusion matrix')
plt.show()
# fig1.savefig('./checkpoints/densenet121/cm_heatmap.png')

fpr, tpr, _ = roc_curve(y_true=y_test, y_score=y_pred, pos_label=None)
roc_auc = auc(x=fpr, y=tpr)
fig2 = plt.figure()
plt.plot(fpr, tpr, 'b', label='AUC = %0.4f' % roc_auc)
plt.title('Receiver Operating Characteristic')
plt.legend()
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# fig2.savefig('./checkpoints/densenet121/roc.png')
Example #50
0
    r'D:\Users\zcguo\PycharmProjects\credit_score\data\test.csv')

test_X = test_data.iloc[:, 2:]
test_y = test_data.iloc[:, 1]

test_X = trans_woe(test_X, x1_name, x1_woe, x1_cut)
test_X = trans_woe(test_X, x2_name, x2_woe, x2_cut)
test_X = trans_woe(test_X, x3_name, x3_woe, x3_cut)
test_X = trans_woe(test_X, x7_name, x7_woe, x7_cut)
test_X = trans_woe(test_X, x9_name, x9_woe, x9_cut)

test_X = test_X.iloc[:, -5:]

# gbdt model roc
X3 = sm.add_constant(test_X)
resuG = gbm.predict(X3)
recall1 = metrics.recall_score(test_y, resuG.round())
acc1 = metrics.accuracy_score(test_y, resuG.round())
print(recall1)
print(acc1)
fpr1, tpr1, threshold1 = metrics.roc_curve(test_y, resuG)
rocauc1 = metrics.auc(fpr1, tpr1)
plt.plot(fpr1, tpr1, 'b', label='AUC = %0.2f' % rocauc1)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('TPR')
plt.xlabel('FPR')
plt.show()
#ml_algorithms(data,y_train,data_test,y_test)
#X_train2: concatnate = original + embeddings'''
prediction_tp, prediction_tpol_prob = ml_algorithms(X_train2, y_train, X_test2,
                                                    y_test)
#%%
from sklearn.metrics import roc_curve, auc

fpr_pol, tpr_pol, _ = roc_curve((y_test == True).apply(int),
                                prediction_tpol_prob[:, 1])

fpr, tpr, _ = roc_curve((y_test == True).apply(int), prediction_tp[:, 1])
fprn, tprn, _ = roc_curve((y_test == True).apply(int), prediction_tn[:, 1])

print('AUC for Node2Vec Logistic + Poly features + Normal Features : ',
      auc(fpr_pol, tpr_pol))
print('AUC for Node2Vec Logistic + Linear Features + Normal Features : ',
      auc(fpr, tpr))
print('AUC for Normal Features LogisticNormal Features Logistic : ',
      auc(fprn, tprn))

plt.plot(fpr_pol,
         tpr_pol,
         'g',
         label='Node2Vec Logistic + Poly features + Normal Features')
plt.plot(fpr,
         tpr,
         'r',
         label='Node2Vec Logistic + Linear Features + Normal Features')
plt.plot(fprn, tprn, 'b', label='Normal Features Logistic')
plt.legend()
Example #52
0
graphviz.Source(dot_graph).view()
##########################################################################
##########################################################################

# Finally, let’s evaluate the tree’s performance on the test data. The predict() function can be used for
# this purpose. We can then build a confusion matrix

# 86+59/200 = 0.725

##########################################################################
############ Here we construct the ROC curve for the tree ################
##########################################################################
y_score = clf.predict_proba(X_test)

fpr, tpr, _ = roc_curve(y_test, y_score[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr,
         tpr,
         color='orange',
         label='ROC curve (area = {:0.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='blue', linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for our Decision Tree')
plt.legend(loc="lower right")
##########################################################################
##########################################################################
Example #53
0
def kfold_cv(model, xFeat, y, k):
    """
    Split xFeat into k different groups, and then use each of the
    k-folds as a validation set, with the model fitting on the remaining
    k-1 folds. Return the model performance on the training and
    validation (test) set. 


    Parameters
    ----------
    model : sktree.DecisionTreeClassifier
        Decision tree model
    xFeat : nd-array with shape n x d
        Features of the dataset 
    y : 1-array with shape n x 1
        Labels of the dataset
    k : int
        Number of folds or groups (approximately equal size)

    Returns
    -------
    trainAuc : float
        Average AUC of the model on the training dataset
    testAuc : float
        Average AUC of the model on the validation dataset
    timeElapsed: float
        Time it took to run this function
    """
    trainAuc = 0
    testAuc = 0
    timeElapsed = 0
    # TODO FILL IN
    timeElapsed = time.time()
    xFeat = np.asarray(xFeat)
    y = np.asarray(y)

    kf = KFold(n_splits=k)
    kf.get_n_splits(xFeat)

    # Loops through all splits and repeates the process for above
    for train_index, test_index in kf.split(xFeat):
        xTrain, xTest = xFeat[train_index], xFeat[test_index]
        yTrain, yTest = y[train_index], y[test_index]

        trainModel = model.fit(xTrain, yTrain)

        predictTrain = trainModel.predict_proba(xTrain)
        predictTrain = predictTrain[:, 1]

        fpr1, tpr1, thresholds = metrics.roc_curve(yTrain, predictTrain)
        trainAuc += metrics.auc(fpr1, tpr1)

        predictTest = trainModel.predict_proba(xTest)
        predictTest = predictTest[:, 1]

        fpr1, tpr1, thresholds = metrics.roc_curve(yTest, predictTest)
        testAuc += metrics.auc(fpr1, tpr1)

    trainAuc /= kf.get_n_splits(xFeat)
    testAuc /= kf.get_n_splits(xFeat)

    timeElapsed = time.time() - timeElapsed

    return trainAuc, testAuc, timeElapsed
Example #54
0
    def explain(self, param, label='', auc_plot=False):
        print('------------ Explanation -------------')
        self._file.write('------------ Explanation -------------\n')
        phi = param[0]
        theta = param[1]
        psi = param[2]
        k = param[3]

        start1 = time()
        ex = Extractor(self._clf, phi, theta, psi)
        ex.extract_forest_paths()

        ex.rule_filter()

        print('max_rule', ex.max_rule, 'max_node', ex.max_node)
        print('min_rule', ex.min_rule, 'min_node', ex.min_node)
        end1 = time()
        print("EX Running time: %s seconds" % (end1 - start1))

        print("original path number: ", ex.n_original_leaves_num)
        print("original scale: ", ex.scale)
        print("path number after rule filter: ", len(ex._forest_values))
        self._file.write('original path number: {}\n'.format(
            ex.n_original_leaves_num))
        self._file.write('original scale: {}\n'.format(ex.scale))
        self._file.write('path number after rule filter: {}\n'.format(
            len(ex._forest_values)))

        start2 = time()
        sat = Z3Process(ex, k)
        sat.leaves_partition()
        if self._maxsat_on is True:
            sat.maxsat()
            print("path number after maxsat: ", sat.n_rules_after_max,
                  " after filter: ", sat.n_rules_after_filter, '\n')
            self._file.write(
                'path number after maxsat: {}\tafter filter: {}\n\nclasses:\t{}\n\n'
                .format(sat.n_rules_after_max, sat.n_rules_after_filter,
                        self._clf.classes_))
        else:
            print('no maxsat')
            self._file.write('/no MAX-SAT\n')
        sat.run_filter()
        end2 = time()

        print("SAT Running time: %s seconds" % (end2 - start2))

        print('classes:', self._clf.classes_)

        start3 = time()
        f = FormulaeEstimator(sat,
                              conjunction=self._conjunction,
                              classes=self._clf.classes_)
        f.get_formulae_text(self._file)
        print('\n------------ Performance -------------')
        self._file.write('\n------------ Performance -------------\n')
        c_ans = self._clf.predict(self._X_test)
        ans = f.classify_samples(self._X_test)
        end3 = time()
        print("ET Running time: %s seconds" % (end3 - start3))

        RF_accuracy = accuracy_score(self._y_test, c_ans)
        EX_accuracy = accuracy_score(self._y_test, ans)
        performance = accuracy_score(c_ans, ans)

        no_ans = 0
        overlap = 0
        for each in f.sat_group:
            if len(each) > 1:
                overlap += 1
            elif len(each) == 0:
                no_ans += 1

        if label == '':  # 计算AUC
            label = self._clf.classes_[0]

        fpr, tpr, thresholds = roc_curve(self._y_test,
                                         self._clf.predict_proba(
                                             self._X_test)[:, 1],
                                         pos_label=label)
        ori_auc = auc(fpr, tpr)

        ex_test = f.classify_samples_values(self._X_test)
        efpr, etpr, ethresholds = roc_curve(self._y_test,
                                            ex_test[:, 1],
                                            pos_label=label)
        ex_auc = auc(efpr, etpr)

        print('sample size:\t', len(self._y_test))
        self._file.write('sample size:\t{}\n'.format(len(self._y_test)))

        print('RF accuracy:\t', RF_accuracy)
        self._file.write('RF accuracy:\t{}\n'.format(RF_accuracy))

        print('RF AUC:\t\t\t', ori_auc)
        self._file.write('RF AUC:\t\t\t{:.2f}\n'.format(ori_auc))

        # print('错误结果覆盖:', f_count)
        print('EX accuracy:\t', EX_accuracy)
        self._file.write('EX accuracy:\t{}\n'.format(EX_accuracy))

        print('EX AUC:\t\t\t', ex_auc)
        self._file.write('EX AUC:\t\t\t{:.2f}\n'.format(ex_auc))

        print('Coverage:\t\t',
              (len(self._y_test) - no_ans) / len(self._y_test))
        self._file.write('Coverage:\t\t{}\n'.format(
            (len(self._y_test) - no_ans) / len(self._y_test)))

        print('Overlap:\t\t', overlap / len(self._y_test))
        self._file.write('Overlap:\t\t{}\n'.format(overlap /
                                                   len(self._y_test)))

        print('*Performance:\t', performance)
        self._file.write('*Performance:\t{}\n'.format(performance))

        if auc_plot is True:
            plt.plot(fpr,
                     tpr,
                     linewidth=2,
                     label="RF ROC curve (area = {:.2f})".format(ori_auc))

            plt.plot(efpr,
                     etpr,
                     linewidth=2,
                     label="Explain ROC curve (area = {:.2f})".format(ex_auc))

            plt.xlabel("false positive rate")

            plt.ylabel("true positive rate")

            plt.ylim(0, 1.05)

            plt.xlim(0, 1.05)

            plt.legend(loc=4)  # 图例的位置

            plt.show()
def avaliacao_PerformanceC(df_train_class, predicted_train,
                           predicted_prob_train, df_test_class, predicted_test,
                           predicted_prob_test, roc_y_n):
    ### Confusion Matrix
    confusion_matrix_train = confusion_matrix(df_train_class, predicted_train)
    confusion_matrix_test = confusion_matrix(df_test_class, predicted_test)
    print("\nTraining Confusion Matrix:\n ", confusion_matrix_train)
    print("\nTesting Confusion Matrix:\n ", confusion_matrix_test)

    ### Accuracy score
    score_train = accuracy_score(df_train_class, predicted_train)
    score_test = accuracy_score(df_test_class, predicted_test)
    print("\nTraining Accuracy Score: ", score_train)
    print("\nTesting Accuracy Score: ", score_test)

    ### Precision, Recall
    precision_train = precision_score(df_train_class, predicted_train)
    precision_test = precision_score(df_test_class, predicted_test)
    print("\nTraining Precision: ", precision_train)
    print("\nTesting Precision: ", precision_test)

    recall_train = recall_score(df_train_class, predicted_train)
    recall_test = recall_score(df_test_class, predicted_test)
    print("\nTraining Recall: ", recall_train)
    print("\nTesting Recall: ", recall_test)

    ### Classification Report
    print("\nTrain Classification Report: \n",
          classification_report(df_train_class, predicted_train))
    print("\nTest Classification Report: \n",
          classification_report(df_test_class, predicted_test))

    ### F1 Score
    f1score_train = f1_score(df_train_class,
                             predicted_train)  #, average='weighted')
    f1score_test = f1_score(df_test_class,
                            predicted_test)  #, average='weighted')
    print("\nTraining F1score: ", f1score_train)
    print("\nTesting F1score: ", f1score_test)

    f1score_train = f1_score(df_train_class,
                             predicted_train,
                             average='weighted')
    f1score_test = f1_score(df_test_class, predicted_test, average='weighted')
    print("\nTraining Weigted F1score: ", f1score_train)
    print("\nTesting Weighted F1score: ", f1score_test)

    ### ROC-AUC
    if roc_y_n == 'y':
        fpr, tpr, threshold = roc_curve(df_train_class,
                                        predicted_prob_train[:, 1])
        roc_auc_train = auc(fpr, tpr)
        print("\nTraining AUC for ROC: ", roc_auc_train)
        plt.figure()
        plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc_train)
        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.legend(loc='lower right')
        plt.title('Training - Receiver Operating Characteristic')

        fpr, tpr, threshold = roc_curve(df_test_class, predicted_prob_test[:,
                                                                           1])
        roc_auc_test = auc(fpr, tpr)
        print("\nTesting AUC for ROC: ", roc_auc_test)
        plt.figure()
        plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc_test)
        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.legend(loc='lower right')
        plt.title('Testing - Receiver Operating Characteristic')
Example #56
0
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(x_train,
                    y_train,
                    batch_size=40,
                    epochs=epochs,
                    validation_split=0.25,
                    verbose=1,
                    callbacks=[tensorboard])

# Prediction and ROC/ AUC curve plotting
y_pred = model.predict(x_test)
fpr_keras, tpr_keras, thresholds_keras = roc_curve(np.ravel(y_test),
                                                   np.ravel(y_pred))
auc_keras = auc(fpr_keras, tpr_keras)

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

test_loss, test_acc = model.evaluate(x_test, y_test, batch_size=batch_size)

model.save("CNN.h5")

print('Test accuracy :', test_acc, 'Test Loss :', test_loss)
Example #57
0
File: SVM.py Project: saizhou1/ML
print("测试集:", accuracy_score(test_label, tes_label))

matrix = confusion_matrix(train_label, tra_label, labels=[0, 1])
TP = matrix[1, 1]
TN = matrix[0, 0]
FP = matrix[0, 1]
FN = matrix[1, 0]
sn = TP / (TP + FN)
sp = TN / (TN + FP)

decision_score = classifier.predict_proba(test_data)
fprs, tprs, thresholds = roc_curve(test_label, decision_score[:, 1])

# plt.plot(fprs, tprs)
# plt.show()
roc_auc = auc(fprs, tprs)
plt.figure()
lw = 2
plt.plot(fprs,
         tprs,
         color='darkorange',
         lw=lw,
         label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
Example #58
0
def mc_cv(model, xFeat, y, testSize, s):
    """
    Evaluate the model using s samples from the
    Monte Carlo cross validation approach where
    for each sample you split xFeat into
    random train and test based on the testSize.
    Returns the model performance on the training and
    test datasets.

    Parameters
    ----------
    model : sktree.DecisionTreeClassifier
        Decision tree model
    xFeat : nd-array with shape n x d
        Features of the dataset 
    y : 1-array with shape n x 1
        Labels of the dataset
    testSize : float
        Portion of the dataset to serve as a holdout. 

    Returns
    -------
    trainAuc : float
        Average AUC of the model on the training dataset
    testAuc : float
        Average AUC of the model on the validation dataset
    timeElapsed: float
        Time it took to run this function
    """
    trainAuc = 0
    testAuc = 0
    timeElapsed = 0
    # TODO FILL IN

    timeElapsed = time.time()
    xFeat = np.asarray(xFeat)
    y = np.asarray(y)

    # Repeats the same process but uses the random shuffle
    ss = ShuffleSplit(n_splits=s, test_size=testSize, random_state=0)

    for train_index, test_index in ss.split(xFeat):
        xTrain, xTest = xFeat[train_index], xFeat[test_index]
        yTrain, yTest = y[train_index], y[test_index]

        trainModel = model.fit(xTrain, yTrain)

        predictTrain = trainModel.predict_proba(xTrain)
        predictTrain = predictTrain[:, 1]

        fpr1, tpr1, thresholds = metrics.roc_curve(yTrain, predictTrain)
        trainAuc += metrics.auc(fpr1, tpr1)

        predictTest = trainModel.predict_proba(xTest)

        fpr1, tpr1, thresholds = metrics.roc_curve(yTest, predictTest[:, 1])
        testAuc += metrics.auc(fpr1, tpr1)

    trainAuc /= ss.get_n_splits(xFeat)
    testAuc /= ss.get_n_splits(xFeat)

    timeElapsed = time.time() - timeElapsed

    return trainAuc, testAuc, timeElapsed
def train_5_cross(df_pre, X,y, X_test_v1,y_test_v1, thresholds=0.45, id_1='id', csv_name=0):
    """
    功能: 五折训练并输出名单
    why: 5折一般是效果比较稳定的,用于线下做的。
    X: 训练数据X(无标签/df型)
    y: 训练数据y(标签/df型)
    X_test_v1: 预测数据X(无标签/df型)
    y_test_v1: 预测数据y(无标签/df型)
    thresholds: 阈值选择,默认0.45高精确率
    csv_name: 保存csv的名称,默认不保存
    returen:
        客户名单及情况
    """
    vali_auc_num=0  # 验证集AUC
    vali_recall_num=0  # 验证集召回率
    vali_precision_num=0  # 验证集精确率
    test_auc_num=0  # 预测集AUC
    test_recall_num=0  # 预测集召回率
    test_precision_num=0  # 预测集精确率
    y_pred_input = np.zeros(len(X_test_v1))  # 相应大小的零矩阵
    print("=============开始训练================")
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)  # 分层采样, n_splits为几折
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print("第 {} 次训练...".format(fold_+1))
        train_x, trai_y = X.loc[trn_idx], y.loc[trn_idx]
        vali_x, vali_y = X.loc[val_idx], y.loc[val_idx]
        
        # 以下为调过参的lgb模型
        clf = lgb.LGBMClassifier(max_depth=20, min_data_in_bin=5, max_bin=200,
                                min_child_samples=90, num_leaves=20, n_estimators=20000,
                                objective='binary', boosting_type='gbdt', learning_rate=0.02,
                                lambda_l2=5)
        clf.fit(train_x, trai_y, eval_set=[(train_x, trai_y), (vali_x, vali_y)], verbose=0,
               early_stopping_rounds=100, eval_metric='f1')
        
        # 不懂的去GitHub看搜LightGBM的参数解释
        
        # ===============验证集AUC操作===================
        y_prb = clf.predict_proba(vali_x)[:,1]  # 获取预测概率
        # fpr:在实际为正的样本中,被正确判断为正的比例。tpr:在实际为负的样本中,被正确判断为负的比例。thres为阈值
        fpr, tpr, thres = roc_curve(vali_y, y_prb)
        vali_roc_auc = auc(fpr, tpr)  # 获取验证集auc
        vali_auc_num += vali_roc_auc  # 将本次auc加入总值里
        print("vali auc = {0:.4}".format(vali_roc_auc))  # 本次auc的值
        # ===============预测集AUC操作===================
        y_prb_test = clf.predict_proba(X_test_v1)[:,1]  # 获取预测概率
        fpr, tpr, thres = roc_curve(y_test_v1, y_prb_test)
        test_roc_auc = auc(fpr, tpr)
        test_auc_num += test_roc_auc
        print("test auc = {0:.4}".format(test_roc_auc))
        
        # ===============验证metric操作===================
        y_pre_proba = clf.predict_proba(vali_x.values)
        y_predictions = y_pre_proba[:, 1]>thresholds  # 取阈值多少以上的为True
        cnf_matrix = confusion_matrix(vali_y, y_predictions)  # 建立矩阵
        np.set_printoptions(precision=2)  # 控制在两位数
        vali_recall = '{0:.3f}'.format(cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))  # 召回率
        vali_precision = '{0:.3f}'.format(cnf_matrix[1,1]/(cnf_matrix[0,1]+cnf_matrix[1,1]))  # 精确率
        print("vali_metric: ", vali_recall, vali_precision)
        vali_recall_num += float(vali_recall)  # 将本次召回率加入总值里
        vali_precision_num += float(vali_precision)  # 将本次精确率加入总值里
        # ===============预测metric操作===================
        y_pre_proba_test = clf.predict_proba(X_test_v1.values)
        y_predictions_test = y_pre_proba_test[:, 1]>thresholds  # 取阈值多少以上的为True
        cnf_matrix_test = confusion_matrix(y_test_v1, y_predictions_test)  # 建立矩阵
        np.set_printoptions(precision=2)  # 控制在两位数
        test_recall = '{0:.3f}'.format(cnf_matrix_test[1,1]/(cnf_matrix_test[1,0]+cnf_matrix_test[1,1]))  # 召回率
        test_precision = '{0:.3f}'.format(cnf_matrix_test[1,1]/(cnf_matrix_test[0,1]+cnf_matrix_test[1,1]))  # 精确率
        print("test_metric: ", test_recall, test_precision)
        test_recall_num += float(test_recall)  # 将本次召回率加入总值里
        test_precision_num += float(test_precision)  # 将本次精确率加入总值里
        y_pred_input += y_pre_proba_test[:, 1]  # 将每次的预测的结果写入数组中
        
    print("5折泛化,验证集AUC:{0:.3f}".format(vali_auc_num/5))  # 前面是做了5次相加,所以这次要除以5
    print("5折泛化,预测集AUC:{0:.3f}".format(test_auc_num/5))
    
    print("5折泛化,验证集recall:{0:.3f}".format(vali_recall_num/5))
    print("5折泛化,验证集precision:{0:.3f}".format(vali_recall_num/5))
    
    print("5折泛化,预测集recall:{0:.3f}".format(test_recall_num/5))
    print("5折泛化,预测集precision:{0:.3f}".format(test_recall_num/5))
    
    print("================开始输出名单==================")
    y_pred_input_end = y_pred_input / 5  # 前面是做了5次相加,所以这次要除以5
    y_pred_input_precision = y_pred_input_end > thresholds  # 获取高精确率的标签
    submission = pd.DataFrame({"id": df_pre[id_1],
                              "概率": y_pred_input_end,
                              "高精确": y_pred_input_precision})
    if csv_name != 0:
        submission.to_csv("%s预测名单.csv" % csv_name, index=False)  # 保存
    print("================输出名单名单==================")
    print(submission.head(5))
dt_t = dt.iloc[:, [0]]
dt_p = dt.iloc[:, [1]]

knn_t = knn.iloc[:, [0]]
knn_p = knn.iloc[:, [1]]

lr_t = lr.iloc[:, [0]]
lr_p = lr.iloc[:, [1]]

rf_t = rf.iloc[:, [0]]
rf_p = rf.iloc[:, [1]]

import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
fpr1, tpr1, threshold1 = metrics.roc_curve(ann_t, ann_p)
roc_auc1 = metrics.auc(fpr1, tpr1)
fpr2, tpr2, threshold2 = metrics.roc_curve(dt_t, dt_p)
roc_auc2 = metrics.auc(fpr2, tpr2)
fpr3, tpr3, threshold1 = metrics.roc_curve(knn_t, knn_p)
roc_auc3 = metrics.auc(fpr3, tpr3)
fpr4, tpr4, threshold4 = metrics.roc_curve(lr_t, lr_p)
roc_auc4 = metrics.auc(fpr4, tpr4)
fpr5, tpr5, threshold5 = metrics.roc_curve(rf_t, rf_p)
roc_auc5 = metrics.auc(fpr5, tpr5)

# method I: plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr1, tpr1, 'r', label = 'ANN(AUC = %0.2f)' % roc_auc1)
plt.plot(fpr2, tpr2, 'g', label = 'DT(AUC = %0.2f)' % roc_auc2)
plt.plot(fpr3, tpr3, 'y', label = 'KNN(AUC = %0.2f)' % roc_auc3)
plt.plot(fpr4, tpr4, 'b', label = 'LR(AUC = %0.2f)' % roc_auc4)