def main(): '''Sorted data''' inputsorted='german-sorted.xlsx' datasorted=readxlsx(inputsorted) score_sorted=datasorted[0,:] act_class_sorted=datasorted[1,:] '''calculating ROC AUC''' fpr_sorted,tpr_sorted,thresholds_sorted=metrics.roc_curve(act_class_sorted,score_sorted) aucvalue_sorted=metrics.auc(fpr_sorted,tpr_sorted) print 'AUC value of sorted data' print aucvalue_sorted #print 'Threshold' #print thresholds_sorted print '' '''Unsorted data''' inputunsorted='german-unsorted.xlsx' dataunsorted=readxlsx(inputunsorted) score_unsorted=dataunsorted[0,:] act_class_unsorted=dataunsorted[1,:] '''calculating ROC AUC''' fpr_unsorted,tpr_unsorted,thresholds_unsorted=metrics.roc_curve(act_class_unsorted,score_unsorted) aucvalue_unsorted=metrics.auc(fpr_unsorted,tpr_unsorted) print 'AUC value of sorted data' print aucvalue_unsorted
def test_roc_curve_hard(): # roc_curve for hard decisions y_true, pred, probas_pred = make_prediction(binary=True) # always predict one trivial_pred = np.ones(y_true.shape) fpr, tpr, thresholds = roc_curve(y_true, trivial_pred) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, 0.50, decimal=2) assert_equal(fpr.shape, tpr.shape) assert_equal(fpr.shape, thresholds.shape) # always predict zero trivial_pred = np.zeros(y_true.shape) fpr, tpr, thresholds = roc_curve(y_true, trivial_pred) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, 0.50, decimal=2) assert_equal(fpr.shape, tpr.shape) assert_equal(fpr.shape, thresholds.shape) # hard decisions fpr, tpr, thresholds = roc_curve(y_true, pred) roc_auc = auc(fpr, tpr) assert_array_almost_equal(roc_auc, 0.78, decimal=2) assert_equal(fpr.shape, tpr.shape) assert_equal(fpr.shape, thresholds.shape)
def roc_calculation(y_pred, y_test, model, type = sys.argv[2]): plt.figure() if type == 'gender': fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=0) roc_auc = metrics.auc(fpr, tpr) plt.plot(fpr,tpr,label='ROC curve of class {0} (area = {1:0.2f})' ''.format(GENDER_CLASSES[0], roc_auc)) fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1) roc_auc = metrics.auc(fpr, tpr) plt.plot(fpr,tpr,label='ROC curve of class {0} (area = {1:0.2f})' ''.format(GENDER_CLASSES[1], roc_auc)) else: for i in [0,1,2,3,4]: fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=i) roc_auc = metrics.auc(fpr, tpr) plt.plot(fpr,tpr,label='ROC curve of class {0} (area = {1:0.2f})' ''.format(AGE_CLASSES[i], roc_auc)) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show() plt.savefig('experiments/fensemble-roc-'+model+'.png')
def plot_ROC(classifier, X,y): cv = StratifiedKFold(y, n_folds=2) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.show()
def classification_metrics (targets, preds, probs=None): if probs != None: fpr, tpr, thresholds = roc_curve(targets, probs[:, 1], 1) roc_auc = auc(fpr, tpr) else: fpr, tpr, thresholds = roc_curve(targets, preds, 1) roc_auc = auc(fpr, tpr) cm = confusion_matrix(targets, preds) #accuracy acc = accuracy_score(targets, preds) #recall? True Positive Rate or Sensitivity or Recall sens = recall_score(targets, preds) #precision prec = precision_score(targets, preds) #f1-score f1 = f1_score(targets, preds, np.unique(targets), 1) tnr = 0.0 #True Negative Rate or Specificity (tn / (tn+fp)) if len(cm) == 2: spec = float(cm[0,0])/(cm[0,0] + cm[0,1]) return acc, sens, spec, prec, f1, fpr, tpr, roc_auc
def plot_roc_cv(classifier, X, y, cv): ''' cv = KFold(len(y),n_folds=5) ''' mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show()
def evaluation(self, test_data, test_label): dinx = np.array(list(self.train_drugs)) DS = self.dsMat[:, dinx] tinx = np.array(list(self.train_targets)) TS = self.tsMat[:, tinx] scores = [] if self.K2 > 0: for d, t in test_data: if d in self.train_drugs: if t in self.train_targets: val = np.sum(self.U[d, :]*self.V[t, :]) else: jj = np.argsort(TS[t, :])[::-1][:self.K2] val = np.sum(self.U[d, :]*np.dot(TS[t, jj], self.V[tinx[jj], :]))/np.sum(TS[t, jj]) else: if t in self.train_targets: ii = np.argsort(DS[d, :])[::-1][:self.K2] val = np.sum(np.dot(DS[d, ii], self.U[dinx[ii], :])*self.V[t, :])/np.sum(DS[d, ii]) else: ii = np.argsort(DS[d, :])[::-1][:self.K2] jj = np.argsort(TS[t, :])[::-1][:self.K2] v1 = DS[d, ii].dot(self.U[dinx[ii], :])/np.sum(DS[d, ii]) v2 = TS[t, jj].dot(self.V[tinx[jj], :])/np.sum(TS[t, jj]) val = np.sum(v1*v2) scores.append(np.exp(val)/(1+np.exp(val))) elif self.K2 == 0: for d, t in test_data: val = np.sum(self.U[d, :]*self.V[t, :]) scores.append(np.exp(val)/(1+np.exp(val))) prec, rec, thr = precision_recall_curve(test_label, np.array(scores)) aupr_val = auc(rec, prec) fpr, tpr, thr = roc_curve(test_label, np.array(scores)) auc_val = auc(fpr, tpr) return aupr_val, auc_val
def plot_roc_class(x, y, fit_class, **kwargs): kf = KFold(len(y), n_folds=10, shuffle=True) y_prob = np.zeros((len(y), 2)) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) for i, (train_index, test_index) in enumerate(kf): x_train, x_test = x[train_index], x[test_index] y_train = y[train_index] clf = fit_class(**kwargs) clf.fit(x_train, y_train) # Predict probabilities, not classes y_prob[test_index] = clf.predict_proba(x_test) fpr, tpr, thresholds = roc_curve(y[test_index], y_prob[test_index, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) mean_tpr /= len(kf) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show()
def learning(X, y, depth, eta, rounds, subs=1.0): rng = np.random.RandomState() skf = cross_validation.StratifiedKFold(y, n_folds=3, shuffle=True, random_state=rng) trscores, cvscores = [], [] num_round = rounds param = {"max_depth": depth, "eta": eta, "sub_sample": subs, "silent": 1, "objective": "binary:logistic"} for train_index, test_index in skf: print("TRAIN:", train_index, "TEST:", test_index) #### cross validations ##### X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) watchlist = [(dtest, "eval"), (dtrain, "train")] bst = xgb.train(param, dtrain, num_round, watchlist) ptrain = bst.predict(dtrain) ptest = bst.predict(dtest) trscore = auc(y_train, ptrain) cvscore = auc(y_test, ptest) trscores.append(trscore) cvscores.append(cvscore) return np.mean(trscores), np.mean(cvscores), bst
def display_roc(): thresholds = np.linspace(0, 1, 21) for hash_name in hash_names: tpr = [] fpr = [] with open(hash_name + ".same", 'r+b') as f: same_family_dm = np.array(cPickle.load(f)) same_family_uniqw, same_family_inverse = np.unique(same_family_dm, return_inverse=True) same_family_dmlist = dict(zip(same_family_uniqw, np.bincount(same_family_inverse))) with open(hash_name + ".diff", 'r+b') as f: diff_family_dm = np.array(cPickle.load(f)) diff_family_uniqw, diff_family_inverse = np.unique(diff_family_dm, return_inverse=True) diff_family_dmlist = dict(zip(diff_family_uniqw, np.bincount(diff_family_inverse))) for threshold in thresholds: tp = fp = 0 for dm in same_family_dmlist: if dm <= threshold: tp += same_family_dmlist[dm] for dm in diff_family_dmlist: if dm <= threshold: fp += diff_family_dmlist[dm] tpr.append(tp*1.0/same_family_dm.size) fpr.append(fp*1.0/diff_family_dm.size) print sm.auc(fpr, tpr) print "Fuzzy hashing algorithm: %s, AUC: %f" %(hash_name, sm.auc(fpr, tpr)) plt.figure(0) plt.plot(fpr, tpr, label=hash_name) plt.ylim(0.75, 1) plt.legend(loc='best') plt.title("ROC curve for different algorithms") plt.xlabel("False posive rate") plt.ylabel("True posive rate") plt.show()
def plot_roc_estimator(estimator, x, y): kf = KFold(len(y), n_folds=10, shuffle=True) y_prob = np.zeros((len(y), 2)) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) for i, (train_index, test_index) in enumerate(kf): x_train, x_test = x[train_index], x[test_index] y_train = y[train_index] estimator.fit(x_train, y_train) y_prob[test_index] = estimator.predict_proba(x_test) fpr, tpr, thresholds = roc_curve(y[test_index], y_prob[test_index, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) mean_tpr /= len(kf) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.show()
def eva(fff1, fff2, fff3, fff4, rocfile): truth = open(fff1) pred = open(fff2) y = [float(line.split(' ',1)[0]) for line in truth] p = [float(line) for line in pred] fpr, tpr, thresholds = roc_curve(y, p, pos_label=1) print auc(fpr, tpr) plt.figure(figsize=(4, 4), dpi=80) x = [0.0, 1.0] plt.plot(x, x, linestyle='dashed', color='red', linewidth=2, label='random') plt.xlim(0.0, 1.0) plt.ylim(0.0, 1.0) plt.xlabel("FPR", fontsize=14) plt.ylabel("TPR", fontsize=14) plt.title("ROC Curve", fontsize=14) plt.plot(fpr, tpr, linewidth=2, label = "adaboost_fea1") truth = open(fff3) pred = open(fff4) y = [float(line.split(' ',1)[0]) for line in truth] p = [float(line) for line in pred] fpr, tpr, thresholds = roc_curve(y, p, pos_label=1) print auc(fpr, tpr) plt.plot(fpr, tpr, linewidth=2, label = "adaboost_fea2") plt.legend(fontsize=10, loc='best') plt.tight_layout() plt.savefig(rocfile)
def main(): (X,y) = skd.make_classification() N = X.shape[0] X = np.append(X,np.ones((N,1)),axis=1) y = 2*y-1 skf = StratifiedKFold(y,5) for train,test in skf: X_train = X[train,:] y_train = y[train] X_test = X[test,:] y_test = y[test] C = 0.01 # dual co-ordinate descent SVM clf = SVMCD(C) clf.fit(X_train,y_train,w_prior=np.ones(21)) pred = clf.decision_function(X_test) score = clf.score(X_test,y_test) fpr, tpr, thresholds = metrics.roc_curve(y_test, pred) print score, metrics.auc(fpr, tpr), "//", w1 = clf.w; # standard svm clf = SVC(C=C,kernel='linear') clf.fit(X_train, y_train) pred = clf.decision_function(X_test) score = clf.score(X_test,y_test) fpr, tpr, thresholds = metrics.roc_curve(y_test, pred) print score, metrics.auc(fpr, tpr) w2 = clf.coef_ w2.shape = (21,)
def makeROCPlot(self, filename, title, labels, roc_data): y = np.array(self.create_binary_label_matrix(labels)) n_classes = y.shape[1] fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y[:, i], roc_data[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) fpr["micro"], tpr["micro"], _ = roc_curve(y.ravel(), roc_data.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Plot ROC curve plt.figure() plt.plot(fpr["micro"], tpr["micro"],label='Average ROC curve (area = {0:0.2f})'.format(roc_auc["micro"])) for i in range(n_classes): plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'.format(i+1, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title(title) plt.legend(loc="lower right") plt.savefig("figs/"+filename+'.png',bbox_inches='tight') #plt.show() plt.clf() return roc_auc
def draw(X, y, classifier): cv = StratifiedKFold(y, n_folds=6) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label="ROC fold %d (area = %0.2f)" % (i, roc_auc)) plt.plot([0, 1], [0, 1], "--", color=(0.6, 0.6, 0.6), label="Luck") mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, "k--", label="Mean ROC (area = %0.2f)" % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("Receiver operating characteristic example") plt.legend(loc="lower right") plt.show()
def calculate_roc(truth, predictions): lb_truth = label_binarize(truth.iloc[:, -1].astype(int), np.arange(n_classes)) lb_prediction = label_binarize(predictions.iloc[:, -1].astype(int), np.arange(n_classes)) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(len(letter_set)): fpr[i], tpr[i], _ = roc_curve(lb_truth[:, i], lb_prediction[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(lb_truth.ravel(), lb_prediction.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) return fpr, tpr, roc_auc
def plotROC(y_score, labels, outpdf): n_classes = labels.shape[1] # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(labels[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(labels.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Plot of a ROC curve for a specific class plt.figure() plt.figure(figsize = (6,6)) # Plot ROC curve for i in range(4): plt.plot(fpr[i], tpr[i], label='' + classifiers[i]+ ' AUC={1:0.2f}' ''.format(i, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False positive rate(1-Specificity)') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc="lower right") savefig(outpdf) plt.show()
def evaluation(self, test_data, test_label): scores = self.predictR[test_data[:, 0], test_data[:, 1]] prec, rec, thr = precision_recall_curve(test_label, scores) aupr_val = auc(rec, prec) fpr, tpr, thr = roc_curve(test_label, scores) auc_val = auc(fpr, tpr) return aupr_val, auc_val
def roc_auc_truncated(labels, predictions, tpr_thresholds=(0.2, 0.4, 0.6, 0.8), roc_weights=(4, 3, 2, 1, 0)): """ Compute weighted area under ROC curve. :param labels: array-like, true labels :param predictions: array-like, predictions :param tpr_thresholds: array-like, true positive rate thresholds delimiting the ROC segments :param roc_weights: array-like, weights for true positive rate segments :return: weighted AUC """ assert np.all(predictions >= 0.) and np.all( predictions <= 1.), 'Data predictions are out of range [0, 1]' assert len(tpr_thresholds) + \ 1 == len(roc_weights), 'Incompatible lengths of thresholds and weights' fpr, tpr, _ = roc_curve(labels, predictions) area = 0. tpr_thresholds = [0.] + list(tpr_thresholds) + [1.] for index in range(1, len(tpr_thresholds)): tpr_cut = np.minimum(tpr, tpr_thresholds[index]) tpr_previous = np.minimum(tpr, tpr_thresholds[index - 1]) area += roc_weights[index - 1] * \ (auc(fpr, tpr_cut, reorder=True) - auc(fpr, tpr_previous, reorder=True)) tpr_thresholds = np.array(tpr_thresholds) # roc auc normalization to be 1 for an ideal classifier area /= np.sum((tpr_thresholds[1:] - tpr_thresholds[:-1]) * np.array(roc_weights)) return area
def draw_roc_curve(classifier, cv, X, y): mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) colors = cycle(['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange']) lw = 2 for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show()
def prc_curve(targets_ts, scores_ts, targets_tr, scores_tr, model_no): plt.clf() colors = ['r', 'g', 'b', 'y', 'k', 'm'] classes = ['lunge', 'wing_threat', 'charge', 'hold', 'tussle', 'other'] for i in range(NUM_CLASSES): i = 5 precision_ts, recall_ts, thresholds_ts = precision_recall_curve(targets_ts[:,i], scores_ts[:,i], pos_label=1) precision_tr, recall_tr, thresholds = precision_recall_curve(targets_tr[:,i], scores_tr[:,i], pos_label=1) area_ts = auc(recall_ts, precision_ts) area_tr = auc(recall_tr, precision_tr) test_i, f1_ts = compute_f1(precision_ts, recall_ts) train_i, f1_tr = compute_f1(precision_tr, recall_tr) print thresholds_ts[train_i] plt.plot(recall_ts, precision_ts, '--',label="%s test AUC: %0.3f f1: %0.3f" %(classes[i], area_ts, f1_ts), color=colors[i]) plt.plot(recall_tr, precision_tr, label="%s train AUC: %0.3f f1: %0.3f" %(classes[i],area_tr, f1_tr), color=colors[i]) break plt.title('Precision Recall of MC Model ' + model_no) plt.xlabel("Recall") plt.ylabel("Precision") plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.legend(loc="lower left", prop={'size':8}) plt.grid(b=True, which='major') figure = plt.gcf() figure.set_size_inches(8, 6) plt.savefig('PRC_mc_model' + model_no +'.png')
def linreg_ccv_plot_roc(num_folds): global data folds = pd.create_folds(data, num_folds) classifier = LinearRegression() mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i in range(num_folds): test_x, test_y, train_x, train_y = pd.split_into_sets(data, folds, i) probs = classifier.fit(train_x, train_y).predict(test_x) fpr, tpr, thresholds = roc_curve(test_y, probs) #takes, y_true and y_score mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(folds) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('%d-fold Clustered Cross-Validation' % num_folds) plt.legend(loc="lower right") plt.show()
def plot_roc_curves(results): mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] # Plot for each cross validation results for i in range(len(results)): fpr = results[i][0] tpr = results[i][1] mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(results[i][0], results[i][1]) pl.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) # Plot default for 'luck' pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(results) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) # Plot the mean ROC curve pl.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) # lot the other axis pl.xlim([-0.05, 1.05]) pl.ylim([-0.05, 1.05]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic example') pl.legend(loc="lower right", prop=font_prop)
def compute_rocauc(self): """ :return: """ # Binarize the output y_test = label_binarize(self.y_test, classes=list(range(self.n_classes))) # Compute ROC curve and ROC area for each class y_score = self.clf.predict_proba(self.X_test) fpr = dict() tpr = dict() roc_auc = dict() for i in range(self.n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) self.report["roc_auc"] = dict( fpr={str(k): v.tolist() for k, v in fpr.items()}, tpr={str(k): v.tolist() for k, v in tpr.items()}, roc_auc={str(k): v.tolist() for k, v in roc_auc.items()} )
def roc_plot(X,y, classifier,filename): from sklearn.metrics import roc_curve, auc from sklearn.cross_validation import StratifiedKFold plt.figure(figsize=(10,9)) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] cv = StratifiedKFold(y, n_folds=5) for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += scipy.interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.savefig(filename+'.pdf')
def CV(clf, X, y, n_folds=10): """ returns gini values and classifier """ from sklearn.cross_validation import StratifiedKFold from sklearn.metrics import roc_curve, auc from sklearn.cross_validation import train_test_split from sklearn.metrics import roc_curve, auc import pandas as pd cv = StratifiedKFold(y, n_folds=n_folds) auccka = [] try: for train_ix, test_ix in cv: clf.fit(X.ix[train_ix,:], y[train_ix]) y_pred = clf.predict_proba(X.ix[test_ix,:])[:,1] y_true = y[test_ix] fpr, tpr, tresholds = roc_curve(y_true, y_pred) auccka.append(auc(fpr,tpr)) except Exception: # treba kdyz vyjde log(0) v nejakem foldu X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123) # uprava: train_test_split vraci ndarray, ja chci ale DataFrame: X_train = pd.DataFrame(X_train) X_test = pd.DataFrame(X_test) X_train.columns = X.columns X_test.columns = X.columns # konec upravy clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test)[:,1] y_true = y_test fpr, tpr, tresholds = roc_curve(y_true, y_pred) auccka.append(auc(fpr, tpr)) gini = [2*auc-1 for auc in auccka] return gini, clf
def AUC(test_labels, predicted_labels, n_classes): y_test = testProbVector(n_classes, test_labels) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(0,n_classes): fpr[i], tpr[i], _ = roc_curve(y_test[:,i], predicted_labels[:,i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), predicted_labels.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Compute macro-average ROC curve and ROC area # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) return np.asarray(roc_auc)
def bootstrap(n_percent, m_times): global data classifier = LogisticRegression() mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i in range(m_times): test_x, test_y, train_x, train_y = pd.bootstrap_sampling(data, n_percent, i) #use i as seed probs = classifier.fit(train_x, train_y).predict_proba(test_x) fpr, tpr, thresholds = roc_curve(test_y, probs[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) #plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) plt.plot(fpr, tpr, lw=1) #lets not do labels plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= m_times mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Bootstrap %d percent of data %d times (SCOREDATA.vina.balanced)' % (n_percent, m_times)) plt.legend(loc="lower right") plt.show()
def eva_complex(fff1, y1, fff3, y2, rocfile): truth = open(fff1) y = [float(line.split(' ',1)[0]) for line in truth] p = y1 fpr, tpr, thresholds = roc_curve(y, p, pos_label=1) print auc(fpr, tpr) plt.figure(figsize=(4, 4), dpi=80) x = [0.0, 1.0] plt.plot(x, x, linestyle='dashed', color='red', linewidth=2, label='random') plt.xlim(0.0, 1.0) plt.ylim(0.0, 1.0) plt.xlabel("FPR", fontsize=14) plt.ylabel("TPR", fontsize=14) plt.title("ROC Curve", fontsize=14) plt.plot(fpr, tpr, linewidth=2, label = "complex_allfea") ''' truth = open(fff3) y = [float(line.split(' ',1)[0]) for line in truth] p = y2 fpr, tpr, thresholds = roc_curve(y, p, pos_label=1) print auc(fpr, tpr) plt.plot(fpr, tpr, linewidth=2, label = "complex_fea2") ''' plt.legend(fontsize=10, loc='best') plt.tight_layout() plt.savefig(rocfile)
def classify_only(X, Y, model): cv = cross_validation.StratifiedKFold(Y, n_folds=K_FOLDS) # print len(Y) mean_tpr = 0.0 mean_fpr = numpy.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas_ = model.fit(X.values[train], Y.values[train]).predict_proba(X.values[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(Y.values[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example: '+model.__class__.__name__) plt.legend(loc="lower right") plt.show() print "Plot done"
def plot_roc_curve(fper, tper, roc_auc, n_classes): ''' This funcion plots the ROC(Reciever Operating Characteristic) curve and calculates the area under the curve(AUC). Parameters: fper : array-like tper : array-like roc_auc : array-like n_classes : int Returns: null ''' # Aggregating all false positive rates all_fper = np.unique(np.concatenate([fper[i] for i in range(n_classes)])) lw = 2 # Then interpolate all ROC curves at these points mean_tper = np.zeros_like(all_fper) for i in range(n_classes): mean_tper += np.interp(all_fper, fper[i], tper[i]) # Average it and compute AUC mean_tper /= n_classes fper["macro"] = all_fper tper["macro"] = mean_tper roc_auc["macro"] = auc(fper["macro"], tper["macro"]) # Plotting all ROC curves plt.figure() # micro-avg plt.plot(fper["micro"], tper["micro"], label='micro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4) # macro-avg plt.plot(fper["macro"], tper["macro"], label='macro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4) colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) for i, color in zip(range(n_classes), colors): plt.plot(fper[i], tper[i], color=color, lw=lw, label='ROC curve of class {0} (area = {1:0.2f})' ''.format(i, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--', lw=lw) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show()
def evaluate(model: net, true: torch.tensor, cond: torch.tensor, out: torch.tensor, loss_fn: Callable[[torch.tensor, torch.tensor], torch.tensor], acc: Callable[[torch.tensor, torch.tensor], float], test_dir: str) -> None: """Use trained model to generate ROC curve and magnitude distribution plots. Args: model (net): the feedforward network true (torch.tensor): the true galaxy magnitudes used as inputs cond (torch.tensor): the observing conditions used as inputs out (torch.tensor): ground truth observed galaxy magnitudes loss_fn (Callable[[torch.tensor, torch.tensor], torch.tensor]): loss function acc (Callable[[torch.tensor, torch.tensor], float]): accuracy function test_dir (str): the directory to save the plots to. """ model.eval() noise = Variable(torch.randn(cond.shape[0], 1)).cuda(non_blocking=True) predout = model(cond, true, noise).squeeze().data.cpu() loss = loss_fn(predout, out).item() out = out.cpu().numpy() true = true.cpu().numpy() pred = (predout >= 0.5).int().numpy() accuracy = acc(pred, out) fpr, tpr, _ = roc_curve(out, predout, pos_label=1) roc_auc = auc(fpr, tpr) r = -2.5 * np.log10(true[pred == 1][:, 1]) + 30. i = -2.5 * np.log10(true[pred == 1][:, 2]) + 30. z = -2.5 * np.log10(true[pred == 1][:, 3]) + 30. plt.figure() plt.plot(fpr, tpr, lw=2, label="AUC = {:.2f}".format(roc_auc)) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("Detection ROC Curve") plt.legend(loc="best") plt.grid(True) plt.savefig(os.path.join(test_dir, "detroc.png")) plt.figure() plt.hist2d(i, r - i, bins=100, range=[[20, 25], [-2, 2]]) plt.xlabel("$i_\mathrm{true}$") plt.ylabel("$(r-i)_\mathrm{true}$") plt.colorbar() plt.savefig(os.path.join(test_dir, "ri_i_t.png")) plt.figure() plt.hist2d(i - z, r - i, bins=100, range=[[-2, 2], [-2, 2]]) plt.xlabel("$(i-z)_\mathrm{true}$") plt.ylabel("$(r-i)_\mathrm{true}$") plt.colorbar() plt.savefig(os.path.join(test_dir, "ri_iz_t.png")) logging.info(f"- Test metrics : loss = {loss}; accuracy = {accuracy}; " f"roc_auc = {roc_auc}") return None
print(accuracy_score(train_labels, train_pred), file=f) print('\n********AdaBoosting_Performance on the Test Set********', file=f) print(confusion_matrix(test_labels, test_pred), file=f) print(classification_report(test_labels, test_pred), file=f) print(accuracy_score(test_labels, test_pred), file=f) test_pred = ada.fit(train_features, train_labels).predict(test_features) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(2): fpr[i], tpr[i], _ = roc_curve(test_labels[:], test_pred[:]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(test_labels.ravel(), test_pred.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) plt.figure() lw = 2 plt.plot(fpr[0], tpr[0], color='darkorange', label='ROC curve (area = %0.2f)' % (roc_auc[0])) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05])
def model(protein,i): if i == 0: with open('.\\result\\'+protein+'-CNN_test.csv','a+') as f: f.write('cycle--epoch'+','+'ACC'+','+'SPE'+','+'SEN'+','+'AUC'+'\n') with open('.\\result\\'+protein+'-CNN_val.csv','a+') as e: e.write('cycle--epoch'+','+'ACC'+','+'SPE'+','+'SEN'+','+'AUC'+'\n') data = pd.read_table(path+protein+'-maccs.csv',delimiter=',') X = data.iloc[:,0] y = data.iloc[:,2] X_,X_test,y_,y_test = train_test_split(X,y,test_size=0.2) X_train,X_validation,y_train,y_validation = train_test_split(X_,y_,test_size=0.25) X_train,X_validation,X_test = resplit(X_train),resplit(X_validation),resplit(X_test) X_train,y_train = del_oversize(X_train,y_train,MAXLEN) X_test,y_test = del_oversize(X_test,y_test,MAXLEN) X_validation,y_validation = del_oversize(X_validation,y_validation,MAXLEN) X_train,X_test,X_validation = onehot_encode(MAXLEN,X_train),onehot_encode(MAXLEN,X_test),onehot_encode(MAXLEN,X_validation) X_train = X_train[:, np.newaxis, :, :] X_test = X_test[:, np.newaxis, :, :] X_validation = X_validation[:, np.newaxis, :, :] y_train,y_test,y_validation = y_train.values,y_test.values,y_validation.values X_train,X_test,X_validation,y_train,y_test,y_validation = t.from_numpy(X_train).type(t.FloatTensor),t.from_numpy(X_test).type(t.FloatTensor),t.from_numpy(X_validation).type(t.FloatTensor),t.from_numpy(y_train),t.from_numpy(y_test),t.from_numpy(y_validation) net=CNN(N_HIDDEN,DROPOUT) if i == 0: print(net) optimizer = t.optim.Adam(net.parameters(), lr=LR, weight_decay=WEIGHT_DECAY) loss_func = nn.CrossEntropyLoss() test_x,test_y,validation_x,validation_y = Variable(X_test), Variable(y_test), Variable(X_validation), Variable(y_validation) train_loader = create_loader(X_train,y_train) for epoch in range(EPOCH): for step, (x, y) in enumerate(train_loader): b_x = Variable(x) # batch x b_y = Variable(y) # batch y output = net(b_x) # cnn output pred_t_y = t.max(F.softmax(output,dim=1), 1)[1] accuracy_t_y = (pred_t_y == b_y).data.numpy().sum() / b_y.size(0) loss = loss_func(output, b_y) # cross entropy loss optimizer.zero_grad() # clear gradients for this training step loss.backward() # backpropagation, compute gradients optimizer.step() # apply gradients if step == math.floor(X_train.size(0)/BATCH_SIZE): net.eval() #有dropout时,预测时转换模式,把dropout断掉 validation_output = net(validation_x) score_v = F.softmax(validation_output,dim=1)[:,1].data.numpy() fpr_v,tpr_v,thresholds_v = metrics.roc_curve(y_validation.numpy(),score_v,pos_label=1) auc_v = metrics.auc(fpr_v,tpr_v) pred_v_y = t.max(F.softmax(validation_output,dim=1), 1)[1] accuracy_v = (pred_v_y == validation_y).data.numpy().sum() / validation_y.size(0) if (epoch+1) % 10 == 0: confusion_v = metrics.confusion_matrix(y_validation.numpy(),pred_v_y.data.numpy()) TP_v = confusion_v[1, 1] TN_v = confusion_v[0, 0] FP_v = confusion_v[0, 1] FN_v = confusion_v[1, 0] sen_v = TP_v / (TP_v+FN_v) spe_v = TN_v / (TN_v+FP_v) with open('.\\result\\'+protein+'-CNN_val.csv','a+') as e: e.write(str(i)+'--'+str(epoch)+','+'{:.3f}'.format(float(accuracy_v))+','+'{:.3f}'.format(float(spe_v))+','+'{:.3f}'.format(float(sen_v))+','+'{:.3f}'.format(float(auc_v))+'\n') text = 'Epoch: ' + str(epoch) + ' | acc: %.4f' % accuracy_v + ' | auc: %.4f' % auc_v x_1 = t.Tensor([epoch]) y_3 = t.Tensor([loss.data[0]]) #交叉熵损失 y_1 = t.Tensor([accuracy_t_y]) y_2 = t.Tensor([accuracy_v]) y_4 = t.Tensor([auc_v]) vis.line(X=x_1,Y=y_3,win='pic1',update='append' if epoch >0 else None,opts=dict(title='acc & loss')) vis.updateTrace(X=x_1, Y=y_1,win='pic1',name='train') vis.updateTrace(X=x_1, Y=y_2,win='pic1',name='validation') vis.updateTrace(X=x_1, Y=y_4,win='pic1',name='auc') vis.text(text,win='log',opts={'title':'nn accuracy'},append=True) net.train() if (epoch+1) % 10 == 0: net.eval() test_output = net(test_x) score = F.softmax(test_output,dim=1)[:,1].data.numpy() fpr,tpr,thresholds = metrics.roc_curve(y_test.numpy(),score,pos_label=1) auc_t = metrics.auc(fpr,tpr) pred_y = t.max(F.softmax(test_output,dim=1), 1)[1] acc = (pred_y == test_y).data.numpy().sum() / test_y.size(0) confusion = metrics.confusion_matrix(y_test.numpy(),pred_y.data.numpy()) TP = confusion[1, 1] TN = confusion[0, 0] FP = confusion[0, 1] FN = confusion[1, 0] sen = TP / (TP+FN) spe = TN / (TN+FP) print('=========='+protein+'-CNN=========') print('The accuracy is: %.3f' %acc) print('The specificity is: %.3f' %spe) print('The sensitivity is: %.3f' %sen) print('The auc is: %.3f' %auc_t) print('============================') with open('.\\result\\'+protein+'-CNN_test.csv','a+') as f: f.write(str(i)+'--'+str(epoch)+','+'{:.3f}'.format(float(acc))+','+'{:.3f}'.format(float(spe))+','+'{:.3f}'.format(float(sen))+','+'{:.3f}'.format(float(auc_t))+'\n') net.train()
def plot_roc_curve(y_test, y_pred, title=None, micro=False, macro=True, per_class=False): if y_test.ndim == 2: num_instances, num_classes = y_test.shape else: num_instances = y_test.shape[0] num_classes = 1 if (num_classes != 2) and (y_test.ndim == 1): bi_y_test = label_binarize(y_test, classes=range(num_classes)) else: bi_y_test = y_test fpr = {} tpr = {} roc_auc = {} for i in range(num_classes): fpr[i], tpr[i], _ = roc_curve(bi_y_test[:, i], y_pred[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) fpr['micro'], tpr['micro'], _ = roc_curve(y_test.ravel(), y_pred.ravel()) roc_auc['micro'] = auc(fpr['micro'], tpr['micro']) # Compute macro-average ROC curve and AUC # Aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(num_classes)])) # Interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(num_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Average and compute AUC mean_tpr /= num_classes fpr['macro'] = all_fpr tpr['macro'] = mean_tpr roc_auc['macro'] = auc(fpr['macro'], tpr['macro']) # Plot all ROC curves plt.figure(figsize=(10, 10)) if per_class == True: for i in range(num_classes): plt.plot(fpr[i], tpr[i], alpha=0.2, label='ROC curve of class {0} (area = {1:0.4f})' ''.format(i+1, roc_auc[i])) if micro == True: plt.plot(fpr['micro'], tpr['micro'], label='micro-average ROC curve (area = {0:0.4f})' ''.format(roc_auc['micro']), color='orangered', linestyle=':', linewidth=3) if macro == True: plt.plot(fpr['macro'], tpr['macro'], label='macro-average ROC curve (area = {0:0.4f})' ''.format(roc_auc['macro']), color='navy', linestyle=':', linewidth=3) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xticks(fontsize=13) plt.xticks(fontsize=13) plt.xlabel('False Positive Rate', fontsize=16) plt.ylabel('True Positive Rate', fontsize=16) if type(title) == str: plt.title(title, fontsize=16) elif title != None: print('Title must be a string.') plt.title('ROC Curves', fontsize=16) else: plt.title('ROC Curves', fontsize=16) plt.legend(loc=4) plt.show()
def sim_same_and_diff_category_samples(self, df, cat_index=1, dist_type='cosine', equal_var=False, plot_roc=True, precalc_dist=False, calc_roc=True): ''' Calculate the similarity of samples from the same and different categories. The cat_index gives the index of the category, where 1 in the first category ''' cols = df.columns.tolist() if type(precalc_dist) == bool: # compute distnace between rows (transpose to get cols as rows) dist_arr = 1 - pdist(df.transpose(), metric=dist_type) else: dist_arr = precalc_dist # generate sample names with categories sample_combos = list(combinations(range(df.shape[1]), 2)) sample_names = [ str(ind) + '_same' if cols[x[0]][cat_index] == cols[x[1]][cat_index] else str(ind) + '_different' for ind, x in enumerate(sample_combos) ] ser_dist = pd.Series(data=dist_arr, index=sample_names) # find same-cat sample comparisons same_cat = [x for x in sample_names if x.split('_')[1] == 'same'] # find diff-cat sample comparisons diff_cat = [x for x in sample_names if x.split('_')[1] == 'different'] # make series of same and diff category sample comparisons ser_same = ser_dist[same_cat] ser_same.name = 'Same Category' ser_diff = ser_dist[diff_cat] ser_diff.name = 'Different Category' sim_dict = {} roc_data = {} sim_data = {} sim_dict['same'] = ser_same sim_dict['diff'] = ser_diff pval_dict = {} ttest_stat, pval_dict['ttest'] = ttest_ind(ser_diff, ser_same, equal_var=equal_var) ttest_stat, pval_dict['mannwhitney'] = mannwhitneyu(ser_diff, ser_same) if calc_roc: # calc AUC true_index = list(np.ones(sim_dict['same'].shape[0])) false_index = list(np.zeros(sim_dict['diff'].shape[0])) y_true = true_index + false_index true_val = list(sim_dict['same'].get_values()) false_val = list(sim_dict['diff'].get_values()) y_score = true_val + false_val fpr, tpr, thresholds = roc_curve(y_true, y_score) inst_auc = auc(fpr, tpr) if plot_roc: plt.figure() plt.plot(fpr, tpr) plt.plot([0, 1], [0, 1], color='navy', linestyle='--') plt.figure(figsize=(10, 10)) print('AUC', inst_auc) roc_data['true'] = y_true roc_data['score'] = y_score roc_data['fpr'] = fpr roc_data['tpr'] = tpr roc_data['thresholds'] = thresholds roc_data['auc'] = inst_auc sim_data['sim_dict'] = sim_dict sim_data['pval_dict'] = pval_dict sim_data['roc_data'] = roc_data return sim_data
use_multiprocessing = True, validation_data = testingGenerator, validation_steps = 1, verbose = 1, workers = settings['cores'] ) kerasDFFNN.save(os.path.join(settings['outputDirectory'], 'final.hdf5')) ### AREA UNDER THE PRECISION/RECALL CURVE x = kerasDFFNN.predict_generator( validationGenerator, steps = settings['epochSteps'], use_multiprocessing = True, verbose = 1, workers = settings['cores'] ) y = [] for k in range(0, settings['epochSteps']): ### inefficient (reads files a second time), but works with open(validationFiles[k]) as reader: for line in reader: d = list(map(float, line.rstrip().split(','))) if len(d) == settings['width']+1: y.append(int(d[0])) p, r, t = precision_recall_curve(y, x[:, 1]) a = auc(r, p) stats = open(os.path.join(settings['outputDirectory'], 'stats.csv'), 'w') stats.write('threshold,precision,recall,F1,AUC\n') for k in range(0, len(t)): stats.write(str(t[k]) + ',' + str(p[k]) + ',' + str(r[k]) + ',' + str(2*((p[k]*r[k])/(p[k]+r[k]))) + ',' + str(a) + '\n') stats.close()
ranked_frequencies = y_true[ranking] ranked_exposure = exposure[ranking] cumulated_claims = np.cumsum(ranked_frequencies * ranked_exposure) cumulated_claims /= cumulated_claims[-1] cumulated_exposure = np.cumsum(ranked_exposure) cumulated_exposure /= cumulated_exposure[-1] return cumulated_exposure, cumulated_claims fig, ax = plt.subplots(figsize=(8, 8)) for model in [dummy, ridge_glm, poisson_glm, poisson_gbrt]: y_pred = model.predict(df_test) cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], y_pred, df_test["Exposure"]) gini = 1 - 2 * auc(cum_exposure, cum_claims) label = "{} (Gini: {:.2f})".format(model[-1], gini) ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) # Oracle model: y_pred == y_test cum_exposure, cum_claims = lorenz_curve(df_test["Frequency"], df_test["Frequency"], df_test["Exposure"]) gini = 1 - 2 * auc(cum_exposure, cum_claims) label = "Oracle (Gini: {:.2f})".format(gini) ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label) # Random Baseline ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline") ax.set( title="Lorenz curves by model",
label=u'预测值,$R^2$=%.3f' % lr2.score(X1_train, Y1_train)) plt.legend(loc='upper left') plt.xlabel(u'数据编号', fontsize=18) plt.ylabel(u'葡萄酒质量', fontsize=18) plt.title(u'葡萄酒质量预测统计(降维处理)', fontsize=20) plt.show() ### 从auc角度看效果===>效果不错 from sklearn.preprocessing import label_binarize from sklearn import metrics y_test_hot = label_binarize(Y_test, classes=(3, 4, 5, 6, 7, 8, 9)).ravel() ### 计算原始数据模型 ## 得到预测的损失值 lr_y_score = lr.decision_function(X_test).ravel() ## 计算roc的值 lr_fpr, lr_tpr, lr_threasholds = metrics.roc_curve(y_test_hot, lr_y_score) ## 计算auc的值 lr_auc = metrics.auc(lr_fpr, lr_tpr) ## 计算降维后的数据模型 lr2_y_score = lr2.decision_function(X1_test).ravel() ## 计算roc的值 lr2_fpr, lr2_tpr, lr2_threasholds = metrics.roc_curve(y_test_hot, lr2_y_score) ## 计算auc的值 lr2_auc = metrics.auc(lr2_fpr, lr2_tpr) print("原始数据AUC值:", lr_auc) print("降维数据AUC值:", lr2_auc)
def test(model, training_data, validation_data, test_data, loss_fn, device, opt): ''' Epoch operation in evaluation phase ''' best_train_scores = eval_epoch(model, training_data, loss_fn, device, opt)[1] best_valid_scores = eval_epoch(model, validation_data, loss_fn, device, opt)[1] model.eval() count = 0 total_loss = 0 true_all = [] pred_all = [] with torch.no_grad(): for batch in tqdm(test_data, mininterval=2, desc=' - (Validation) ', leave=False): # prepare data if opt.feature: note, length, mortality, feature = map(lambda x: x.to(device), batch) pred = model(note, length, feature) else: note, length, mortality = map(lambda x: x.to(device), batch) pred = model(note, length) # backward loss = loss_fn(pred, mortality.view(-1)) # note keeping total_loss += loss.item() count += 1 # probability true_all.append(mortality.view(-1)) pred_all.append(F.softmax(pred)[:, 1].view(-1)) true_all = torch.cat(true_all, axis=0) pred_all = torch.cat(pred_all, axis=0) roc_auc = roc_auc_score(true_all.cpu(), pred_all.cpu()) precision, recall, thresholds = precision_recall_curve( true_all.cpu(), pred_all.cpu()) pr_auc = auc(recall, precision) ap = average_precision_score(true_all.cpu(), pred_all.cpu()) p_at_1 = precision_at_k(true_all.cpu(), pred_all.cpu(), 1) p_at_5 = precision_at_k(true_all.cpu(), pred_all.cpu(), 5) p_at_10 = precision_at_k(true_all.cpu(), pred_all.cpu(), 10) loss_per_word = total_loss / count print("----- Test Result -----") print("ROC AUC:", roc_auc) print("PR AUC:", pr_auc) print("Loss:", loss_per_word) if not os.path.exists("./results/"): os.mkdir("results") if not os.path.exists(f"./results/{opt.task}"): os.mkdir(f"./results/{opt.task}") if not os.path.exists(f"./results/{opt.task}/{opt.name}"): os.mkdir(f"./results/{opt.task}/{opt.name}") outname = f'{opt.period}.csv' if opt.text: outname = "text_" + outname if opt.feature: outname = "feature_" + outname print("Write Result to ", outname) with open(os.path.join('./results/', opt.task, opt.name, outname), 'w') as f: f.write("TYPE,ROCAUC,PRAUC,AP,P@1,P@5,P@10\n") f.write( f"train,{best_train_scores[0]},{best_train_scores[1]},{best_train_scores[2]},{best_train_scores[3]},{best_train_scores[4]},{best_train_scores[5]}\n" ) f.write( f"valid,{best_valid_scores[0]},{best_valid_scores[1]},{best_valid_scores[2]},{best_valid_scores[3]},{best_valid_scores[4]},{best_valid_scores[5]}\n" ) f.write(f"test,{roc_auc},{pr_auc},{ap},{p_at_1},{p_at_5},{p_at_10}")
def drawROCCurveFromClassifiers(classifilers, class_labels, X_train, y_train, X_test, y_test, positiveLabel=1): """ トレーニングデータとテストデータを分割するイテレータから、ROC曲線を描写する. [Input] classifilers : 推定器クラスのオブジェクト fit() 関数と predict() 関数が実装されたクラスのオブジェクト class_labels : list <str> """ # 分類器 classifers に対応したMAPの作成(最大5クラス対応) #tuple_makers = ( "s","x","+","^","v" ) # タプル(定数リスト) #tuple_colors = ( "red","blue","lightgreen", "gray", "cyan" ) # 塗りつぶす色を表すタプル(定数リスト) #tuple_linestyle = ( 'k--', '-', '-.', '--', "---" ) # classifilers 内の各弱識別器 clf の ROC 曲線を作図 for (clf, label) in zip(classifilers, class_labels): # トレーニングデータで推定器 classifiler を学習 fit() predict = clf.fit(X_train, y_train) #print("predict : \n", predict ) # test データの予想所属確率を predict_proba() で算出 proba = predict.predict_proba(X_test) #print("predict_proba : \n", proba ) # 実際の所属確率と予想の所属確率から roc_curve() 関数で ROC 曲線の性能値(FPR,TPR)を計算 fpr, tpr, thresholds = roc_curve( y_true=y_test, # True binary labels in range {0, 1} or {-1, 1} y_score= proba[:, 1], # Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions pos_label=positiveLabel # positive と見なすラベルの値 ) # AUC 値を計算 roc_auc = auc(fpr, tpr) #print("roc_auc : \n", roc_auc ) # 計算したROC 曲線の性能を plot plt.plot( fpr, tpr, # 偽陽性率 [FPR] と真陽性率 [TPR] lw=2, label='%s (AUC = %0.2f)' % (label, roc_auc)) # perfect performance 時の ROC 曲線 plot plt.plot([0, 0, 1], [0, 1, 1], lw=1, linestyle=':', color='black', label='perfect performance (AUC = 1.00)') # 当て推量時の ROC 曲線 & AUC値 plot plt.plot([0, 1], [0, 1], lw=1, linestyle='--', color=(0.6, 0.6, 0.6), label='random guessing (AUC =0.50)') # plt.title("ROC Curve [Receiver Operator Characteristic Curve]") plt.xlabel("FPR : false positive rate") plt.ylabel("TPR : true positive rate") plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.legend(loc='best') return
plt.title('Receiver Operating Characteristic') plt.legend(loc="lower right") print("k: ", bestK) print("auc =", roc_auc) cat=pd.concat([dfpredicted,dfphenotypes], axis=1, ) cat.columns=["PredictedPhenoProba","realPheno" ] catsort=cat.sort_values(by="PredictedPhenoProba") precision, recall, thresholds=metrics.precision_recall_curve(catsort["realPheno"], catsort["PredictedPhenoProba"], pos_label=1) cat=pd.concat([pd.DataFrame(precision),pd.DataFrame(recall)], axis=1 ) cat.columns=["precision","recall" ] catsort=cat.sort_values(by="recall") plt.plot( catsort["recall"],catsort["precision"],label='Precision Recall curve (area = %0.3f)' % metrics.auc(catsort["recall"],catsort["precision"])) #plt.plot([0, 1], [0, 1], 'k--') # random predictions curve plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision Recall curve') plt.legend(loc="upper right") a=metrics.auc(catsort["recall"],catsort["precision"]) print("aupr =", a) plt.savefig('/Users/Iryna/Desktop/myimage.pdf', format="pdf", dpi=1200) out=pd.DataFrame([roc_auc,a], index=["auc", "aupr"], columns=[bestK]) out.to_csv(dirShared+"PerfMeasures_k="+str(bestK)+".txt" , sep= "\t")
def cal_auc_ks_iv(df, targets=[0, 1, 3, 7, 14, 30], text='', max_depth=2, plot=True, precision=3): ''' 计算 AUC KS 和 IV的值 并画出对应的AUC图 ''' ks = pd.DataFrame() ac = pd.DataFrame() iv = pd.DataFrame() dn = [f'{n}d' for n in targets] cols = set(df.columns) - set(dn) for n in targets: auc_value = [] ks_value = [] iv_value = [] plt.figure(figsize=(6,4), dpi=100) for var in cols: y_true = df[df[var].notnull()][f'{n}d'] y_pred = df[df[var].notnull()][var] # 计算各个指标的 fpr tpr 和 thr fpr, tpr, thr = roc_curve(y_true, y_pred, pos_label=1) # 计算AUC值 ac_single = auc(fpr, tpr) if ac_single < 0.5: fpr, tpr, thr = roc_curve(y_true, -y_pred, pos_label=1) ac_single = auc(fpr, tpr) auc_value.append(ac_single) # 计算K-S值 ks_single = (tpr - fpr).max() ks_value.append(ks_single) # 计算IV值 iv_single = cal_woe_iv(y_pred, y_true, max_depth=max_depth)[1] iv_value.append(iv_single) if plot: # ROC Cureve plt.plot(fpr, tpr, lw=1, label=f'{var}(auc=' + str(round(ac_single, precision)) + ')') plt.plot(fpr, tpr, lw=1) # Labels plt.grid() plt.plot([0,1], [0,1], linestyle='--', color=(0.6, 0.6, 0.6)) plt.plot([0, 0, 1], [0, 1, 1], lw=1, linestyle=':', color='black') plt.xlabel('false positive rate') plt.ylabel('true positive rate') plt.title(f'{text}ROC for {n}d') plt.legend(loc='best') auc_part = pd.DataFrame(auc_value, columns=[f'{n}d'], index=cols) ac = pd.concat([ac, auc_part], axis=1) ks_part = pd.DataFrame(ks_value, columns=[f'{n}d'], index=cols) ks = pd.concat([ks, ks_part], axis=1) iv_part = pd.DataFrame(iv_value, columns=[f'{n}d'], index=cols) iv = pd.concat([iv, iv_part], axis=1) iv = np.round(iv, precision) ac = np.round(ac, precision) ks = np.round(ks, precision) return ac, ks, iv
def metrics(X, Y, A, B, N): incorrect = 0 true_pos = 0 false_pos = 0 true_neg = 0 false_neg = 0 y_true = [] y_pred = [] i = 0 for x in X: prediction = np.argmax(stable_softmax(x, A, B)) true_label = np.argmax(Y[i]) y_true.append(true_label) y_pred.append(prediction) if prediction != true_label: incorrect += 1 if prediction == 1 and true_label == 1: true_pos += 1 if prediction == 1 and true_label == 0: false_pos += 1 if prediction == 0 and true_label == 0: true_neg += 1 if prediction == 0 and true_label == 1: false_neg += 1 i += 1 print("confusion matrix: ") print("[ ", true_neg, false_pos, " ]") print("[ ", false_neg, true_pos, " ]") y_true = np.array(y_true) y_pred = np.array(y_pred) # Compute fpr, tpr, thresholds and roc auc fpr, tpr, thresholds = roc_curve(y_true, y_pred) roc_auc = auc(fpr, tpr) print("AUC score: ", roc_auc) if true_pos == 0 and false_pos == 0: print("WARNING::True pos and False pos both zero") precision = true_pos / 0.000001 recall = true_pos / 0.000001 F1 = 2 * ((precision * recall) / (precision + recall)) classification_error = incorrect / N else: precision = true_pos / (true_pos + false_pos) # true pos rate (TRP) recall = true_pos / (true_pos + false_neg) # F1 = 2 * ((precision * recall) / (precision + recall)) classification_error = incorrect / N print() return classification_error, precision, recall, F1, roc_auc, fpr, tpr
def showResults(pred_labels, test_labels): binder, nonBinder = showBinder(pred_labels) false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, pred_labels) roc_auc = auc(false_positive_rate, true_positive_rate) print("SVM predicted %s binder and %s nonBinder. AUC = %s" % (binder, nonBinder, roc_auc))
def drawROCCurveFromTrainTestIterator(classifiler, iterator, X_train, y_train, X_test, y_test, positiveLabel=1): """ トレーニングデータとテストデータを分割するイテレータから、ROC曲線を描写する. [Input] classifiler : 推定器クラスのオブジェクト fit() 関数と predict() 関数が実装されたクラスのオブジェクト iterator : list イテレータ [Output] figure : matplotlib.figure クラスのオブジェクト 描画される部品を納めるコンテナクラス ( Artist の派生クラス ) """ # Figure クラスのオブジェクト作成&グラフサイズを設定 figure = plt.figure(figsize=(7, 5)) # ROC 曲線を構成する偽陽性率 [FPR] と真陽性率 [TPR] の初期化 means_tpr = 0.0 # means_fpr = numpy.linspace(0, 1, 100) # [0,1] の範囲(確率)を 100 個で分割 #all_tpr = [] # 空のリストで初期化 #--------------------------------------------------------------------------------------- # iterator 内の分割された ( train, test ) のペアでループ処理 (enumerate で並列ループ) # イテレータ毎に ROC曲線 & AUC の描写処理 #--------------------------------------------------------------------------------------- for it, (train, test) in enumerate(iterator): #print("X_train[train] : \n", X_train[train] ) #print("y_train[train] : \n", y_train[train] ) # トレーニングデータで推定器 classifiler を学習 fit() predict = classifiler.fit(X_train[train], y_train[train]) #print("predict : \n", predict ) # test データの予想所属確率を predict_proba() で算出 proba = predict.predict_proba(X_train[test]) #print("predict_proba : \n", proba ) # 実際の所属確率と予想の所属確率から roc_curve() 関数で ROC 曲線の性能値(FPR,TPR)を計算 fpr, tpr, thresholds = roc_curve( y_true=y_train[ test], # True binary labels in range {0, 1} or {-1, 1} y_score= proba[:, 1], # Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions pos_label=positiveLabel # positive と見なすラベルの値 ) #print("roc_curve() retrun FPR : \n", fpr ) #print("roc_curve() retrun TPR : \n", tpr ) #print("roc_curve() retrun thresholds : \n", thresholds ) # 得られた fpr (x軸値) と tpr (y軸値) の線形補間処理 means_tpr += interp(means_fpr, fpr, tpr) #print("means_tpr : \n", means_tpr ) means_tpr[0] = 0.0 # ? #print("means_tpr : \n", means_tpr ) # AUC 値を計算 roc_auc = auc(fpr, tpr) #print("roc_auc : \n", roc_auc ) # 計算したROC 曲線の性能を plot plt.plot( fpr, tpr, # 偽陽性率 [FPR] と真陽性率 [TPR] lw=1, label='ROC k=%d fold CV (AUC = %0.2f)' % (it + 1, roc_auc)) # ROC の平均を plot means_tpr /= len(iterator) means_tpr[-1] = 1.0 mean_auc = auc(means_fpr, means_tpr) #print("means_tpr : \n", means_tpr ) plt.plot(means_fpr, means_tpr, 'k--', label='mean ROC (AUC = %0.2f)' % mean_auc, lw=2) # perfect performance 時の ROC 曲線 plot plt.plot([0, 0, 1], [0, 1, 1], lw=2, linestyle=':', color='black', label='perfect performance (AUC = 1.00)') # 当て推量時の ROC 曲線 & AUC値 plot plt.plot([0, 1], [0, 1], linestyle='--', color=(0.6, 0.6, 0.6), label='random guessing (AUC =0.50)') # plt.title("ROC Curve [Receiver Operator Characteristic Curve]") plt.xlabel("FPR : false positive rate") plt.ylabel("TPR : true positive rate") plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.legend(loc='best') #plt.grid() #plt.tight_layout() return figure
corrctLTPC += 1 elif true_label == 1: allTPC += 1 if true_label == pred_label: corrctTPC += 1 acc_lst.append(acc) class2_acc_lst.append( [corrctLTPC / float(allLTPC), corrctTPC / float(allTPC)]) # auc roc true_class = np.array(test_label_set) # true_class为数据的真实标签 pred_scores = np.array([a[0] for a in result1]) # scores为分类其预测的得分 fpr, tpr, thresholds = metrics.roc_curve(true_class, pred_scores, pos_label=0) # bcc AUC = auc(fpr, tpr) # tpr fpr yuedeng = [] for i in range(len(fpr)): yuedeng.append(tpr[i] - fpr[i]) yuedeng_index = yuedeng.index(max(yuedeng)) # print 'the best TPR FPR in subset-%d'%testIndex, tpr[yuedeng_index], fpr[yuedeng_index] auc_lst.append(AUC) trueAllLst += test_label_set scoreAllLst += [a[0] for a in result1] true_class = np.array(test_label_set) # true_class为数据的真实标签 pred_scores = np.array([a[1] for a in result1]) # scores为分类其预测的得分 fpr0, tpr0, thresholds0 = metrics.roc_curve(true_class, pred_scores,
def getAUC(pred_labels, test_labels): false_positive_rate, true_positive_rate, thresholds = roc_curve(test_labels, pred_labels) roc_auc = auc(false_positive_rate, true_positive_rate) return roc_auc
print( classification_report(y_true=y_test, y_pred=y_pred_rnd, target_names=['normal', 'covid'])) fig1 = plt.figure() sns.heatmap(data=cm, cmap='Blues', annot=True, annot_kws={'size': 14}, fmt='d', vmin=0, vmax=len(y_test) / 2.) plt.title('annotated heatmap for confusion matrix') plt.show() # fig1.savefig('./checkpoints/densenet121/cm_heatmap.png') fpr, tpr, _ = roc_curve(y_true=y_test, y_score=y_pred, pos_label=None) roc_auc = auc(x=fpr, y=tpr) fig2 = plt.figure() plt.plot(fpr, tpr, 'b', label='AUC = %0.4f' % roc_auc) plt.title('Receiver Operating Characteristic') plt.legend() plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show() # fig2.savefig('./checkpoints/densenet121/roc.png')
r'D:\Users\zcguo\PycharmProjects\credit_score\data\test.csv') test_X = test_data.iloc[:, 2:] test_y = test_data.iloc[:, 1] test_X = trans_woe(test_X, x1_name, x1_woe, x1_cut) test_X = trans_woe(test_X, x2_name, x2_woe, x2_cut) test_X = trans_woe(test_X, x3_name, x3_woe, x3_cut) test_X = trans_woe(test_X, x7_name, x7_woe, x7_cut) test_X = trans_woe(test_X, x9_name, x9_woe, x9_cut) test_X = test_X.iloc[:, -5:] # gbdt model roc X3 = sm.add_constant(test_X) resuG = gbm.predict(X3) recall1 = metrics.recall_score(test_y, resuG.round()) acc1 = metrics.accuracy_score(test_y, resuG.round()) print(recall1) print(acc1) fpr1, tpr1, threshold1 = metrics.roc_curve(test_y, resuG) rocauc1 = metrics.auc(fpr1, tpr1) plt.plot(fpr1, tpr1, 'b', label='AUC = %0.2f' % rocauc1) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('TPR') plt.xlabel('FPR') plt.show()
#ml_algorithms(data,y_train,data_test,y_test) #X_train2: concatnate = original + embeddings''' prediction_tp, prediction_tpol_prob = ml_algorithms(X_train2, y_train, X_test2, y_test) #%% from sklearn.metrics import roc_curve, auc fpr_pol, tpr_pol, _ = roc_curve((y_test == True).apply(int), prediction_tpol_prob[:, 1]) fpr, tpr, _ = roc_curve((y_test == True).apply(int), prediction_tp[:, 1]) fprn, tprn, _ = roc_curve((y_test == True).apply(int), prediction_tn[:, 1]) print('AUC for Node2Vec Logistic + Poly features + Normal Features : ', auc(fpr_pol, tpr_pol)) print('AUC for Node2Vec Logistic + Linear Features + Normal Features : ', auc(fpr, tpr)) print('AUC for Normal Features LogisticNormal Features Logistic : ', auc(fprn, tprn)) plt.plot(fpr_pol, tpr_pol, 'g', label='Node2Vec Logistic + Poly features + Normal Features') plt.plot(fpr, tpr, 'r', label='Node2Vec Logistic + Linear Features + Normal Features') plt.plot(fprn, tprn, 'b', label='Normal Features Logistic') plt.legend()
graphviz.Source(dot_graph).view() ########################################################################## ########################################################################## # Finally, let’s evaluate the tree’s performance on the test data. The predict() function can be used for # this purpose. We can then build a confusion matrix # 86+59/200 = 0.725 ########################################################################## ############ Here we construct the ROC curve for the tree ################ ########################################################################## y_score = clf.predict_proba(X_test) fpr, tpr, _ = roc_curve(y_test, y_score[:, 1]) roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, color='orange', label='ROC curve (area = {:0.2f})'.format(roc_auc)) plt.plot([0, 1], [0, 1], color='blue', linestyle='--') plt.xlim([-0.05, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC curve for our Decision Tree') plt.legend(loc="lower right") ########################################################################## ##########################################################################
def kfold_cv(model, xFeat, y, k): """ Split xFeat into k different groups, and then use each of the k-folds as a validation set, with the model fitting on the remaining k-1 folds. Return the model performance on the training and validation (test) set. Parameters ---------- model : sktree.DecisionTreeClassifier Decision tree model xFeat : nd-array with shape n x d Features of the dataset y : 1-array with shape n x 1 Labels of the dataset k : int Number of folds or groups (approximately equal size) Returns ------- trainAuc : float Average AUC of the model on the training dataset testAuc : float Average AUC of the model on the validation dataset timeElapsed: float Time it took to run this function """ trainAuc = 0 testAuc = 0 timeElapsed = 0 # TODO FILL IN timeElapsed = time.time() xFeat = np.asarray(xFeat) y = np.asarray(y) kf = KFold(n_splits=k) kf.get_n_splits(xFeat) # Loops through all splits and repeates the process for above for train_index, test_index in kf.split(xFeat): xTrain, xTest = xFeat[train_index], xFeat[test_index] yTrain, yTest = y[train_index], y[test_index] trainModel = model.fit(xTrain, yTrain) predictTrain = trainModel.predict_proba(xTrain) predictTrain = predictTrain[:, 1] fpr1, tpr1, thresholds = metrics.roc_curve(yTrain, predictTrain) trainAuc += metrics.auc(fpr1, tpr1) predictTest = trainModel.predict_proba(xTest) predictTest = predictTest[:, 1] fpr1, tpr1, thresholds = metrics.roc_curve(yTest, predictTest) testAuc += metrics.auc(fpr1, tpr1) trainAuc /= kf.get_n_splits(xFeat) testAuc /= kf.get_n_splits(xFeat) timeElapsed = time.time() - timeElapsed return trainAuc, testAuc, timeElapsed
def explain(self, param, label='', auc_plot=False): print('------------ Explanation -------------') self._file.write('------------ Explanation -------------\n') phi = param[0] theta = param[1] psi = param[2] k = param[3] start1 = time() ex = Extractor(self._clf, phi, theta, psi) ex.extract_forest_paths() ex.rule_filter() print('max_rule', ex.max_rule, 'max_node', ex.max_node) print('min_rule', ex.min_rule, 'min_node', ex.min_node) end1 = time() print("EX Running time: %s seconds" % (end1 - start1)) print("original path number: ", ex.n_original_leaves_num) print("original scale: ", ex.scale) print("path number after rule filter: ", len(ex._forest_values)) self._file.write('original path number: {}\n'.format( ex.n_original_leaves_num)) self._file.write('original scale: {}\n'.format(ex.scale)) self._file.write('path number after rule filter: {}\n'.format( len(ex._forest_values))) start2 = time() sat = Z3Process(ex, k) sat.leaves_partition() if self._maxsat_on is True: sat.maxsat() print("path number after maxsat: ", sat.n_rules_after_max, " after filter: ", sat.n_rules_after_filter, '\n') self._file.write( 'path number after maxsat: {}\tafter filter: {}\n\nclasses:\t{}\n\n' .format(sat.n_rules_after_max, sat.n_rules_after_filter, self._clf.classes_)) else: print('no maxsat') self._file.write('/no MAX-SAT\n') sat.run_filter() end2 = time() print("SAT Running time: %s seconds" % (end2 - start2)) print('classes:', self._clf.classes_) start3 = time() f = FormulaeEstimator(sat, conjunction=self._conjunction, classes=self._clf.classes_) f.get_formulae_text(self._file) print('\n------------ Performance -------------') self._file.write('\n------------ Performance -------------\n') c_ans = self._clf.predict(self._X_test) ans = f.classify_samples(self._X_test) end3 = time() print("ET Running time: %s seconds" % (end3 - start3)) RF_accuracy = accuracy_score(self._y_test, c_ans) EX_accuracy = accuracy_score(self._y_test, ans) performance = accuracy_score(c_ans, ans) no_ans = 0 overlap = 0 for each in f.sat_group: if len(each) > 1: overlap += 1 elif len(each) == 0: no_ans += 1 if label == '': # 计算AUC label = self._clf.classes_[0] fpr, tpr, thresholds = roc_curve(self._y_test, self._clf.predict_proba( self._X_test)[:, 1], pos_label=label) ori_auc = auc(fpr, tpr) ex_test = f.classify_samples_values(self._X_test) efpr, etpr, ethresholds = roc_curve(self._y_test, ex_test[:, 1], pos_label=label) ex_auc = auc(efpr, etpr) print('sample size:\t', len(self._y_test)) self._file.write('sample size:\t{}\n'.format(len(self._y_test))) print('RF accuracy:\t', RF_accuracy) self._file.write('RF accuracy:\t{}\n'.format(RF_accuracy)) print('RF AUC:\t\t\t', ori_auc) self._file.write('RF AUC:\t\t\t{:.2f}\n'.format(ori_auc)) # print('错误结果覆盖:', f_count) print('EX accuracy:\t', EX_accuracy) self._file.write('EX accuracy:\t{}\n'.format(EX_accuracy)) print('EX AUC:\t\t\t', ex_auc) self._file.write('EX AUC:\t\t\t{:.2f}\n'.format(ex_auc)) print('Coverage:\t\t', (len(self._y_test) - no_ans) / len(self._y_test)) self._file.write('Coverage:\t\t{}\n'.format( (len(self._y_test) - no_ans) / len(self._y_test))) print('Overlap:\t\t', overlap / len(self._y_test)) self._file.write('Overlap:\t\t{}\n'.format(overlap / len(self._y_test))) print('*Performance:\t', performance) self._file.write('*Performance:\t{}\n'.format(performance)) if auc_plot is True: plt.plot(fpr, tpr, linewidth=2, label="RF ROC curve (area = {:.2f})".format(ori_auc)) plt.plot(efpr, etpr, linewidth=2, label="Explain ROC curve (area = {:.2f})".format(ex_auc)) plt.xlabel("false positive rate") plt.ylabel("true positive rate") plt.ylim(0, 1.05) plt.xlim(0, 1.05) plt.legend(loc=4) # 图例的位置 plt.show()
def avaliacao_PerformanceC(df_train_class, predicted_train, predicted_prob_train, df_test_class, predicted_test, predicted_prob_test, roc_y_n): ### Confusion Matrix confusion_matrix_train = confusion_matrix(df_train_class, predicted_train) confusion_matrix_test = confusion_matrix(df_test_class, predicted_test) print("\nTraining Confusion Matrix:\n ", confusion_matrix_train) print("\nTesting Confusion Matrix:\n ", confusion_matrix_test) ### Accuracy score score_train = accuracy_score(df_train_class, predicted_train) score_test = accuracy_score(df_test_class, predicted_test) print("\nTraining Accuracy Score: ", score_train) print("\nTesting Accuracy Score: ", score_test) ### Precision, Recall precision_train = precision_score(df_train_class, predicted_train) precision_test = precision_score(df_test_class, predicted_test) print("\nTraining Precision: ", precision_train) print("\nTesting Precision: ", precision_test) recall_train = recall_score(df_train_class, predicted_train) recall_test = recall_score(df_test_class, predicted_test) print("\nTraining Recall: ", recall_train) print("\nTesting Recall: ", recall_test) ### Classification Report print("\nTrain Classification Report: \n", classification_report(df_train_class, predicted_train)) print("\nTest Classification Report: \n", classification_report(df_test_class, predicted_test)) ### F1 Score f1score_train = f1_score(df_train_class, predicted_train) #, average='weighted') f1score_test = f1_score(df_test_class, predicted_test) #, average='weighted') print("\nTraining F1score: ", f1score_train) print("\nTesting F1score: ", f1score_test) f1score_train = f1_score(df_train_class, predicted_train, average='weighted') f1score_test = f1_score(df_test_class, predicted_test, average='weighted') print("\nTraining Weigted F1score: ", f1score_train) print("\nTesting Weighted F1score: ", f1score_test) ### ROC-AUC if roc_y_n == 'y': fpr, tpr, threshold = roc_curve(df_train_class, predicted_prob_train[:, 1]) roc_auc_train = auc(fpr, tpr) print("\nTraining AUC for ROC: ", roc_auc_train) plt.figure() plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc_train) plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.legend(loc='lower right') plt.title('Training - Receiver Operating Characteristic') fpr, tpr, threshold = roc_curve(df_test_class, predicted_prob_test[:, 1]) roc_auc_test = auc(fpr, tpr) print("\nTesting AUC for ROC: ", roc_auc_test) plt.figure() plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc_test) plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.legend(loc='lower right') plt.title('Testing - Receiver Operating Characteristic')
loss='binary_crossentropy', metrics=['accuracy']) history = model.fit(x_train, y_train, batch_size=40, epochs=epochs, validation_split=0.25, verbose=1, callbacks=[tensorboard]) # Prediction and ROC/ AUC curve plotting y_pred = model.predict(x_test) fpr_keras, tpr_keras, thresholds_keras = roc_curve(np.ravel(y_test), np.ravel(y_pred)) auc_keras = auc(fpr_keras, tpr_keras) plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras)) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') plt.show() test_loss, test_acc = model.evaluate(x_test, y_test, batch_size=batch_size) model.save("CNN.h5") print('Test accuracy :', test_acc, 'Test Loss :', test_loss)
print("测试集:", accuracy_score(test_label, tes_label)) matrix = confusion_matrix(train_label, tra_label, labels=[0, 1]) TP = matrix[1, 1] TN = matrix[0, 0] FP = matrix[0, 1] FN = matrix[1, 0] sn = TP / (TP + FN) sp = TN / (TN + FP) decision_score = classifier.predict_proba(test_data) fprs, tprs, thresholds = roc_curve(test_label, decision_score[:, 1]) # plt.plot(fprs, tprs) # plt.show() roc_auc = auc(fprs, tprs) plt.figure() lw = 2 plt.plot(fprs, tprs, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show()
def mc_cv(model, xFeat, y, testSize, s): """ Evaluate the model using s samples from the Monte Carlo cross validation approach where for each sample you split xFeat into random train and test based on the testSize. Returns the model performance on the training and test datasets. Parameters ---------- model : sktree.DecisionTreeClassifier Decision tree model xFeat : nd-array with shape n x d Features of the dataset y : 1-array with shape n x 1 Labels of the dataset testSize : float Portion of the dataset to serve as a holdout. Returns ------- trainAuc : float Average AUC of the model on the training dataset testAuc : float Average AUC of the model on the validation dataset timeElapsed: float Time it took to run this function """ trainAuc = 0 testAuc = 0 timeElapsed = 0 # TODO FILL IN timeElapsed = time.time() xFeat = np.asarray(xFeat) y = np.asarray(y) # Repeats the same process but uses the random shuffle ss = ShuffleSplit(n_splits=s, test_size=testSize, random_state=0) for train_index, test_index in ss.split(xFeat): xTrain, xTest = xFeat[train_index], xFeat[test_index] yTrain, yTest = y[train_index], y[test_index] trainModel = model.fit(xTrain, yTrain) predictTrain = trainModel.predict_proba(xTrain) predictTrain = predictTrain[:, 1] fpr1, tpr1, thresholds = metrics.roc_curve(yTrain, predictTrain) trainAuc += metrics.auc(fpr1, tpr1) predictTest = trainModel.predict_proba(xTest) fpr1, tpr1, thresholds = metrics.roc_curve(yTest, predictTest[:, 1]) testAuc += metrics.auc(fpr1, tpr1) trainAuc /= ss.get_n_splits(xFeat) testAuc /= ss.get_n_splits(xFeat) timeElapsed = time.time() - timeElapsed return trainAuc, testAuc, timeElapsed
def train_5_cross(df_pre, X,y, X_test_v1,y_test_v1, thresholds=0.45, id_1='id', csv_name=0): """ 功能: 五折训练并输出名单 why: 5折一般是效果比较稳定的,用于线下做的。 X: 训练数据X(无标签/df型) y: 训练数据y(标签/df型) X_test_v1: 预测数据X(无标签/df型) y_test_v1: 预测数据y(无标签/df型) thresholds: 阈值选择,默认0.45高精确率 csv_name: 保存csv的名称,默认不保存 returen: 客户名单及情况 """ vali_auc_num=0 # 验证集AUC vali_recall_num=0 # 验证集召回率 vali_precision_num=0 # 验证集精确率 test_auc_num=0 # 预测集AUC test_recall_num=0 # 预测集召回率 test_precision_num=0 # 预测集精确率 y_pred_input = np.zeros(len(X_test_v1)) # 相应大小的零矩阵 print("=============开始训练================") folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234) # 分层采样, n_splits为几折 for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)): print("第 {} 次训练...".format(fold_+1)) train_x, trai_y = X.loc[trn_idx], y.loc[trn_idx] vali_x, vali_y = X.loc[val_idx], y.loc[val_idx] # 以下为调过参的lgb模型 clf = lgb.LGBMClassifier(max_depth=20, min_data_in_bin=5, max_bin=200, min_child_samples=90, num_leaves=20, n_estimators=20000, objective='binary', boosting_type='gbdt', learning_rate=0.02, lambda_l2=5) clf.fit(train_x, trai_y, eval_set=[(train_x, trai_y), (vali_x, vali_y)], verbose=0, early_stopping_rounds=100, eval_metric='f1') # 不懂的去GitHub看搜LightGBM的参数解释 # ===============验证集AUC操作=================== y_prb = clf.predict_proba(vali_x)[:,1] # 获取预测概率 # fpr:在实际为正的样本中,被正确判断为正的比例。tpr:在实际为负的样本中,被正确判断为负的比例。thres为阈值 fpr, tpr, thres = roc_curve(vali_y, y_prb) vali_roc_auc = auc(fpr, tpr) # 获取验证集auc vali_auc_num += vali_roc_auc # 将本次auc加入总值里 print("vali auc = {0:.4}".format(vali_roc_auc)) # 本次auc的值 # ===============预测集AUC操作=================== y_prb_test = clf.predict_proba(X_test_v1)[:,1] # 获取预测概率 fpr, tpr, thres = roc_curve(y_test_v1, y_prb_test) test_roc_auc = auc(fpr, tpr) test_auc_num += test_roc_auc print("test auc = {0:.4}".format(test_roc_auc)) # ===============验证metric操作=================== y_pre_proba = clf.predict_proba(vali_x.values) y_predictions = y_pre_proba[:, 1]>thresholds # 取阈值多少以上的为True cnf_matrix = confusion_matrix(vali_y, y_predictions) # 建立矩阵 np.set_printoptions(precision=2) # 控制在两位数 vali_recall = '{0:.3f}'.format(cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])) # 召回率 vali_precision = '{0:.3f}'.format(cnf_matrix[1,1]/(cnf_matrix[0,1]+cnf_matrix[1,1])) # 精确率 print("vali_metric: ", vali_recall, vali_precision) vali_recall_num += float(vali_recall) # 将本次召回率加入总值里 vali_precision_num += float(vali_precision) # 将本次精确率加入总值里 # ===============预测metric操作=================== y_pre_proba_test = clf.predict_proba(X_test_v1.values) y_predictions_test = y_pre_proba_test[:, 1]>thresholds # 取阈值多少以上的为True cnf_matrix_test = confusion_matrix(y_test_v1, y_predictions_test) # 建立矩阵 np.set_printoptions(precision=2) # 控制在两位数 test_recall = '{0:.3f}'.format(cnf_matrix_test[1,1]/(cnf_matrix_test[1,0]+cnf_matrix_test[1,1])) # 召回率 test_precision = '{0:.3f}'.format(cnf_matrix_test[1,1]/(cnf_matrix_test[0,1]+cnf_matrix_test[1,1])) # 精确率 print("test_metric: ", test_recall, test_precision) test_recall_num += float(test_recall) # 将本次召回率加入总值里 test_precision_num += float(test_precision) # 将本次精确率加入总值里 y_pred_input += y_pre_proba_test[:, 1] # 将每次的预测的结果写入数组中 print("5折泛化,验证集AUC:{0:.3f}".format(vali_auc_num/5)) # 前面是做了5次相加,所以这次要除以5 print("5折泛化,预测集AUC:{0:.3f}".format(test_auc_num/5)) print("5折泛化,验证集recall:{0:.3f}".format(vali_recall_num/5)) print("5折泛化,验证集precision:{0:.3f}".format(vali_recall_num/5)) print("5折泛化,预测集recall:{0:.3f}".format(test_recall_num/5)) print("5折泛化,预测集precision:{0:.3f}".format(test_recall_num/5)) print("================开始输出名单==================") y_pred_input_end = y_pred_input / 5 # 前面是做了5次相加,所以这次要除以5 y_pred_input_precision = y_pred_input_end > thresholds # 获取高精确率的标签 submission = pd.DataFrame({"id": df_pre[id_1], "概率": y_pred_input_end, "高精确": y_pred_input_precision}) if csv_name != 0: submission.to_csv("%s预测名单.csv" % csv_name, index=False) # 保存 print("================输出名单名单==================") print(submission.head(5))
dt_t = dt.iloc[:, [0]] dt_p = dt.iloc[:, [1]] knn_t = knn.iloc[:, [0]] knn_p = knn.iloc[:, [1]] lr_t = lr.iloc[:, [0]] lr_p = lr.iloc[:, [1]] rf_t = rf.iloc[:, [0]] rf_p = rf.iloc[:, [1]] import sklearn.metrics as metrics # calculate the fpr and tpr for all thresholds of the classification fpr1, tpr1, threshold1 = metrics.roc_curve(ann_t, ann_p) roc_auc1 = metrics.auc(fpr1, tpr1) fpr2, tpr2, threshold2 = metrics.roc_curve(dt_t, dt_p) roc_auc2 = metrics.auc(fpr2, tpr2) fpr3, tpr3, threshold1 = metrics.roc_curve(knn_t, knn_p) roc_auc3 = metrics.auc(fpr3, tpr3) fpr4, tpr4, threshold4 = metrics.roc_curve(lr_t, lr_p) roc_auc4 = metrics.auc(fpr4, tpr4) fpr5, tpr5, threshold5 = metrics.roc_curve(rf_t, rf_p) roc_auc5 = metrics.auc(fpr5, tpr5) # method I: plt plt.title('Receiver Operating Characteristic') plt.plot(fpr1, tpr1, 'r', label = 'ANN(AUC = %0.2f)' % roc_auc1) plt.plot(fpr2, tpr2, 'g', label = 'DT(AUC = %0.2f)' % roc_auc2) plt.plot(fpr3, tpr3, 'y', label = 'KNN(AUC = %0.2f)' % roc_auc3) plt.plot(fpr4, tpr4, 'b', label = 'LR(AUC = %0.2f)' % roc_auc4)