def test_fine_tune_all_nodes(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) # Chain composition chain = get_class_chain() # Before tuning prediction chain.fit(train_data, use_cache=False) before_tuning_predicted = chain.predict(test_data) # root node tuning chain.fine_tune_all_nodes(train_data, max_lead_time=timedelta(minutes=1), iterations=30) chain.fit_from_scratch(train_data) after_tun_root_node_predicted = chain.predict(test_data) bfr_tun_roc_auc = round( roc(y_true=test_data.target, y_score=before_tuning_predicted.predict), 2) aft_tun_roc_auc = round( roc(y_true=test_data.target, y_score=after_tun_root_node_predicted.predict), 2) print(f'Before tune test {bfr_tun_roc_auc}') print(f'After tune test {aft_tun_roc_auc}', '\n') assert aft_tun_roc_auc >= bfr_tun_roc_auc
def test_tune_certain_node_with_tune_class_correctly(data_fixture, request): data = request.getfixturevalue(data_fixture) train_data, test_data = train_test_data_setup(data=data) chain = create_four_depth_chain() chain.fit(train_data, use_cache=False) before_tuning_predicted = chain.predict(test_data) model_id_to_tune = 4 tuned_chain = Tune(chain).fine_tune_certain_node( model_id=model_id_to_tune, input_data=train_data, max_lead_time=timedelta(minutes=1), iterations=30) tuned_chain.fit_from_scratch(train_data) after_tun_root_node_predicted = tuned_chain.predict(test_data) bfr_tun_roc_auc = round( roc(y_true=test_data.target, y_score=before_tuning_predicted.predict), 1) aft_tun_roc_auc = round( roc(y_true=test_data.target, y_score=after_tun_root_node_predicted.predict), 1) print(f'Before tune test {bfr_tun_roc_auc}') print(f'After tune test {aft_tun_roc_auc}', '\n') assert aft_tun_roc_auc >= bfr_tun_roc_auc
def roc_auc(y_true, y_score, pos_label=None, ascending_score=True): """Computes ROC AUC score Parameters ---------- y_true : array, shape=[n_samples] True binary labels, in range {0,1} or {-1,1}. If positive label is different than 1, it must be explicitly defined. y_score : array, shape=[n_samples] Scores for tested series of samples pos_label: int Positive label of samples (if other than 1) ascending_score: bool (default=True) Indicates if your score is ascendig. Ascending score icreases with deacreasing activity. In other words it ascends on ranking list (where actives are on top). Returns ------- ef : float Enrichment Factor for given percenage in range 0:1 """ if ascending_score: y_score = -y_score fpr, tpr, tresholds = roc(y_true, y_score, pos_label=pos_label) return auc(fpr, tpr, reorder=False)
def roc_log_auc(y_true, y_score, pos_label=None, log_min=0.001, log_max=1.): """Computes area under semi-log ROC for random distribution. Parameters ---------- y_true : array, shape=[n_samples] True binary labels, in range {0,1} or {-1,1}. If positive label is different than 1, it must be explicitly defined. y_score : array, shape=[n_samples] Scores for tested series of samples pos_label: int Positive label of samples (if other than 1) log_min : float (default=0.001) Minimum logarithm value for estimating AUC log_max : float (default=1.) Maximum logarithm value for estimating AUC. Returns ------- auc : float semi-log ROC AUC """ fpr, tpr, t = roc(y_true, y_score, pos_label=pos_label) idx = (fpr >= log_min) & (fpr <= log_max) log_fpr = 1-np.log10(fpr[idx])/np.log10(log_min) return auc(log_fpr, tpr[idx])
def roc_log_auc(y_true, y_score, pos_label=None, log_min=0.001, log_max=1.0): """Computes area under semi-log ROC for random distribution. Parameters ---------- y_true : array, shape=[n_samples] True binary labels, in range {0,1} or {-1,1}. If positive label is different than 1, it must be explicitly defined. y_score : array, shape=[n_samples] Scores for tested series of samples pos_label: int Positive label of samples (if other than 1) log_min : float (default=0.001) Minimum logarithm value for estimating AUC log_max : float (default=1.) Maximum logarithm value for estimating AUC. Returns ------- auc : float semi-log ROC AUC """ fpr, tpr, t = roc(y_true, y_score, pos_label=pos_label) idx = (fpr >= log_min) & (fpr <= log_max) log_fpr = 1 - np.log10(fpr[idx]) / np.log10(log_min) return auc(log_fpr, tpr[idx])
def roc_log_auc(y_true, y_score, pos_label=None, ascending_score=True, log_min=0.001, log_max=1.): """Computes area under semi-log ROC for random distribution. Parameters ---------- y_true : array, shape=[n_samples] True binary labels, in range {0,1} or {-1,1}. If positive label is different than 1, it must be explicitly defined. y_score : array, shape=[n_samples] Scores for tested series of samples pos_label: int Positive label of samples (if other than 1) ascending_score: bool (default=True) Indicates if your score is ascendig. Ascending score icreases with deacreasing activity. In other words it ascends on ranking list (where actives are on top). log_min : float (default=0.001) Minimum logarithm value for estimating AUC log_max : float (default=1.) Maximum logarithm value for estimating AUC. Returns ------- auc : float semi-log ROC AUC """ if ascending_score: y_score = -y_score fpr, tpr, t = roc(y_true, y_score, pos_label=pos_label) idx = (fpr >= log_min) & (fpr <= log_max) log_fpr = 1-np.log10(fpr[idx])/np.log10(log_min) return auc(log_fpr, tpr[idx], reorder=False)
def auc_plot_multi(XY, flag="", file="Noname", cutoff=None, show=None, linestyle='rx-.', include_baseline=True, equal_aspect=True): """ Method that generates a plot of the ROC curve Parameters: title: Title of the chart include_baseline: Add the baseline plot line if it's True equal_aspect: Aspects to be equal for all plot """ import pylab from matplotlib import pyplot pyplot.figure() pylab.clf() color_list = ["b", "g", "r", "c", "m", "y", "k", "w"] colorid = 0 for xy in XY: if colorid > len(color_list): colorid = 0 x, y = xy[:, 0], xy[:, 1] fpr, tpr, thresholds_roc = roc(x, y) aucvalue = round(auc(fpr, tpr), 3) pylab.plot([x1 for x1 in np.hstack((0, fpr))], [y1 for y1 in np.hstack((0, tpr))], color=color_list[colorid], linewidth=1.0) # pylab.plot([x1 for x1 in precision], [y1 for y1 in recall],color='blue',linewidth=2.0) if include_baseline: pylab.plot([0.0, 1.0], [0.0, 1.0], 'k--') pylab.ylim((0, 1)) pylab.xlim((0, 1)) pylab.xticks(pylab.arange(0, 1.1, .1), fontsize=10) pylab.yticks(pylab.arange(0, 1.1, .1), fontsize=10) #pylab.grid(False,alpha=0.5) if equal_aspect: cax = pylab.gca() cax.set_aspect('equal') #pylab.xlabel('1 - Specificity(red)/Precision(blue)') pylab.xlabel('1 - Specificity', fontsize=10) pylab.ylabel('Sensitivity', fontsize=10) pylab.title(flag + ' AUC=' + "%4.3f" % aucvalue + ' N=' + '%1.0f' % len(x)) colorid += 1 ax = pylab.gca() ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') ax.get_yaxis().set_tick_params(direction='out') ax.get_xaxis().set_tick_params(direction='out') pyplot.savefig(file + '_' + flag + '_aucplot.pdf', format='pdf') pylab.show()
def all_stats(labels, scores, cutoff=None): if np.unique(labels).shape[0] > 1: #print np.unique(labels) if np.unique(labels).shape[0] == 2: #print len(np.unique(labels)) fpr, tpr, thresholds_roc = roc(labels, scores) precision, recall, thresholds = precision_recall_curve( labels, scores) precision[np.where(precision == 0)] = 0.000000001 recall[np.where(recall == 0)] = 0.000000001 if len(thresholds) > 1: F_score = 2 * (precision * recall) / (precision + recall) try: if cutoff == None: #cutoff=round(thresholds_roc[np.where(abs(tpr-0.95)==min((abs(tpr-0.95))))][0],5) #print "Calculation cutoff of maximum F-score" cutoff = round( thresholds[np.where(F_score == max(F_score))][0], 5) else: print("Using cutoff from previous calculations", cutoff) aucvalue = round(auc(fpr, tpr), 3) cutoff_id = np.where( abs(thresholds_roc - cutoff) == min(abs(thresholds_roc - cutoff))) cutoff_pre_id = np.where( abs(thresholds - cutoff) == min(abs(thresholds - cutoff))) TPR = round(tpr[cutoff_id][0], 5) FPR = round(fpr[cutoff_id][0], 5) PRE = round(precision[cutoff_pre_id][0], 5) stats = aucvalue, TPR, 1 - FPR, len( labels), PRE, cutoff, max(F_score) except: stats = float('NaN'), float('NaN'), float('NaN'), len( labels), float('NaN'), float('NaN') else: stats = float('NaN'), float('NaN'), float('NaN'), len( labels), float('NaN'), float('NaN') else: gradient, intercept, r_value, p_value, std_err = linregress( labels, scores) std_err = np.std((labels - scores)) stats = r_value**2, std_err, gradient, len(labels), p_value, float( 'NaN') else: stats = [ float('NaN'), float('NaN'), float('NaN'), len(labels), float('NaN'), float('NaN') ] return np.array(stats)
def self_roc(X, y): from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) import matplotlib.pyplot as plt from sklearn.metrics import roc_curve as roc fpr, tpr, thresholds = roc(y_test, gs3.predict_proba(X_test)[:, 1]) plt.plot(fpr, tpr) plt.plot([0, 1])
def calc_RH04_auc_train_test(rh03_only=False): current_dir = os.getcwd() os.chdir(os.path.join('..', 'init_models')) models = dict() for mod_type in ("ord", "enrich", "bin", "rf"): with open("%s_model" % mod_type, 'rb') as model_file: models[mod_type] = pickle.load(model_file) os.chdir(current_dir) model_names = { "ord": "ordinal regression", "bin": "classification neural network", "enrich": "enrichment neural network", "rf": "random forest classifier" } colors = ("red", "blue", "green", "orange") for i, mod_type in enumerate(("ord", "enrich", "bin", "rf")): current_data = load_data(model_type=mod_type, return_validation_set=False, return_rh04=True, rh03_only=rh03_only) xtrain, rh04train, xtest, rh04test = current_data[0], current_data[1], current_data[2],\ current_data[3] trainpreds, testpreds = gen_model_spec_preds(models[mod_type], mod_type, xtrain, xtest) print('Training set %s AUC: %s' % (mod_type, auc(rh04train, trainpreds))) print('Test set %s AUC: %s' % (mod_type, auc(rh04test, testpreds))) fpr_ord, tpr_ord, _ = roc(rh04test, testpreds) equal_spec = np.arange(1, -0.1, -0.1) plt.plot(fpr_ord, tpr_ord, color=colors[i], label=model_names[mod_type]) plt.xlabel('False positive rate') plt.ylabel('True positive rate') if rh03_only == True: plt.title( 'AUC-ROC for RH04 prediction on the held-out test set' '\nusing models trained on RH01 - RH03 data only;\nAUC-ROC of ranking for RH03 sequences' ) else: plt.title( 'AUC-ROC for RH04 prediction on the held-out test set' '\nusing models trained on RH01 - RH03 data only;\nAUC-ROC of ranking for RH01 - RH03 sequences' ) plt.legend() plt.plot(equal_spec, equal_spec, color='black', linestyle='--') plt.show()
def get_results(ins, oos): #in/out of sample """ returns AOROC, AOPR (success), AOPR (failure) """ rval = [] y_true = np.hstack((np.ones(len(ins)), np.zeros(len(oos)))) y_score = np.hstack((ins, oos)) rval += [round(roc(y_true, y_score)*100, 2), round(pr(y_true, y_score)*100, 2)] y_true = np.hstack((np.zeros(len(ins)), np.ones(len(oos)))) y_score = -y_score rval += [#round(roc(y_true, y_score)*100, 2), round(pr(y_true, y_score)*100, 2)] return rval
def evaluate(y_test, y_pred): # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) print(cm) #f1 score from sklearn.metrics import f1_score fscore = f1_score(y_test, y_pred) print("F1-Score : ", fscore) # Finding Area Under ROC curve from sklearn.metrics import roc_auc_score as roc a = roc(y_test, y_pred, average='micro') print("ROC-AUC Score : ", a) return (cm, fscore, a)
def analyze(classifier, X_val, y_val, prc_ax, roc_ax, **params): # y_predict = classifier.predict(X_val) if params['model'] is 'svm' or params['model'] is 'logistic': y_predict = classifier.decision_function(X_val) else: y_predict = classifier.predict_proba(X_val)[:, 1] # Accuracy accuracy = classifier.score(X_val, y_val) # Precision-Recall auprc = prc_score(y_val, y_predict) precision, recall, thresholds = prc(y_val, y_predict) prc_ax.plot(recall, precision, label='AUC={}'.format(auprc)) # Receiver Operating Characteristics fpr, tpr, thr = roc(y_val, y_predict, pos_label=1) auroc = roc_score(fpr, tpr) roc_ax.plot(fpr, tpr, label='AUC={}'.format(auroc)) return accuracy, auprc, auroc
def get_AOC(ins, oos): #in/out of sample # TODO: order of the last 2??? """ returns AOROC, AOPR (success), AOPR (failure) """ rval = [] y_true = np.hstack((np.ones(len(ins)), np.zeros(len(oos)))) y_score = np.vstack((ins, oos)) # TODO: use different scores (e.g. entropy, acq fns) y_score = y_score.max(axis=1) #print y_score #import ipdb; ipdb.set_trace() rval += [ round(roc(y_true, y_score) * 100, 2), round(pr(y_true, y_score) * 100, 2) ] y_true = np.hstack((np.zeros(len(ins)), np.ones(len(oos)))) y_score = -y_score rval += [ #round(roc(y_true, y_score)*100, 2), round(pr(y_true, y_score) * 100, 2) ] return rval
def compute_eval_stats(classifier, y_data, rankings, threshold): ''' Takes: classifier object, true target data, predicted score rankings, ranking threshold cutoff Returns: accuracy, precision, recall of predictions of classifier on x for y ''' predicted_test = np.where(rankings < threshold, 1, 0) # print(threshold) # print(predicted_test.sum()) # print(predicted_test[0:10]) # print("num unique ranks: ", pd.DataFrame(pred_scores)[0].unique().shape) # print("eval stats rankings are: ", rankings[0:10]) stats = [ accuracy(y_data, predicted_test), precision(y_data, predicted_test), recall(y_data, predicted_test), f1(y_data, predicted_test), roc(y_data, predicted_test) ] return stats
def train(): data_x, data_y = load_tensors() sizedata = len(data_x) print("Data of size:", sizedata) # Split dataset into 5 sub-datasets splitted_x = list(split(data_x, 5)) splitted_y = list(split(data_y, 5)) print("Available GPU :", torch.cuda.is_available()) torch.cuda.set_device(0) k = ARGS.kFold # Prepare array of scores precision_list = [] recall_list = [] # valloss_list = [] AUC_list = [] for ind_i in range(0,k): # Prepare X_train Y_train X_test Y_test X_test = splitted_x[ind_i] Y_test = splitted_y[ind_i] # Deep copy, otherwise iteration problem copysplitX = list(splitted_x) copysplitY = list(splitted_y) del copysplitX[ind_i] del copysplitY[ind_i] X_train = copysplitX Y_train = copysplitY model = Network().cuda() # XAVIER Init model.apply(init_weights) with torch.cuda.device(0): # Hyperparameters : epochs = ARGS.nEpochs batchsize = ARGS.batchSize learning_rate = ARGS.lr log_interval = 2 criterion = nn.BCEWithLogitsLoss() # criterion = nn.BCELoss() # criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=learning_rate) # optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Train loader numplist = np.array(X_train) arrX = np.concatenate(numplist).tolist() tensor_x = torch.Tensor(arrX).cuda() numplist = np.array(Y_train) arrY = np.concatenate(numplist).tolist() tensor_y = torch.Tensor(arrY).cuda() print("Shape X:", np.shape(arrX), "Shape Y:", np.shape(arrY)) # tensor_x = torch.Tensor(np.array(X_train).tolist()).cuda() # transform to torch tensor # tensor_y = torch.Tensor(np.array(Y_train).tolist()).cuda() dataset = dt.TensorDataset(tensor_x, tensor_y) # create your dataset # train_size = int(len(dataset)) # print("train_size =", train_size) train_loader = dt.DataLoader( dataset, batch_size=batchsize, shuffle=True) # Test loader tensor_x = torch.Tensor(np.array(X_test).tolist()).cuda() # transform to torch tensor tensor_y = torch.Tensor(np.array(Y_test).tolist()).cuda() dataset = dt.TensorDataset(tensor_x, tensor_y) # create your dataset test_loader = dt.DataLoader( dataset, batch_size=batchsize, shuffle=True) # Training for epoch in range(epochs): for batch_idx, (data, target) in enumerate(train_loader): data, target = Variable(data), Variable(target) optimizer.zero_grad() net_out = model(data) loss = criterion(net_out, target) loss.backward() optimizer.step() if batch_idx % log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: '.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader))) print(loss.data) # saving model # torch.save(model.state_dict(), ARGS.outFile + str(ind_i)) # Testing and save score total = 0 correct = 0 model.eval() # Validation loss # loss_values = [] itr_ctr = 0 for batch_idx, (data, target) in enumerate(test_loader): #with torch.no_grad(): itr_ctr += 1 data, target = Variable(data, volatile=True), Variable(target, volatile=True) net_out = model(data) loss = criterion(net_out, target) # loss_values.append(loss) # Validation Loss in the list # valloss_list.append(np.mean(loss_values)) P = list() R = list() # Precisions for i in range(1,4): for data in test_loader: x, labels = data outputs = model(Variable(x)).detach() # output is a tensor of size [BATCHSIZE][ARGS.numberOfOutputCodes] _, predicted = torch.topk(outputs.data, i) for y_predlist, y in zip(predicted, labels): for y_pred in y_predlist: total += 1 if y[y_pred] == 1: correct += 1 precision = correct / total P.append(precision) correct = 0 total = 0 # Number of diagnostic for each sample (mean of 12 codes, max of 30 codes, R@10 - R@20 - R@30 seems appropriate) total_true_list = list() for data in test_loader: x, labels = data for y in labels : total_true = 0 for val in y : if val == 1: total_true += 1 total_true_list.append(total_true) # Recalls for i in range(10,40,10): total_true_list_cpy = list(total_true_list) for data in test_loader: x, labels = data outputs = model(Variable(x)).detach() _, predicted = torch.topk(outputs.data, i) for y_predlist, y in zip(predicted, labels): total += total_true_list_cpy.pop(0) for y_pred in y_predlist: if y[y_pred] == 1: correct += 1 recall = correct / total R.append(recall) correct = 0 total = 0 precision_list.append(P) recall_list.append(R) # AUROC YTRUE = None YPROBA = None for data in test_loader: x, labels = data x, labels = Variable(x), Variable(labels) outputs = model(x).detach().cpu().numpy() labels = labels.detach().cpu().numpy() for batch_true, batch_prob in zip(labels, outputs): YTRUE = np.concatenate((YTRUE, [batch_true]), axis=0) if YTRUE is not None else [batch_true] YPROBA = np.concatenate((YPROBA, [batch_prob]), axis=0) if YPROBA is not None else [batch_prob] ROC_avg_score=roc(YTRUE, YPROBA, average='micro', multi_class='ovr') AUC_list.append(ROC_avg_score) # Output score of each fold + average print("Scores for each fold:") print("Precision:", precision_list) print("Recall:", recall_list) print("AUROC:", AUC_list) # print("Loss:", valloss_list) print("Average scores:") P1=(sum([precision_list[k][0] for k in range(0, k)])/k) P2=(sum([precision_list[k][1] for k in range(0, k)])/k) P3=(sum([precision_list[k][2] for k in range(0, k)])/k) R10=(sum([recall_list[k][0] for k in range(0, k)])/k) R20=(sum([recall_list[k][1] for k in range(0, k)])/k) R30=(sum([recall_list[k][2] for k in range(0, k)])/k) AUROC=(sum([AUC_list[k] for k in range(0,k)])/k) # loss_avg=sum(valloss_list)/len(valloss_list) print("Precision@1:", P1) print("Precision@2:", P2) print("Precision@3:", P3) print("Recall@10:", R10) print("Recall@20:", R20) print("Recall@30:", R30) print("AUROC:", AUROC)
cm = confusion_matrix(y_test, y_pred) # Finding Area Under ROC curve #from sklearn.metrics import roc_auc_score as roc #a = roc(y_test, y_pred, average='micro') from sklearn.metrics import roc_auc_score as roc from sklearn.preprocessing import label_binarize print(y_test) y_test= label_binarize(y_test,classes=[0,1]) y_pred= label_binarize(y_pred,classes=[0,1]) print(y_test) #y_test=y_test.reshape(-1, 1) #y_pred=y_test.reshape(1, -1) #print(y_test) a = roc(y_test, y_pred, average='micro') #################################################################### #Processing data for plotting graph from sklearn.preprocessing import label_binarize y = label_binarize(y, classes=[0, 1]) n_classes = y.shape[1] from sklearn.preprocessing import OneHotEncoder onehotencoder = OneHotEncoder(categorical_features = [0]) y = onehotencoder.fit_transform(y).toarray() y_test = label_binarize(y_test, classes=[0, 1]) onehotencoder = OneHotEncoder(categorical_features = [0])
def main(): X, y = rr('kddcup.data_10_percent') X, X_test, y, y_test = train_test_split(X, y, random_state=75) y = np.array([0 if i != 11 else 1 for i in y]) yy = np.array([0 if i != 11 else 1 for i in y_test]) random_state = np.random.RandomState(0) kdd3 = SVC() kdd2 = C4_5(random_state=0) kdd1 = NB() kdd = RF(n_estimators=100, random_state=0, max_features=2) p3 = kdd3.fit(X, y) p2 = kdd2.fit(X, y) p1 = kdd1.fit(X, y) p = kdd.fit(X, y) t = p.predict(X_test) t1 = p1.predict(X_test) t2 = p2.predict(X_test) t3 = p3.predict(X_test) print("Random Forest") rn(t, yy) print("Native Base") rn(t1, yy) print("C4.5") rn(t2, yy) print("SVC") rn(t3, yy) # Построение матрицы ошибок cm3 = confusion_matrix(yy, t3) disp = ConfusionMatrixDisplay(confusion_matrix=cm3) disp.plot() plt.show() cm2 = confusion_matrix(yy, t2) disp = ConfusionMatrixDisplay(confusion_matrix=cm2) disp.plot() plt.show() cm1 = confusion_matrix(yy, t1) disp = ConfusionMatrixDisplay(confusion_matrix=cm1) disp.plot() plt.show() cm = confusion_matrix(yy, t) disp = ConfusionMatrixDisplay(confusion_matrix=cm) disp.plot() plt.show() # Построение AUC-ROC кривых pr3, rc3, _ = precision_recall_curve(yy, t3) fpr, tpr, _ = roc(yy, p3.decision_function(X_test)) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('SVM', roc_auc)) pr2, rc2, _ = precision_recall_curve(yy, t2) fpr, tpr, _ = roc(yy, p2.predict_proba(X_test)[:, 1]) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('C4.5', roc_auc)) pr1, rc1, _ = precision_recall_curve(yy, t1) fpr, tpr, _ = roc(yy, p1.predict_proba(X_test)[:, 1]) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % ('NB', roc_auc)) pr, rc, _ = precision_recall_curve(yy, t) fpr, tpr, _ = roc(yy, p.predict_proba(X_test)[:, 1]) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, linestyle='--', label='%s ROC (area = %0.2f)' % ('RF', roc_auc)) plt.plot([0, 1], [0, 1], linestyle='--') plt.legend(loc=0, fontsize='small') plt.show() plt.plot(rc3, pr3, label='SVM') plt.plot(rc2, pr2, label='C4.5') plt.plot(rc1, pr1, label='NB') plt.plot(rc, pr, label='RF') plt.legend(loc=0, fontsize='small') plt.show()
def main(): # Prepare data df_even = pd.read_csv('DNN/adversary_even_log.csv', index_col=0) df_even = df_even.loc[df_even['Class']==0] df_even = df_even.loc[df_even['generator']!=2] df_even = df_even.loc[df_even['nTags']==2] df_odd = pd.read_csv('DNN/adversary_odd_log.csv', index_col=0) df_odd = df_odd.loc[df_odd['Class']==0] df_odd = df_odd.loc[df_odd['generator']!=2] df_odd = df_odd.loc[df_odd['nTags']==2] df_odd[variables] = scale(df_odd[variables]) df_even[variables] = scale(df_even[variables]) # construction of bdt bdt_even = AdaBoostClassifier(DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=0.01), learning_rate=learning_rate, algorithm="SAMME", n_estimators=n_estimators ) bdt_even.n_classes_ = 2 bdt_odd = AdaBoostClassifier(DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=0.01), learning_rate=0.15, algorithm="SAMME", n_estimators=n_estimators ) bdt_odd.n_classes_ = 2 # Convert generator class to categorical z_even = to_categorical(df_even['generator'], num_classes=2) z_odd = to_categorical(df_odd['generator'], num_classes=2) # fitting to generators bdt_even.fit(df_even[variables], df_even['generator'], sample_weight=df_even['EventWeight']) bdt_odd.fit(df_odd[variables], df_odd['generator'], sample_weight=df_odd['EventWeight']) # Scoring df_odd['bdt_outcome'] = bdt_odd.decision_function(df_odd[variables]).tolist() df_even['bdt_outcome'] = bdt_even.decision_function(df_even[variables]).tolist() print(bdt_odd.score()) df = pd.concat([df_odd,df_even]) # plotting BDT outcome for different generators gen1 = df.loc[df['generator']==0] gen2 = df.loc[df['generator']==1] # plt.hist(gen1['bdt_outcome'],bins=70,color='red',alpha=0.5,density=True) # plt.hist(gen2['bdt_outcome'],bins=70,color='blue',alpha=0.5,density=True) # # plt.show() # calculating fpr, tpr and auc for roc curve fpr = dict() tpr = dict() area = dict() z_all = to_categorical(df['generator'], num_classes=2) fpr[0], tpr[0], _ = roc(z_all[:,0], df['bdt_outcome'],sample_weight=df['EventWeight']) # area[0] = auc(fpr[0], tpr[0]) # plotting the roc curve # plt.plot(fpr[0], tpr[0], color='darkorange',lw=1,label='PYTHIA, ROC curve (area = %0.2f)' % area[0]) # plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--') # plt.legend() # plt.savefig('roc.png', bbox_inches='tight') # plt.show(block=True) print('It runs. ')
def test(): # Load the test data X_test = pickle.load(open(ARGS.Xinputdata, 'rb')) Y_test = pickle.load(open(ARGS.Yinputdata, 'rb')) ARGS.inputdim = len(X_test[0][0]) ARGS.numberOfOutputCodes = len(Y_test[0][0]) print("Dataset with :", len(X_test), "patients. Y:", len(Y_test)) print("Each patient has :", len(X_test[0]), "admissions. Y:", len(Y_test[0])) X_test=np.array([np.array([np.array(unelist, dtype=np.uint8) for unelist in xi]) for xi in X_test]) Y_test=np.array([np.array([np.array(unelist, dtype=np.uint8) for unelist in xi]) for xi in Y_test]) tensor_x = torch.from_numpy(X_test) tensor_y = torch.from_numpy(Y_test) print("X_dataset_shape=",tensor_x.shape) print("Y_dataset_shape=",tensor_y.shape) dataset = dt.TensorDataset(tensor_x, tensor_y) # create your dataset batchsize = ARGS.batchSize test_loader = dt.DataLoader( dataset, batch_size=batchsize, shuffle=False) # Load the model model = Network() model.load_state_dict(torch.load(ARGS.inputModel)) # for param_tensor in model.state_dict(): # print(param_tensor, "\t", model.state_dict()[param_tensor].size()) total = 0 correct = 0 model.eval() h = model.init_hidden() P = list() R = list() # Precisions for i in range(1,4): for (data, targets) in test_loader: x, labels = Variable(data.float()), Variable(targets.float()) # output is a tensor of size [BATCHSIZE][#ADMISSIONS][ARGS.numberOfOutputCodes] if (x.size(0) != ARGS.batchSize): continue outputs, h = model(x, h) _, predicted = torch.topk(outputs.data, i) for y_predlist_adm, y_adm in zip(predicted, targets): for y_predlist, y in zip(y_predlist_adm, y_adm): # If y is a tensor with only zeros (padding), break this loop if torch.max(y) != 1 : break for y_pred in y_predlist: total += 1 if y[y_pred] == 1: correct += 1 precision = correct / total print("P@", i, "=", precision) P.append(precision) correct = 0 total = 0 # Number of diagnostic for each sample (mean of 12 codes, max of 30 codes, R@10 - R@20 - R@30 seems appropriate) total_true_list = list() h = model.init_hidden() for (data, targets) in test_loader: x, labels = Variable(data.float()), Variable(targets.float()) if (x.size(0) != ARGS.batchSize): continue outputs, h = model(x, h) for y_adm in targets : for y in y_adm : if torch.max(y) != 1 : break total_true = 0 for val in y : if val == 1: total_true += 1 total_true_list.append(total_true) # recall h = model.init_hidden() for i in range(10,40,10): total_true_list_cpy = list(total_true_list) for (data, targets) in test_loader: x, labels = Variable(data.float()), Variable(targets.float()) if (x.size(0) != ARGS.batchSize): continue outputs, h = model(x,h) _, predicted = torch.topk(outputs.data, i) for y_predlist_adm, y_adm in zip(predicted, targets): for y_predlist, y in zip(y_predlist_adm, y_adm): if torch.max(y) != 1 : break total += total_true_list_cpy.pop(0) for y_pred in y_predlist: if y[y_pred] == 1: correct += 1 recall = correct / total print("R@", i, "=", recall) R.append(recall) correct = 0 total = 0 # AUROC YTRUE = None YPROBA = None h = model.init_hidden() for (data, targets) in test_loader: x, labels = Variable(data.float()), Variable(targets.float()) if x.size(0) != ARGS.batchSize: continue outputs, h = model(x, h) outputs = outputs.detach().cpu().numpy() x = x.detach().cpu().numpy() labels = labels.detach().cpu().numpy() for batch_true, batch_prob in zip(labels, outputs): for adm_true, adm_prob in zip(batch_true, batch_prob): if torch.max(torch.from_numpy(adm_true)) != 1: break YTRUE = np.concatenate((YTRUE, [adm_true]), axis=0) if YTRUE is not None else [adm_true] YPROBA = np.concatenate((YPROBA, [adm_prob]), axis=0) if YPROBA is not None else [adm_prob] ROC_avg_score = roc(YTRUE, YPROBA, average='micro', multi_class='ovr') print("ROC Average Score:", ROC_avg_score)
#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Created on Tue Feb 21 15:43:55 2017 @author: ViniciusPantoja """ #%% # After runnig the data_adjustments file, this code will deliver the final # predictions from sklearn.linear_model import LogisticRegression as lr from sklearn.metrics import roc_auc_score as roc model = lr(penalty='l2', fit_intercept=False, n_jobs=-1) resultado = model.fit(X_train, Y_train) prediction = resultado.predict(X_test) roc(Y_test, prediction) final_prediction = resultado.predict(Y)
xgtrain = xgb.DMatrix(Xdata,label=Ydata) xgmodel = XGBClassifier_new(n_estimators=550,seed=1,learning_rate=0.1,objective='binary:logistic',nthread=-1) xgb_param = xgmodel.get_xgb_params() cvresult = xgb.cv(xgb_param,xgtrain, num_boost_round=xgmodel.get_params()['n_estimators'],nfold=4,metrics=['auc'], early_stopping_rounds=100, show_progress=True) #cross_validation = StratifiedKFold(Ydata, n_folds=4, shuffle=True,random_state=0) #predicted = cross_val_predict(xgmodel, Xdata, Ydata, cv = cross_validation, verbose= 1, n_jobs = 1) X_train, X_test, y_train, y_test = train_test_split(Xdata, Ydata, test_size=0.25, random_state=1) xgmodel.fit(X_train, y_train) ypred = xgmodel.predict_proba(X_test) print roc(y_test, ypred[:,1]) #print ypred[:,1] xgmodel.fit(Xdata, Ydata) test_df_old['predicted'] = xgmodel.predict_proba(Xtest)[:,1] #print test_df_old test_df_old.to_csv('submission_xgb.csv',index=False) merged_df['predicted'] = xgmodel.predict_proba(Xdata)[:,1] merged_df['Ytrue'] = ytrue #print roc(ytrue, merged_df['predicted'])
#view the results #Usage: python results.py resultfile import pandas as pd import sys from sklearn.metrics import accuracy_score, roc_auc_score as roc, auc, classification_report, balanced_accuracy_score, matthews_corrcoef as mcc df = pd.read_csv( sys.argv[1], header=None, sep=',', ) y_true = df[2] y_pred = df[3] y_predprob = df[4] #print(y_test) print("Accuracy:", accuracy_score(y_true, y_pred)) print("Balanced accuracy:", balanced_accuracy_score(y_true, y_pred)) #print(auc(y_true, y_predprob)) print(classification_report(y_true, y_pred)) print("MCC:", mcc(y_true, y_pred)) print("ROC_AUC:", roc(y_true, y_predprob))
def test(): # Load the test data X_test = pickle.load(open(ARGS.Xinputdata, 'rb')) Y_test = pickle.load(open(ARGS.Yinputdata, 'rb')) criterion = nn.BCEWithLogitsLoss() ARGS.inputdim = len(X_test[0]) ARGS.numberOfOutputCodes = len(Y_test[0]) print("X_test of len:", len(X_test), "and Y_test of len:", len(Y_test)) print("Samples of X's len:", len(X_test[0]), "and samples of Y's len:", len(Y_test[0])) tensor_x = torch.Tensor(np.array(X_test)) # transform to torch tensor print("X_dataset_shape=", tensor_x.shape) tensor_y = torch.Tensor(np.array(Y_test)) print("Y_dataset_shape=", tensor_y.shape) dataset = dt.TensorDataset(tensor_x, tensor_y) # create your dataset test_size = int(len(dataset)) print("test_size =", test_size) batchsize = ARGS.batchSize test_loader = dt.DataLoader(dataset, batch_size=batchsize, shuffle=False) # Load the model model = Network() model.load_state_dict( torch.load(ARGS.inputModel, map_location=torch.device('cpu'))) # for param_tensor in model.state_dict(): # print(param_tensor, "\t", model.state_dict()[param_tensor].size()) total = 0 correct = 0 model.eval() # validation loss loss_values = [] itr_ctr = 0 for batch_idx, (data, target) in enumerate(test_loader): with torch.no_grad(): itr_ctr += 1 data, target = Variable(data), Variable(target) net_out = model(data) loss = criterion(net_out, target) loss_values.append(loss) print("Validation loss :", np.mean(loss_values)) P = list() R = list() # Precisions for i in range(1, 4): for data in test_loader: x, labels = data outputs = model( x ) # output is a tensor of size [BATCHSIZE][ARGS.numberOfOutputCodes] # _, predicted = torch.max(outputs.data, 1) _, predicted = torch.topk(outputs.data, i) for y_predlist, y in zip(predicted, labels): for y_pred in y_predlist: total += 1 if y[y_pred] == 1: correct += 1 precision = correct / total print("P@", i, "=", precision) P.append(precision) correct = 0 total = 0 # Number of diagnostic for each sample (mean of 12 codes, max of 30 codes, R@10 - R@20 - R@30 seems appropriate) total_true_list = list() for data in test_loader: x, labels = data outputs = model(x) for y in labels: total_true = 0 for val in y: if val == 1: total_true += 1 total_true_list.append(total_true) # Recalls for i in range(10, 40, 10): total_true_list_cpy = list(total_true_list) for data in test_loader: x, labels = data outputs = model(x) _, predicted = torch.topk(outputs.data, i) for y_predlist, y in zip(predicted, labels): total += total_true_list_cpy.pop(0) for y_pred in y_predlist: if y[y_pred] == 1: correct += 1 recall = correct / total print("R@", i, "=", recall) R.append(recall) correct = 0 total = 0 # AUC score AUC_list = list() YTRUE = None YPROBA = None for data in test_loader: x, labels = data x, labels = Variable(x), Variable(labels) outputs = model(x).detach().numpy() labels = labels.detach().numpy() # roc_score=roc(labels, outputs, average='micro', multi_class='ovr') # AUC_list.append(roc_score) for batch_true, batch_prob in zip(labels, outputs): YTRUE = np.concatenate( (YTRUE, [batch_true]), axis=0) if YTRUE is not None else [batch_true] YPROBA = np.concatenate( (YPROBA, [batch_prob]), axis=0) if YPROBA is not None else [batch_prob] # ROC_avg_score=sum(AUC_list)/len(AUC_list) ROC_avg_score = roc(YTRUE, YPROBA, average='micro', multi_class='ovr') print("ROC Average Score:", ROC_avg_score)