def print_results(unique_test_name, grid, y_pred, y_test): fpr, tpr, _ = ROC(y_test, y_pred) roc_auc = auc(fpr, tpr) print("-----------------%s--------------------" % unique_test_name) print("-----------------Best Param Overview--------------------") print("Best score: %0.4f" % grid.best_score_) print("Using the following parameters:") print(grid.best_params_) print("-----------------Scoring Model--------------------") print(classification_report(y_pred, y_test)) print(confusion_matrix(y_pred, y_test), "\n") plt.figure() lw = 2 plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show() return
def evaluateModel(self,test_x,test_y,model): from sklearn.metrics import classification_report as CR, roc_auc_score as ROC predict_result = model.predict(test_x) predict_prob = model.predict_proba(test_x)[:,1] crReport = CR(test_y,predict_result) acc = model.score(test_x,test_y) roc = ROC(test_y,predict_prob) #print("Accuracy %f" % model.score(test_x,test_y)) return crReport,acc,roc
def ensembleStacking(self, train_x,train_y,test_x,test_y): from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report as CR, roc_auc_score as ROC algorithms = {"rfc":{"algo":["gini","entropy"] , "method":self.RandomForestClassifier}, "lgbm":{"algo":["gbdt","dart","goss"] , "method": self.lightgbm}, "logit": {"algo":[""] , "method":self.LogisticRegression}, "ada":{"algo":[""] , "method":self.AdaBoostClassifier}, "dt":{"algo":["gini","entropy"] , "method":self.DecisionTreeClassifier}, "knn":{"algo":["auto"] , "method":self.KNN}, "xgboost":{"algo":["gbtree","gblinear","dart"] , "method":self.XGBoostClassifier}, } models = [] df_val_pred = pd.DataFrame() for key in algorithms: algosPerClassifier = algorithms[key]["algo"] for algo in algosPerClassifier: method = algorithms[key]["method"] model = method(train_x,train_y,test_x,test_y,algorithm = algo) predict_prob = model.predict_proba(test_x)[:,1] modelName = key + "_"+algo df_val_pred = pd.concat([df_val_pred,pd.DataFrame(predict_prob,columns = [modelName])], axis = 1) models.append({modelName:(model,ROC(test_y,predict_prob))}) df_val_pred = np.array(df_val_pred) pdb.set_trace() stackingModel = LogisticRegression(class_weight = 'balanced') stackingModel.fit(df_val_pred,test_y) predict_prob = stackingModel.predict_proba(df_val_pred)[:,1] roc = ROC(test_y,predict_prob) print("ROC %f" % roc) models.append({"StackingModel":(stackingModel,roc)}) return models,df_val_pred
def metaLearner(data_x,data_y,test,models): from sklearn import svm from sklearn.metrics import classification_report as CR, roc_auc_score as ROC from sklearn.model_selection import StratifiedKFold roc_tmp = 0 model_final = 0 skf = StratifiedKFold(n_splits=5) for train_index, test_index in skf.split(data_x, data_y): train_x, test_x = data_x[train_index], data_x[test_index] train_y, test_y = data_y[train_index], data_y[test_index] model,result = models(train_x,train_y,test_x,test_y) roc = ROC(test_y,result) print("ROC:",roc) if roc_tmp < roc: roc_tmp = roc model_final = model pdb.set_trace() result = model_final.predict_proba(test) return result
def lightgbm_tmp(self,train,test): from sklearn.model_selection import StratifiedKFold from sklearn.metrics import classification_report as CR, roc_auc_score as ROC data_x = np.array(train.loc[:,train.columns != "TARGET"]) data_y = np.array(train[["TARGET"]]) data_test_x = np.array(test) skf = StratifiedKFold(n_splits=2) roc = 0 for train_index, test_index in skf.split(data_x, data_y): train_x, test_x = data_x[train_index], data_x[test_index] train_y, test_y = data_y[train_index], data_y[test_index] model_tmp,result = self.lightgbm(train_x,train_y,test_x,test_y) roc_tmp = ROC(test_y,result) print("ROC",roc_tmp) if roc_tmp > roc: roc= roc_tmp model = model_tmp results = model.predict_proba(data_test_x) results = results[:,1] return model, results
st = '(svm) fraction of testing instances correctly predicted: ' print("{0}{1}".format(st, fraction_correct(svc_pred, te[1]))) #### Evaluate Classifier Performance ##### ROC # In[148]: # because we have a binary classification problem, # we can use ROC to evaluate the quality of these models #logistic regression pred_prob_lr = lr.predict_proba(te[0]) false_pos_rate_lr, true_pos_rate_lr, thresholds_lr = ROC( te[1], pred_prob_lr[:, 1]) roc_auc_lr = AUC(false_pos_rate_lr, true_pos_rate_lr) print( "Logisitc Regression, area under the curve: {0:>9.3f}".format(roc_auc_lr)) # svm pred_prob_svm = svc.predict_proba(te[0]) false_pos_rate_svm, true_pos_rate_svm, thresholds_svm = ROC( te[1], pred_prob_svm[:, 1]) roc_auc_svm = AUC(false_pos_rate_svm, true_pos_rate_svm) print("SVM, area under the curve: {0:>25.3f}".format(roc_auc_svm)) # In[170]: # plot the ROC curves for each classifier
def _error(self, text, y_test, predictions): print(text + ' roc_auc_score: ', ROC(y_test, predictions)) return ROC(y_test, predictions)
times = time() clf = SVC(kernel="linear", C=3.1663157894736838, cache_size=5000, class_weight="balanced").fit(Xtrain, Ytrain) result = clf.predict(Xtest) score = clf.score(Xtest, Ytest) recall = recall_score(Ytest, result) auc = roc_auc_score(Ytest, clf.decision_function(Xtest)) print("testing accuracy %f,recall is %f', auc is %f" % (score, recall, auc)) print(datetime.datetime.fromtimestamp(time() - times).strftime("%M:%S:%f")) from sklearn.metrics import roc_curve as ROC import matplotlib.pyplot as plt FPR, Recall, thresholds = ROC(Ytest, clf.decision_function(Xtest), pos_label=1) area = roc_auc_score(Ytest, clf.decision_function(Xtest)) plt.figure() plt.plot(FPR, Recall, color='red', label='ROC curve (area = %0.2f)' % area) plt.plot([0, 1], [0, 1], color='black', linestyle='--') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('Recall') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show() maxindex = (Recall - FPR).tolist().index(max(Recall - FPR))
def ensembleStacking(self, train,test, iteration = 5): from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report as CR, roc_auc_score as ROC from sklearn.model_selection import StratifiedKFold model = 0 roc = 0 models = [] algorithms = { "xgboost":{"algo":["gbtree","gblinear","dart"] , "method":self.XGBoostClassifier}, "tf":{"algo":[""], "method": self.TensorFlowModel}, "rfc":{"algo":["gini","entropy"] , "method":self.RandomForestClassifier}, "lgbm":{"algo":["gbdt","dart","goss"] , "method": self.lightgbm}, "logit": {"algo":[""] , "method":self.LogisticRegression}, "ada":{"algo":[""] , "method":self.AdaBoostClassifier}, "dt":{"algo":["gini","entropy"] , "method":self.DecisionTreeClassifier} } skf = StratifiedKFold(n_splits=iteration) data_x = np.array(train.loc[:,train.columns != "TARGET"]) data_y = np.array(train[["TARGET"]]) data_test_x = np.array(test) df_val_pred = pd.DataFrame() df_test_pred = pd.DataFrame() index = 0 target = np.array([], dtype = np.int32) for train_index, test_index in skf.split(data_x, data_y): train_x, test_x = data_x[train_index], data_x[test_index] train_y, test_y = data_y[train_index], data_y[test_index] df_val_pred_algo = pd.DataFrame() for key in algorithms: algosPerClassifier = algorithms[key]["algo"] for algo in algosPerClassifier: method = algorithms[key]["method"] model,predict_prob = method(train_x,train_y,test_x,test_y,algorithm = algo) modelName = key + "_"+algo #---------Merge result of each algo #pdb.set_trace() df_val_pred_algo = pd.concat([df_val_pred_algo, pd.DataFrame({modelName: predict_prob})], axis = 1) if index == 0: #-------Do prediction for entire dataset i.e. train and test #-------This will be your test set model,result = method(data_x, data_y,data_test_x,"") df_test_pred = pd.concat([df_test_pred,pd.DataFrame({modelName: result})],axis = 1) #--------Add target variable target = np.concatenate((target,test_y[:,0]), axis = 0) #-------Merge all the splits row wise df_val_pred = pd.concat([df_val_pred,df_val_pred_algo],axis = 0) index = index + 1 df_val_pred = np.array(df_val_pred) df_test_pred = np.array(df_test_pred) pdb.set_trace() stackingModel = LogisticRegression(class_weight = 'balanced') stackingModel.fit(df_val_pred,target) #--------Metrics info on training set predict_prob = stackingModel.predict_proba(df_val_pred)[:,1] roc = ROC(target,predict_prob) print("ROC %f" % roc) predict_prob = stackingModel.predict_proba(df_test_pred)[:,1] return predict_prob,df_val_pred,target,df_test_pred