def train_test(X_train, X_test, y_train, y_test): print("Performing grid search...") RANDOM_SEED = 99 k = 5 scoring_val = 'roc_auc' max_features_val = 'sqrt' #-----------------------------Find the best parameters' combination of the model------------------------------ param_test1 = {'penalty': ['l1', 'l2'], 'C': [1.0, 10.0, 100.0]} gsearch1 = GridSearchCV(estimator=LogisticRegression( class_weight='balanced', random_state=RANDOM_SEED, n_jobs=4), param_grid=param_test1, scoring=scoring_val, cv=k) t1 = time() gsearch1.fit(X_train, y_train) print("Grid search phase1 done in %0.3fs" % (time() - t1)) print("best score: %0.3f" % gsearch1.best_score_) print("best parameters set:", gsearch1.best_params_) #for item in gsearch1.grid_scores_: # print(item) #print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_,'\n') penalty_val = gsearch1.best_params_.get('penalty') c_val = gsearch1.best_params_.get('C') print( '-----------------------------------------------------------------------------------------------------' ) param_test2 = {'tol': [0.0001, 0.001, 0.01, 0.1]} gsearch2 = GridSearchCV(estimator=LogisticRegression( penalty=penalty_val, C=c_val, class_weight='balanced', solver='liblinear', max_iter=100, random_state=RANDOM_SEED, n_jobs=4), param_grid=param_test2, scoring=scoring_val, iid=False, cv=k) t2 = time() gsearch2.fit(X_train, y_train) print("Grid search phase2 done in %0.3fs" % (time() - t2)) print("best score: %0.3f" % gsearch2.best_score_) print("best parameters set:", gsearch2.best_params_) print('best_estimator_:', gsearch2.best_estimator_) #for item in gsearch2.grid_scores_: # print(item) #print(gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_,'\n') print() tol_val = gsearch2.best_params_.get('tol') ##-----------------------------Find the best parameters' combination of the model------------------------------ model = \ LogisticRegression(penalty = penalty_val, dual = False, tol = tol_val, C = c_val, fit_intercept = True, intercept_scaling = 1, class_weight = 'balanced', solver= 'liblinear', max_iter= 100, multi_class = 'ovr', verbose = 0, warm_start = False, random_state = RANDOM_SEED, n_jobs = 1) print("Performing kfold cross-validation...") kfold = model_selection.KFold(n_splits=5, random_state=RANDOM_SEED) eval_standard = ['accuracy', 'recall_macro', 'precision_macro', 'f1_macro'] results = [] t = time() for scoring in eval_standard: cv_results = model_selection.cross_val_score(model, X_train, y_train, scoring=scoring, cv=kfold) results.append(cv_results) msg = "%s: %f (%f)" % (scoring, cv_results.mean(), cv_results.std()) print(msg) model.fit(X_train, y_train) print("Kfold cross-validation done in %0.3fs" % (time() - t)) print() #joblib.dump(model,'../../model/lr_train_model.pkl',compress=3) joblib.dump(model, '/tmp/model/lr_train_model.pkl', compress=3) # Make predictions on validation dataset #default evaluation way print('-------------------default evaluation----------------------') rf_pred_probs = model.predict(X=X_test) result_probs = np.column_stack((rf_pred_probs, y_test.as_matrix())) #for item in result_probs: # print(item) print("confusion_matrix:\n", metrics.confusion_matrix(y_test, rf_pred_probs)) print("accuracy_score:", metrics.accuracy_score(y_test, rf_pred_probs)) print("recall_score:", metrics.recall_score(y_test, rf_pred_probs)) print("precision_score:", metrics.precision_score(y_test, rf_pred_probs)) print("f1_score:", metrics.f1_score(y_test, rf_pred_probs)) print("roc_auc_score:", metrics.roc_auc_score(y_test, rf_pred_probs)) print("classification_report:\n", metrics.classification_report(y_test, rf_pred_probs)) rf_pred_probs = model.predict_proba(X=X_test) result_probs = np.column_stack((rf_pred_probs, y_test.as_matrix())) #for item in result_probs: # print(item) result_df = DataFrame(result_probs, columns=['pred_neg', 'pred_pos', 'real']) #fpr,tpr,thresholds = metrics.roc_curve(result_df['real'],result_df['pred_pos'],pos_label=2) fpr, tpr, _ = metrics.roc_curve(result_df['real'], result_df['pred_pos']) # good model's auc should > 0.5 print("auc:", metrics.auc(fpr, tpr)) # good model's ks should > 0.2 print("ks:", max(tpr - fpr)) # good model's gini should > 0.6 print("gini:", 2 * metrics.auc(fpr, tpr) - 1) #self-defined evaluation way print('-------------------self-defined evaluation----------------------') low_prob = 1e-6 high_prob = 1 - low_prob log_low_prob = np.log(low_prob) g_low_prob = np.log(low_prob) log_high_prob = np.log(high_prob) log_prob_thresholds = np.linspace(start=log_low_prob, stop=log_high_prob, num=100) prob_thresholds = np.exp(log_prob_thresholds) rf_pred_probs = model.predict_proba(X=X_test) #result_probs = np.column_stack((rf_pred_probs,y_test)) #for item in result_probs: # print(item) #for item in rf_pred_probs[:,1]: # print(item) ## histogram of predicted probabilities ##n,bins,patches = plt.hist(rf_pred_probs[:1],10,normed=1,facecolor='g',alpha=0.75) ##plt.xlabel('Predicted probability of diabetes') ##plt.ylabel('Frequency') ##plt.title('Histogram of predicted probabilities') ###plt.text(60, .025, r'$\mu=100,\ \sigma=15$') ##plt.axis([0,1,0,1]) ##plt.grid(True) #print(type(rf_pred_probs)) #print(type(rf_pred_probs[:,1])) #print(rf_pred_probs[:,1]) #fig = plt.figure() #ax = fig.add_subplot(111) #ax.hist(rf_pred_probs[:,1], bins=20) #plt.xlim(0,1) #plt.title('Histogram of predicted probabilities') #plt.xlabel('Predicted probability of diabetes') #plt.ylabel('Frequency') #plt.show() model_oos_performance = bin_classif_eval(rf_pred_probs[:, 1], y_test, pos_cat=1, thresholds=prob_thresholds) #print(type(model_oos_performance)) #for item in model_oos_performance.recall: # print(item) recall_threshold = .74 idx = next(i for i in range(100) if model_oos_performance.recall[i] <= recall_threshold) - 1 print("idx = %d" % idx) selected_prob_threshold = prob_thresholds[idx] print("selected_prob_threshold:", selected_prob_threshold) print(model_oos_performance.iloc[idx, :])
###plt.text(60, .025, r'$\mu=100,\ \sigma=15$') ##plt.axis([0,1,0,1]) ##plt.grid(True) #print(type(rf_pred_probs)) #print(type(rf_pred_probs[:,1])) #print(rf_pred_probs[:,1]) #fig = plt.figure() #ax = fig.add_subplot(111) #ax.hist(rf_pred_probs[:,1], bins=20) #plt.xlim(0,1) #plt.title('Histogram of predicted probabilities') #plt.xlabel('Predicted probability of diabetes') #plt.ylabel('Frequency') #plt.show() model_oos_performance = bin_classif_eval(rf_pred_probs[:, 1], y_validation, pos_cat=1, thresholds=prob_thresholds) #print(type(model_oos_performance)) #for item in model_oos_performance.recall: # print(item) recall_threshold = .74 idx = next(i for i in range(100) if model_oos_performance.recall[i] <= recall_threshold) - 1 print("idx = %d" % idx) selected_prob_threshold = prob_thresholds[idx] print("selected_prob_threshold:", selected_prob_threshold) print(model_oos_performance.iloc[idx, :])
def test_bin_classif_eval_metrics(nb_samples=1000, threshold=0.5): """test: Binary Classification Metrics""" neg_class = "Cat0" pos_class = "Cat1" classes = neg_class, pos_class d = DataFrame(dict(actuals_cat=Categorical(choice(classes, size=nb_samples), categories=classes))) d["actuals_bool"] = d.actuals_cat == pos_class d["probs"] = uniform(size=nb_samples) d["hard_preds_bool"] = d.probs >= threshold d["hard_preds_cat"] = Categorical( map(lambda b: {False: neg_class, True: pos_class}[b], d.hard_preds_bool), categories=classes ) scikit_learn_accuracy = accuracy_score(d.actuals_cat, d.hard_preds_cat) scikit_learn_recall = recall_score(d.actuals_cat, d.hard_preds_cat, pos_label=pos_class) scikit_learn_precision = precision_score(d.actuals_cat, d.hard_preds_cat, pos_label=pos_class) scikit_learn_f1_score = f1_score(d.actuals_cat, d.hard_preds_cat, pos_label=pos_class) scikit_learn_log_loss = log_loss(d.actuals_cat, array(d.probs)) test = True # TEST: hard predictions (boolean format) vs. actuals (boolean format) metrics = bin_classif_eval(predictions=d.hard_preds_bool, actuals=d.actuals_bool) test &= ( allclose(metrics["accuracy"], scikit_learn_accuracy) & allclose(metrics["recall"], scikit_learn_recall) & allclose(metrics["precision"], scikit_learn_precision) & allclose(metrics["f1_score"], scikit_learn_f1_score) ) # TEST: hard predictions (boolean format) vs. actuals (categorical format) metrics = bin_classif_eval(predictions=d.hard_preds_bool, actuals=d.actuals_cat, pos_cat=pos_class) test &= ( allclose(metrics["accuracy"], scikit_learn_accuracy) & allclose(metrics["recall"], scikit_learn_recall) & allclose(metrics["precision"], scikit_learn_precision) & allclose(metrics["f1_score"], scikit_learn_f1_score) ) # TEST: hard predictions (categorical format) vs. actuals (boolean format) metrics = bin_classif_eval(predictions=d.hard_preds_cat, actuals=d.actuals_bool, pos_cat=pos_class) test &= ( allclose(metrics["accuracy"], scikit_learn_accuracy) & allclose(metrics["recall"], scikit_learn_recall) & allclose(metrics["precision"], scikit_learn_precision) & allclose(metrics["f1_score"], scikit_learn_f1_score) ) # TEST: hard predictions (categorical format) vs. actuals (categorical format) metrics = bin_classif_eval(predictions=d.hard_preds_cat, actuals=d.actuals_cat, pos_cat=pos_class) test &= ( allclose(metrics["accuracy"], scikit_learn_accuracy) & allclose(metrics["recall"], scikit_learn_recall) & allclose(metrics["precision"], scikit_learn_precision) & allclose(metrics["f1_score"], scikit_learn_f1_score) ) # TEST: probs vs. actuals (boolean format) metrics = bin_classif_eval(predictions=d.probs, actuals=d.actuals_bool) test &= ( allclose(metrics["accuracy"], scikit_learn_accuracy) & allclose(metrics["recall"], scikit_learn_recall) & allclose(metrics["precision"], scikit_learn_precision) & allclose(metrics["f1_score"], scikit_learn_f1_score) & allclose(metrics["deviance"], 2 * scikit_learn_log_loss) ) # TEST: probs vs. actuals (categorical format) metrics = bin_classif_eval(predictions=d.probs, actuals=d.actuals_cat, pos_cat=pos_class) test &= ( allclose(metrics["accuracy"], scikit_learn_accuracy) & allclose(metrics["recall"], scikit_learn_recall) & allclose(metrics["precision"], scikit_learn_precision) & allclose(metrics["f1_score"], scikit_learn_f1_score) & allclose(metrics["deviance"], 2 * scikit_learn_log_loss) ) assert test
def test_bin_classif_eval_metrics(nb_samples=1000, threshold=.5): """test: Binary Classification Metrics""" neg_class = 'Cat0' pos_class = 'Cat1' classes = neg_class, pos_class d = DataFrame( dict(actuals_cat=Categorical(choice(classes, size=nb_samples), categories=classes))) d['actuals_bool'] = d.actuals_cat == pos_class d['probs'] = uniform(size=nb_samples) d['hard_preds_bool'] = d.probs >= threshold d['hard_preds_cat'] = Categorical(map( lambda b: { False: neg_class, True: pos_class }[b], d.hard_preds_bool), categories=classes) scikit_learn_accuracy = accuracy_score(d.actuals_cat, d.hard_preds_cat) scikit_learn_recall = recall_score(d.actuals_cat, d.hard_preds_cat, pos_label=pos_class) scikit_learn_precision = precision_score(d.actuals_cat, d.hard_preds_cat, pos_label=pos_class) scikit_learn_f1_score = f1_score(d.actuals_cat, d.hard_preds_cat, pos_label=pos_class) scikit_learn_log_loss = log_loss(d.actuals_cat, array(d.probs)) test = True # TEST: hard predictions (boolean format) vs. actuals (boolean format) metrics = bin_classif_eval(predictions=d.hard_preds_bool, actuals=d.actuals_bool) test &=\ allclose(metrics['accuracy'], scikit_learn_accuracy) &\ allclose(metrics['recall'], scikit_learn_recall) &\ allclose(metrics['precision'], scikit_learn_precision) &\ allclose(metrics['f1_score'], scikit_learn_f1_score) # TEST: hard predictions (boolean format) vs. actuals (categorical format) metrics = bin_classif_eval(predictions=d.hard_preds_bool, actuals=d.actuals_cat, pos_cat=pos_class) test &= \ allclose(metrics['accuracy'], scikit_learn_accuracy) & \ allclose(metrics['recall'], scikit_learn_recall) & \ allclose(metrics['precision'], scikit_learn_precision) & \ allclose(metrics['f1_score'], scikit_learn_f1_score) # TEST: hard predictions (categorical format) vs. actuals (boolean format) metrics = bin_classif_eval(predictions=d.hard_preds_cat, actuals=d.actuals_bool, pos_cat=pos_class) test &= \ allclose(metrics['accuracy'], scikit_learn_accuracy) & \ allclose(metrics['recall'], scikit_learn_recall) & \ allclose(metrics['precision'], scikit_learn_precision) & \ allclose(metrics['f1_score'], scikit_learn_f1_score) # TEST: hard predictions (categorical format) vs. actuals (categorical format) metrics = bin_classif_eval(predictions=d.hard_preds_cat, actuals=d.actuals_cat, pos_cat=pos_class) test &= \ allclose(metrics['accuracy'], scikit_learn_accuracy) & \ allclose(metrics['recall'], scikit_learn_recall) & \ allclose(metrics['precision'], scikit_learn_precision) & \ allclose(metrics['f1_score'], scikit_learn_f1_score) # TEST: probs vs. actuals (boolean format) metrics = bin_classif_eval(predictions=d.probs, actuals=d.actuals_bool) test &= \ allclose(metrics['accuracy'], scikit_learn_accuracy) & \ allclose(metrics['recall'], scikit_learn_recall) & \ allclose(metrics['precision'], scikit_learn_precision) & \ allclose(metrics['f1_score'], scikit_learn_f1_score) &\ allclose(metrics['deviance'], 2 * scikit_learn_log_loss) # TEST: probs vs. actuals (categorical format) metrics = bin_classif_eval(predictions=d.probs, actuals=d.actuals_cat, pos_cat=pos_class) test &= \ allclose(metrics['accuracy'], scikit_learn_accuracy) & \ allclose(metrics['recall'], scikit_learn_recall) & \ allclose(metrics['precision'], scikit_learn_precision) & \ allclose(metrics['f1_score'], scikit_learn_f1_score) & \ allclose(metrics['deviance'], 2 * scikit_learn_log_loss) assert test
def train_test(X_train, X_test, y_train, y_test): print("Performing grid search...") RANDOM_SEED = 99 k = 5 scoring_val = 'roc_auc' max_features_val = 'sqrt' #-----------------------------Find the best parameters' combination of the model------------------------------ param_test1 = {'n_estimators': range(20, 600, 20)} gsearch1 = GridSearchCV(estimator=RandomForestClassifier( min_samples_split=200, min_samples_leaf=2, max_depth=5, max_features=max_features_val, random_state=RANDOM_SEED), param_grid=param_test1, scoring=scoring_val, cv=k) t1 = time() gsearch1.fit(X_train, y_train) print("Grid search phase1 done in %0.3fs" % (time() - t1)) print("best score: %0.3f" % gsearch1.best_score_) print("best parameters set:", gsearch1.best_params_) #for item in gsearch1.grid_scores_: # print(item) #print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_,'\n') print() n_estimators_val = gsearch1.best_params_.get('n_estimators') print( '-----------------------------------------------------------------------------------------------------' ) param_test2 = { 'max_depth': range(2, 16, 2), 'min_samples_split': range(20, 200, 20) } gsearch2 = GridSearchCV( estimator=RandomForestClassifier( n_estimators=n_estimators_val, #min_samples_leaf=2, max_features=max_features_val, oob_score=True, min_samples_leaf=2, max_features=max_features_val, random_state=RANDOM_SEED), param_grid=param_test2, scoring=scoring_val, iid=False, cv=k) t2 = time() gsearch2.fit(X_train, y_train) print("Grid search phase2 done in %0.3fs" % (time() - t2)) print("best score: %0.3f" % gsearch2.best_score_) print("best parameters set:", gsearch2.best_params_) print('best_estimator_:', gsearch2.best_estimator_) #for item in gsearch2.grid_scores_: # print(item) #print(gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_,'\n') print() max_depth_val = gsearch2.best_params_.get('max_depth') min_samples_split_val = gsearch2.best_params_.get('min_samples_split') ##-----------------------------Find the best parameters' combination of the model------------------------------ B = n_estimators_val model = \ RandomForestClassifier( n_estimators=B, #criterion='entropy', criterion='gini', #max_depth=None, # expand until all leaves are pure or contain < MIN_SAMPLES_SPLIT samples max_depth=max_depth_val, min_samples_split=min_samples_split_val, min_samples_leaf=2, min_weight_fraction_leaf=0.0, #max_features=None, # number of features to consider when looking for the best split; None: max_features=n_features max_features=max_features_val, max_leaf_nodes=None, # None: unlimited number of leaf nodes bootstrap=True, oob_score=True, # estimate Out-of-Bag Cross Entropy n_jobs=multiprocessing.cpu_count() - 4, # paralellize over all CPU cores minus 4 class_weight=None, # our classes are skewed, but but too skewed random_state=RANDOM_SEED, verbose=0, warm_start=False) print("Performing kfold cross-validation...") kfold = model_selection.KFold(n_splits=5, random_state=RANDOM_SEED) eval_standard = ['accuracy', 'recall_macro', 'precision_macro', 'f1_macro'] results = [] t = time() for scoring in eval_standard: cv_results = model_selection.cross_val_score(model, X_train, y_train, scoring=scoring, cv=kfold) results.append(cv_results) msg = "%s: %f (%f)" % (scoring, cv_results.mean(), cv_results.std()) print(msg) model.fit(X_train, y_train) print("Kfold cross-validation done in %0.3fs" % (time() - t)) print() print('oob_score: %f' % (model.oob_score_)) #joblib.dump(model,'../../model/rf_train_model.pkl',compress=3) joblib.dump(model, '/tmp/model/rf_train_model.pkl', compress=3) # Make predictions on validation dataset #default evaluation way print('-------------------default evaluation----------------------') rf_pred_probs = model.predict(X=X_test) result_probs = np.column_stack((rf_pred_probs, y_test.as_matrix())) #for item in result_probs: # print(item) print("confusion_matrix:\n", metrics.confusion_matrix(y_test, rf_pred_probs)) print("accuracy_score:", metrics.accuracy_score(y_test, rf_pred_probs)) print("recall_score:", metrics.recall_score(y_test, rf_pred_probs)) print("precision_score:", metrics.precision_score(y_test, rf_pred_probs)) print("f1_score:", metrics.f1_score(y_test, rf_pred_probs)) print("roc_auc_score:", metrics.roc_auc_score(y_test, rf_pred_probs)) print("classification_report:\n", metrics.classification_report(y_test, rf_pred_probs)) rf_pred_probs = model.predict_proba(X=X_test) result_probs = np.column_stack((rf_pred_probs, y_test.as_matrix())) #for item in result_probs: # print(item) result_df = DataFrame(result_probs, columns=['pred_neg', 'pred_pos', 'real']) #fpr,tpr,thresholds = metrics.roc_curve(result_df['real'],result_df['pred_pos'],pos_label=2) fpr, tpr, _ = metrics.roc_curve(result_df['real'], result_df['pred_pos']) # good model's auc should > 0.5 print("auc:", metrics.auc(fpr, tpr)) # good model's ks should > 0.2 print("ks:", max(tpr - fpr)) # good model's gini should > 0.6 print("gini:", 2 * metrics.auc(fpr, tpr) - 1) #self-defined evaluation way print('-------------------self-defined evaluation----------------------') low_prob = 1e-6 high_prob = 1 - low_prob log_low_prob = np.log(low_prob) g_low_prob = np.log(low_prob) log_high_prob = np.log(high_prob) log_prob_thresholds = np.linspace(start=log_low_prob, stop=log_high_prob, num=100) prob_thresholds = np.exp(log_prob_thresholds) rf_pred_probs = model.predict_proba(X=X_test) #result_probs = np.column_stack((rf_pred_probs,y_test)) #for item in result_probs: # print(item) #for item in rf_pred_probs[:,1]: # print(item) ## histogram of predicted probabilities ##n,bins,patches = plt.hist(rf_pred_probs[:1],10,normed=1,facecolor='g',alpha=0.75) ##plt.xlabel('Predicted probability of diabetes') ##plt.ylabel('Frequency') ##plt.title('Histogram of predicted probabilities') ###plt.text(60, .025, r'$\mu=100,\ \sigma=15$') ##plt.axis([0,1,0,1]) ##plt.grid(True) #print(type(rf_pred_probs)) #print(type(rf_pred_probs[:,1])) #print(rf_pred_probs[:,1]) #fig = plt.figure() #ax = fig.add_subplot(111) #ax.hist(rf_pred_probs[:,1], bins=20) #plt.xlim(0,1) #plt.title('Histogram of predicted probabilities') #plt.xlabel('Predicted probability of diabetes') #plt.ylabel('Frequency') #plt.show() model_oos_performance = bin_classif_eval(rf_pred_probs[:, 1], y_test, pos_cat=1, thresholds=prob_thresholds) #print(type(model_oos_performance)) #for item in model_oos_performance.recall: # print(item) recall_threshold = .74 idx = next(i for i in range(100) if model_oos_performance.recall[i] <= recall_threshold) - 1 print("idx = %d" % idx) selected_prob_threshold = prob_thresholds[idx] print("selected_prob_threshold:", selected_prob_threshold) print(model_oos_performance.iloc[idx, :])