def test_array_like(self): plot_roc([0, 'a'], [[0.8, 0.2], [0.2, 0.8]]) plot_roc([0, 1], [[0.8, 0.2], [0.2, 0.8]]) plot_roc(['b', 'a'], [[0.8, 0.2], [0.2, 0.8]])
from sklearn.metrics import f1_score f1 = f1_score(y_test, y_test_pred_class, average='micro') neptune.log_metric('test_f1', f1) import matplotlib.pyplot as plt from scikitplot.metrics import plot_confusion_matrix, plot_roc fig, ax = plt.subplots(figsize=(16, 12)) plot_confusion_matrix(y_test, y_test_pred_class, ax=ax) neptune.log_image('diagnostic_charts', fig) fig, ax = plt.subplots(figsize=(16, 12)) plot_roc(y_test, y_test_pred, ax=ax) neptune.log_image('diagnostic_charts', fig) model.save('my_model.h5') neptune.log_artifact('my_model.h5') # tests current_exp = neptune.get_experiment() correct_logs = [ 'batch_loss', 'batch_accuracy', 'epoch_loss', 'epoch_accuracy', 'epoch_val_loss', 'epoch_val_accuracy', 'test_f1', 'diagnostic_charts' ] if set(current_exp.get_logs().keys()) != set(correct_logs): raise ValueError()
y_pred = y_pred df = pd.DataFrame( data={ 'y_test': y_test, 'y_pred': y_pred, 'y_pred_probability': y_pred_proba.max(axis=1) }) log_table('predictions', df) # Log model performance visualizations import matplotlib.pyplot as plt from scikitplot.metrics import plot_roc, plot_precision_recall fig, ax = plt.subplots() plot_roc(y_test, y_pred_proba, ax=ax) neptune.log_image('model-performance-visualizations', fig, image_name='ROC') fig, ax = plt.subplots() plot_precision_recall(y_test, y_pred_proba, ax=ax) neptune.log_image('model-performance-visualizations', fig, image_name='precision recall') plt.close('all') # Log train data sample (images per class) for j, class_name in enumerate(class_names): plt.figure(figsize=(10, 10)) label_ = np.where(y_train == j) for i in range(9):
def test_string_classes(self): np.random.seed(0) clf = LogisticRegression() clf.fit(self.X, convert_labels_into_string(self.y)) probas = clf.predict_proba(self.X) plot_roc(convert_labels_into_string(self.y), probas)
def plot_roc_curve(labels, output): skplt.plot_roc(labels, output, plot_micro=False) plt.show()
def LogisticRegression_self_test(X_train, X_test, y_train, y_test, learning_rates, epochs, iteration): """ Logistic regression with stochastic gradient descent and gradient descent. """ # scoping number of training samples n_inputs = X_train.shape[0] n_features = X_train.shape[1] eta_ = 1e-12 beta_opt = np.random.randn(X_train.shape[1], 2) calc_beta_GD, norm = GradientDescent(X_train, beta_opt, y_train, iteration, eta_) prob_GD, predict_GD = Probability_GD( X_test, calc_beta_GD) #defining values to be between 0 and 1 #yPred_GD = (predict_GD >= 0.5).astype(int) # converting to just 0 or 1 #Define Logistic regression clf = LogisticRegression(solver='lbfgs', max_iter=1e5) clf = clf.fit(X_train, np.ravel(y_train)) pred_sklearn = clf.predict(X_test) prob_sklearn = clf.predict_proba(X_test) #print(prob_sklearn) #for eta in np.logspace(np.log10(1e-6), np.log10(1e0), 7): accuracy = np.zeros(len(learning_rates)) auc_score = np.zeros(len(learning_rates)) for i, eta in enumerate(learning_rates): beta_SGD = stochastic_gradient_descent(X_train, beta_opt, y_train, eta, epochs, iteration) prob_SGD, predict_SGD = Probability( X_test, beta_SGD) #defining values to be between 0 and 1 accuracy[i] = metrics.accuracy_score(y_test, predict_SGD) auc_score[i] = metrics.roc_auc_score(y_test, predict_SGD) difference = y_test - predict_SGD if i > 0 and auc_score[i] > auc_score[i - 1]: best_pred_SGD = predict_SGD best_prob_SGD = prob_SGD print('Accuracy {}, learning rate= {}, iterations = {}'.format( accuracy[i], eta, iteration)) print('Auc score: {}'.format(auc_score[i])) """ plt.plot(yPred, label='predict') plt.plot(optimal_beta, label ='optimal beta') plt.plot(y_test, label='test') plt.show() """ sns.set() sns.heatmap(pd.DataFrame(accuracy), annot=True, fmt='.4g') plt.title('Grid-search for logistic regression') plt.ylabel('Learning rate: $\\eta$') plt.xlabel('Regularization Term: $\\lambda$') #plt.xticks(ticks=np.arange(len(learning_rates)) + 0.5, labels=learning_rates) #plt.yticks(ticks=np.arange(len(lambda_values)) + 0.5, labels=lambda_values) b, t = plt.ylim() # discover the values for bottom and top b += 0.5 # Add 0.5 to the bottom t -= 0.5 # Subtract 0.5 from the top plt.ylim(b, t) # update the ylim(bottom, top) values #plt.savefig('accuracy_logreg.png') plt.show() sns.heatmap(pd.DataFrame(auc_score), annot=True, fmt='.4g') plt.title('Grid-search for logistic regression') plt.ylabel('Learning rate: $\\eta$') plt.xlabel('Regularization Term: $\\lambda$') #plt.xticks(ticks=np.arange(len(learning_rates)) + 0.5, labels=learning_rates) #plt.yticks(ticks=np.arange(len(lambda_values)) + 0.5, labels=lambda_values) b, t = plt.ylim() # discover the values for bottom and top b += 0.5 # Add 0.5 to the bottom t -= 0.5 # Subtract 0.5 from the top plt.ylim(b, t) # update the ylim(bottom, top) values #plt.savefig('auc_score_logreg.png') plt.show() #plot confusion matrix Confusion_Matrix(y_test, predict_GD) #Confusion_Matrix(y_test, best_pred_SGD) #Confusion_Matrix(y_test, pred_sklearn) #diff = np.concatenate((1- predict, predict), axis=1) diff_sklearn = np.concatenate((1 - prob_sklearn, prob_sklearn), axis=1) diff_GD = np.concatenate((1 - prob_GD, prob_GD), axis=1) diff_SGD = np.concatenate((1 - best_prob_SGD, best_prob_SGD), axis=1) #plot roc curves plot_roc(y_test, prob_sklearn) plot_roc(y_test, diff_SGD) plot_roc(y_test, prob_GD) plt.show() #plot cumulative gain curves plot_cumulative_gain(y_test, prob_sklearn) ax = plot_cumulative_gain(y_test, diff_SGD) plot_cumulative_gain(y_test, prob_GD) #plt.show() """ #plot roc curves plot_roc(y_test, diff_sklearn, plot_micro=False, plot_macro= False) plot_roc(y_test, diff_GD, plot_micro=False, plot_macro= False) plot_roc(y_test, diff_SGD, plot_micro=False, plot_macro= False) plt.show() #plot cumulative gain curves plot_cumulative_gain(y_test, diff_sklearn) plot_cumulative_gain(y_test, diff_GD) plot_cumulative_gain(y_test, diff_SGD) plt.show() """ model_curve = auc_score area_baseline = 0.5 area_ratio = (model_curve - area_baseline) / (area_baseline) print('Area Ratio:', area_ratio) return accuracy, learning_rates
test_questions_df = X_test[import_quest_lst] import_quest_demos_test = pd.concat([test_questions_df, test_demos], axis=1, join_axes=[test_questions_df.index]) #Train best model, test best model rf_final = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight='balanced', random_state=1) rf_final.fit(import_quest_demos, y_train) y_predict = rf_final.predict(import_quest_demos_test) y_predict_prob = rf_final.predict_proba(import_quest_demos_test) acc_final = accuracy_score(y_test, y_predict) #ROC plot plot_roc(y_test, y_predict_prob, title='Test Data ROC Curve', plot_micro=False, plot_macro=True, classes_to_plot=[]) plt.savefig("images/roc.png") plt.close() #Partial Dependency Plots part_dep_plot(import_quest_lst) part_dep_plot(['gender_num', 'club_num', 'age_bin'])
def main(): neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'), project_qualified_name=os.getenv('NEPTUNE_PROJECT')) train_idx = pd.read_csv(TRAIN_IDX_PATH, nrows=NROWS) valid_idx = pd.read_csv(VALID_IDX_PATH, nrows=NROWS) features = pd.read_csv(FEATURES_PATH, nrows=NROWS) train = pd.merge(train_idx, features, on='SK_ID_CURR') valid = pd.merge(valid_idx, features, on='SK_ID_CURR') all_params = { 'num_boost_round': NUM_BOOST_ROUND, 'early_stopping_rounds': EARLY_STOPPING_ROUNDS, **LGBM_PARAMS } with neptune.create_experiment(name='model training', params=all_params, tags=['lgbm'], upload_source_files=get_filepaths(), properties={ 'features_path': FEATURES_PATH, 'features_version': md5_hash(FEATURES_PATH), 'train_split_version': md5_hash(TRAIN_IDX_PATH), 'valid_split_version': md5_hash(VALID_IDX_PATH), }): results = train_evaluate(train, valid, LGBM_PARAMS, callbacks=[neptune_monitor()]) train_score, valid_score = results['train_score'], results[ 'valid_score'] train_preds, valid_preds = results['train_preds'], results[ 'valid_preds'] neptune.send_metric('train_auc', train_score) neptune.send_metric('valid_auc', valid_score) train_pred_path = os.path.join(PREDICTION_DIRPATH, 'train_preds.csv') train_preds.to_csv(train_pred_path, index=None) neptune.send_artifact(train_pred_path) valid_pred_path = os.path.join(PREDICTION_DIRPATH, 'valid_preds.csv') valid_preds.to_csv(valid_pred_path, index=None) neptune.send_artifact(valid_pred_path) model_path = os.path.join(MODEL_DIRPATH, 'model.pkl') joblib.dump(results['model'], model_path) neptune.set_property('model_path', model_path) neptune.set_property('model_version', md5_hash(model_path)) neptune.send_artifact(model_path) if PACKAGE_TO_PROD: saved_path = CreditDefaultClassifier.pack( model=results['model']).save(PRODUCTION_DIRPATH) neptune.set_property('production_model_path', saved_path) fig, ax = plt.subplots(figsize=(16, 12)) sk_metrics.plot_confusion_matrix(valid_preds['TARGET'], valid_preds['preds_pos'] > 0.5, ax=ax) plot_path = os.path.join(REPORTS_DIRPATH, 'conf_matrix.png') fig.savefig(plot_path) neptune.send_image('diagnostics', plot_path) fig, ax = plt.subplots(figsize=(16, 12)) sk_metrics.plot_roc(valid_preds['TARGET'], valid_preds[['preds_neg', 'preds_pos']], ax=ax) plot_path = os.path.join(REPORTS_DIRPATH, 'roc_auc.png') fig.savefig(plot_path) neptune.send_image('diagnostics', plot_path) fig, ax = plt.subplots(figsize=(16, 12)) sk_metrics.plot_precision_recall( valid_preds['TARGET'], valid_preds[['preds_neg', 'preds_pos']], ax=ax) plot_path = os.path.join(REPORTS_DIRPATH, 'prec_recall.png') fig.savefig(plot_path) neptune.send_image('diagnostics', plot_path) fig, ax = plt.subplots(figsize=(16, 12)) plot_prediction_distribution(valid_preds['TARGET'], valid_preds['preds_pos'], ax=ax) plot_path = os.path.join(REPORTS_DIRPATH, 'preds_dist.png') fig.savefig(plot_path) neptune.send_image('diagnostics', plot_path)
def train_models(X_train, X_test, y_train, y_test): ''' train, store model results: images + scores, and store models input: X_train: X training data X_test: X testing data y_train: y training data y_test: y testing data output: None ''' print("Training models") # Train models rfc = RandomForestClassifier(random_state=42) lrc = LogisticRegression(solver='lbfgs', max_iter=400) param_grid = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt'], 'max_depth': [4, 5, 100], 'criterion': ['gini', 'entropy'] } cv_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5) cv_rfc.fit(X_train, y_train) lrc.fit(X_train, y_train) print("Successfully trained models") print("Making predictions") # Make predictions y_train_preds_rf = cv_rfc.best_estimator_.predict(X_train) y_test_preds_rf = cv_rfc.best_estimator_.predict(X_test) y_train_preds_lr = lrc.predict(X_train) y_test_preds_lr = lrc.predict(X_test) print("Successfully made predictions") print("Saving results as images") # Save roc curve plt.figure(figsize=(15, 8)) ax = plt.gca() lrc_plot = plot_roc(lrc, X_test, y_test, ax=ax) rfc_disp = plot_roc(cv_rfc.best_estimator_, X_test, y_test, ax=ax) # lrc_plot.plot(ax=ax, alpha=0.8) plt.savefig('./images/results/roc_curve_result.png') plt.close() # Save results classification_report_image(y_train, y_test, y_train_preds_lr, y_train_preds_rf, y_test_preds_lr, y_test_preds_rf) # Save feature importance feature_importance_plot(cv_rfc.best_estimator_, X_train, './images/results/feature_importances.png') print("Successfully saved results as images") print("Saving models as pickle files") # Save pickle files joblib.dump(cv_rfc.best_estimator_, './models/rfc_model.pkl') joblib.dump(lrc, './models/logistic_model.pkl') print("Successfully saved models as pickle files")