def run_model_regression(run_test_only, x_train, y_train, x_test, y_test, num_model_iterations=1, plot_learning_curve=False, clf_class=RandomForestReg, **kwargs): # # @brief: For cross-validation, Runs the model and gives rmse / mse. Also, will run the trained model # # on test data if run_test_only is set # # For test, trains the model on train data and predicts rmse / mse for test data # # @param: x_train, x_test - Input features (numpy array) # # y_train, y_test - expected output (numpy array) # # plot_learning_curve (only for cv) - bool # # num_model_iterations - Times to run the model (to average the results) # # clf_class - Model to run (if specified model doesn't run, # # then it'll have to be imported from sklearn) # # **kwargs - Model inputs, refer sklearn documentation for your model to see available parameters # # @return: None # Plot learning curve only for cv if not run_test_only and plot_learning_curve: title = "Learning Curves for regression" # Train data further split into train and CV # Cross validation with 100 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = cross_validation.ShuffleSplit(x_train.shape[0], n_iter=100, test_size=0.2, random_state=0) modeling_tools.plot_learning_curve(clf_class(**kwargs), title, x_train, y_train, cv=cv, n_jobs=-1) if not os.path.isdir("temp_pyplot_regr_dont_commit"): # Create dir if it doesn't exist. Do not commit this directory or contents. # It's a temp store for pyplot images os.mkdir("temp_pyplot_regr_dont_commit") # plt.show() plt.savefig("temp_pyplot_regr_dont_commit/learning_curve.png") # Error metrics - mean-squared error and root mse rmse_cv = rmse_test = 0.0 mse_cv = mse_test = 0.0 for _ in range(num_model_iterations): if run_test_only: # test y_pred_test = run_test(x_train, y_train, x_test, clf_class, **kwargs) # calculate root mean squared error # Pep8 warning not valid rmse_test += ((np.mean((y_pred_test - y_test) ** 2)) ** 0.5) mse_test += np.mean((y_pred_test - y_test) ** 2) # Print first 10 actual and predicted values for test logger.debug(y_test[0:10]) logger.debug(sf.format_float_0_2f(y_pred_test[0:10])) logger.debug(np.mean(y_test)) logger.debug(np.mean(y_pred_test)) else: # cv y_pred_cv, y_pred_test = run_kfold_cv(x_train, y_train, x_test, clf_class, **kwargs) # Pep8 warning not valid rmse_cv += ((np.mean((y_pred_cv - y_train) ** 2)) ** 0.5) mse_cv += np.mean((y_pred_cv - y_train) ** 2) # Pep8 warning not valid rmse_test += ((np.mean((y_pred_test - y_test) ** 2)) ** 0.5) mse_test += np.mean((y_pred_test - y_test) ** 2) # Print first 10 actual and predicted values for cv logger.debug(y_train[0:10]) logger.debug(sf.format_float_0_2f(y_pred_cv[0:10])) logger.debug(np.mean(y_train)) logger.debug(np.mean(y_pred_cv)) # Print first 10 actual and predicted values for test logger.debug(y_test[0:10]) logger.debug(sf.format_float_0_2f(y_pred_test[0:10])) logger.debug(np.mean(y_test)) logger.debug(np.mean(y_pred_test)) if not run_test_only: rmse_cv /= num_model_iterations mse_cv /= num_model_iterations logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nCV Root Mean Squared Error {:.2f} Mean Squared Error {:.2f}".format(rmse_cv, mse_cv) + sf.Color.END) rmse_test /= num_model_iterations mse_test /= num_model_iterations logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nTest Root Mean Squared Error {:.2f} Mean Squared Error {:.2f}".format(rmse_test, mse_test) + sf.Color.END) if run_test_only: return y_pred_test else: return y_pred_cv
def run_model(cv_0_test_1, x, y, num_model_iterations=1, test_size=0.2, plot_learning_curve=False, run_prob_predictions=False, return_yprob=False, classification_threshold=0.5, clf_class=RandomForest, **kwargs): # # @brief: For cross-validation, Runs the model and gives accuracy and precision / recall # # For test, runs the model and gives accuracy and precision / recall by treating # # a random sample of input data as test data # # @param: x - Input features (numpy array) # # y - expected output (numpy array) # # plot_learning_curve (only for cv) - bool # # num_model_iterations - Times to run the model (to average the results) # # test_size (only for test) - % of data that should be treated as test (in decimal) # # clf_class - Model to run (if specified model doesn't run, # # then it'll have to be imported from sklearn) # # **kwargs - Model inputs, refer sklearn documentation for your model to see available parameters # # plot_learning_curve - bool # # @return: None # Create train / test split only for test if cv_0_test_1: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42) y_actual = y_predicted = y_test.copy() x_train = np.array(x_train) x_test = np.array(x_test) y_train = np.array(y_train) else: x_train, x_test, y_train, y_test = 0, 0, 0, 0 y_actual = y_predicted = y.copy() # Plot learning curve only for cv if not cv_0_test_1 and plot_learning_curve: title = "Learning Curves" # Cross validation with 25 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = cross_validation.ShuffleSplit(x.shape[0], n_iter=25, test_size=0.2, random_state=0) modeling_tools.plot_learning_curve(clf_class(**kwargs), title, x, y, cv=cv, n_jobs=-1) if not os.path.isdir("temp_pyplot_images_dont_commit"): # Create dir if it doesn't exist. Do not commit this directory or contents. # It's a temp store for pyplot images os.mkdir("temp_pyplot_images_dont_commit") # plt.show() plt.savefig("temp_pyplot_images_dont_commit/learning_curve.png") # Predict accuracy (mean of num_iterations) logger.info("k-fold CV:") # Accuracy mean_correct_positive_prediction = 0 mean_correct_negative_prediction = 0 mean_incorrect_positive_prediction = 0 mean_incorrect_negative_prediction = 0 mean_accuracy = 0 # Precision / Recall beta = 2.0 # higher beta prioritizes recall more than precision, default is 1 mean_precision = 0 mean_recall = 0 mean_fbeta_score = 0 if return_yprob: num_model_iterations = 1 # for probabilities returned, just run 1 iteration for _ in range(num_model_iterations): if cv_0_test_1: # test y_predicted = run_test(x_train=x_train, y_train=y_train, x_test=x_test, run_prob_predictions=run_prob_predictions, return_yprob=return_yprob, classification_threshold=classification_threshold, clf_class=clf_class, **kwargs) else: # cv y_predicted = run_cv(x=x, y=y, run_prob_predictions=run_prob_predictions, return_yprob=return_yprob, classification_threshold=classification_threshold, clf_class=clf_class, **kwargs) # Only do accuracy / precision and recall if actual classified values are returned and not probabilities if not return_yprob: # Accuracy mean_accuracy += accuracy(y_actual, y_predicted) mean_correct_positive_prediction += correct_positive_prediction mean_correct_negative_prediction += correct_negative_prediction mean_incorrect_positive_prediction += incorrect_positive_prediction mean_incorrect_negative_prediction += incorrect_negative_prediction # Precision recall prec_recall = precision_recall_fscore_support(y_true=y_actual, y_pred=y_predicted, beta=beta, average='binary') mean_precision += prec_recall[0] mean_recall += prec_recall[1] mean_fbeta_score += prec_recall[2] # Only do accuracy / precision and recall if actual classified values are returned and not probabilities if not return_yprob: # Accuracy mean_accuracy /= num_model_iterations mean_correct_positive_prediction /= num_model_iterations mean_correct_negative_prediction /= num_model_iterations mean_incorrect_positive_prediction /= num_model_iterations mean_incorrect_negative_prediction /= num_model_iterations # Precision recall mean_precision /= num_model_iterations mean_recall /= num_model_iterations mean_fbeta_score /= num_model_iterations # Accuracy logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nAccuracy {:.2f}".format(mean_accuracy * 100) + sf.Color.END) logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nCorrect positive prediction {:.2f}".format( mean_correct_positive_prediction) + sf.Color.END) logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nCorrect negative prediction {:.2f}".format( mean_correct_negative_prediction) + sf.Color.END) logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nIncorrect positive prediction {:.2f}".format( mean_incorrect_positive_prediction) + sf.Color.END) logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nIncorrect negative prediction {:.2f}".format( mean_incorrect_negative_prediction) + sf.Color.END) # Precision recall logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nPrecision {:.2f} Recall {:.2f} Fbeta-score {:.2f}".format( mean_precision * 100, mean_recall * 100, mean_fbeta_score * 100) + sf.Color.END) # compare probability predictions of the model if run_prob_predictions: if not cv_0_test_1: logger.info("\nPrediction probabilities for CV\n") # compare_prob_predictions(cv_0_test_1=cv_0_test_1, x=x, y=y, x_test=0, clf_class=clf_class, **kwargs) else: logger.info("\nPrediction probabilities for Test\n") # compare_prob_predictions(cv_0_test_1=cv_0_test_1, x=x_train, y=y_train, x_test=x_test, # clf_class=clf_class, **kwargs) return [y_actual, y_predicted]
def run_model(cv_0_test_1, x, y, num_model_iterations=1, test_size=0.2, plot_learning_curve=False, run_prob_predictions=False, return_yprob=False, classification_threshold=0.5, clf_class=RandomForest, **kwargs): # # @brief: For cross-validation, Runs the model and gives accuracy and precision / recall # # For test, runs the model and gives accuracy and precision / recall by treating # # a random sample of input data as test data # # @param: x - Input features (numpy array) # # y - expected output (numpy array) # # plot_learning_curve (only for cv) - bool # # num_model_iterations - Times to run the model (to average the results) # # test_size (only for test) - % of data that should be treated as test (in decimal) # # clf_class - Model to run (if specified model doesn't run, # # then it'll have to be imported from sklearn) # # **kwargs - Model inputs, refer sklearn documentation for your model to see available parameters # # plot_learning_curve - bool # # @return: None # Create train / test split only for test if cv_0_test_1: x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, random_state=42) y_actual = y_predicted = y_test.copy() x_train = np.array(x_train) x_test = np.array(x_test) y_train = np.array(y_train) else: x_train, x_test, y_train, y_test = 0, 0, 0, 0 y_actual = y_predicted = y.copy() # Plot learning curve only for cv if not cv_0_test_1 and plot_learning_curve: title = "Learning Curves" # Cross validation with 25 iterations to get smoother mean test and train # score curves, each time with 20% data randomly selected as a validation set. cv = cross_validation.ShuffleSplit(x.shape[0], n_iter=25, test_size=0.2, random_state=0) modeling_tools.plot_learning_curve(clf_class(**kwargs), title, x, y, cv=cv, n_jobs=-1) if not os.path.isdir("temp_pyplot_images_dont_commit"): # Create dir if it doesn't exist. Do not commit this directory or contents. # It's a temp store for pyplot images os.mkdir("temp_pyplot_images_dont_commit") # plt.show() plt.savefig("temp_pyplot_images_dont_commit/learning_curve.png") # Predict accuracy (mean of num_iterations) logger.info("k-fold CV:") # Accuracy mean_correct_positive_prediction = 0 mean_correct_negative_prediction = 0 mean_incorrect_positive_prediction = 0 mean_incorrect_negative_prediction = 0 mean_accuracy = 0 # Precision / Recall beta = 2.0 # higher beta prioritizes recall more than precision, default is 1 mean_precision = 0 mean_recall = 0 mean_fbeta_score = 0 if return_yprob: num_model_iterations = 1 # for probabilities returned, just run 1 iteration for _ in range(num_model_iterations): if cv_0_test_1: # test y_predicted = run_test( x_train=x_train, y_train=y_train, x_test=x_test, run_prob_predictions=run_prob_predictions, return_yprob=return_yprob, classification_threshold=classification_threshold, clf_class=clf_class, **kwargs) else: # cv y_predicted = run_cv( x=x, y=y, run_prob_predictions=run_prob_predictions, return_yprob=return_yprob, classification_threshold=classification_threshold, clf_class=clf_class, **kwargs) # Only do accuracy / precision and recall if actual classified values are returned and not probabilities if not return_yprob: # Accuracy mean_accuracy += accuracy(y_actual, y_predicted) mean_correct_positive_prediction += correct_positive_prediction mean_correct_negative_prediction += correct_negative_prediction mean_incorrect_positive_prediction += incorrect_positive_prediction mean_incorrect_negative_prediction += incorrect_negative_prediction # Precision recall prec_recall = precision_recall_fscore_support(y_true=y_actual, y_pred=y_predicted, beta=beta, average='binary') mean_precision += prec_recall[0] mean_recall += prec_recall[1] mean_fbeta_score += prec_recall[2] # Only do accuracy / precision and recall if actual classified values are returned and not probabilities if not return_yprob: # Accuracy mean_accuracy /= num_model_iterations mean_correct_positive_prediction /= num_model_iterations mean_correct_negative_prediction /= num_model_iterations mean_incorrect_positive_prediction /= num_model_iterations mean_incorrect_negative_prediction /= num_model_iterations # Precision recall mean_precision /= num_model_iterations mean_recall /= num_model_iterations mean_fbeta_score /= num_model_iterations # Accuracy logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nAccuracy {:.2f}".format(mean_accuracy * 100) + sf.Color.END) logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nCorrect positive prediction {:.2f}".format( mean_correct_positive_prediction) + sf.Color.END) logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nCorrect negative prediction {:.2f}".format( mean_correct_negative_prediction) + sf.Color.END) logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nIncorrect positive prediction {:.2f}".format( mean_incorrect_positive_prediction) + sf.Color.END) logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nIncorrect negative prediction {:.2f}".format( mean_incorrect_negative_prediction) + sf.Color.END) # Precision recall logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nPrecision {:.2f} Recall {:.2f} Fbeta-score {:.2f}". format(mean_precision * 100, mean_recall * 100, mean_fbeta_score * 100) + sf.Color.END) # compare probability predictions of the model if run_prob_predictions: if not cv_0_test_1: logger.info("\nPrediction probabilities for CV\n") # compare_prob_predictions(cv_0_test_1=cv_0_test_1, x=x, y=y, x_test=0, clf_class=clf_class, **kwargs) else: logger.info("\nPrediction probabilities for Test\n") # compare_prob_predictions(cv_0_test_1=cv_0_test_1, x=x_train, y=y_train, x_test=x_test, # clf_class=clf_class, **kwargs) return [y_actual, y_predicted]