コード例 #1
0
def run_model_regression(run_test_only, x_train, y_train, x_test, y_test, num_model_iterations=1,
                         plot_learning_curve=False, clf_class=RandomForestReg, **kwargs):
    # # @brief: For cross-validation, Runs the model and gives rmse / mse. Also, will run the trained model
    # #         on test data if run_test_only is set
    # #         For test, trains the model on train data and predicts rmse / mse for test data
    # # @param: x_train, x_test - Input features (numpy array)
    # #         y_train, y_test - expected output (numpy array)
    # #         plot_learning_curve (only for cv) - bool
    # #         num_model_iterations - Times to run the model (to average the results)
    # #         clf_class - Model to run (if specified model doesn't run,
    # #                     then it'll have to be imported from sklearn)
    # #         **kwargs  - Model inputs, refer sklearn documentation for your model to see available parameters
    # # @return: None

    # Plot learning curve only for cv
    if not run_test_only and plot_learning_curve:
        title = "Learning Curves for regression"
        # Train data further split into train and CV
        # Cross validation with 100 iterations to get smoother mean test and train
        # score curves, each time with 20% data randomly selected as a validation set.
        cv = cross_validation.ShuffleSplit(x_train.shape[0], n_iter=100, test_size=0.2, random_state=0)

        modeling_tools.plot_learning_curve(clf_class(**kwargs), title, x_train, y_train, cv=cv, n_jobs=-1)

        if not os.path.isdir("temp_pyplot_regr_dont_commit"):
            # Create dir if it doesn't exist. Do not commit this directory or contents.
            # It's a temp store for pyplot images
            os.mkdir("temp_pyplot_regr_dont_commit")

        # plt.show()
        plt.savefig("temp_pyplot_regr_dont_commit/learning_curve.png")

    # Error metrics - mean-squared error and root mse
    rmse_cv = rmse_test = 0.0
    mse_cv = mse_test = 0.0

    for _ in range(num_model_iterations):
        if run_test_only:  # test
            y_pred_test = run_test(x_train, y_train, x_test, clf_class, **kwargs)
            # calculate root mean squared error
            # Pep8 warning not valid
            rmse_test += ((np.mean((y_pred_test - y_test) ** 2)) ** 0.5)
            mse_test += np.mean((y_pred_test - y_test) ** 2)

            # Print first 10 actual and predicted values for test
            logger.debug(y_test[0:10])
            logger.debug(sf.format_float_0_2f(y_pred_test[0:10]))

            logger.debug(np.mean(y_test))
            logger.debug(np.mean(y_pred_test))

        else:  # cv
            y_pred_cv, y_pred_test = run_kfold_cv(x_train, y_train, x_test, clf_class, **kwargs)
            # Pep8 warning not valid
            rmse_cv += ((np.mean((y_pred_cv - y_train) ** 2)) ** 0.5)
            mse_cv += np.mean((y_pred_cv - y_train) ** 2)

            # Pep8 warning not valid
            rmse_test += ((np.mean((y_pred_test - y_test) ** 2)) ** 0.5)
            mse_test += np.mean((y_pred_test - y_test) ** 2)

            # Print first 10 actual and predicted values for cv
            logger.debug(y_train[0:10])
            logger.debug(sf.format_float_0_2f(y_pred_cv[0:10]))

            logger.debug(np.mean(y_train))
            logger.debug(np.mean(y_pred_cv))

            # Print first 10 actual and predicted values for test
            logger.debug(y_test[0:10])
            logger.debug(sf.format_float_0_2f(y_pred_test[0:10]))

            logger.debug(np.mean(y_test))
            logger.debug(np.mean(y_pred_test))

    if not run_test_only:
        rmse_cv /= num_model_iterations
        mse_cv /= num_model_iterations

        logger.info(sf.Color.BOLD + sf.Color.DARKCYAN +
                    "\nCV Root Mean Squared Error {:.2f} Mean Squared Error {:.2f}".format(rmse_cv,
                                                                                           mse_cv) + sf.Color.END)

    rmse_test /= num_model_iterations
    mse_test /= num_model_iterations

    logger.info(sf.Color.BOLD + sf.Color.DARKCYAN +
                "\nTest Root Mean Squared Error {:.2f} Mean Squared Error {:.2f}".format(rmse_test,
                                                                                         mse_test) + sf.Color.END)
    if run_test_only:
        return y_pred_test
    else:
        return y_pred_cv
コード例 #2
0
def run_model(cv_0_test_1, x, y, num_model_iterations=1, test_size=0.2, plot_learning_curve=False,
              run_prob_predictions=False, return_yprob=False, classification_threshold=0.5, clf_class=RandomForest,
              **kwargs):
    # # @brief: For cross-validation, Runs the model and gives accuracy and precision / recall
    # #         For test, runs the model and gives accuracy and precision / recall by treating
    # #         a random sample of input data as test data
    # # @param: x - Input features (numpy array)
    # #         y - expected output (numpy array)
    # #         plot_learning_curve (only for cv) - bool
    # #         num_model_iterations - Times to run the model (to average the results)
    # #         test_size (only for test) - % of data that should be treated as test (in decimal)
    # #         clf_class - Model to run (if specified model doesn't run,
    # #                     then it'll have to be imported from sklearn)
    # #         **kwargs  - Model inputs, refer sklearn documentation for your model to see available parameters
    # #         plot_learning_curve - bool
    # # @return: None

    # Create train / test split only for test
    if cv_0_test_1:
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=42)
        y_actual = y_predicted = y_test.copy()

        x_train = np.array(x_train)
        x_test = np.array(x_test)
        y_train = np.array(y_train)
    else:
        x_train, x_test, y_train, y_test = 0, 0, 0, 0

        y_actual = y_predicted = y.copy()

    # Plot learning curve only for cv
    if not cv_0_test_1 and plot_learning_curve:
        title = "Learning Curves"
        # Cross validation with 25 iterations to get smoother mean test and train
        # score curves, each time with 20% data randomly selected as a validation set.
        cv = cross_validation.ShuffleSplit(x.shape[0], n_iter=25, test_size=0.2, random_state=0)

        modeling_tools.plot_learning_curve(clf_class(**kwargs), title, x, y, cv=cv, n_jobs=-1)

        if not os.path.isdir("temp_pyplot_images_dont_commit"):
            # Create dir if it doesn't exist. Do not commit this directory or contents.
            # It's a temp store for pyplot images
            os.mkdir("temp_pyplot_images_dont_commit")

        # plt.show()
        plt.savefig("temp_pyplot_images_dont_commit/learning_curve.png")

    # Predict accuracy (mean of num_iterations)
    logger.info("k-fold CV:")

    # Accuracy
    mean_correct_positive_prediction = 0
    mean_correct_negative_prediction = 0
    mean_incorrect_positive_prediction = 0
    mean_incorrect_negative_prediction = 0
    mean_accuracy = 0

    # Precision / Recall
    beta = 2.0  # higher beta prioritizes recall more than precision, default is 1
    mean_precision = 0
    mean_recall = 0
    mean_fbeta_score = 0

    if return_yprob:
        num_model_iterations = 1  # for probabilities returned, just run 1 iteration

    for _ in range(num_model_iterations):
        if cv_0_test_1:  # test
            y_predicted = run_test(x_train=x_train, y_train=y_train, x_test=x_test,
                                   run_prob_predictions=run_prob_predictions, return_yprob=return_yprob,
                                   classification_threshold=classification_threshold,
                                   clf_class=clf_class, **kwargs)
        else:  # cv
            y_predicted = run_cv(x=x, y=y, run_prob_predictions=run_prob_predictions, return_yprob=return_yprob,
                                 classification_threshold=classification_threshold, clf_class=clf_class, **kwargs)

        # Only do accuracy / precision and recall if actual classified values are returned and not probabilities
        if not return_yprob:
            # Accuracy
            mean_accuracy += accuracy(y_actual, y_predicted)

            mean_correct_positive_prediction += correct_positive_prediction
            mean_correct_negative_prediction += correct_negative_prediction
            mean_incorrect_positive_prediction += incorrect_positive_prediction
            mean_incorrect_negative_prediction += incorrect_negative_prediction

            # Precision recall
            prec_recall = precision_recall_fscore_support(y_true=y_actual, y_pred=y_predicted, beta=beta,
                                                          average='binary')

            mean_precision += prec_recall[0]
            mean_recall += prec_recall[1]
            mean_fbeta_score += prec_recall[2]

    # Only do accuracy / precision and recall if actual classified values are returned and not probabilities
    if not return_yprob:
        # Accuracy
        mean_accuracy /= num_model_iterations
        mean_correct_positive_prediction /= num_model_iterations
        mean_correct_negative_prediction /= num_model_iterations
        mean_incorrect_positive_prediction /= num_model_iterations
        mean_incorrect_negative_prediction /= num_model_iterations

        # Precision recall
        mean_precision /= num_model_iterations
        mean_recall /= num_model_iterations
        mean_fbeta_score /= num_model_iterations

        # Accuracy
        logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nAccuracy {:.2f}".format(mean_accuracy * 100) + sf.Color.END)

        logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nCorrect positive prediction {:.2f}".format(
            mean_correct_positive_prediction) + sf.Color.END)
        logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nCorrect negative prediction {:.2f}".format(
            mean_correct_negative_prediction) + sf.Color.END)
        logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nIncorrect positive prediction {:.2f}".format(
            mean_incorrect_positive_prediction) + sf.Color.END)
        logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nIncorrect negative prediction {:.2f}".format(
            mean_incorrect_negative_prediction) + sf.Color.END)

        # Precision recall
        logger.info(sf.Color.BOLD + sf.Color.DARKCYAN + "\nPrecision {:.2f} Recall {:.2f} Fbeta-score {:.2f}".format(
            mean_precision * 100, mean_recall * 100, mean_fbeta_score * 100) + sf.Color.END)

    # compare probability predictions of the model
    if run_prob_predictions:
        if not cv_0_test_1:
            logger.info("\nPrediction probabilities for CV\n")

        # compare_prob_predictions(cv_0_test_1=cv_0_test_1, x=x, y=y, x_test=0, clf_class=clf_class, **kwargs)
        else:
            logger.info("\nPrediction probabilities for Test\n")

            # compare_prob_predictions(cv_0_test_1=cv_0_test_1, x=x_train, y=y_train, x_test=x_test,
            #                          clf_class=clf_class, **kwargs)

    return [y_actual, y_predicted]
コード例 #3
0
def run_model(cv_0_test_1,
              x,
              y,
              num_model_iterations=1,
              test_size=0.2,
              plot_learning_curve=False,
              run_prob_predictions=False,
              return_yprob=False,
              classification_threshold=0.5,
              clf_class=RandomForest,
              **kwargs):
    # # @brief: For cross-validation, Runs the model and gives accuracy and precision / recall
    # #         For test, runs the model and gives accuracy and precision / recall by treating
    # #         a random sample of input data as test data
    # # @param: x - Input features (numpy array)
    # #         y - expected output (numpy array)
    # #         plot_learning_curve (only for cv) - bool
    # #         num_model_iterations - Times to run the model (to average the results)
    # #         test_size (only for test) - % of data that should be treated as test (in decimal)
    # #         clf_class - Model to run (if specified model doesn't run,
    # #                     then it'll have to be imported from sklearn)
    # #         **kwargs  - Model inputs, refer sklearn documentation for your model to see available parameters
    # #         plot_learning_curve - bool
    # # @return: None

    # Create train / test split only for test
    if cv_0_test_1:
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, test_size=test_size, random_state=42)
        y_actual = y_predicted = y_test.copy()

        x_train = np.array(x_train)
        x_test = np.array(x_test)
        y_train = np.array(y_train)
    else:
        x_train, x_test, y_train, y_test = 0, 0, 0, 0

        y_actual = y_predicted = y.copy()

    # Plot learning curve only for cv
    if not cv_0_test_1 and plot_learning_curve:
        title = "Learning Curves"
        # Cross validation with 25 iterations to get smoother mean test and train
        # score curves, each time with 20% data randomly selected as a validation set.
        cv = cross_validation.ShuffleSplit(x.shape[0],
                                           n_iter=25,
                                           test_size=0.2,
                                           random_state=0)

        modeling_tools.plot_learning_curve(clf_class(**kwargs),
                                           title,
                                           x,
                                           y,
                                           cv=cv,
                                           n_jobs=-1)

        if not os.path.isdir("temp_pyplot_images_dont_commit"):
            # Create dir if it doesn't exist. Do not commit this directory or contents.
            # It's a temp store for pyplot images
            os.mkdir("temp_pyplot_images_dont_commit")

        # plt.show()
        plt.savefig("temp_pyplot_images_dont_commit/learning_curve.png")

    # Predict accuracy (mean of num_iterations)
    logger.info("k-fold CV:")

    # Accuracy
    mean_correct_positive_prediction = 0
    mean_correct_negative_prediction = 0
    mean_incorrect_positive_prediction = 0
    mean_incorrect_negative_prediction = 0
    mean_accuracy = 0

    # Precision / Recall
    beta = 2.0  # higher beta prioritizes recall more than precision, default is 1
    mean_precision = 0
    mean_recall = 0
    mean_fbeta_score = 0

    if return_yprob:
        num_model_iterations = 1  # for probabilities returned, just run 1 iteration

    for _ in range(num_model_iterations):
        if cv_0_test_1:  # test
            y_predicted = run_test(
                x_train=x_train,
                y_train=y_train,
                x_test=x_test,
                run_prob_predictions=run_prob_predictions,
                return_yprob=return_yprob,
                classification_threshold=classification_threshold,
                clf_class=clf_class,
                **kwargs)
        else:  # cv
            y_predicted = run_cv(
                x=x,
                y=y,
                run_prob_predictions=run_prob_predictions,
                return_yprob=return_yprob,
                classification_threshold=classification_threshold,
                clf_class=clf_class,
                **kwargs)

        # Only do accuracy / precision and recall if actual classified values are returned and not probabilities
        if not return_yprob:
            # Accuracy
            mean_accuracy += accuracy(y_actual, y_predicted)

            mean_correct_positive_prediction += correct_positive_prediction
            mean_correct_negative_prediction += correct_negative_prediction
            mean_incorrect_positive_prediction += incorrect_positive_prediction
            mean_incorrect_negative_prediction += incorrect_negative_prediction

            # Precision recall
            prec_recall = precision_recall_fscore_support(y_true=y_actual,
                                                          y_pred=y_predicted,
                                                          beta=beta,
                                                          average='binary')

            mean_precision += prec_recall[0]
            mean_recall += prec_recall[1]
            mean_fbeta_score += prec_recall[2]

    # Only do accuracy / precision and recall if actual classified values are returned and not probabilities
    if not return_yprob:
        # Accuracy
        mean_accuracy /= num_model_iterations
        mean_correct_positive_prediction /= num_model_iterations
        mean_correct_negative_prediction /= num_model_iterations
        mean_incorrect_positive_prediction /= num_model_iterations
        mean_incorrect_negative_prediction /= num_model_iterations

        # Precision recall
        mean_precision /= num_model_iterations
        mean_recall /= num_model_iterations
        mean_fbeta_score /= num_model_iterations

        # Accuracy
        logger.info(sf.Color.BOLD + sf.Color.DARKCYAN +
                    "\nAccuracy {:.2f}".format(mean_accuracy * 100) +
                    sf.Color.END)

        logger.info(sf.Color.BOLD + sf.Color.DARKCYAN +
                    "\nCorrect positive prediction {:.2f}".format(
                        mean_correct_positive_prediction) + sf.Color.END)
        logger.info(sf.Color.BOLD + sf.Color.DARKCYAN +
                    "\nCorrect negative prediction {:.2f}".format(
                        mean_correct_negative_prediction) + sf.Color.END)
        logger.info(sf.Color.BOLD + sf.Color.DARKCYAN +
                    "\nIncorrect positive prediction {:.2f}".format(
                        mean_incorrect_positive_prediction) + sf.Color.END)
        logger.info(sf.Color.BOLD + sf.Color.DARKCYAN +
                    "\nIncorrect negative prediction {:.2f}".format(
                        mean_incorrect_negative_prediction) + sf.Color.END)

        # Precision recall
        logger.info(sf.Color.BOLD + sf.Color.DARKCYAN +
                    "\nPrecision {:.2f} Recall {:.2f} Fbeta-score {:.2f}".
                    format(mean_precision * 100, mean_recall *
                           100, mean_fbeta_score * 100) + sf.Color.END)

    # compare probability predictions of the model
    if run_prob_predictions:
        if not cv_0_test_1:
            logger.info("\nPrediction probabilities for CV\n")

        # compare_prob_predictions(cv_0_test_1=cv_0_test_1, x=x, y=y, x_test=0, clf_class=clf_class, **kwargs)
        else:
            logger.info("\nPrediction probabilities for Test\n")

            # compare_prob_predictions(cv_0_test_1=cv_0_test_1, x=x_train, y=y_train, x_test=x_test,
            #                          clf_class=clf_class, **kwargs)

    return [y_actual, y_predicted]