Example #1
0
def grid_search(params_grid, X, Y, linear_model: LinearModel):
    linear_model_cv = GridSearchCV(linear_model.get_model(),
                                   params_grid,
                                   iid=False,
                                   cv=5)
    linear_model_cv.fit(X, Y)
    print("Best parameters set found on development set:")
    print()
    print(linear_model_cv.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = linear_model_cv.cv_results_['mean_test_score']
    stds = linear_model_cv.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds,
                                 linear_model_cv.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()
    return linear_model_cv.best_params_
Example #2
0
def evaluate_model_on_train(title,
                            model_name,
                            preprocessing,
                            data,
                            results_df: pd.DataFrame,
                            model: LinearModel,
                            verbose=cfg.verbose,
                            visualize=True):
    """
    Cross-validation, split train data frame to equally sized val and train data sets and evaluate R2 adjusted and RMSE
    :param title: name of experiment to be saved in results data frame
    :param model_name: name of linear model used in experiment
    :param preprocessing: type of postprocessing, if a massive change is preformed please write down in short
                          what the postprocessing includes(like normal)
    :param data: train data frame with SalePrice given
    :param results_df: where to put the result of the experiment
    :param model: model that implements the LinearModel interface
    :param verbose: to print the results on console
    :return:
    """
    print(
        "------------------  Training on train and CV on Train ------------------"
    )
    X, Y = data
    if visualize:
        X_train, X_val, Y_train, Y_val = split_data(X, Y)
        model.model_fit(X_train, Y_train)
        y_val_pred = model.predict(X_val)
        plt.figure()
        sns.residplot(y_val_pred, Y_val, lowess=True, color="b")
        plt.xlabel("Fitted values")
        plt.ylabel("Residuals")
        plt.title(model_name)
        plt.savefig(cfg.visualization_dir + "/Residuals_{}".format(model_name))

    print("------------------  Evaluating on Train ------------------")
    model.model_fit(X, Y)
    train_rmse, train_R2_adjusted = model.model_eval(X, Y, False)
    rmse_cv_score = np.sqrt(
        np.abs(
            cross_val_score(model.get_model(),
                            X,
                            Y,
                            cv=5,
                            scoring='neg_mean_squared_error')))
    R2_cv_score = cross_val_score(model.get_model(), X, Y, cv=5, scoring='r2')
    if verbose:
        print("train: \n rmse: {}, R2_adj: {}".format(train_rmse,
                                                      train_R2_adjusted))
        print("cross validation rmse score: {} (+/- {}".format(
            rmse_cv_score.mean(),
            rmse_cv_score.std() * 2))
        print("cross validation r2 score: {} (+/- {}".format(
            R2_cv_score.mean(),
            R2_cv_score.std() * 2))
    results = [
        build_result_line(title, 'train', model_name, preprocessing,
                          train_rmse, train_R2_adjusted),
        build_result_line(title, 'val', model_name, preprocessing,
                          rmse_cv_score.mean(), R2_cv_score.mean())
    ]
    if title in results_df['title'].values:
        print(
            "Warning: Overwriting previous experiement due to colliding title")
        user_input = over_write_or_exit_from_usr()
        if user_input == 'e':
            print("not saving results, existing....")
            exit()
        if user_input == 'd':
            print("deleting previous results, existing....")
            criteria = results_df['title'].values != title
            results_df = results_df[criteria]
            results_df.to_pickle(path=cfg.results_df)
            results_df.to_csv(path_or_buf=cfg.results_path, index=False)
            exit()
        criteria = results_df['title'].values != title
        results_df = results_df[criteria]
    results_df = results_df.append(
        pd.DataFrame(results,
                     columns=[
                         'title', 'dataset', 'model_name', 'preprocessing',
                         'rmse', 'R2_adjusted'
                     ]))
    results_df.to_pickle(path=cfg.results_df)
    results_df.to_csv(path_or_buf=cfg.results_path, index=False)