def grid_search(params_grid, X, Y, linear_model: LinearModel): linear_model_cv = GridSearchCV(linear_model.get_model(), params_grid, iid=False, cv=5) linear_model_cv.fit(X, Y) print("Best parameters set found on development set:") print() print(linear_model_cv.best_params_) print() print("Grid scores on development set:") print() means = linear_model_cv.cv_results_['mean_test_score'] stds = linear_model_cv.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, linear_model_cv.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() return linear_model_cv.best_params_
def evaluate_model_on_train(title, model_name, preprocessing, data, results_df: pd.DataFrame, model: LinearModel, verbose=cfg.verbose, visualize=True): """ Cross-validation, split train data frame to equally sized val and train data sets and evaluate R2 adjusted and RMSE :param title: name of experiment to be saved in results data frame :param model_name: name of linear model used in experiment :param preprocessing: type of postprocessing, if a massive change is preformed please write down in short what the postprocessing includes(like normal) :param data: train data frame with SalePrice given :param results_df: where to put the result of the experiment :param model: model that implements the LinearModel interface :param verbose: to print the results on console :return: """ print( "------------------ Training on train and CV on Train ------------------" ) X, Y = data if visualize: X_train, X_val, Y_train, Y_val = split_data(X, Y) model.model_fit(X_train, Y_train) y_val_pred = model.predict(X_val) plt.figure() sns.residplot(y_val_pred, Y_val, lowess=True, color="b") plt.xlabel("Fitted values") plt.ylabel("Residuals") plt.title(model_name) plt.savefig(cfg.visualization_dir + "/Residuals_{}".format(model_name)) print("------------------ Evaluating on Train ------------------") model.model_fit(X, Y) train_rmse, train_R2_adjusted = model.model_eval(X, Y, False) rmse_cv_score = np.sqrt( np.abs( cross_val_score(model.get_model(), X, Y, cv=5, scoring='neg_mean_squared_error'))) R2_cv_score = cross_val_score(model.get_model(), X, Y, cv=5, scoring='r2') if verbose: print("train: \n rmse: {}, R2_adj: {}".format(train_rmse, train_R2_adjusted)) print("cross validation rmse score: {} (+/- {}".format( rmse_cv_score.mean(), rmse_cv_score.std() * 2)) print("cross validation r2 score: {} (+/- {}".format( R2_cv_score.mean(), R2_cv_score.std() * 2)) results = [ build_result_line(title, 'train', model_name, preprocessing, train_rmse, train_R2_adjusted), build_result_line(title, 'val', model_name, preprocessing, rmse_cv_score.mean(), R2_cv_score.mean()) ] if title in results_df['title'].values: print( "Warning: Overwriting previous experiement due to colliding title") user_input = over_write_or_exit_from_usr() if user_input == 'e': print("not saving results, existing....") exit() if user_input == 'd': print("deleting previous results, existing....") criteria = results_df['title'].values != title results_df = results_df[criteria] results_df.to_pickle(path=cfg.results_df) results_df.to_csv(path_or_buf=cfg.results_path, index=False) exit() criteria = results_df['title'].values != title results_df = results_df[criteria] results_df = results_df.append( pd.DataFrame(results, columns=[ 'title', 'dataset', 'model_name', 'preprocessing', 'rmse', 'R2_adjusted' ])) results_df.to_pickle(path=cfg.results_df) results_df.to_csv(path_or_buf=cfg.results_path, index=False)