def test_experiment_lgb_regressor(): X, y = make_regression_df(n_samples=1024, n_num_features=10, n_cat_features=2, random_state=0, id_column='user_id') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) params = {'objective': 'regression', 'max_depth': 8} with get_temp_directory() as temp_path: result = experiment_gbdt(params, X_train, y_train, X_test, temp_path) assert len(np.unique(result.oof_prediction) ) > 5 # making sure prediction is not binarized assert len(np.unique(result.test_prediction)) > 5 assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1] _check_file_exists( temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
def test_experiment_sklearn_regressor(tmpdir_name): X, y = make_regression_df(n_samples=1024, n_num_features=10, n_cat_features=0, random_state=0, id_column='user_id') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) params = {'fit_intercept': True} result = run_experiment(params, X_train, y_train, X_test, tmpdir_name, with_auto_prep=False, algorithm_type=LinearRegression) assert len(np.unique( result.oof_prediction)) > 5 # making sure prediction is not binarized assert len(np.unique(result.test_prediction)) > 5 assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1] _check_file_exists(tmpdir_name)
def test_experiment_cat_custom_eval(): X, y = make_regression_df(n_samples=1024, n_num_features=10, n_cat_features=2, random_state=0, id_column='user_id') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) params = {'max_depth': 8, 'num_boost_round': 100, 'eval_metric': 'MAE'} with get_temp_directory() as temp_path: result = experiment_gbdt(params, X_train, y_train, X_test, temp_path, gbdt_type='cat', eval_func=mean_absolute_error) assert mean_absolute_error(y_train, result.oof_prediction) == result.metrics[-1] _check_file_exists( temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
def test_experiment_cat_custom_eval(tmpdir_name): X, y = make_regression_df(n_samples=1024, n_num_features=10, n_cat_features=2, random_state=0, id_column='user_id') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) params = {'max_depth': 8, 'num_boost_round': 100, 'eval_metric': 'MAE'} result = run_experiment(params, X_train, y_train, X_test, tmpdir_name, algorithm_type='cat', eval_func=mean_absolute_error) assert mean_absolute_error(y_train, result.oof_prediction) == result.metrics[-1] _check_file_exists(tmpdir_name)
def test_experiment_xgb_regressor(tmpdir_name): X, y = make_regression_df(n_samples=1024, n_num_features=10, n_cat_features=2, random_state=0, id_column='user_id') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) params = {'max_depth': 8, 'num_boost_round': 100} result = run_experiment(params, X_train, y_train, X_test, tmpdir_name, algorithm_type='xgb', with_auto_prep=True) assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1] _check_file_exists(tmpdir_name)
def test_experiment_cat_regressor(): X, y = make_regression_df(n_samples=1024, n_num_features=10, n_cat_features=2, random_state=0, id_column='user_id') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) params = {'max_depth': 8, 'num_boost_round': 100} with get_temp_directory() as temp_path: result = run_experiment(params, X_train, y_train, X_test, temp_path, algorithm_type='cat') assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1] _check_file_exists( temp_path, ('oof_prediction.npy', 'test_prediction.npy', 'metrics.txt'))
def test_averaging_regression(): X, y = make_regression_df() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) oof, test = _make_1st_stage_preds(X_train, y_train, X_test) result = averaging(test, oof, y_train) assert_array_almost_equal((test[0] + test[1] + test[2]) / 3, result.test_prediction) assert_array_almost_equal((oof[0] + oof[1] + oof[2]) / 3, result.oof_prediction) assert result.score is None
def test_averaging_opt_minimize(): X, y = make_regression_df(n_samples=1024) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) oof, test = _make_1st_stage_preds(X_train, y_train, X_test) best_single_model = min(mean_squared_error(y_train, oof[0]), mean_squared_error(y_train, oof[1]), mean_squared_error(y_train, oof[2])) result = averaging_opt(test, oof, y_train, mean_squared_error, higher_is_better=False) assert result.score <= best_single_model result_simple_avg = averaging(test, oof, y_train, eval_func=mean_squared_error) assert result.score <= result_simple_avg.score
def test_experiment_lgb_regressor(tmpdir_name): X, y = make_regression_df(n_samples=1024, n_num_features=10, n_cat_features=2, random_state=0, id_column='user_id') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) params = { 'objective': 'regression', 'max_depth': 8 } result = run_experiment(params, X_train, y_train, X_test, tmpdir_name) assert len(np.unique(result.oof_prediction)) > 5 # making sure prediction is not binarized assert len(np.unique(result.test_prediction)) > 5 assert mean_squared_error(y_train, result.oof_prediction) == result.metrics[-1] _check_file_exists(tmpdir_name)
def test_averaging_opt_minimize_with_method(): X, y = make_regression_df(n_samples=1024) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) oof, test = _make_1st_stage_preds(X_train, y_train, X_test) best_single_model = min(mean_squared_error(y_train, oof[0]), mean_squared_error(y_train, oof[1]), mean_squared_error(y_train, oof[2])) result1 = averaging_opt(test, oof, y_train, mean_squared_error, higher_is_better=False) result2 = averaging_opt(test, oof, y_train, mean_squared_error, higher_is_better=False, method='Nelder-Mead') result3 = averaging_opt(test, oof, y_train, mean_squared_error, higher_is_better=False, method='SLSQP') assert result1.score != result2.score assert result1.score == result3.score assert result1.score <= best_single_model assert result2.score <= best_single_model