def _test_elbs(model): elb_pred = model.predict(elb_data) score = score_util.ScoreReport(elb_y, elb_pred) elb_cv_results.append(score.abs_99) print(score) print(f"elb yield stats: {_print_stats(elb_y)}") print(f"elb prediction stats: {_print_stats(elb_pred)}")
def _test_elbs(model): if elb_data.num_row() == 0: print("no test elbs found, skipping scoring") return elb_pred = model.predict(elb_data) score = score_util.ScoreReport(elb_y, elb_pred) elb_cv_results.append(score.abs_99) print(score) print(f"elb yield stats: {_print_stats(elb_y)}") print(f"elb prediction stats: {_print_stats(elb_pred)}")
def _eval(pred, d: xgb.DMatrix): _scr = score_util.ScoreReport(d.get_label(), pred) return '3std_100x', int(round(_scr.abs_99 * 100, 0))
return '3std_100x', int(round(_scr.abs_99 * 100, 0)) print(f"training xgb model") model: xgb.Booster = xgb.train( default_xgb_params, train, num_boost_round=xgb_n_rounds, early_stopping_rounds=xgb_early_stopping, evals=eval_list, evals_result=eval_result, feval=_eval, verbose_eval=verbose_eval) predictions = model.predict(test, ntree_limit=model.best_ntree_limit) score = score_util.ScoreReport(test.get_label(), predictions) _results = grid_cv_results[i] _results.add(score.abs_99, iteration=model.best_iteration, mean=score.abs_mean, std_dev=score.abs_std, y_min=np.min(train.get_label()), y_max=np.max(train.get_label()), pred_min=np.min(predictions), pred_max=np.max(predictions)) print(f""" GRID SEARCH RESULT: ({curr_model_training} of total: {total_model_trainings}) {test_run_idx + 1}, {idx + 1}, {i + 1} of {len(cv_params) - 1}, {p} training yield stats: {_print_stats(train.get_label())} pred yield stats: {_print_stats(predictions)}""")
epoch = 0 epoch_scores: List[score_util.ScoreReport] = [] test_diff = float('inf') while epoch < epochs_max: print(f"model current estimators: {model.n_estimators}") n_estimators = model.n_estimators + n_threads print(f"setting estimators: {n_estimators}") model.n_estimators = n_estimators print("fitting model") model.fit(train, train_label) print("scoring") predictions = model.predict(test) score = score_util.ScoreReport(test_label, predictions, store_predictions=False) print(f""" EPOCH RESULT: (training {curr_model_training} of total: {total_model_trainings}) epoch: {epoch} n_estimators: {n_estimators}""") print(f"abs_99: {score.abs_99:.2f}") mem_util.print_mem_usage() if epoch > 0: test_diff = epoch_scores[epoch - 1].abs_std_3 - score.abs_std_3 print(f"score diff: {test_diff:.2f}") epoch += 1 if test_diff < train_tol and epoch >= epochs_min: print(f"ending early. test diff: {test_diff:.2f}, epochs: {epoch}")
# def _run_dummies(): column_categories = categorical_util.get_categories_lookup(dtrain) categorical_util.set_categories(dtrain, column_categories) categorical_util.set_categories(dtest, column_categories) dtrain_y = dtrain.pop('Dry_Yield') dtrain = categorical_util.encode_dummies(df) dummy_cols = dtrain.columns print(dummy_cols) dtrain = dtrain.to_coo() dtest_y = dtest.pop('Dry_Yield') dtest = pandas.get_dummies(dtest, sparse=True, drop_first=True, dummy_na=False, prefix_sep='__DUMMY__').to_sparse().to_coo() print(f"dmatrix sizes: {dtrain.shape[1]}, {dtest.shape[1]}") dtrain: xgb.DMatrix = xgb.DMatrix(dtrain, dtrain_y) dtest: xgb.DMatrix = xgb.DMatrix(dtest, label=dtest_y) model: xgb.Booster = xgb.train({'max_depth': 2}, dtrain, num_boost_round=2) pred = model.predict(dtest) scr = score_util.ScoreReport(dtest_y, pred) print(scr) # _run_dummies()
column_categories = categorical_util.encode_categories(data) dummy_enc = categorical_util.DummyEncoder(data.columns, column_categories) print("Fitting dummy enc") data: np.ndarray = dummy_enc.fit_transform(data.as_matrix()) kf_outer = GroupKFold() split = next(kf_outer.split(data, groups=df_year_id)) train_idx, test_idx = split train, train_y = data[train_idx], data_label[train_idx] test, test_y = data[test_idx], data_label[test_idx] model = XGBRegressor(max_depth=5, n_estimators=100, silent=False, n_jobs=2) model.fit(train, train_y) scr = score_util.ScoreReport(test_y, model.predict(test)) print(scr) model = RandomForestRegressor(verbose=99, n_estimators=50, n_jobs=2) model.fit(train, train_y) scr = score_util.ScoreReport(test_y, model.predict(test)) print(scr) model = ExtraTreesRegressor(verbose=99, n_estimators=50, n_jobs=2) model.fit(train, train_y) scr = score_util.ScoreReport(test_y, model.predict(test)) print(scr) model = SVR(degree=5, verbose=99) model.fit(train, train_y) scr = score_util.ScoreReport(test_y, model.predict(test))
def _score_cv(est: GridSearchCV, X, y): if est.best_estimator_ not in cv_results: cv_results.append(est.best_estimator_) predictions = est.predict(X) return score_util.ScoreReport(y, predictions).abs_99
def _score_grid_search(est: Pipeline, X, y): predictions = est.predict(X) _print_mem_usage() return score_util.ScoreReport(y, predictions).abs_99