def run_experiment( model_name: str, get_data: callable, compute_permutation: bool, \ save_results: bool, model, exp_results_path): X, y = get_data() seed(7) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=VAL_RATIO) preprocessing_pipeline.fit(X_train) X_train = preprocessing_pipeline.transform(X_train) X_test = preprocessing_pipeline.transform(X_test) print("binning data") num_cols = get_num_cols(X_train.dtypes) bin_mapper = BinMapper(max_bins=256, random_state=42) X_train.loc[:, num_cols] = bin_mapper.fit_transform( X_train.loc[:, num_cols].values) X_test.loc[:, num_cols] = bin_mapper.transform(X_test.loc[:, num_cols].values) original_dtypes = X_train.dtypes model.fit(X_train, y_train) test_prediction = model.predict(X_test) if compute_permutation: permutation_train = model.compute_fi_permutation(X_train, y_train).to_dict() permutation_test = model.compute_fi_permutation(X_test, y_test).to_dict() else: empty_dict = Series({col: nan for col in original_dtypes}) permutation_train = empty_dict permutation_test = empty_dict is_classification = len(unique(y)) == 2 if is_classification: df = DataFrame() df['p'] = model.predict(X_test) df['y'] = y_test df = df[df.p.notna()] logloss = log_loss(df['y'], df['p']) else: logloss = nan fi = Series(model.compute_feature_importance(method='gain')) fi = normalize_series(fi).to_dict() results = dict(model=f"{model_name}", ntrees=len(model.trees), leaves=[tree.n_leaves for tree in model.trees], nleaves=sum([tree.n_leaves for tree in model.trees]), logloss=logloss, gain=fi) # if save_results: # DataFrame(Series(results)).T.to_csv(exp_results_path) print(logloss)
def bin_numeric_features(x_train, x_test, contains_num_features): if contains_num_features: print("binning data") num_cols = get_num_cols(x_train.dtypes) bin_mapper = BinMapper(max_bins=256, random_state=42) x_train.loc[:, num_cols] = bin_mapper.fit_transform( x_train.loc[:, num_cols].values) x_test.loc[:, num_cols] = bin_mapper.transform( x_test.loc[:, num_cols].values) return x_train, x_test return x_train, x_test
y = a * (X['X1'] > 0.5) + (10 - a) * X['X2'].isin(list(range(k1 // 2))) + random.random( nrows) * sigma return X, y if __name__ == '__main__': MULTIPLE_EXPERIMENTS = True KFOLD = False ONE_HOT = False COMPUTE_PERMUTATION = True RESULTS_DIR = Path("k_50_sigma_5_x1_num_for_paper_k_200_sigma_20_nrows100/") REGRESSION = True x, y = get_x_y(2) contains_num_features = len(get_num_cols(x.dtypes)) > 0 pp = get_preprocessing_pipeline if contains_num_features else get_preprocessing_pipeline_only_cat predictors = GBM_REGRESSORS if REGRESSION else GBM_CLASSIFIERS config = Config( multiple_experimens=MULTIPLE_EXPERIMENTS, n_experiments=10,#100 kfold_flag=KFOLD, compute_permutation=COMPUTE_PERMUTATION, save_results=True, one_hot=ONE_HOT, contains_num_features=contains_num_features, seed=SEED, kfolds=KFOLDS, predictors=predictors, columns_to_remove=[],
np.random.seed(3) if FAST: model = FastCartGradientBoostingRegressorKfold if KFOLD else FastCartGradientBoostingRegressor else: model = CartGradientBoostingRegressorKfold if KFOLD else CartGradientBoostingRegressor reg = model(max_depth=3) start = time.time() X, y = get_x_y_boston() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) if FAST: num_cols = get_num_cols(X.dtypes) bin_mapper = BinMapper(max_bins=256, random_state=42) X_train.loc[:, num_cols] = bin_mapper.fit_transform( X_train.loc[:, num_cols].values) X_test.loc[:, num_cols] = bin_mapper.transform( X_test.loc[:, num_cols].values) reg.fit(X_train, y_train) end = time.time() print(end - start) start = time.time() print(f"mse is {mean_squared_error(y_test, reg.predict(X_test))}") end = time.time() print(end - start) tree_vis = TreeVisualizer() tree_vis.plot(reg.trees[0])