def smooth_wrapper(random_forest, X_train, y_train, params): inner_distance, parent_all, no_constraint, initial_lamb, class_loss, \ adam_values = params a, b1, b2, e = adam_values adam_dict = {"alpha": a, "beta_1": b1, "beta_2": b2, "eps": e} adam_rf, _, _, c = smooth_rf.smooth(random_forest, X_trained=X_train, y_trained=y_train, no_constraint=no_constraint, sgd_max_num=10000, all_trees=False, initial_lamb_seed=initial_lamb, parents_all=False, dist_mat_style=inner_distance, distance_style=d_style, class_eps=0.0001, class_loss=class_loss, adam=adam_dict, verbose=False, levels=levels) best_oob = np.min(c) if initial_lamb is not None: il = str(initial_lamb) else: il = "rf" name = "element_opt" + "_dist_style:" + d_style + add_levels +\ "_dist:" + inner_distance +\ ",parents:" + str(parent_all) +\ ",constraints:" + str(not no_constraint) +\ ",initial_lamb:" + il +\ ",adam_options:" + str(adam_values).replace(", ", "_") scoring = assess_rf(adam_rf, X_test, y_test) info = adam_rf.lamb return name, scoring, info, best_oob
def check_rf_grow(n_data, n_large, n_draws, reg_or_class="reg", depth_range=np.arange(1, 50), verbose=True, ntree=1, data_set=["microsoft", "knn", "online_news", "splice"], tuning=["resample", "oob", "oracle"], constrained=True, style=["level-base", "element-based"], parents_all=False, batch=["single-tree", "all-trees"], initial_lamb=["rf-init", "random-init"], max_iter=10000, t=1, data_all=None, y_all=None): model_type = None if reg_or_class == "reg": model_type = sklearn.ensemble.RandomForestRegressor scoring = sklearn.metrics.mean_squared_error if reg_or_class == "class": model_type = sklearn.ensemble.RandomForestClassifier scoring = sklearn.metrics.accuracy_score if model_type is None: raise ValueError("reg_or_class input string is not 'reg' "+\ "nor 'class'.") depth_iter = list(enumerate(depth_range)) n_depth = depth_range.shape[0] if verbose: bar = progressbar.ProgressBar() depth_iter = bar(depth_iter) # selection parameters if type(data_set) == list: data_set = data_set[0] if type(tuning) == list: tuning = tuning[0] if type(style) == list: style = style[0] if type(batch) == list: batch = batch[0] if type(initial_lamb) == list: initial_lamb = initial_lamb[0] if data_set == "microsoft": data_generator = smooth_rf.generate_data elif data_set == "knn": data_generator = lambda large_n: smooth_rf.generate_data_knn( n=large_n, p=np.array([.3, .7])) elif data_set == "online_news" or \ data_set == "splice" or \ data_set == "prgeng": if tuning == "oracle": NameError("tuning cannot be oracle for the online_news, splice, " +\ "prgeng datasets") if data_all is None or y_all is None: NameError("data_all and y_all must be inserted when using the "+\ "online_news, splice, " +\ "prgeng datasets") else: NameError("data_set option needs to be 1 of the 3 options") if tuning == "resample": resample_input = True else: resample_input = False if initial_lamb == "rf-init": initial_lamb_seed_f = lambda: None elif initial_lamb == "random-init": initial_lamb_seed_f = get_random_seed if batch == "single-tree": all_trees = False elif batch == "all-trees": all_trees = True else: NameError("batch option needs to be 1 of the 2 options") # storage devices if style == "level-base": score_mat = np.zeros((2, n_depth, n_draws)) c_mat = None elif style == "element-based": score_mat = np.zeros((3, n_depth, n_draws)) c_mat = np.zeros((depth_range.shape[0], n_draws, max_iter + 1)) else: NameError("style needs to be 1 of the 2 options") for i, max_depth in depth_iter: for j in np.arange(n_draws): if data_set == "online_news" or \ data_set == "splice" or \ data_set == "prgeng": data, data_test, y, y_test = \ sklearn.model_selection.train_test_split(data_all, y_all, test_size = .5) data_tune = None y_tune = None else: # data generation all_dat = data_generator(large_n=n_data) data, y = all_dat[0], all_dat[1] y = y + 100 # tune if tuning == "oracle": all_dat_tune = data_generator(large_n=n_large) data_tune, y_tune = all_dat_tune[0], all_dat_tune[1] y_tune = y_tune + 100 y_tune = y_tune.ravel() else: data_tune = None y_tune = None # test all_dat_test = data_generator(large_n=n_large) data_test, y_test = all_dat_test[0], all_dat_test[1] y_test = y_test + 100 model = model_type(max_depth=max_depth, n_estimators=ntree) model_fit = model.fit(data, y) if style == "level-base": smooth_rf_level = smooth_rf.smooth_all( model_fit, X_trained=data, y_trained=y.ravel(), X_tune=data_tune, y_tune=y_tune, resample_tune=resample_input, no_constraint=not constrained, parents_all=parents_all, verbose=False) yhat_test_base = model_fit.predict(data_test) score_mat[0, i, j] = scoring(y_test, yhat_test_base) yhat_test = smooth_rf_level.predict(data_test) score_mat[1, i, j] = scoring(y_test, yhat_test) elif style == "element-based": smooth_rf_opt, smooth_rf_last, _, c = smooth_rf.smooth( model_fit, X_trained=data, y_trained=y.ravel(), X_tune=data_tune, y_tune=y_tune, resample_tune=resample_input, no_constraint=not constrained, sgd_max_num=max_iter, sgd_t_fix=t, parents_all=parents_all, verbose=False, all_trees=all_trees, initial_lamb_seed=initial_lamb_seed_f(), adam={ "alpha": .001, "beta_1": .9, "beta_2": .999, "eps": 1e-8 }) c_mat[i, j, :] = c yhat_test_base = model_fit.predict(data_test) score_mat[0, i, j] = scoring(y_test, yhat_test_base) yhat_test_opt = smooth_rf_opt.predict(data_test) score_mat[1, i, j] = scoring(y_test, yhat_test_opt) yhat_test_last = smooth_rf_last.predict(data_test) score_mat[2, i, j] = scoring(y_test, yhat_test_last) return score_mat, c_mat
ggout = ggplot(data_vis) +\ geom_point(aes(x = "x1",y ="x2", color = "factor(y)")) +\ theme_minimal() +\ labs(x= "X1", y = "X2", color = "value (minus 100)") rf = sklearn.ensemble.RandomForestClassifier(n_estimators=300) rf_fit = rf.fit(data, y) smooth_rf_opt_ce, smooth_rf_last_ce, last_ce, c_ce = smooth_rf.smooth( rf_fit, X_trained=data, y_trained=y.ravel(), X_tune=None, y_tune=None, resample_tune=False, # oob no_constraint=False, subgrad_max_num=10000, subgrad_t_fix=1, parents_all=True, verbose=True, all_trees=False, initial_lamb_seed=None, class_eps=1e-4, class_loss="ce") smooth_rf_opt_l2, smooth_rf_last_l2, last_l2, c_l2 = smooth_rf.smooth( rf_fit, X_trained=data, y_trained=y.ravel(), X_tune=None, y_tune=None, resample_tune=False, # oob
model_type = sklearn.ensemble.RandomForestRegressor model = model_type(n_estimators=10) model_fit = model.fit(data, y) random_forest = model_fit max_iter = 10000 smooth_rf_standard, _ , _, loss_all_standard = \ smooth_rf.smooth(random_forest=random_forest, X_trained=data, y_trained=y, X_tune=None, y_tune=None, resample_tune=False, sgd_max_num=max_iter, all_trees=False, parents_all=True, distance_style="standard", verbose=True, adam={"alpha": .001, "beta_1": .9, "beta_2": .999,"eps": 1e-8}) smooth_rf_pytorch, loss_all, loss_min, params_min, best_model, \ (torch_model, forest_dataset, dataloader) = \ smooth_rf.smooth_pytorch(random_forest=random_forest, X_trained=data, y_trained=y, X_tune=None, y_tune=None, resample_tune=False, sgd_max_num=max_iter, all_trees=False, parents_all=True,
n_rf_fit = n_rf.fit(data,y) yhat_test_node = n_rf_fit.predict(data_test) # smoothed rf_base = model(n_estimators = n_estimators) rf_fit = rf_base.fit(data,y) yhat_test_base = rf_fit.predict(data_test) rf_smooth, rf_smooth_b, _, _ = smooth_rf.smooth( random_forest = rf_fit, X_trained = data, y_trained = y, sgd_max_num = n_steps, verbose = False, parents_all = True, distance_style = "standard", no_constraint = False, adam = {"alpha": .001, "beta_1": .9, "beta_2": .999,"eps": 1e-8}) yhat_test_smooth = rf_smooth.predict(data_test) yhat_test_smooth_b = rf_smooth_b.predict(data_test) # depth_smoothed d_rf_smooth, d_rf_smooth_b, _, _ = smooth_rf.smooth(d_rf_fit, X_trained = data, y_trained = y, sgd_max_num = n_steps, verbose = False, parents_all = True,