#d2 = average_depth(smooth_rf_opt, data) #np.all(d1 == d2) n_trees = len(random_forest.estimators_) n_obs = data.shape[0] depth = np.zeros(n_obs) for t in random_forest.estimators_: d_path = t.decision_path(data) depth = depth + np.array(d_path.sum(axis=1)).ravel() return depth / n_trees # start of analysis data, y = smooth_rf.generate_data(large_n=650) data_vis = pd.DataFrame(data={ "x1": data[:, 0], "x2": data[:, 1], "y": y }, columns=["x1", "x2", "y"]) ggout = ggplot(data_vis) +\ geom_point(aes(x = "x1",y ="x2", color = "factor(y)")) +\ theme_minimal() +\ labs(x= "X1", y = "X2", color = "value (minus 100)") rf = sklearn.ensemble.RandomForestClassifier(n_estimators=300) rf_fit = rf.fit(data, y)
n_sim = 100 depth_range = np.arange(2, 50, 2) np.random.seed(100) verbose = True # n_tree = 10 n_tree = 10 score_mat = np.zeros((n_sim, depth_range.shape[0])) if verbose: bar = progressbar.ProgressBar() sim_iter = bar(np.arange(n_sim)) for s_idx in sim_iter: all_dat_tune = smooth_rf.generate_data(large_n=1000) data_all, y_all = all_dat_tune[0], all_dat_tune[1] #data_all = data_all + 100 y_all = y_all.ravel() data_train, data_test, y_train, y_test = \ sklearn.model_selection.train_test_split(data_all, y_all, test_size = .5) score_vec = regression_prep(data_train, data_test, y_train, y_test, depth_range=np.arange(2, 50, 2),
def test_update_rf(): """ test update_rf """ data, y = smooth_rf.generate_data(650) X_trained = data y_trained = y data_test, y_test = smooth_rf.generate_data(10000) # classification model_type = sklearn.ensemble.RandomForestClassifier model = model_type(n_estimators=2) model_fit = model.fit(data, y) random_forest = model_fit max_iter = 10000 y_all, Gamma, eta, weights_all, t_idx_vec, \ one_d_dict, two_d_dict, lamb_dim, num_classes \ = smooth_rf.pytorch_numpy_prep(random_forest, X_trained = X_trained, y_trained = y_trained, distance_style="standard", parents_all=True, verbose=False, train_only=True) forest_dataset = smooth_rf.ForestDataset(y_all, Gamma, eta, weights_all, t_idx_vec, one_d_dict, two_d_dict, lamb_dim) dataloader = smooth_rf.DataLoader( dataset=forest_dataset, sampler=RandomSampler(forest_dataset, replacement=True) ) # doesn't have to go through all trees in 1 iteration init = 100 num_vars = len(one_d_dict) + len(two_d_dict) torch_model = smooth_rf.SoftmaxTreeFit(num_vars=num_vars, lamb_dim=lamb_dim, init=init) criterion = smooth_rf.weighted_l2 optimizer = torch.optim.Adam(torch_model.parameters()) smooth_rf_pytorch = smooth_rf.update_rf(random_forest, pytorch_model=torch_model, X_trained=X_trained, y_trained=y_trained, parents_all=True, distance_style="standard", verbose=False) y_pred_test_base = random_forest.predict(data_test) y_pred_test_base_prob = random_forest.predict_proba(data_test) y_pred_test_smooth = smooth_rf_pytorch.predict(data_test) y_pred_test_smooth_prob = smooth_rf_pytorch.predict_proba(data_test) assert np.all(y_pred_test_base == y_pred_test_smooth), \ "update of random forest with really weak weights should produce "+\ "the same predictions are the base rf" assert np.any(y_pred_test_base_prob != y_pred_test_smooth_prob), \ "update of random forest with really weak weights should produce "+\ "slightly different probabilities" # regression model_type = sklearn.ensemble.RandomForestRegressor model = model_type(n_estimators=2) model_fit = model.fit(data, y) random_forest = model_fit max_iter = 10000 y_all, Gamma, eta, weights_all, t_idx_vec, \ one_d_dict, two_d_dict, lamb_dim, num_classes \ = smooth_rf.pytorch_numpy_prep(random_forest, X_trained = X_trained, y_trained = y_trained, distance_style="standard", parents_all=True, verbose=False, train_only=True) forest_dataset = smooth_rf.ForestDataset(y_all, Gamma, eta, weights_all, t_idx_vec, one_d_dict, two_d_dict, lamb_dim) dataloader = smooth_rf.DataLoader( dataset=forest_dataset, sampler=RandomSampler(forest_dataset, replacement=True) ) # doesn't have to go through all trees in 1 iteration init = 100 num_vars = len(one_d_dict) + len(two_d_dict) torch_model = smooth_rf.SoftmaxTreeFit(num_vars=num_vars, lamb_dim=lamb_dim, init=init) criterion = smooth_rf.weighted_l2 optimizer = torch.optim.Adam(torch_model.parameters()) smooth_rf_pytorch = smooth_rf.update_rf(random_forest, pytorch_model=torch_model, X_trained=X_trained, y_trained=y_trained, parents_all=True, distance_style="standard", verbose=False) y_pred_test_base = random_forest.predict(data_test) y_pred_test_smooth = smooth_rf_pytorch.predict(data_test) assert np.allclose(y_pred_test_base,y_pred_test_smooth), \ "update of random forest with really weak weights should produce "+\ "the really close predicted values are the base rf" assert np.any(y_pred_test_base != y_pred_test_smooth), \ "update of random forest with really weak weights should produce "+\ "slightly different predicted values"
def pull_data(data_set, path, reg_or_class="reg"): if data_set == "microsoft": n_data = 650 * 2 X, y = smooth_rf.generate_data(large_n=n_data) if reg_or_class == "reg": y = y + 100 return X, y elif data_set == "moon": n_data = 350 * 2 X, y = sklearn.datasets.make_moons(n_samples=n_data, noise=.3) if reg_or_class == "reg": y = y + 100 return X, y elif data_set == "prgeng": data_all = pd.read_csv(path + "data/prgeng/prgeng.txt", sep=" ") y_all = data_all["wageinc"] data_all.pop("wageinc") X = np.array(data_all) y = y_all.ravel() if reg_or_class != "reg": ValueError("must use 'reg' with 'prgeng' dataset") return X, y elif data_set == "titantic": data_train = pd.read_csv(path + "data/titanic/titanic3.csv") data_train.pop("cabin") data_train.pop("name") data_train.pop("ticket") data_train.pop("body") data_train.pop("boat") data_train.pop("home.dest") data_train["pclass"] = data_train["pclass"].apply(str) NAs = pd.concat([data_train.isnull().sum()], axis=1) # Filling missing Age values with mean data_train["age"] = data_train["age"].fillna(data_train["age"].mean()) # Filling missing Embarked values with most common value data_train["embarked"] = data_train["embarked"].fillna( data_train["embarked"].mode()[0]) for col in data_train.dtypes[data_train.dtypes == "object"].index: for_dummy = data_train.pop(col) data_train = pd.concat( [data_train, pd.get_dummies(for_dummy, prefix=col)], axis=1) data_train = data_train.dropna() y_all = data_train.survived data_train.pop("survived") data_all = data_train X = np.array(data_all) y = y_all.ravel() if reg_or_class != "class": ValueError("must use 'class' with 'titanic' dataset") return X, y
import sklearn.datasets import sklearn.metrics base_error = [] smooth_error = [] smooth_error2 = [] for sim in np.arange(2): #np.arange(20): print("sim", sim) # data, y = sklearn.datasets.make_moons(n_samples=350, noise=.3) # data_test, y_test = sklearn.datasets.make_moons(10000, noise=.3) # model_type = sklearn.ensemble.RandomForestClassifier data, y = smooth_rf.generate_data(650) y = y + 100 data_test, y_test = smooth_rf.generate_data(10000) y_test = y_test + 100 model_type = sklearn.ensemble.RandomForestRegressor model = model_type(n_estimators=10) model_fit = model.fit(data, y) random_forest = model_fit max_iter = 10000 smooth_rf_standard, _ , _, loss_all_standard = \
def pull_data(data_set, path="", reg_or_class="reg"): """ create / pull data depending upon data requested (and data type) Arguments: ---------- data_set : string name of dataset path : string location to data folder (only if needed - reads in data) reg_or_class : string either "reg" or "class" - determines which random forest model is being built Returns: -------- X : numpy array data's X features y : numpy array data's y values """ if data_set == "microsoft": n_data = 650 * 2 X, y = smooth_rf.generate_data(large_n=n_data) if reg_or_class == "reg": y = y + 100 return X, y elif data_set == "moon": n_data = 350 * 2 X, y = sklearn.datasets.make_moons(n_samples=n_data, noise=.3) if reg_or_class == "reg": y = y + 100 return X, y elif data_set == "prgeng": data_all = pd.read_csv(path + "data/prgeng/prgeng.txt", sep=" ") y_all = data_all["wageinc"] data_all.pop("wageinc") X = np.array(data_all) y = y_all.ravel() if reg_or_class != "reg": ValueError("must use 'reg' with 'prgeng' dataset") return X, y elif data_set == "titantic": data_train = pd.read_csv(path + "data/titanic/titanic3.csv") data_train.pop("cabin") data_train.pop("name") data_train.pop("ticket") data_train.pop("body") data_train.pop("boat") data_train.pop("home.dest") data_train["pclass"] = data_train["pclass"].apply(str) NAs = pd.concat([data_train.isnull().sum()], axis=1) # Filling missing Age values with mean data_train["age"] = data_train["age"].fillna(data_train["age"].mean()) # Filling missing Embarked values with most common value data_train["embarked"] = data_train["embarked"].fillna( data_train["embarked"].mode()[0]) for col in data_train.dtypes[data_train.dtypes == "object"].index: for_dummy = data_train.pop(col) data_train = pd.concat( [data_train, pd.get_dummies(for_dummy, prefix=col)], axis=1) data_train = data_train.dropna() y_all = data_train.survived data_train.pop("survived") data_all = data_train X = np.array(data_all) y = y_all.ravel() if reg_or_class != "class": ValueError("must use 'class' with 'titanic' dataset") return X, y