Exemple #1
0
def smooth_wrapper(random_forest, X_train, y_train, params):
    inner_distance, parent_all, no_constraint, initial_lamb, class_loss, \
    adam_values = params

    a, b1, b2, e = adam_values
    adam_dict = {"alpha": a, "beta_1": b1, "beta_2": b2, "eps": e}

    adam_rf, _, _, c = smooth_rf.smooth(random_forest,
                                        X_trained=X_train,
                                        y_trained=y_train,
                                        no_constraint=no_constraint,
                                        sgd_max_num=10000,
                                        all_trees=False,
                                        initial_lamb_seed=initial_lamb,
                                        parents_all=False,
                                        dist_mat_style=inner_distance,
                                        distance_style=d_style,
                                        class_eps=0.0001,
                                        class_loss=class_loss,
                                        adam=adam_dict,
                                        verbose=False,
                                        levels=levels)

    best_oob = np.min(c)

    if initial_lamb is not None:
        il = str(initial_lamb)
    else:
        il = "rf"



    name = "element_opt" + "_dist_style:" + d_style + add_levels +\
        "_dist:" + inner_distance +\
        ",parents:" + str(parent_all) +\
        ",constraints:" + str(not no_constraint) +\
        ",initial_lamb:" + il +\
        ",adam_options:" + str(adam_values).replace(", ", "_")

    scoring = assess_rf(adam_rf, X_test, y_test)
    info = adam_rf.lamb

    return name, scoring, info, best_oob
Exemple #2
0
def check_rf_grow(n_data,
                  n_large,
                  n_draws,
                  reg_or_class="reg",
                  depth_range=np.arange(1, 50),
                  verbose=True,
                  ntree=1,
                  data_set=["microsoft", "knn", "online_news", "splice"],
                  tuning=["resample", "oob", "oracle"],
                  constrained=True,
                  style=["level-base", "element-based"],
                  parents_all=False,
                  batch=["single-tree", "all-trees"],
                  initial_lamb=["rf-init", "random-init"],
                  max_iter=10000,
                  t=1,
                  data_all=None,
                  y_all=None):

    model_type = None
    if reg_or_class == "reg":
        model_type = sklearn.ensemble.RandomForestRegressor
        scoring = sklearn.metrics.mean_squared_error

    if reg_or_class == "class":
        model_type = sklearn.ensemble.RandomForestClassifier
        scoring = sklearn.metrics.accuracy_score

    if model_type is None:
        raise ValueError("reg_or_class input string is not 'reg' "+\
                         "nor 'class'.")

    depth_iter = list(enumerate(depth_range))
    n_depth = depth_range.shape[0]

    if verbose:
        bar = progressbar.ProgressBar()
        depth_iter = bar(depth_iter)

    # selection parameters
    if type(data_set) == list:
        data_set = data_set[0]
    if type(tuning) == list:
        tuning = tuning[0]
    if type(style) == list:
        style = style[0]
    if type(batch) == list:
        batch = batch[0]
    if type(initial_lamb) == list:
        initial_lamb = initial_lamb[0]

    if data_set == "microsoft":
        data_generator = smooth_rf.generate_data
    elif data_set == "knn":
        data_generator = lambda large_n: smooth_rf.generate_data_knn(
            n=large_n, p=np.array([.3, .7]))
    elif data_set == "online_news" or \
        data_set == "splice" or \
        data_set == "prgeng":
        if tuning == "oracle":
            NameError("tuning cannot be oracle for the online_news, splice, " +\
                      "prgeng datasets")
        if data_all is None or y_all is None:
            NameError("data_all and y_all must be inserted when using the "+\
                      "online_news, splice, " +\
                      "prgeng datasets")
    else:
        NameError("data_set option needs to be 1 of the 3 options")

    if tuning == "resample":
        resample_input = True
    else:
        resample_input = False

    if initial_lamb == "rf-init":
        initial_lamb_seed_f = lambda: None
    elif initial_lamb == "random-init":
        initial_lamb_seed_f = get_random_seed

    if batch == "single-tree":
        all_trees = False
    elif batch == "all-trees":
        all_trees = True
    else:
        NameError("batch option needs to be 1 of the 2 options")

    # storage devices
    if style == "level-base":
        score_mat = np.zeros((2, n_depth, n_draws))
        c_mat = None
    elif style == "element-based":
        score_mat = np.zeros((3, n_depth, n_draws))
        c_mat = np.zeros((depth_range.shape[0], n_draws, max_iter + 1))
    else:
        NameError("style needs to be 1 of the 2 options")

    for i, max_depth in depth_iter:
        for j in np.arange(n_draws):

            if data_set == "online_news" or \
                data_set == "splice" or \
                data_set == "prgeng":

                data, data_test, y, y_test = \
                    sklearn.model_selection.train_test_split(data_all,
                                                             y_all,
                                                             test_size = .5)
                data_tune = None
                y_tune = None
            else:
                # data generation
                all_dat = data_generator(large_n=n_data)
                data, y = all_dat[0], all_dat[1]
                y = y + 100

                # tune
                if tuning == "oracle":
                    all_dat_tune = data_generator(large_n=n_large)
                    data_tune, y_tune = all_dat_tune[0], all_dat_tune[1]
                    y_tune = y_tune + 100
                    y_tune = y_tune.ravel()
                else:
                    data_tune = None
                    y_tune = None

                # test
                all_dat_test = data_generator(large_n=n_large)
                data_test, y_test = all_dat_test[0], all_dat_test[1]
                y_test = y_test + 100

            model = model_type(max_depth=max_depth, n_estimators=ntree)
            model_fit = model.fit(data, y)

            if style == "level-base":

                smooth_rf_level = smooth_rf.smooth_all(
                    model_fit,
                    X_trained=data,
                    y_trained=y.ravel(),
                    X_tune=data_tune,
                    y_tune=y_tune,
                    resample_tune=resample_input,
                    no_constraint=not constrained,
                    parents_all=parents_all,
                    verbose=False)
                yhat_test_base = model_fit.predict(data_test)
                score_mat[0, i, j] = scoring(y_test, yhat_test_base)
                yhat_test = smooth_rf_level.predict(data_test)
                score_mat[1, i, j] = scoring(y_test, yhat_test)

            elif style == "element-based":
                smooth_rf_opt, smooth_rf_last, _, c = smooth_rf.smooth(
                    model_fit,
                    X_trained=data,
                    y_trained=y.ravel(),
                    X_tune=data_tune,
                    y_tune=y_tune,
                    resample_tune=resample_input,
                    no_constraint=not constrained,
                    sgd_max_num=max_iter,
                    sgd_t_fix=t,
                    parents_all=parents_all,
                    verbose=False,
                    all_trees=all_trees,
                    initial_lamb_seed=initial_lamb_seed_f(),
                    adam={
                        "alpha": .001,
                        "beta_1": .9,
                        "beta_2": .999,
                        "eps": 1e-8
                    })
                c_mat[i, j, :] = c

                yhat_test_base = model_fit.predict(data_test)
                score_mat[0, i, j] = scoring(y_test, yhat_test_base)
                yhat_test_opt = smooth_rf_opt.predict(data_test)
                score_mat[1, i, j] = scoring(y_test, yhat_test_opt)
                yhat_test_last = smooth_rf_last.predict(data_test)
                score_mat[2, i, j] = scoring(y_test, yhat_test_last)

    return score_mat, c_mat
ggout = ggplot(data_vis) +\
    geom_point(aes(x = "x1",y ="x2", color = "factor(y)")) +\
    theme_minimal() +\
    labs(x= "X1", y = "X2", color = "value (minus 100)")

rf = sklearn.ensemble.RandomForestClassifier(n_estimators=300)
rf_fit = rf.fit(data, y)

smooth_rf_opt_ce, smooth_rf_last_ce, last_ce, c_ce = smooth_rf.smooth(
    rf_fit,
    X_trained=data,
    y_trained=y.ravel(),
    X_tune=None,
    y_tune=None,
    resample_tune=False,  # oob
    no_constraint=False,
    subgrad_max_num=10000,
    subgrad_t_fix=1,
    parents_all=True,
    verbose=True,
    all_trees=False,
    initial_lamb_seed=None,
    class_eps=1e-4,
    class_loss="ce")

smooth_rf_opt_l2, smooth_rf_last_l2, last_l2, c_l2 = smooth_rf.smooth(
    rf_fit,
    X_trained=data,
    y_trained=y.ravel(),
    X_tune=None,
    y_tune=None,
    resample_tune=False,  # oob
    model_type = sklearn.ensemble.RandomForestRegressor

    model = model_type(n_estimators=10)
    model_fit = model.fit(data, y)
    random_forest = model_fit

    max_iter = 10000


    smooth_rf_standard, _ , _, loss_all_standard = \
      smooth_rf.smooth(random_forest=random_forest,
               X_trained=data, y_trained=y,
               X_tune=None, y_tune=None,
               resample_tune=False,
               sgd_max_num=max_iter,
               all_trees=False,
               parents_all=True,
               distance_style="standard",
               verbose=True,
               adam={"alpha": .001, "beta_1": .9,
                     "beta_2": .999,"eps": 1e-8})

    smooth_rf_pytorch, loss_all, loss_min, params_min, best_model, \
        (torch_model, forest_dataset, dataloader) = \
        smooth_rf.smooth_pytorch(random_forest=random_forest,
               X_trained=data, y_trained=y,
               X_tune=None, y_tune=None,
               resample_tune=False,
               sgd_max_num=max_iter,
               all_trees=False,
               parents_all=True,
    n_rf_fit = n_rf.fit(data,y)
    yhat_test_node = n_rf_fit.predict(data_test)


    # smoothed
    rf_base = model(n_estimators = n_estimators)
    rf_fit = rf_base.fit(data,y)
    yhat_test_base = rf_fit.predict(data_test)


    rf_smooth, rf_smooth_b, _, _ = smooth_rf.smooth(
                                          random_forest = rf_fit,
                                          X_trained = data,
                                          y_trained = y,
                                          sgd_max_num = n_steps,
                                          verbose = False,
                                          parents_all = True,
                                          distance_style = "standard",
                                          no_constraint = False,
                                          adam = {"alpha": .001, "beta_1": .9,
                                                  "beta_2": .999,"eps": 1e-8})
    yhat_test_smooth = rf_smooth.predict(data_test)
    yhat_test_smooth_b = rf_smooth_b.predict(data_test)

    # depth_smoothed
    d_rf_smooth, d_rf_smooth_b, _, _ = smooth_rf.smooth(d_rf_fit,
                                          X_trained = data,
                                          y_trained = y,
                                          sgd_max_num = n_steps,
                                          verbose = False,
                                          parents_all = True,