Ejemplo n.º 1
0
def _ada_boost_regression_train(table,
                                feature_cols,
                                label_col,
                                max_depth=3,
                                n_estimators=50,
                                learning_rate=1.0,
                                loss='linear',
                                random_state=None):

    feature_names, x_train = check_col_type(table, feature_cols)
    y_train = table[label_col]

    base_estimator = DecisionTreeRegressor(max_depth=max_depth)
    regressor = AdaBoostRegressor(base_estimator, n_estimators, learning_rate,
                                  loss, random_state)

    regressor.fit(x_train, y_train)

    params = {
        'feature_cols': feature_cols,
        'label_col': label_col,
        'feature_importance': regressor.feature_importances_,
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'loss': loss,
        'random_state': random_state
    }

    model = _model_dict('ada_boost_regression_model')
    get_param = regressor.get_params()
    model['parameters'] = get_param
    model['regressor'] = regressor
    model['params'] = params

    fig_feature_importance = _plot_feature_importance(feature_names, regressor)
    params = dict2MD(get_param)

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## AdaBoost Regression Train Result
    |
    | ### Feature Importance
    | {fig_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_feature_importance=fig_feature_importance,
               list_parameters=params)))

    model['_repr_brtc_'] = rb.get()
    feature_importance = regressor.feature_importances_
    feature_importance_table = pd.DataFrame(
        [[feature_names[i], feature_importance[i]]
         for i in range(len(feature_names))],
        columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model': model}
Ejemplo n.º 2
0
    def test_parameters(self):
        """ Testing parameters of Model class. """
#1.)
        #create instance of PLS model using Model class & creating instance
        #   using SKlearn libary, comparing if the parameters of both instances are equal
        pls_parameters = {"n_components": 20, "scale": False, "max_iter": 200}
        model = Model(algorithm="PlsRegression", parameters=pls_parameters)
        pls_model = PLSRegression(n_components=20, scale="svd", max_iter=200)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(pls_model.get_params()))
#2.)
        rf_parameters = {"n_estimators": 200, "max_depth": 50,"min_samples_split": 10}
        model = Model(algorithm="RandomForest", parameters=rf_parameters)
        rf_model = RandomForestRegressor(n_estimators=200, max_depth=50, min_samples_split=10)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(rf_model.get_params()))
#3.)
        knn_parameters = {"n_neighbors": 10, "weights": "distance", "algorithm": "ball_tree"}
        model = Model(algorithm="KNN", parameters=knn_parameters)
        knn_model = KNeighborsRegressor(n_neighbors=10, weights='distance', algorithm="kd_tree")

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(knn_model.get_params()))
#4.)
        svr_parameters = {"kernel": "poly", "degree": 5, "coef0": 1}
        model = Model(algorithm="SVR",parameters=svr_parameters)
        svr_model = SVR(kernel='poly', degree=5, coef0=1)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(svr_model.get_params()))
#5.)
        ada_parameters = {"n_estimators": 150, "learning_rate": 1.2, "loss": "square"}
        model = Model(algorithm="AdaBoost", parameters=ada_parameters)
        ada_model = AdaBoostRegressor(n_estimators=150, learning_rate=1.2, loss="square")

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(ada_model.get_params()))
#6.)
        bagging_parameters = {"n_estimators": 50, "max_samples": 1.5, "max_features": 2}
        model = Model(algorithm="Bagging", parameters=bagging_parameters)
        bagging_model = BaggingRegressor(n_estimators=50, max_samples=1.5, max_features="square")

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(bagging_model.get_params()))
#7.)
        lasso_parameters = {"alpha": 1.5, "max_iter": 500, "tol": 0.004}
        model = Model(algorithm="lasso", parameters=lasso_parameters)
        lasso_model = Lasso(alpha=1.5, max_iter=500, tol=0.004)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(lasso_model.get_params()))
Ejemplo n.º 3
0
class _AdaBoostRegressorImpl:
    def __init__(
        self,
        base_estimator=None,
        *,
        n_estimators=50,
        learning_rate=1.0,
        loss="linear",
        random_state=None,
    ):
        if base_estimator is None:
            estimator_impl = None
        else:
            estimator_impl = _FitSpecProxy(base_estimator)

        self._hyperparams = {
            "base_estimator": estimator_impl,
            "n_estimators": n_estimators,
            "learning_rate": learning_rate,
            "loss": loss,
            "random_state": random_state,
        }
        self._wrapped_model = SKLModel(**self._hyperparams)
        self._hyperparams["base_estimator"] = base_estimator

    def get_params(self, deep=True):
        out = self._wrapped_model.get_params(deep=deep)
        # we want to return the lale operator, not the underlying impl
        out["base_estimator"] = self._hyperparams["base_estimator"]
        return out

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            feature_transformer = FunctionTransformer(
                func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns),
                inverse_func=None,
                check_inverse=False,
            )
            self._hyperparams["base_estimator"] = _FitSpecProxy(
                feature_transformer >> self._hyperparams["base_estimator"])
            self._wrapped_model = SKLModel(**self._hyperparams)
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def score(self, X, y, sample_weight=None):
        return self._wrapped_model.score(X, y, sample_weight)
Ejemplo n.º 4
0
class _AdaBoostRegressorImpl:
    def __init__(
        self,
        base_estimator=None,
        n_estimators=50,
        learning_rate=1.0,
        loss="linear",
        random_state=None,
    ):
        estimator_impl = base_estimator
        if isinstance(estimator_impl, lale.operators.Operator):
            if isinstance(estimator_impl, lale.operators.IndividualOp):
                estimator_impl = estimator_impl._impl_instance()
                wrapped_model = getattr(estimator_impl, "_wrapped_model", None)
                if wrapped_model is not None:
                    estimator_impl = wrapped_model
            else:
                raise ValueError(
                    "If base_estimator is a Lale operator, it needs to be an individual operator. "
                )
        self._hyperparams = {
            "base_estimator": estimator_impl,
            "n_estimators": n_estimators,
            "learning_rate": learning_rate,
            "loss": loss,
            "random_state": random_state,
        }
        self._wrapped_model = SKLModel(**self._hyperparams)
        self._hyperparams["base_estimator"] = base_estimator

    def get_params(self, deep=True):
        out = self._wrapped_model.get_params(deep=deep)
        # we want to return the lale operator, not the underlying impl
        out["base_estimator"] = self._hyperparams["base_estimator"]
        return out

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def score(self, X, y, sample_weight=None):
        return self._wrapped_model.score(X, y, sample_weight)
Ejemplo n.º 5
0
def ada_boost(df, significant_cols, target, cat_cols, num_cols):
    ss = StandardScaler()
    ohe = OneHotEncoder(drop='first', sparse=False)
    X = df[significant_cols]
    y = df[target]
    base = DecisionTreeRegressor(max_depth=3, random_state=0)
    estimator = AdaBoostRegressor(base_estimator=base, random_state=0)
    params = {
        'n_estimators': np.arange(5, int(X.shape[0] * 0.1)),
        'learning_rate': np.arange(0.1, 1.1, 0.1),
        'loss': ['linear', 'square', 'exponential'],
    }
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=0)
    X_train_cat = ohe.fit_transform(X_train[cat_cols])
    X_train_num = ss.fit_transform(X_train[num_cols])
    X_test_cat = ohe.transform(X_test[cat_cols])
    X_test_num = ss.transform(X_test[num_cols])
    train_data = np.c_[X_train_cat, X_train_num]
    test_data = np.c_[X_test_cat, X_test_num]
    gs = GridSearchCV(estimator, params, scoring='r2', cv=3)
    gs.fit(train_data, y_train)
    estimator = gs.best_estimator_
    r2_cv_scores = cross_val_score(estimator,
                                   train_data,
                                   y_train,
                                   scoring='r2',
                                   cv=3,
                                   n_jobs=-1)
    rmse_cv_scores = cross_val_score(estimator,
                                     train_data,
                                     y_train,
                                     scoring='neg_root_mean_squared_error',
                                     cv=3,
                                     n_jobs=-1)
    params = estimator.get_params()
    r2 = np.mean(r2_cv_scores)
    rmse = np.abs(np.mean(rmse_cv_scores))
    r2_variance = np.var(r2_cv_scores, ddof=1)
    rmse_variance = np.abs(np.var(rmse_cv_scores, ddof=1))
    estimator.fit(train_data, y_train)
    y_pred = estimator.predict(test_data)
    r2_validation = r2_score(y_test, y_pred)
    rmse_validation = np.sqrt(mean_squared_error(y_test, y_pred))
    return r2, rmse, r2_variance, rmse_variance, r2_validation, rmse_validation, params
Ejemplo n.º 6
0
def ada(X, Y, kfold=3, feature_set=None):

    arr = index_splitter(N=len(X), fold=kfold)
    ps = PredefinedSplit(arr)

    for train, test in ps.split():
        train_index = train
        test_index = test

    train_X, train_y = X.values[train_index, :], Y.values[train_index]
    test_X, test_y = X.values[test_index, :], Y.values[test_index]
    arr = index_splitter(N=len(train_X), fold=kfold)
    ps2 = PredefinedSplit(arr)

    learning_rate = [x for x in np.linspace(0.1, 1, num=10)]
    n_estimators = [int(x) for x in np.linspace(start=20, stop=1000, num=100)]
    loss = ['square']

    random_grid = {
        'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'loss': loss
    }

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    ada = AdaBoostRegressor(random_state=42, loss='square')

    # Look at parameters used by our current forest
    print('Parameters for baseline:\n')
    pprint(ada.get_params())

    # Random search of parameters, using 3 fold cross validation,
    # search across 100 different combinations, and use all available cores
    ada_random = RandomizedSearchCV(estimator=ada,
                                    n_iter=200,
                                    param_distributions=random_grid,
                                    scoring='neg_mean_squared_error',
                                    cv=ps2.split(),
                                    verbose=2,
                                    random_state=42,
                                    n_jobs=-1)

    # Fit the random search model
    ada_random.fit(train_X, train_y)
    pprint(ada_random.best_params_)

    cv_result_rd = ada_random.cv_results_
    BestPara_random = ada_random.best_params_

    ## Grid search of parameters, using 3 fold cross validation based on Random search
    lr = [BestPara_random['learning_rate']]
    #n_estimators = [BestPara_random["n_estimators"]]

    n_estimators = [
        int(x) for x in range(BestPara_random["n_estimators"] -
                              10, BestPara_random["n_estimators"] + 10, 20)
    ]
    n_estimators = [item for item in n_estimators if item > 0]

    grid_grid = {
        'n_estimators': n_estimators,
        'learning_rate': lr,
        'loss': loss
    }

    ada_grid = GridSearchCV(estimator=ada,
                            param_grid=grid_grid,
                            scoring='neg_mean_squared_error',
                            cv=ps2.split(),
                            verbose=2,
                            n_jobs=-1)

    # Fit the grid search model
    ada_grid.fit(train_X, train_y)
    BestPara_grid = ada_grid.best_params_

    pprint(ada_grid.best_params_)
    cv_results_grid = ada_grid.cv_results_

    # Fit the base line search model
    ada.fit(train_X, train_y)

    #prediction
    predict_y = ada_random.predict(test_X)
    predict_y_grid = ada_grid.predict(test_X)
    predict_y_base = ada.predict(test_X)

    # Performance metrics

    def RMLSE(predict_y_grid, predict_y, predict_y_base, test_y):
        errors_Grid_CV = np.sqrt(mean_squared_log_error(
            predict_y_grid, test_y))
        errors_Random_CV = np.sqrt(mean_squared_log_error(predict_y, test_y))
        errors_baseline = np.sqrt(
            mean_squared_log_error(predict_y_base, test_y))
        return errors_Grid_CV, errors_Random_CV, errors_baseline

    errors_Grid_CV = (mean_squared_error(predict_y_grid,
                                         test_y))  #,squared = False))
    errors_Random_CV = (mean_squared_error(predict_y,
                                           test_y))  #,squared = False))
    errors_baseline = (mean_squared_error(predict_y_base,
                                          test_y))  #,squared = False))

    x_axis = range(3)
    results = [errors_Grid_CV, errors_Random_CV, errors_baseline]

    print('Adaboot Results:', results)

    if True:
        fig = plt.figure(figsize=(15, 8))
        x_axis = range(3)
        plt.bar(x_axis, results)
        plt.xticks(x_axis, ('GridSearchCV', 'RandomizedSearchCV', 'Baseline'))
        #plt.show()
        plt.savefig('ada_compare_error.png')

        #feature importance
        num_feature = len(ada_grid.best_estimator_.feature_importances_)
        plt.figure(figsize=(24, 6))
        plt.bar(range(0, num_feature * 4, 4),
                ada_grid.best_estimator_.feature_importances_)

        label_name = X.keys()

        plt.xticks(range(0, num_feature * 4, 4), label_name)
        plt.title("Feature Importances" + ",kfold=" + str(kfold))
        #plt.show()
        plt.savefig('ada_feature_importance.png')

        fig = plt.figure(figsize=(20, 8))
        ax = fig.gca()
        x_label = range(0, len(predict_y_grid))
        plt.title("kfold=" + str(kfold))
        ax.plot(x_label, predict_y_grid, 'r--', label="predict")
        ax.plot(x_label, test_y, label="ground_truth")
        ax.set_ylim(0, 200)
        ax.legend()
        #plt.show()
        plt.savefig('ada_prediction.png')

        #return a dictionary for all results
    return ada_grid.predict, ada_grid.best_estimator_
Ejemplo n.º 7
0
y = yacht["resid_resist"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# MinMaxScaler da mejores resultados que StanderScaler
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)


model = AdaBoostRegressor(base_estimator=RandomForestRegressor())
model.fit(train_scaled, y_train)

print("Accuracy on train data: ", round(model.score(train_scaled, y_train)*100, 2), "%")
print("Accuracy on test data: ", round(model.score(test_scaled, y_test)*100, 2), "%")
print("Parameters: ", model.get_params())
print("MAE: ", mean_absolute_error(y_test, model.predict(test_scaled)))
# TODO: Se puede mejorar el Grid
gridParams = {
     "n_estimators": [200], 'base_estimator__n_estimators': np.arange(1, 20)}

grid = GridSearchCV(model, gridParams,
                    verbose=1,
                    cv=5)
grid.fit(train_scaled, y_train)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)

params = grid.best_params_
Ejemplo n.º 8
0
y = np.sin(X1).ravel() + np.sin(6 * X1).ravel() + rng.normal(
    0, 0.1, X1.shape[0])

print(X.shape, y.shape)

# Fit regression model
regr_1 = DecisionTreeRegressor(max_depth=4)

regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                           n_estimators=300,
                           random_state=rng)

regr_1.fit(X, y)
regr_2.fit(X, y)

# Predict
y_1 = regr_1.predict(X)
y_2 = regr_2.predict(X)

print(regr_2.get_params())

# Plot the results
# plt.figure()
# plt.scatter(X1, y, c="k", label="training samples")
# plt.plot(X1, y_1, c="g", label="n_estimators=1", linewidth=2)
# plt.plot(X1, y_2, c="r", label="n_estimators=300", linewidth=2)
# plt.xlabel("data")
# plt.ylabel("target")
# plt.title("Boosted Decision Tree Regression")
# plt.legend()
# plt.show()