def test_transform_target_regressor_2d_transformer(X, y):
    # Check consistency with transformer accepting only 2D array and a 1D/2D y
    # array.
    transformer = StandardScaler()
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      transformer=transformer)
    y_pred = regr.fit(X, y).predict(X)
    assert y.shape == y_pred.shape
    # consistency forward transform
    if y.ndim == 1:  # create a 2D array and squeeze results
        y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze()
    else:
        y_tran = regr.transformer_.transform(y)
    _check_standard_scaled(y, y_tran)
    assert y.shape == y_pred.shape
    # consistency inverse transform
    assert_allclose(y, regr.transformer_.inverse_transform(
        y_tran).squeeze())
    # consistency of the regressor
    lr = LinearRegression()
    transformer2 = clone(transformer)
    if y.ndim == 1:  # create a 2D array and squeeze results
        lr.fit(X, transformer2.fit_transform(y.reshape(-1, 1)).squeeze())
    else:
        lr.fit(X, transformer2.fit_transform(y))
    y_lr_pred = lr.predict(X)
    assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
    assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_transform_target_regressor_1d_transformer(X, y):
    # All transformer in scikit-learn expect 2D data. FunctionTransformer with
    # validate=False lift this constraint without checking that the input is a
    # 2D vector. We check the consistency of the data shape using a 1D and 2D y
    # array.
    transformer = FunctionTransformer(func=lambda x: x + 1,
                                      inverse_func=lambda x: x - 1,
                                      validate=False)
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      transformer=transformer)
    y_pred = regr.fit(X, y).predict(X)
    assert y.shape == y_pred.shape
    # consistency forward transform
    y_tran = regr.transformer_.transform(y)
    _check_shifted_by_one(y, y_tran)
    assert y.shape == y_pred.shape
    # consistency inverse transform
    assert_allclose(y, regr.transformer_.inverse_transform(
        y_tran).squeeze())
    # consistency of the regressor
    lr = LinearRegression()
    transformer2 = clone(transformer)
    lr.fit(X, transformer2.fit_transform(y))
    y_lr_pred = lr.predict(X)
    assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
    assert_allclose(regr.regressor_.coef_, lr.coef_)
Beispiel #3
0
def test_transform_target_regressor_count_fit(check_inverse):
    # regression test for gh-issue #11618
    # check that we only call a single time fit for the transformer
    X, y = friedman
    ttr = TransformedTargetRegressor(
        transformer=DummyTransformer(), check_inverse=check_inverse
    )
    ttr.fit(X, y)
    assert ttr.transformer_.fit_counter == 1
def test_transform_target_regressor_ensure_y_array():
    # check that the target ``y`` passed to the transformer will always be a
    # numpy array. Similarly, if ``X`` is passed as a list, we check that the
    # predictor receive as it is.
    X, y = friedman
    tt = TransformedTargetRegressor(transformer=DummyCheckerArrayTransformer(),
                                    regressor=DummyCheckerListRegressor(),
                                    check_inverse=False)
    tt.fit(X.tolist(), y.tolist())
    tt.predict(X.tolist())
    assert_raises(AssertionError, tt.fit, X, y.tolist())
    assert_raises(AssertionError, tt.predict, X)
def test_transform_target_regressor_invertible():
    X, y = friedman
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      func=np.sqrt, inverse_func=np.log,
                                      check_inverse=True)
    assert_warns_message(UserWarning, "The provided functions or transformer"
                         " are not strictly inverse of each other.",
                         regr.fit, X, y)
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      func=np.sqrt, inverse_func=np.log)
    regr.set_params(check_inverse=False)
    assert_no_warnings(regr.fit, X, y)
def test_transform_target_regressor_multi_to_single():
    X = friedman[0]
    y = np.transpose([friedman[1], (friedman[1] ** 2 + 1)])

    def func(y):
        out = np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2)
        return out[:, np.newaxis]

    def inverse_func(y):
        return y

    tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func,
                                    check_inverse=False)
    tt.fit(X, y)
    y_pred_2d_func = tt.predict(X)
    assert y_pred_2d_func.shape == (100, 1)

    # force that the function only return a 1D array
    def func(y):
        return np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2)

    tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func,
                                    check_inverse=False)
    tt.fit(X, y)
    y_pred_1d_func = tt.predict(X)
    assert y_pred_1d_func.shape == (100, 1)

    assert_allclose(y_pred_1d_func, y_pred_2d_func)
def test_transform_target_regressor_functions():
    X, y = friedman
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      func=np.log, inverse_func=np.exp)
    y_pred = regr.fit(X, y).predict(X)
    # check the transformer output
    y_tran = regr.transformer_.transform(y.reshape(-1, 1)).squeeze()
    assert_allclose(np.log(y), y_tran)
    assert_allclose(y, regr.transformer_.inverse_transform(
        y_tran.reshape(-1, 1)).squeeze())
    assert y.shape == y_pred.shape
    assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X)))
    # check the regressor output
    lr = LinearRegression().fit(X, regr.func(y))
    assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
def test_transform_target_regressor_functions_multioutput():
    X = friedman[0]
    y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      func=np.log, inverse_func=np.exp)
    y_pred = regr.fit(X, y).predict(X)
    # check the transformer output
    y_tran = regr.transformer_.transform(y)
    assert_allclose(np.log(y), y_tran)
    assert_allclose(y, regr.transformer_.inverse_transform(y_tran))
    assert y.shape == y_pred.shape
    assert_allclose(y_pred, regr.inverse_func(regr.regressor_.predict(X)))
    # check the regressor output
    lr = LinearRegression().fit(X, regr.func(y))
    assert_allclose(regr.regressor_.coef_.ravel(), lr.coef_.ravel())
Beispiel #9
0
def test_rfe_importance_getter_validation(importance_getter, err_type,
                                          Selector):
    X, y = make_friedman1(n_samples=50, n_features=10, random_state=42)
    estimator = LinearSVR()
    log_estimator = TransformedTargetRegressor(
        regressor=estimator, func=np.log, inverse_func=np.exp
    )

    with pytest.raises(err_type):
        model = Selector(log_estimator, importance_getter=importance_getter)
        model.fit(X, y)
Beispiel #10
0
def test_transform_target_regressor_error():
    X, y = friedman
    # provide a transformer and functions at the same time
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      transformer=StandardScaler(),
                                      func=np.exp, inverse_func=np.log)
    assert_raises_regex(ValueError, "'transformer' and functions"
                        " 'func'/'inverse_func' cannot both be set.",
                        regr.fit, X, y)
    # fit with sample_weight with a regressor which does not support it
    sample_weight = np.ones((y.shape[0],))
    regr = TransformedTargetRegressor(regressor=Lasso(),
                                      transformer=StandardScaler())
    assert_raises_regex(TypeError, r"fit\(\) got an unexpected keyword "
                        "argument 'sample_weight'", regr.fit, X, y,
                        sample_weight=sample_weight)
    # func is given but inverse_func is not
    regr = TransformedTargetRegressor(func=np.exp)
    assert_raises_regex(ValueError, "When 'func' is provided, 'inverse_func'"
                        " must also be provided", regr.fit, X, y)
Beispiel #11
0
def build_model(CAT_COL, CON_COL, func_name=None):
    '''
    build model by:
    1. encoding the categorical columns with OneHotEncoder
    2. scaling of the numerical columns with RobustScaler (limit outliers effects)
    3. create pipeline testing Ridge regressor
        3.1 normalization of the target variable (None, sqrt, or log10)
    4. create parameters to test through GridSearch
    5. create Grid search with cross-validation
    ---
    INPUTS
    CAT_COL - list of categorical columns
    CON_COL - list of continuous columns
    func_name - nomalization function (str) 'sqrt' or 'log10'
    OUTPUT
    model
    '''

    func_option = ['np.sqrt', 'np.log10']
    func_inv = ['power_2', 'sp.special.exp10']

    if func_name:
        try:
            func_str = 'np.' + func_name
            idx = [x.split('.')[-1] for x in func_option].index(func_name)
            inv_str = func_inv[idx]
        except:
            print(f'{func_name} is not an option')
    else:
        func_str = 'None'
        inv_str = 'None'

    preprocessor = make_column_transformer(
        (OneHotEncoder(drop='if_binary'), CAT_COL), (RobustScaler(), CON_COL),
        remainder='passthrough')

    pipe = make_pipeline(
        preprocessor,
        TransformedTargetRegressor(regressor=Ridge(alpha=1e-10,
                                                   fit_intercept=True,
                                                   solver='auto'),
                                   func=eval(func_str),
                                   inverse_func=eval(inv_str)))

    params = {
        'transformedtargetregressor__regressor__alpha': [1e-10, 1e-5, 0.1],
        'transformedtargetregressor__regressor__max_iter': [None, 100, 200],
        'transformedtargetregressor__regressor__tol': [0.001, 0.01, 0.1]
    }

    model = GridSearchCV(pipe, params, cv=5)

    return model
def test_transform_target_regressor_route_pipeline():
    X, y = friedman

    regr = TransformedTargetRegressor(
        regressor=DummyRegressorWithExtraFitParams(),
        transformer=DummyTransformer())
    estimators = [('normalize', StandardScaler()), ('est', regr)]

    pip = Pipeline(estimators)
    pip.fit(X, y, **{'est__check_input': False})

    assert regr.transformer_.fit_counter == 1
Beispiel #13
0
def pipeline_trans_reg():
    '''
            Application of Transformed Linear Regression

    #n_quantiles needs to be smaller than the number of samples (standard is 1000)

    PRIMARY_MERCHANT_NAME
    #accuracy negative; model totally off
    ---
    AMOUNT_MEAN_LAG7
    q-t R2-score: 0.896
    unprocessed R2-score: 0.926
    '''
    transformer = QuantileTransformer(n_quantiles=750,
                                      output_distribution='normal')
    regressor = LinearRegression()
    regr = TransformedTargetRegressor(regressor=regressor,
                                      transformer=transformer)

    regr.fit(X_train, y_train)

    TransformedTargetRegressor(...)
    print('q-t R2-score: {0:.3f}'.format(regr.score(X_test, y_test)))

    raw_target_regr = LinearRegression().fit(X_train, y_train)
    print('unprocessed R2-score: {0:.3f}'.format(
        raw_target_regr.score(X_test, y_test)))
    return regr, raw_target_regr
Beispiel #14
0
def build_model_of(core_estimator, no_target_transform: bool = False):
    # tf = HomeTransformer()
    tf0 = load_categorical_mapping()
    tf = CategoricalTransformer(feature_names=None, mapping=tf0)
    si = SimpleImputer(strategy='most_frequent')
    regressor = core_estimator
    if not no_target_transform:
        regressor = TransformedTargetRegressor(regressor=core_estimator,
                                               func=np.log,
                                               inverse_func=np.exp)
    pipe = Pipeline(steps=[('prep', tf), ('remove_zeros',
                                          si), ('estimator', regressor)])
    return pipe
Beispiel #15
0
def test_rfe_wrapped_estimator(importance_getter, selector, expected_n_features):
    # Non-regression test for
    # https://github.com/scikit-learn/scikit-learn/issues/15312
    X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
    estimator = LinearSVR(random_state=0)

    log_estimator = TransformedTargetRegressor(
        regressor=estimator, func=np.log, inverse_func=np.exp
    )

    selector = selector(log_estimator, importance_getter=importance_getter)
    sel = selector.fit(X, y)
    assert sel.support_.sum() == expected_n_features
Beispiel #16
0
def base_pls_cv(x, y, n_comps, return_model=False):
    pls_base = PLSRegression(n_components=n_comps)
    ttr_pls = TransformedTargetRegressor(regressor=pls_base,
                                         func=neg_log,
                                         inverse_func=neg_exp)
    y_cv = cross_val_predict(ttr_pls, x, y, cv=cv, groups=x.index)
    # y_cv = cross_val_predict(ttr_pls, x, y, cv=cv)
    score = r2_score(y, y_cv)
    rmsecv = mean_absolute_error(y, y_cv)
    if return_model == False:
        return (y_cv, score, rmsecv)
    else:
        return (y_cv, score, rmsecv, ttr_pls)
Beispiel #17
0
def main():
    # Generate random data : create random points, and, keep only a subset of them.
    x = np.linspace(0, 10, 500)
    rng = np.random.RandomState(0)
    rng.shuffle(x)
    x = np.sort(x[:])
    y = f(x)

    # Plot random data.
    plt.plot(x, y, 'o', color='black', markersize=2, label='random data')

    # Create augmented data : add dimensions to initial data in order to fit y as a polynomial of degree 5.
    x_augmented = np.array([x, x**2, x**3, x**4, x**5]).T

    # Polynomial regression : regression on augmented data.
    regrs = []
    regrs.append((linear_model.LinearRegression(), 'polynomial reg'))
    regrs.append((neighbors.KNeighborsRegressor(15), '15-NN reg'))
    for regr in regrs:
        model, lbl = regr[0], regr[1]

        # Scale data to reduce weights.
        # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables
        # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn
        pipe = Pipeline(
            [('scale', preprocessing.StandardScaler()), ('model', model)]
        )  # Data scaling applied before / after any operator applied to the model.
        treg = TransformedTargetRegressor(
            regressor=pipe, transformer=preprocessing.MinMaxScaler()
        )  # Target scaling applied before / after any operator applied to the model.

        # Train model.
        treg.fit(x_augmented, y)

        # Plot regression.
        plt.plot(x_augmented[:, 0], treg.predict(x_augmented), '-', label=lbl)
    plt.axis('off')
    plt.legend()
    plt.show()
def test_transform_target_regressor_invertible():
    X, y = friedman
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      func=np.sqrt,
                                      inverse_func=np.log,
                                      check_inverse=True)
    with pytest.warns(UserWarning,
                      match="The provided functions or"
                      " transformer are not strictly inverse of each other."):
        regr.fit(X, y)
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      func=np.sqrt,
                                      inverse_func=np.log)
    regr.set_params(check_inverse=False)
    assert_no_warnings(regr.fit, X, y)
def test_transform_target_regressor_2d_transformer_multioutput():
    # Check consistency with transformer accepting only 2D array and a 2D y
    # array.
    X = friedman[0]
    y = np.vstack((friedman[1], friedman[1]**2 + 1)).T
    transformer = StandardScaler()
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      transformer=transformer)
    y_pred = regr.fit(X, y).predict(X)
    assert y.shape == y_pred.shape
    # consistency forward transform
    y_tran = regr.transformer_.transform(y)
    _check_standard_scaled(y, y_tran)
    assert y.shape == y_pred.shape
    # consistency inverse transform
    assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze())
    # consistency of the regressor
    lr = LinearRegression()
    transformer2 = clone(transformer)
    lr.fit(X, transformer2.fit_transform(y))
    y_lr_pred = lr.predict(X)
    assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
    assert_allclose(regr.regressor_.coef_, lr.coef_)
Beispiel #20
0
def cv(args):
    gmu.must_not_exist(args.o)
    gmu.must_not_exist(args.r)

    if 'time' in str.lower(args.r):
        print("Performing cross-validation for time...")
        estimator = TransformedTargetRegressor(
            regressor=ExtraTreesRegressor(n_jobs=24),
            func=np.log,
            inverse_func=np.exp)
    elif 'power' in str.lower(args.r):
        print("Performing cross-validation for power...")
        estimator = ExtraTreesRegressor(n_jobs=24)

    scorer = gmu.neg_mape
    # TODO load yml param grid file

    if 'time' in str.lower(args.r):
        param_grid = {
            'regressor__bootstrap': [False],
            'regressor__max_features': [None, 'log2', 'sqrt'],
            'regressor__criterion': ['mse', 'mae'],
            'regressor__n_estimators': [128, 256, 512, 1024]
        }
    elif 'power' in str.lower(args.r):
        param_grid = {
            'bootstrap': [False],
            'max_features': [None, 'log2', 'sqrt'],
            'criterion': ['mse', 'mae'],
            'n_estimators': [128, 256, 512, 1024]
        }

    dataset = pickle.load(open(args.i, "rb"))
    X, y = gmu.get_xy(dataset)
    model, cv_scores = gmu.nested_cv(X,
                                     y,
                                     estimator,
                                     scorer,
                                     param_grid,
                                     num_trials=int(args.t),
                                     n_splits=int(args.s),
                                     n_high=int(args.k))

    # for item in cv_scores:
    #    print(pd.DataFrame(item["gs_scores"]))

    if args.o is not None:
        pickle.dump(model, open(args.o, "wb"))
    if args.r is not None:
        pickle.dump(cv_scores, open(args.r, "wb"))
def test_model_finder_predict_X_test_regression(model_finder_regression_fitted,
                                                split_dataset_numerical, limit,
                                                seed):
    """Testing if predictions of X_test split from found models are correct (in regression)."""
    models = [
        SVR(**{
            "C": 0.1,
            "tol": 1.0
        }),
        Ridge(**{
            "alpha": 0.0001,
            "random_state": seed
        }),
        DecisionTreeRegressor(**{
            "max_depth": 10,
            "criterion": "mae",
            "random_state": seed
        }),
    ]
    results = []
    X_train, X_test, y_train, y_test = split_dataset_numerical
    transformer = QuantileTransformer(output_distribution="normal",
                                      random_state=seed)
    for model in models:
        new_model = TransformedTargetRegressor(regressor=model,
                                               transformer=transformer)
        new_model.fit(X_train, y_train)
        results.append((model, new_model.predict(X_test)))

    expected_results = results[:limit]

    actual_results = model_finder_regression_fitted.predictions_X_test(limit)

    for actual_result, expected_result in zip(actual_results,
                                              expected_results):
        assert str(actual_result[0]) == str(expected_result[0])
        assert np.array_equal(actual_result[1], expected_result[1])
def calculate_effort(X, Y, project, task, model_type, transformer, regressor,
                     i_records, t_records):

    dummy_df = X.copy()
    dummy_df["Y"] = Y
    p_na = utils.percentage_nan(X)

    X.fillna(0, inplace=True)
    Y.fillna(0, inplace=True)

    # Let's create multiple regression
    print("\n{0} - {1} - {2} model performance: \n".format(
        project, task, model_type))

    splits = 10
    num_records = len(X)

    if num_records <= splits:
        splits = num_records

    pipeline = Pipeline(steps=[('scaler', transformer), ('predictor',
                                                         regressor)])
    model = TransformedTargetRegressor(regressor=pipeline,
                                       transformer=transformer)
    model.fit(X, Y)

    kfold = model_selection.KFold(n_splits=splits)
    predictions = cross_val_predict(model, X, Y, cv=kfold)
    results = utils.create_percent_error_df(Y, predictions)

    r_squared, r_squared_adj, mae, mse, rmse, pred25, pred50 = extractPerfMeasures(
        model, Y, predictions, results, X)
    row = createDF(project, model_type, task, r_squared, r_squared_adj, mae,
                   mse, rmse, pred25, pred50, t_records, i_records - t_records,
                   p_na)

    return row
Beispiel #23
0
def tlr_reg(X_train, X_test, y_train, y_test):
    '''
    Transformed Linear Regression
    #n_quantiles needs to be smaller than the number of samples (standard is 1000)
    '''
    transformer = QuantileTransformer(n_quantiles=750,
                                      output_distribution='normal')
    regressor = LinearRegression(n_jobs=-1)

    #Initialize the transformed target regressor
    regr = TransformedTargetRegressor(regressor=regressor,
                                      transformer=transformer)
    regr.fit(X_train, y_train)

    # raw LinearRegressor for comparison
    raw_target_regr = LinearRegression(n_jobs=-1).fit(X_train, y_train)

    #Print the best value combination
    print('q-t R2-score: {0:.3f}'.format(regr.score(X_test, y_test)))
    print('unprocessed R2-score: {0:.3f}'.format(
        raw_target_regr.score(X_test, y_test)))

    return regr, raw_target_regr
    '''
def test_transform_target_regressor_1d_transformer(X, y):
    # All transformer in scikit-learn expect 2D data. FunctionTransformer with
    # validate=False lift this constraint without checking that the input is a
    # 2D vector. We check the consistency of the data shape using a 1D and 2D y
    # array.
    transformer = FunctionTransformer(func=lambda x: x + 1,
                                      inverse_func=lambda x: x - 1)
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      transformer=transformer)
    y_pred = regr.fit(X, y).predict(X)
    assert y.shape == y_pred.shape
    # consistency forward transform
    y_tran = regr.transformer_.transform(y)
    _check_shifted_by_one(y, y_tran)
    assert y.shape == y_pred.shape
    # consistency inverse transform
    assert_allclose(y, regr.transformer_.inverse_transform(y_tran).squeeze())
    # consistency of the regressor
    lr = LinearRegression()
    transformer2 = clone(transformer)
    lr.fit(X, transformer2.fit_transform(y))
    y_lr_pred = lr.predict(X)
    assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
    assert_allclose(regr.regressor_.coef_, lr.coef_)
def test_transform_target_regressor_2d_transformer_multioutput():
    # Check consistency with transformer accepting only 2D array and a 2D y
    # array.
    X = friedman[0]
    y = np.vstack((friedman[1], friedman[1] ** 2 + 1)).T
    transformer = StandardScaler()
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      transformer=transformer)
    y_pred = regr.fit(X, y).predict(X)
    assert y.shape == y_pred.shape
    # consistency forward transform
    y_tran = regr.transformer_.transform(y)
    _check_standard_scaled(y, y_tran)
    assert y.shape == y_pred.shape
    # consistency inverse transform
    assert_allclose(y, regr.transformer_.inverse_transform(
        y_tran).squeeze())
    # consistency of the regressor
    lr = LinearRegression()
    transformer2 = clone(transformer)
    lr.fit(X, transformer2.fit_transform(y))
    y_lr_pred = lr.predict(X)
    assert_allclose(y_pred, transformer2.inverse_transform(y_lr_pred))
    assert_allclose(regr.regressor_.coef_, lr.coef_)
Beispiel #26
0
def transformed_xgb():
    xgb_regr = XGBRegressor(n_estimators=500,
                            learning_rate=0.1,
                            max_depth=5,
                            subsample=0.8,
                            booster='gbtree',
                            objective='reg:linear',
                            min_samples_leaf=5,
                            n_jobs=4,
                            random_state=42)

    return TransformedTargetRegressor(regressor=xgb_regr,
                                      transformer=PowerTransformer(
                                          method='yeo-johnson',
                                          standardize=True))
    def __init__(self, confidence_intervals=True):
        # Note to self, to get parameters out: model.diagnosis_model.named_steps['scaler'].mean_
        self.diagnosis_model = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier',
             svm.SVC(kernel='rbf',
                     C=0.5,
                     gamma='auto',
                     class_weight='balanced',
                     probability=True)),
        ])
        adas_pipeline = Pipeline([('scaler', StandardScaler()),
                                  ('classifier',
                                   svm.SVR(kernel='rbf', C=0.5,
                                           gamma='auto'))])
        self.adas_model = TransformedTargetRegressor(
            regressor=adas_pipeline, transformer=StandardScaler())

        ventricles_pipeline = Pipeline(
            steps=[('scaler', StandardScaler()),
                   ('classifier', svm.SVR(kernel='rbf', C=0.5, gamma='auto'))])
        self.ventricles_model = TransformedTargetRegressor(
            regressor=ventricles_pipeline, transformer=StandardScaler())

        self.y_diagnosis = None
        self.y_adas = None
        self.y_ventricles = None

        self.train_df_diagnosis = None
        self.train_df_adas = None
        self.train_df_ventricles = None

        self.confidence_intervals = confidence_intervals

        self.train_df_processed = None
        self.test_df_processed = None
Beispiel #28
0
def test_transform_target_regressor_pass_extra_predict_parameters():
    # Checks that predict kwargs are passed to regressor.
    X, y = friedman
    regr = TransformedTargetRegressor(
        regressor=DummyRegressorWithExtraPredictParams(), transformer=DummyTransformer()
    )

    regr.fit(X, y)
    regr.predict(X, check_input=False)
    assert regr.regressor_.predict_called
Beispiel #29
0
    def get_pipe(self, ):
        if self.inner_cv is None:
            inner_cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=0)
        else:
            inner_cv = self.inner_cv

        # gridpoints=self.gridpoints
        transformer_list = [
            none_T(), log_T(), logp1_T()
        ]  # Using 3 of many options here: none_T,logp1_T(),log_T()
        steps = [
            ('shrink_k1',
             shrinkBigKTransformer(
                 selector=LassoLarsCV(cv=inner_cv, max_iter=32))
             ),  # retain a subset of the best original variables
            ('polyfeat',
             PolynomialFeatures(interaction_only=0,
                                degree=2)),  # create interactions among them
            ('drop_constant', dropConst()),
            ('shrink_k2',
             shrinkBigKTransformer(
                 selector=LassoLarsCV(cv=inner_cv, max_iter=64))
             ),  # pick from all of those options
            ('reg', LinearRegression())
        ]
        if self.bestT:
            steps.insert(0,
                         ('xtransform',
                          columnBestTransformer(float_k=len(self.float_idx))))

        X_T_pipe = Pipeline(steps=steps)
        #develop a new pipeline that allows transformation of y in addition to X, which other scikit learn transformers don't
        Y_T_X_T_pipe = Pipeline(
            steps=[('ttr', TransformedTargetRegressor(regressor=X_T_pipe))])
        Y_T__param_grid = {
            'ttr__transformer': transformer_list,
            'ttr__regressor__polyfeat__degree':
            [2],  #could use other degrees here if desired
        }
        outerpipe = GridSearchCV(Y_T_X_T_pipe,
                                 param_grid=Y_T__param_grid,
                                 cv=inner_cv)
        if self.do_prep:
            steps = [('prep', missingValHandler(prep_dict=self.prep_dict)),
                     ('post', outerpipe)]
            outerpipe = Pipeline(steps=steps)

        return outerpipe
Beispiel #30
0
    def wrap_target_scaler(self, target_scaler, model_obj, model_params):

        if self.spec['problem_type'] != 'regression':
            return model_obj, model_params
        if target_scaler is None:
            return model_obj, model_params

        # Process and get the base scaler_obj + params
        base_scaler_obj = TargetScalerConstructor(self.user_passed_objs,
                                                  self.dataset,
                                                  self.spec)
        scaler_objs, scaler_params =\
            base_scaler_obj.process(target_scaler)
        scaler_obj = scaler_objs[0]

        # Unwrap into name + base
        model_name, base_model = model_obj[0], model_obj[1]
        scaler_name, base_scaler = scaler_obj[0], scaler_obj[1]

        # Now, wrap the model + scaler in the transformed target regressor
        base_wrapper_model =\
            TransformedTargetRegressor(regressor=base_model,
                                       transformer=base_scaler)
        wrapped_name = 'scale_target_' + model_name
        wrapper_model_obj = (wrapped_name, base_wrapper_model)

        # Need to update model params with new nested model name
        model_param_names = list(model_params)
        for param_name in model_param_names:
            if param_name.startswith(model_name + '__'):

                new_base = wrapped_name + '__regressor__'
                new_param_name =\
                    param_name.replace(model_name + '__', new_base, 1)

                model_params[new_param_name] = model_params.pop(param_name)

        # Need to also update / add any scaler params
        for param_name in scaler_params:
            if param_name.startswith(scaler_name + '__'):

                new_base = wrapped_name + '__transformer__'
                new_param_name =\
                    param_name.replace(scaler_name + '__', new_base, 1)

                model_params[new_param_name] = scaler_params[param_name]

        return wrapper_model_obj, model_params
Beispiel #31
0
def get_log_spiral_pipeline():
    names = ('polynomialfeatures', 'bayesianridge')
    steps = [
        PolynomialFeatures(
            degree=1,
            include_bias=False,
        ),
        TransformedTargetRegressor(regressor=BayesianRidge(compute_score=True,
                                                           fit_intercept=True,
                                                           copy_X=True,
                                                           normalize=True,
                                                           **clf_kwargs),
                                   func=np.log,
                                   inverse_func=np.exp)
    ]
    return Pipeline(memory=None, steps=list(zip(names, steps)))
Beispiel #32
0
def test_transform_target_regressor_ensure_y_array():
    # check that the target ``y`` passed to the transformer will always be a
    # numpy array. Similarly, if ``X`` is passed as a list, we check that the
    # predictor receive as it is.
    X, y = friedman
    tt = TransformedTargetRegressor(transformer=DummyCheckerArrayTransformer(),
                                    regressor=DummyCheckerListRegressor(),
                                    check_inverse=False)
    tt.fit(X.tolist(), y.tolist())
    tt.predict(X.tolist())
    assert_raises(AssertionError, tt.fit, X, y.tolist())
    assert_raises(AssertionError, tt.predict, X)
Beispiel #33
0
    def get_pipe(self, ):
        if self.inner_cv is None:
            inner_cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=0)
        else:
            inner_cv = self.inner_cv
        gridpoints = self.gridpoints
        transformer_list = [None_T(), Log_T(),
                            LogP1_T()]  # ,logp1_T()] # log_T()]#
        steps = [
            ('shrink_k1',
             ShrinkBigKTransformer(
                 selector=LassoLarsCV(cv=inner_cv, max_iter=32))),
            # retain a subset of the best original variables
            ('polyfeat', PolynomialFeatures(interaction_only=0, degree=2)
             ),  # create interactions among them
            ('drop_constant', DropConst()),
            ('shrink_k2',
             ShrinkBigKTransformer(
                 selector=LassoLarsCV(cv=inner_cv, max_iter=64))),
            # pick from all of those options
            ('reg', LinearRegression())
        ]
        if self.bestT:
            steps.insert(0,
                         ('xtransform',
                          ColumnBestTransformer(float_k=len(self.float_idx))))

        X_T_pipe = Pipeline(steps=steps)
        Y_T_X_T_pipe = Pipeline(
            steps=[('ttr', TransformedTargetRegressor(regressor=X_T_pipe))])
        Y_T__param_grid = {
            'ttr__transformer': transformer_list,
            'ttr__regressor__polyfeat__degree': [2],
        }
        outerpipe = GridSearchCV(Y_T_X_T_pipe,
                                 param_grid=Y_T__param_grid,
                                 cv=inner_cv)
        if self.do_prep:
            steps = [('prep', MissingValHandler(prep_dict=self.prep_dict)),
                     ('post', outerpipe)]
            outerpipe = Pipeline(steps=steps)

        return outerpipe
Beispiel #34
0
def test():
    import numpy as np
    import scipy as sp
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.datasets import fetch_openml
    from sklearn.model_selection import train_test_split
    from sklearn.compose import make_column_transformer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import Ridge
    from sklearn.compose import TransformedTargetRegressor

    # data
    survey = fetch_openml(data_id=534, as_frame=True)
    X = survey.data[survey.feature_names]
    y = survey.target.values.ravel()
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
    train_dataset = X_train.copy()
    train_dataset.insert(0, "WAGE", y_train)
    # _ = sns.pairplot(train_dataset, kind = "reg", diag_kind = "kde")

    # machine learning pipeline
    categorical_columns = [
        "RACE", "OCCUPATION", "SECTOR", "MARR", "UNION", "SEX", "SOUTH"
    ]
    numerical_columns = ["EDUCATION", "EXPERIENCE", "AGE"]

    preprocessor = make_column_transformer(
        (OneHotEncoder(drop="if_binary"), categorical_columns),
        remainder="passthrough",
        verbose_feature_names_out=False)

    model = make_pipeline(
        preprocessor,
        TransformedTargetRegressor(regressor=Ridge(alpha=1e-10),
                                   func=np.log10,
                                   inverse_func=sp.special.exp10),
    )

    # processing the data
    _ = model.fit(X_train, y_train)
    def ols_prediction(self):
        """
        uses linear regression after standardising to normal dist
        prints out accuracy metrics and then saves the design matrix with y and predicted y as a csv file
        also creates another column to calculate relative percentage difference between y and predicted y
        :return:
        """
        logger.info("running Linear Regression model")
        crab_df_woo = self.pre_process_data()
        transformer = QuantileTransformer(output_distribution='normal')
        # since I observed that the data was skewed, I decided to transform the continuous variables to normal dist
        reg = linear_model.LinearRegression()
        t_reg = TransformedTargetRegressor(regressor=reg,
                                           transformer=transformer)
        ohe = ce.OneHotEncoder(handle_unknown='ignore',
                               use_cat_names=True,
                               drop_invariant=True)
        crab_df_woo_enc = ohe.fit_transform(crab_df_woo)
        X = crab_df_woo_enc.drop("age", axis=1)
        y = crab_df_woo_enc[["age"]]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=100)
        t_reg.fit(X_train, y_train)
        s = t_reg.score(X_test, y_test)
        logger.info("R-squared from Linear Regression is: {0}".format(s))
        y_pred = t_reg.predict(X)
        mse = np.sqrt(mean_squared_error(y, y_pred))
        mae = mean_absolute_error(y, y_pred)
        logger.debug("Linear Regression MAE: {0}".format(mae))
        logger.debug("Linear Regression RMSE: {0}".format(mse))
        logger.debug("Linear Regression R-squared: {0}".format(s))

        crab_df = X.copy()
        crab_df["age"] = pd.Series(y.values.ravel())
        crab_df["age_ols"] = pd.Series(y_pred.ravel())
        crab_df['sex'] = crab_df.apply(lambda row: self.reverse_ohe(row),
                                       axis=1)
        crab_df.drop(["sex_I", "sex_M", "sex_F"], axis=1, inplace=True)
        crab_df["percentage_difference"] = np.abs(
            np.divide(
                (crab_df["age"] - crab_df["age_ols"]), crab_df["age"]) * 100)
        crab_df.to_csv("crab_predit_ols.csv", index=False)
        logger.info("Crab data with predicted variables saved: {0}".format(
            "crab_predit_ols.csv"))
        logger.info("Linear Regression execution finished")
 def rf_prediction(self):
     """
     uses ensemble (Random Forest) method to predict crab age
     :return:
     """
     logger.info("running Random Forest model")
     X = self.crab_data.drop("age", axis=1)
     y = self.crab_data[["age"]]
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         test_size=0.2,
                                                         random_state=100)
     #
     numerical_features = X_train.dtypes == 'float'
     categorical_features = ~numerical_features
     # I used pipelining so that the predicted values were automatically transformed/scaled back
     preprocess = make_column_transformer(
         (RobustScaler(), numerical_features),
         (OneHotEncoder(sparse=False), categorical_features))
     forest = RandomForestRegressor(n_estimators=5000,
                                    max_depth=20,
                                    min_samples_leaf=2,
                                    min_samples_split=4,
                                    random_state=100)
     f_reg = Pipeline(steps=[('preprocess', preprocess), ('model', forest)])
     f_reg_ttr = TransformedTargetRegressor(regressor=f_reg)
     f_reg_ttr.fit(X_train, y_train)
     s = f_reg_ttr.score(X_test, y_test)
     logger.info("R-squared from Random Forest is: {0}".format(s))
     y_pred = f_reg_ttr.predict(X)
     mse = np.sqrt(mean_squared_error(y, y_pred))
     mae = mean_absolute_error(y, y_pred)
     logger.debug("RandomForest MAE: {0}".format(mae))
     logger.debug("RandomForest RMSE: {0}".format(mse))
     logger.debug("RandomForest R-squared: {0}".format(s))
     # recreate the original dataset
     crab_df = X.copy()
     crab_df["age"] = pd.Series(y.values.ravel())
     crab_df["age_forest"] = pd.Series(y_pred.ravel())
     crab_df["percentage_difference"] = np.abs(
         np.divide(
             (crab_df["age"] - crab_df["age_forest"]), crab_df["age"]) *
         100)
     crab_df.to_csv("crab_predit_forest.csv", index=False)
     logger.info("Crab data with predicted variables saved: {0}".format(
         "crab_predit_forest.csv"))
     logger.info("Random Forest execution finished")
Beispiel #37
0
def train_gpr(l=None):
    # basic no tuning. sklearn gp is not great for this.
    if l is None:
        l = get_data()
    model = GaussianProcessRegressor(
        alpha=1.8,
        copy_X_train=True,
        # kernel=kernels.RBF(4.85 * np.array([4, 3000])),
        # kernel=kernels.RBF([1, 1]),
        n_restarts_optimizer=10,
        normalize_y=True,
        optimizer='fmin_l_bfgs_b',
        random_state=None)
    model = TransformedTargetRegressor(
        regressor=model,
        transformer=QuantileTransformer(output_distribution='normal'))
    steps = [('copulize_x',
              QuantileTransformer(output_distribution='uniform')),
             ('gpr', model)]
    model = Pipeline(steps)
    model.fit(l.X_train.values, l.y_train.values.squeeze())
    return attributedict_from_locals('model')
Beispiel #38
0
def train_svm(l=None):
    # basic no tuning
    if l is None:
        l = get_data()
    model = SVR(C=1.0,
                cache_size=200,
                coef0=0.0,
                degree=3,
                epsilon=0.1,
                gamma='auto',
                kernel='rbf',
                max_iter=-1,
                shrinking=True,
                tol=0.001,
                verbose=False)
    model = TransformedTargetRegressor(
        regressor=model,
        transformer=QuantileTransformer(output_distribution='normal'))
    # model = LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
    #     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
    #     random_state=None, tol=0.0001, verbose=0)
    model.fit(l.X_train.values, l.y_train.values.squeeze())
    return attributedict_from_locals('model')
Beispiel #39
0
    def set_pipeline(self):
        warnings.simplefilter('ignore')

        transformer_list = [None_T(), LogP1_T()]
        steps = [('scaler', StandardScaler()),
                 ('shrink_k1', ShrinkBigKTransformer()),
                 ('polyfeat', PolynomialFeatures(interaction_only=1)),
                 ('shrink_k2', ShrinkBigKTransformer(selector='elastic-net')),
                 ('reg',
                  make_pipeline(StandardScaler(),
                                LinearRegression(fit_intercept=1)))]

        inner_params = {'polyfeat__degree': [2]}
        if self.k > 4:
            interv = -(-self.k // 3)
            np.arange(2, self.k + interv, interv)
            inner_params['shrink_k1__max_k'] = np.arange(4, self.k, 4)
        inner_cv = RepeatedKFold(n_splits=5,
                                 n_repeats=1,
                                 random_state=self.seed)
        X_T_pipe = GridSearchCV(Pipeline(steps=steps),
                                param_grid=inner_params,
                                cv=inner_cv)

        Y_T_X_T_pipe = Pipeline(
            steps=[('ttr', TransformedTargetRegressor(regressor=X_T_pipe))])
        Y_T__param_grid = {'ttr__transformer': transformer_list}
        lin_reg_Xy_transform = GridSearchCV(Y_T_X_T_pipe,
                                            param_grid=Y_T__param_grid,
                                            cv=inner_cv)

        self.lr_estimator = lin_reg_Xy_transform
        self.lr_estimator.fit(self.x_train, self.y_train)
        self.attr = pd.DataFrame(self.lr_estimator.cv_results_)
        # generates the model that is saved
        logger.info("Total execution time: {} sec".format(
            round(time.time() - self.start_time, 3)))
regr = RidgeCV()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

ax0.scatter(y_test, y_pred)
ax0.plot([0, 2000], [0, 2000], '--k')
ax0.set_ylabel('Target predicted')
ax0.set_xlabel('True Target')
ax0.set_title('Ridge regression \n without target transformation')
ax0.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % (
    r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
ax0.set_xlim([0, 2000])
ax0.set_ylim([0, 2000])

regr_trans = TransformedTargetRegressor(regressor=RidgeCV(),
                                        func=np.log1p,
                                        inverse_func=np.expm1)
regr_trans.fit(X_train, y_train)
y_pred = regr_trans.predict(X_test)

ax1.scatter(y_test, y_pred)
ax1.plot([0, 2000], [0, 2000], '--k')
ax1.set_ylabel('Target predicted')
ax1.set_xlabel('True Target')
ax1.set_title('Ridge regression \n with target transformation')
ax1.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % (
    r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
ax1.set_xlim([0, 2000])
ax1.set_ylim([0, 2000])

f.suptitle("Synthetic data", y=0.035)