def test_transform_target_regressor_multi_to_single():
    X = friedman[0]
    y = np.transpose([friedman[1], (friedman[1] ** 2 + 1)])

    def func(y):
        out = np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2)
        return out[:, np.newaxis]

    def inverse_func(y):
        return y

    tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func,
                                    check_inverse=False)
    tt.fit(X, y)
    y_pred_2d_func = tt.predict(X)
    assert y_pred_2d_func.shape == (100, 1)

    # force that the function only return a 1D array
    def func(y):
        return np.sqrt(y[:, 0] ** 2 + y[:, 1] ** 2)

    tt = TransformedTargetRegressor(func=func, inverse_func=inverse_func,
                                    check_inverse=False)
    tt.fit(X, y)
    y_pred_1d_func = tt.predict(X)
    assert y_pred_1d_func.shape == (100, 1)

    assert_allclose(y_pred_1d_func, y_pred_2d_func)
def test_transform_target_regressor_multi_to_single():
    X = friedman[0]
    y = np.transpose([friedman[1], (friedman[1]**2 + 1)])

    def func(y):
        out = np.sqrt(y[:, 0]**2 + y[:, 1]**2)
        return out[:, np.newaxis]

    def inverse_func(y):
        return y

    tt = TransformedTargetRegressor(func=func,
                                    inverse_func=inverse_func,
                                    check_inverse=False)
    tt.fit(X, y)
    y_pred_2d_func = tt.predict(X)
    assert y_pred_2d_func.shape == (100, 1)

    # force that the function only return a 1D array
    def func(y):
        return np.sqrt(y[:, 0]**2 + y[:, 1]**2)

    tt = TransformedTargetRegressor(func=func,
                                    inverse_func=inverse_func,
                                    check_inverse=False)
    tt.fit(X, y)
    y_pred_1d_func = tt.predict(X)
    assert y_pred_1d_func.shape == (100, 1)

    assert_allclose(y_pred_1d_func, y_pred_2d_func)
Beispiel #3
0
    def predict_deaths(self, s):
        s_train = s.loc[:self.last_training_year]
        num_predict = self.pred_year - self.last_training_year
        n_train = len(s_train)
        n = n_train + num_predict  # predict into the future
        y = s_train.values
        x = np.arange(n)
        X = x.reshape(-1, 1)
        X2 = np.column_stack((x, x**2, x**3))

        X_train = X[:n_train]
        X2_train = X2[:n_train]

        # train model using ridge regression on box-cox transformed values
        ttr = TransformedTargetRegressor(
            regressor=Ridge(alpha=10), transformer=PowerTransformer('box-cox'))
        ttr.fit(X_train, y)
        yp1 = ttr.predict(X)

        if s.name.left < 90:
            ttr.fit(X2_train, y)
            yp2 = ttr.predict(X2)

            # average predictions
            yp = yp1 * .9 + yp2 * .1
        else:
            yp = yp1

        index = range(s.index[0], self.pred_year + 1)
        sp = pd.Series(yp, index=index)
        return sp
Beispiel #4
0
def test_transform_target_regressor_pass_extra_predict_parameters():
    # Checks that predict kwargs are passed to regressor.
    X, y = friedman
    regr = TransformedTargetRegressor(
        regressor=DummyRegressorWithExtraPredictParams(), transformer=DummyTransformer()
    )

    regr.fit(X, y)
    regr.predict(X, check_input=False)
    assert regr.regressor_.predict_called
def test_transform_target_regressor_ensure_y_array():
    # check that the target ``y`` passed to the transformer will always be a
    # numpy array. Similarly, if ``X`` is passed as a list, we check that the
    # predictor receive as it is.
    X, y = friedman
    tt = TransformedTargetRegressor(transformer=DummyCheckerArrayTransformer(),
                                    regressor=DummyCheckerListRegressor(),
                                    check_inverse=False)
    tt.fit(X.tolist(), y.tolist())
    tt.predict(X.tolist())
    assert_raises(AssertionError, tt.fit, X, y.tolist())
    assert_raises(AssertionError, tt.predict, X)
Beispiel #6
0
def test_transform_target_regressor_ensure_y_array():
    # check that the target ``y`` passed to the transformer will always be a
    # numpy array. Similarly, if ``X`` is passed as a list, we check that the
    # predictor receive as it is.
    X, y = friedman
    tt = TransformedTargetRegressor(transformer=DummyCheckerArrayTransformer(),
                                    regressor=DummyCheckerListRegressor(),
                                    check_inverse=False)
    tt.fit(X.tolist(), y.tolist())
    tt.predict(X.tolist())
    assert_raises(AssertionError, tt.fit, X, y.tolist())
    assert_raises(AssertionError, tt.predict, X)
def test_model_finder_search_and_fit_regression(model_finder_regression, mode,
                                                expected_model,
                                                expected_scores, seed):
    """Testing if search_and_fit() function correctly searches for and sets and fits chosen model (regression).
    Additionally checks if the model is correctly wrapped in TransformedTargetRegressor."""
    prediction_array = np.array([
        1.34, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
        0, 0, 1
    ]).reshape(1, -1)

    model_finder_regression._quicksearch_limit = 1
    model_finder_regression.scoring_functions = [mean_squared_error, r2_score]
    actual_model = model_finder_regression.search_and_fit(
        models=None, scoring=mean_squared_error, mode=mode)
    expected_model.random_state = seed
    t_X = model_finder_regression.X
    t_y = model_finder_regression.y
    m = TransformedTargetRegressor(regressor=expected_model,
                                   transformer=QuantileTransformer(
                                       output_distribution="normal",
                                       random_state=seed))
    m.fit(t_X, t_y)
    expected_array = m.predict(prediction_array)

    assert str(actual_model) == str(expected_model)
    assert str(model_finder_regression._chosen_model) == str(expected_model)
    assert model_finder_regression._chosen_model_params == expected_model.get_params(
    )
    assert model_finder_regression._chosen_model_scores == expected_scores
    assert type(actual_model) == WrappedModelRegression
    assert str(actual_model.clf.regressor) == str(expected_model)

    assert np.array_equal(model_finder_regression.predict(prediction_array),
                          expected_array)
Beispiel #8
0
def run_experiment(
        exp_name,
        models,
        folds,
        train_seasons,
        test_seasons,
        X,
        y,
        preprocessor=None,
        #print_exp_progress=None,
        calculate_metrics_func=calculate_clf_metrics,
        algorithm_type='clf'):
    results = []
    names = []
    print("Running experiment", exp_name)
    for name, current_model in models:
        cv_results = defaultdict(list)

        for train_idx, test_idx in folds:
            X_train, X_test = X.loc[train_idx], X.loc[test_idx]
            #X_train, X_test = utils.scale_X(X, y, train_idx, test_idx)
            y_train, y_test = y.loc[train_idx], y.loc[test_idx]
            y_true = y_test

            pipeline = Pipeline(steps=[('preprocessor',
                                        preprocessor), ('model',
                                                        current_model)])
            if algorithm_type == 'reg':
                model = TransformedTargetRegressor(
                    regressor=pipeline, transformer=StandardScaler())
            else:
                model = pipeline
            fit_info = model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            fold_metric_results = calculate_metrics_func(y_true, y_pred)
            for key, value in fold_metric_results.items():
                cv_results[key].append(value)

        exp_result = {
            "exp_name": exp_name,
            "model": name,
            **agg_metrics(cv_results.keys(), cv_results)
        }

        if algorithm_type == 'reg':
            reg_exp_results.append(exp_result)
        else:
            exp_results.append(exp_result)

        cv_results["model"] = [name] * len(folds)
        cv_results["season_train"] = train_seasons
        cv_results["season_test"] = test_seasons

        results.append(cv_results)
        names.append(name)
    print("Done")
    return names, results
def predict_by_pos(pos, year):
    features_list = ['g', 'gs', 'mp_per_g', 'fg_per_g', 'fga_per_g', 'fg_pct', 'fg2_per_g', 'fg2a_per_g', 'fg2_pct', 'fg3_per_g', 'fg3a_per_g', 'fg3_pct', 'ft_per_g', 'fta_per_g', 'ft_pct', 'orb_per_g', 'drb_per_g', 'trb_per_g', 'ast_per_g', 'stl_per_g', 'blk_per_g', 'tov_per_g', 'pf_per_g', 'pts_per_g', 'tenure', 'height', 'weight', 'sos', 'srs', 'ows', 'dws', 'ws', 'ts_pct', 'usg_pct', 'bpm', 'pprod']
    X = df[(df['pos'] == pos) & (df['is_final_year'])]
    X = X[features_list]

    X_imp = IterativeImputer(max_iter=10).fit_transform(X)
    X = pd.DataFrame(X_imp, index=X.index, columns=X.columns)

    df.loc[X.index, X.columns] = X
    X = df[(df['is_final_year']) & (df['pos'] == pos) & (df['mp_per_g'] > 15) & (df['g'] > 25)][features_list]
    #X['per'] = (1/X['mp_per_g']) * ((X['fg_per_g'] * 85.91) + (X['stl_per_g'] * 53.897) + (X['fg3_per_g'] * 51.757) + (X['ft_per_g'] * 46.845) + (X['blk_per_g'] * 39.19) + (X['orb_per_g'] * 39.19) + (X['ast_per_g'] * 34.677) + (X['drb_per_g'] * 14.707) - (X['pf_per_g'] * 17.174) - (X['fta_per_g'] - (X['ft_per_g'])*20.091) - ((X['fga_per_g'] - X['fg_per_g'])*39.19) - (X['tov_per_g']*53.897))

    X = (X - X.min()) / (X.max() - X.min())

    predicted_to_nba = pd.DataFrame()
    for yr in range(1996, 2020):
        a = predict_make_nba(yr, X)
        predicted_to_nba = predicted_to_nba.append(a)

    ##################################################
    ##PER Regression##
    #train algorithm on players not in given year
    clf1 = SGDRegressor(alpha=.01, penalty='elasticnet')

    features_list = X.columns.tolist()

    #create dataframe of NCAA players that made NBA
    df2 = predicted_to_nba

    X2 = transform_train_data(df2[features_list])
    y2 = df2[['mean_per']].loc[X2.index]

    to_drop = list(X2.columns[X2.var() < .1])
    to_drop += ['gs']
    X2.drop(to_drop, axis=1, inplace=True)
    X2 = (X2 - X2.mean())/X2.std()

    X_new_pred = X2[df2.loc[X2.index]['year'] == year]
    X2 = X2[(df2.loc[X2.index]['year'] != year) & (df2.loc[X2.index]['year'] < 2018) & (df2.loc[X2.index]['year'] > 1995)]
    y2 = y2.loc[X2.index]

    y_new_pred = df2[['mean_per']].loc[X_new_pred.index]
    y_new_pred = (y_new_pred - y2.mean())/y2.std()
    y2 = (y2 - y2.mean())/y2.std()

    X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.25, stratify=df2.loc[y2.index]['tier'])

    clf2 = TransformedTargetRegressor(clf1)
    clf2.fit(X2_train, y2_train)

    #predict per for players in given year
    X_new_pred = X_new_pred[X2.columns.tolist()]
    new_pred = clf2.predict(X_new_pred)

    new_pred_curr_year = pd.DataFrame(new_pred, index=X_new_pred.index).merge(df.iloc[:, :-8], left_index=True, right_index=True)
    return new_pred_curr_year
Beispiel #10
0
def run_transform(X, y, transform, cv_outer=LeaveOneOut(), n_alphas=1000):

    from sklearn.feature_selection import VarianceThreshold
    from sklearn.decomposition import PCA
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import Ridge
    from sklearn.model_selection import GridSearchCV
    from joblib import Memory
    from sklearn.compose import TransformedTargetRegressor
    from sklearn.preprocessing import PowerTransformer

    # Find alpha range
    alphas = find_alpha_range(X, y, n_alphas=n_alphas)

    list_y_pred = []
    list_y_true = []
    list_models = []

    for train_index, test_index in tqdm(cv_outer.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        list_y_true.append(y_test)

        cv_inner = StratifiedKFoldReg(n_splits=5, shuffle=True, random_state=0)

        tmpfolder = mkdtemp()
        memory = Memory(location=tmpfolder)

        pip = make_pipeline(VarianceThreshold(),
                            PCA(),
                            Ridge(max_iter=1e6),
                            memory=memory)

        grid = GridSearchCV(pip,
                            param_grid={'ridge__alpha': alphas},
                            cv=cv_inner,
                            n_jobs=-1,
                            scoring="neg_mean_squared_error")

        regr_trans = TransformedTargetRegressor(
            regressor=grid, transformer=PowerTransformer(method=transform))

        regr_trans.fit(X_train, y_train)
        list_models.append(regr_trans)

        y_pred = regr_trans.predict(X_test)
        list_y_pred.append(y_pred)

        memory.clear(warn=False)
        shutil.rmtree(tmpfolder)

    y_pred = np.concatenate(list_y_pred)
    y_true = np.concatenate(list_y_true)

    return y_pred, y_true, list_models
Beispiel #11
0
def main():
    # Generate random data : create random points, and, keep only a subset of them.
    x = np.linspace(0, 10, 500)
    rng = np.random.RandomState(0)
    rng.shuffle(x)
    x = np.sort(x[:])
    y = f(x)

    # Create bagging and random forest models.
    fig, axes = plt.subplots(2, 2, figsize=(20, 10))
    models = [AdaBoostRegressor(n_estimators=5, base_estimator=KNeighborsRegressor()),
              AdaBoostRegressor(n_estimators=5, base_estimator=SVR()),
              AdaBoostRegressor(n_estimators=5, base_estimator=KernelRidge(kernel='rbf')),
              GradientBoostingRegressor()]
    for axis, model in zip(axes.ravel(), models):
        # Set title.
        title = model.__class__.__name__
        reg_params = model.get_params()
        if 'base_estimator' in reg_params: # GradientBoostingRegressor has no 'base_estimator'.
            title += ', estimator: '+reg_params['base_estimator'].__class__.__name__
        axis.set_title(title)

        # Plot random data.
        axis.plot(x, y, 'o', color='black', markersize=2, label='random data')

        # Create augmented data : add dimensions to initial data in order to fit y as a polynomial of degree 5.
        x_augmented = np.array([x, x**2, x**3, x**4, x**5]).T

        # Scale data to reduce weights.
        # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables
        # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn
        pipe = Pipeline([('scale', preprocessing.StandardScaler()), ('model', model)]) # Data scaling applied before / after any operator applied to the model.
        y_transformer = preprocessing.MinMaxScaler().fit(y.reshape(-1, 1))
        treg = TransformedTargetRegressor(regressor=pipe, transformer=y_transformer) # Target scaling applied before / after any operator applied to the model.

        # Train model.
        treg.fit(x_augmented, y)

        # Plot intermediate regression estimations.
        if isinstance(model, AdaBoostRegressor):
            for i, tree in enumerate(treg.regressor_['model'].estimators_):
                x_augmented_scaled = treg.regressor_['scale'].transform(x_augmented) # x input after scaling (as tree does not use Pipeline).
                y_hat = tree.predict(x_augmented_scaled) # y outcome before scaling (as tree does not use TransformedTargetRegressor).
                y_pred = y_transformer.inverse_transform(y_hat.reshape(-1, 1))

                axis.plot(x, y_pred, '--', label='tree '+str(i))
                axis.axis('off')
                axis.legend()

        # Plot final regression.
        axis.plot(x, treg.predict(x_augmented), '-', color='black', label=model.__class__.__name__)
        axis.axis('off')
        axis.legend()
    plt.show()
class SemiSup_RandomizedSearchCV(BaseEstimator):
    def __init__(self, estimator, param_distributions, n_iter=100, cv=5, scoring=metrics.accuracy_score, pseudo=True):
        # We initialize our class similar to sklearn randomized search
        self.estimator = estimator
        self.scoring = scoring
        self.pseudo = pseudo
        
        self.transformedtargetestimator = TransformedTargetRegressor(regressor=estimator,
                                                    func=lambda x: x if np.random.rand() > 1/cv else -1, 
                                                    inverse_func=lambda x: x, check_inverse=False)
        self.scoring = scoring
        self.sampler = ParameterSampler(param_distributions, n_iter)
        self.cv_results_ = pd.DataFrame({'mean_test_score': np.empty(shape=[0]),
                                         'std_test_score': np.empty(shape=[0]),
                                         'mean_score_time': np.empty(shape=[0]),
                                         'std_score_time': np.empty(shape=[0]),
                                         'params': None})
        self.folds = KFold(n_splits=cv)
        
    def fit(self, X, y, sample_weight=None):
        for params in self.sampler:
            # Update Parameters
            self.estimator.set_params(**params)
            # Reset Scores
            scores = []
            times = []
            
            for train_index, test_index in self.folds.split(X):
                #Create Semisupervised Sampler
                self.transformedtargetestimator = TransformedTargetRegressor(regressor=self.estimator,
                                                                             func=lambda x: np.where(np.in1d(x.index,train_index),x,-1), 
                                                                             inverse_func=lambda x: x, check_inverse=False)
                #Fit
                if self.pseudo:
                    self.transformedtargetestimator.regressor.pseudo_fit = pseudo_fit.__get__(self.transformedtargetestimator.regressor)
                    self.transformedtargetestimator = self.transformedtargetestimator.regressor.pseudo_fit(X, self.transformedtargetestimator.func(y))
                else:
                    self.transformedtargetestimator.fit(X, y, sample_weight)
                    
                #Score
                score_index = np.in1d(y.index,test_index)
                start = time()
                scores.append(self.scoring(y[score_index], self.transformedtargetestimator.predict(X=X[score_index])))
                times.append(time()-start)
            self.cv_results_ = self.cv_results_.append(pd.DataFrame({'mean_test_score': np.mean(scores),
                                                                     'std_test_score': np.std(scores),
                                                                     'mean_score_time': np.mean(times),
                                                                     'std_score_time': np.std(times),
                                                                     'params': [params]}))
        self.cv_results_ = self.cv_results_.sort_values('mean_test_score', ascending=False).reset_index(drop=True)
        return self
Beispiel #13
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.MYReg = TransformedTargetRegressor(
            regressor=RandomForestRegressor(n_estimators=30, max_depth=12),
            func=lambda u: np.log10(np.clip(u, a_min=1, a_max=None)),
            inverse_func=lambda u: np.power(10, u),
            check_inverse=False,
        )

    def fit(self, X, y):
        return self.MYReg.fit(X, y)

    def predict(self, X):
        return self.MYReg.predict(X)
Beispiel #14
0
class TransformedTargetRegressorImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
 def rf_prediction(self):
     """
     uses ensemble (Random Forest) method to predict crab age
     :return:
     """
     logger.info("running Random Forest model")
     X = self.crab_data.drop("age", axis=1)
     y = self.crab_data[["age"]]
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         test_size=0.2,
                                                         random_state=100)
     #
     numerical_features = X_train.dtypes == 'float'
     categorical_features = ~numerical_features
     # I used pipelining so that the predicted values were automatically transformed/scaled back
     preprocess = make_column_transformer(
         (RobustScaler(), numerical_features),
         (OneHotEncoder(sparse=False), categorical_features))
     forest = RandomForestRegressor(n_estimators=5000,
                                    max_depth=20,
                                    min_samples_leaf=2,
                                    min_samples_split=4,
                                    random_state=100)
     f_reg = Pipeline(steps=[('preprocess', preprocess), ('model', forest)])
     f_reg_ttr = TransformedTargetRegressor(regressor=f_reg)
     f_reg_ttr.fit(X_train, y_train)
     s = f_reg_ttr.score(X_test, y_test)
     logger.info("R-squared from Random Forest is: {0}".format(s))
     y_pred = f_reg_ttr.predict(X)
     mse = np.sqrt(mean_squared_error(y, y_pred))
     mae = mean_absolute_error(y, y_pred)
     logger.debug("RandomForest MAE: {0}".format(mae))
     logger.debug("RandomForest RMSE: {0}".format(mse))
     logger.debug("RandomForest R-squared: {0}".format(s))
     # recreate the original dataset
     crab_df = X.copy()
     crab_df["age"] = pd.Series(y.values.ravel())
     crab_df["age_forest"] = pd.Series(y_pred.ravel())
     crab_df["percentage_difference"] = np.abs(
         np.divide(
             (crab_df["age"] - crab_df["age_forest"]), crab_df["age"]) *
         100)
     crab_df.to_csv("crab_predit_forest.csv", index=False)
     logger.info("Crab data with predicted variables saved: {0}".format(
         "crab_predit_forest.csv"))
     logger.info("Random Forest execution finished")
    def ols_prediction(self):
        """
        uses linear regression after standardising to normal dist
        prints out accuracy metrics and then saves the design matrix with y and predicted y as a csv file
        also creates another column to calculate relative percentage difference between y and predicted y
        :return:
        """
        logger.info("running Linear Regression model")
        crab_df_woo = self.pre_process_data()
        transformer = QuantileTransformer(output_distribution='normal')
        # since I observed that the data was skewed, I decided to transform the continuous variables to normal dist
        reg = linear_model.LinearRegression()
        t_reg = TransformedTargetRegressor(regressor=reg,
                                           transformer=transformer)
        ohe = ce.OneHotEncoder(handle_unknown='ignore',
                               use_cat_names=True,
                               drop_invariant=True)
        crab_df_woo_enc = ohe.fit_transform(crab_df_woo)
        X = crab_df_woo_enc.drop("age", axis=1)
        y = crab_df_woo_enc[["age"]]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=100)
        t_reg.fit(X_train, y_train)
        s = t_reg.score(X_test, y_test)
        logger.info("R-squared from Linear Regression is: {0}".format(s))
        y_pred = t_reg.predict(X)
        mse = np.sqrt(mean_squared_error(y, y_pred))
        mae = mean_absolute_error(y, y_pred)
        logger.debug("Linear Regression MAE: {0}".format(mae))
        logger.debug("Linear Regression RMSE: {0}".format(mse))
        logger.debug("Linear Regression R-squared: {0}".format(s))

        crab_df = X.copy()
        crab_df["age"] = pd.Series(y.values.ravel())
        crab_df["age_ols"] = pd.Series(y_pred.ravel())
        crab_df['sex'] = crab_df.apply(lambda row: self.reverse_ohe(row),
                                       axis=1)
        crab_df.drop(["sex_I", "sex_M", "sex_F"], axis=1, inplace=True)
        crab_df["percentage_difference"] = np.abs(
            np.divide(
                (crab_df["age"] - crab_df["age_ols"]), crab_df["age"]) * 100)
        crab_df.to_csv("crab_predit_ols.csv", index=False)
        logger.info("Crab data with predicted variables saved: {0}".format(
            "crab_predit_ols.csv"))
        logger.info("Linear Regression execution finished")
Beispiel #17
0
def create_scatter_df(profile, threshold):
    """
    INPUT:
    profile: profile dataframe
    threshold: Threshold to remove data that can be identified as outliers

    DESCRIPTION: Function to remove outliers from profile dataset and predict the spendings of the customers.
    
    OUTPUT:
    result: DataFrame with predictions and actual values
    mse: The mean squared error of the predictions
    """

    scaler = MinMaxScaler()
    prediction_df = profile[[
        "age", "income", "memberdays", "gender_F", "gender_M",
        "overall_spendings"
    ]]
    prediction_df[["age", "income", "memberdays"]] = scaler.fit_transform(
        prediction_df[["age", "income", "memberdays"]])
    prediction_df = prediction_df[
        prediction_df["overall_spendings"] < threshold]

    X = prediction_df.drop("overall_spendings", axis=1)
    y = prediction_df["overall_spendings"]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    regr_trans = TransformedTargetRegressor(regressor=RidgeCV(),
                                            func=np.log1p,
                                            inverse_func=np.expm1)
    regr_trans.fit(X_train, y_train)
    y_pred = regr_trans.predict(X_test)
    y_test.reset_index(drop=True, inplace=True)

    result = pd.concat([y_test, pd.Series(y_pred)], axis=1)
    result.rename(columns={
        0: "prediction",
        "overall_spendings": "actual_value"
    },
                  inplace=True)
    mse = mean_squared_error(y_test, y_pred)
    return result, mse
Beispiel #18
0
def run_transform(X, y, transform, cv_outer=LeaveOneOut(), n_alphas=1000):

    from sklearn.feature_selection import VarianceThreshold
    from sklearn.decomposition import PCA
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import Lasso
    from sklearn.compose import TransformedTargetRegressor
    from sklearn.preprocessing import PowerTransformer

    y_trans = PowerTransformer(method=transform).fit_transform(
        y[:, None]).flatten()
    alphas = find_alpha_range(X, y_trans, n_alphas=n_alphas)

    list_y_pred = []
    list_y_true = []
    list_models = []

    for train_index, test_index in tqdm(cv_outer.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        list_y_true.append(y_test)

        cv_inner = StratifiedKFoldReg(n_splits=5, shuffle=True, random_state=0)

        lasso_pcr = LassoPCR(scale=False,
                             cv=cv_inner,
                             n_jobs=-1,
                             alphas=alphas,
                             lasso_kws={'max_iter': 1e6},
                             scoring="neg_mean_squared_error")

        regr_trans = TransformedTargetRegressor(
            regressor=lasso_pcr,
            transformer=PowerTransformer(method=transform))

        regr_trans.fit(X_train, y_train)
        list_models.append(regr_trans)

        y_pred = regr_trans.predict(X_test)
        list_y_pred.append(y_pred)

    y_pred = np.concatenate(list_y_pred)
    y_true = np.concatenate(list_y_true)

    return y_pred, y_true, list_models
Beispiel #19
0
def run():
    # -- LOAD FILE -- #
    filename = "Data/JabRef_train.csv"
    df = pd.read_csv(filename, dtype=object)

    df.dropna(inplace=True)

    # # - Set target - #
    y = df["set_clicked"]

    # # --- Label Encoder --- #
    encoder = preprocessing.LabelEncoder()
    df = df.apply(encoder.fit_transform)

    # # -- Get relevant columns -- #
    cor = df.corr()
    cor_target = abs(cor["set_clicked"])
    relevant_features = cor_target[cor_target > 0.01]
    print(relevant_features.index)

    # # -- Normal Distribution, Reduce impact of outliers -- #
    transformer = QuantileTransformer(output_distribution="normal")

    X = df[relevant_features.index]
    X = X.drop(["set_clicked"], 1)

    regressor = LinearRegression()
    regr = TransformedTargetRegressor(regressor=regressor,
                                      transformer=transformer)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.4,
                                                        random_state=0)
    regr.fit(X_train, y_train)

    y_pred = regr.predict(X_test)
    df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
    print(df)

    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
    y = y_test.to_numpy(dtype=int)
    print(f"F1 Score: {f1_score(y, y_pred)}")

    df[["Actual", "Predicted"]].to_csv("david_results.csv")
Beispiel #20
0
def main():
    # Generate random data : create random points, and, keep only a subset of them.
    x = np.linspace(0, 10, 500)
    rng = np.random.RandomState(0)
    rng.shuffle(x)
    x = np.sort(x[:])
    y = f(x)

    # Plot random data.
    plt.plot(x, y, 'o', color='black', markersize=2, label='random data')

    # Create augmented data : add dimensions to initial data in order to fit y as a polynomial of degree 5.
    x_augmented = np.array([x, x**2, x**3, x**4, x**5]).T

    # Polynomial regression : regression on augmented data.
    regrs = []
    regrs.append((linear_model.LinearRegression(), 'polynomial reg'))
    regrs.append((neighbors.KNeighborsRegressor(15), '15-NN reg'))
    for regr in regrs:
        model, lbl = regr[0], regr[1]

        # Scale data to reduce weights.
        # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables
        # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn
        pipe = Pipeline(
            [('scale', preprocessing.StandardScaler()), ('model', model)]
        )  # Data scaling applied before / after any operator applied to the model.
        treg = TransformedTargetRegressor(
            regressor=pipe, transformer=preprocessing.MinMaxScaler()
        )  # Target scaling applied before / after any operator applied to the model.

        # Train model.
        treg.fit(x_augmented, y)

        # Plot regression.
        plt.plot(x_augmented[:, 0], treg.predict(x_augmented), '-', label=lbl)
    plt.axis('off')
    plt.legend()
    plt.show()
def test_model_finder_predict_X_test_regression(model_finder_regression_fitted,
                                                split_dataset_numerical, limit,
                                                seed):
    """Testing if predictions of X_test split from found models are correct (in regression)."""
    models = [
        SVR(**{
            "C": 0.1,
            "tol": 1.0
        }),
        Ridge(**{
            "alpha": 0.0001,
            "random_state": seed
        }),
        DecisionTreeRegressor(**{
            "max_depth": 10,
            "criterion": "mae",
            "random_state": seed
        }),
    ]
    results = []
    X_train, X_test, y_train, y_test = split_dataset_numerical
    transformer = QuantileTransformer(output_distribution="normal",
                                      random_state=seed)
    for model in models:
        new_model = TransformedTargetRegressor(regressor=model,
                                               transformer=transformer)
        new_model.fit(X_train, y_train)
        results.append((model, new_model.predict(X_test)))

    expected_results = results[:limit]

    actual_results = model_finder_regression_fitted.predictions_X_test(limit)

    for actual_result, expected_result in zip(actual_results,
                                              expected_results):
        assert str(actual_result[0]) == str(expected_result[0])
        assert np.array_equal(actual_result[1], expected_result[1])
Beispiel #22
0
# On this plot, we see that for the large True price values, our model tends to
# under-estimate the price of the house. Typically, this issue arises when
# the target to predict does not follow a normal distribution. In these cases
# the model would benefit from target transformation.

# %%
from sklearn.preprocessing import QuantileTransformer
from sklearn.compose import TransformedTargetRegressor

model_transformed_target = TransformedTargetRegressor(
    regressor=model,
    transformer=QuantileTransformer(
        n_quantiles=900, output_distribution="normal"
    ),
)
model_transformed_target.fit(X_train, y_train)
y_pred = model_transformed_target.predict(X_test)

plot_predicted_vs_actual(y_test, y_pred, title="House prices in Ames")

# %% [markdown]
# Thus, once we transformed the target, we see that we corrected some of the
# high values.
#
# ## Summary
# In this notebook, we presented the metrics and plots useful to evaluate and
# get insights about models. We both focus on regression and classification
# problems.

# %%
        plt.show()

    elif len(keys) == 2:
        g = sns.FacetGrid(cv_data, col=keys[0])
        g.map(plt.plot, keys[1], "mean_test_score", marker="o")
        [plt.setp(ax.texts, text="") for ax in g.axes.flat]
        g.set_titles(row_template='{row_name}', col_template='{col_name}')
        plt.subplots_adjust(top=0.8)
        g.fig.suptitle('Validation Scores')
        plt.show()
    plt.close()

    ###Final evaluation parameters on train, test and val sets

    #Predicted
    Y_hat_test = full_pipeline.predict(X_test).flatten()
    Y_hat_train = full_pipeline.predict(X_train).flatten()
    #Actual
    Y_vals_test = np.array(Y_test).flatten()
    Y_vals_train = np.array(Y_train).flatten()

    #RMSE
    rmse_test = mean_squared_error(Y_vals_test, Y_hat_test, squared=False)
    rmse_train = mean_squared_error(Y_vals_train, Y_hat_train, squared=False)
    #MAE
    mae_test = mean_absolute_error(Y_vals_test, Y_hat_test)
    mae_train = mean_absolute_error(Y_vals_train, Y_hat_train)
    #R^2
    rsquared_test = r2_score(Y_vals_test, Y_hat_test)
    rsquared_train = r2_score(Y_vals_train, Y_hat_train)
class EMCEB(TadpoleModel):
    """EMC-EB method, Esther Bron - [email protected]

    The `train_df*` attributes contain training data optimized for each variable.

    The `y_train_df*` attributes contain the labels to be used for training by each model,
    thus corresponding to the matching `train_df` DataFrame.

    Attributes:
        diagnosis_model (Pipeline): Model for predicting 'diagnosis' variable
        adas_model (Pipeline): Model for predicting 'ADAS13' variable
        ventricles_model (Pipeline): Model for predicting 'ventricles' variable

        y_diagnosis (pandas.DataFrame): 'Diagnosis' labels
        train_df_diagnosis (pandas.DataFrame): Training data used for 'diagnosis' model.
    """
    def __init__(self, confidence_intervals=True):
        # Note to self, to get parameters out: model.diagnosis_model.named_steps['scaler'].mean_
        self.diagnosis_model = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier',
             svm.SVC(kernel='rbf',
                     C=0.5,
                     gamma='auto',
                     class_weight='balanced',
                     probability=True)),
        ])
        adas_pipeline = Pipeline([('scaler', StandardScaler()),
                                  ('classifier',
                                   svm.SVR(kernel='rbf', C=0.5,
                                           gamma='auto'))])
        self.adas_model = TransformedTargetRegressor(
            regressor=adas_pipeline, transformer=StandardScaler())

        ventricles_pipeline = Pipeline(
            steps=[('scaler', StandardScaler()),
                   ('classifier', svm.SVR(kernel='rbf', C=0.5, gamma='auto'))])
        self.ventricles_model = TransformedTargetRegressor(
            regressor=ventricles_pipeline, transformer=StandardScaler())

        self.y_diagnosis = None
        self.y_adas = None
        self.y_ventricles = None

        self.train_df_diagnosis = None
        self.train_df_adas = None
        self.train_df_ventricles = None

        self.confidence_intervals = confidence_intervals

        self.train_df_processed = None
        self.test_df_processed = None

    @staticmethod
    def preprocess(df: pd.DataFrame, is_train_df: bool):
        if not is_train_df:
            # select last row per RID
            df = df.sort_values(by=['EXAMDATE'])
            df = df.groupby('RID').tail(1)
            exam_dates = df['EXAMDATE']

        logger.info("Pre-processing")

        df = df.copy()

        if 'Diagnosis' not in df.columns:
            """We want to transform 'DXCHANGE' (a change in diagnosis, in contrast
            to the previous visits diagnosis) to an actual diagnosis."""
            df = df.replace({'DXCHANGE': {4: 2, 5: 3, 6: 3, 7: 1, 8: 2, 9: 1}})
            df = df.rename(columns={"DXCHANGE": "Diagnosis"})

        # Adds months to age
        if 'Month_bl' in df.columns:
            df['AGE'] += df['Month_bl'] / 12.

        # Remove feature categories based on prior knowledge
        # If month_bl in dataframe, then it is data set D1D2, not D3
        h = list(df)
        if 'Month_bl' in df.columns:
            remove_columns = h[1:8] + [h[9]] + h[14:17] + h[45:47] + h[53:73] + h[74:486] + h[832:838] + h[1172:1174] + \
                h[1657:1667] + h[1895:1902] + h[1905:]
            df: pd.DataFrame = df.drop(remove_columns, axis=1)
        else:
            remove_columns = [h[1]] + h[7:11] + h[20:37]
            df: pd.DataFrame = df.drop(remove_columns, 1)
        h = list(df)

        logger.info('Forcing Numeric Values')
        for i in range(5, len(h)):
            if df[h[i]].dtype != 'float64':
                df[h[i]] = pd.to_numeric(df[h[i]], errors='coerce')
        """Sort the DataFrame per patient on age (at time of visit). This allows using observations from
        the next row/visit to be used as a label for the previous row. (See `set_futures` method.)"""
        df = df.sort_values(by=['RID', 'AGE'])

        if 'APOE4' in df.columns:
            df = df.drop(['EXAMDATE', 'PTGENDER', 'PTEDUCAT', 'APOE4'], axis=1)
        else:
            df = df.drop(['EXAMDATE', 'PTGENDER', 'PTEDUCAT'], axis=1)

        # Ventricles_ICV = Ventricles/ICV_bl. So make sure ICV_bl is not zero to avoid division by zero
        icv_bl_median = df['ICV_bl'].median()
        df.loc[df['ICV_bl'] == 0, 'ICV_bl'] = icv_bl_median

        if 'Ventricles_ICV' not in df.columns:
            df["Ventricles_ICV"] = df["Ventricles"].values / df["ICV_bl"].values

        if not is_train_df:
            return df, exam_dates
        else:
            return df

    def set_data(self, train_df, test_df, train, test):
        train_df = self.preprocess(train_df, True)
        test_df, exam_dates = self.preprocess(test_df, False)

        if test == 'd1d2':
            """Select features based on EMCEB_features.csv file"""
            # Drop columns found unimportant by feature importance ranking measure.
            selected_features = pd.read_csv(
                Path(__file__).parent /
                'EMCEB_features.csv')['feature'].values.tolist()
            selected_features = selected_features[0:200]
            selected_features += ['RID', 'Diagnosis', 'Ventricles_ICV', 'AGE']
            selected_features = set(selected_features)
            train_df = train_df.copy()[selected_features]

            test_df = test_df.copy()[selected_features]

        if test == 'd3':

            test_df_copy = test_df.copy()
            percentage = .50
            idx_fewmissing = pd.isnull(test_df).select_dtypes(
                include=['bool']).sum(axis=0) < percentage * test_df.shape[0]
            test_df = test_df.loc[:, idx_fewmissing].copy()

            test_df['RID'] = test_df_copy['RID']
            test_df['Diagnosis'] = test_df_copy['Diagnosis']
            test_df['Ventricles_ICV'] = test_df_copy['Ventricles_ICV']

            train_df = train_df[test_df.columns]

        # Fill nans by older values
        train_df = EMCEB.fill_nans_by_older_values(train_df)
        if (train == 'd1d2') & (test == 'd1d2'):
            test_df_copy = test_df.copy()
            # get test set again from filled train set
            test_df = train_df.groupby('RID').tail(1).copy()
            # select all records where RID is in d4.
            test_df = test_df[test_df['RID'].isin(
                test_df_copy['RID'].unique())]
        else:
            test_df = EMCEB.fill_nans_by_older_values(test_df)

        self.train_df_processed = train_df
        self.test_df_processed = test_df
        self.exam_dates = exam_dates

    @staticmethod
    def set_futures(
            train_df,
            features=['RID', 'Diagnosis', 'ADAS13', 'Ventricles_ICV', 'AGE']):
        """For each feature in `features` argument, generate a `Future_{feature}` column, that is filled
        using the next row for each patient"""

        futures_df = train_df[features].copy()

        # Set future value based on each row's next row, e.g. shift the column one up
        for predictor in ["Diagnosis", "ADAS13"]:
            futures_df["Future_" + predictor] = futures_df[predictor].shift(-1)

        # For Ventricles we predict change per year rather dan future value
        for predictor in ['Ventricles_ICV']:
            futures_df["Future_" + predictor] = futures_df[predictor].shift(-1)

            Change_Ventricles_ICV = futures_df[predictor].shift(
                -1) - futures_df[predictor]
            Change_Age = futures_df['AGE'].shift(-1) - futures_df['AGE']
            Change_Age[Change_Age == 0] = np.nan
            futures_df["ChangePerMonth_" +
                       predictor] = Change_Ventricles_ICV / Change_Age / 12

        # Drop each last row per patient
        futures_df = futures_df.drop(
            futures_df.groupby('RID').tail(1).index.values)
        return futures_df

    @staticmethod
    def fill_nans_by_older_values(train_df):
        """Fill nans in feature matrix by older values (ffill), then by newer (bfill)"""

        df_filled_nans = train_df.groupby('RID').fillna(method='ffill')
        train_df[df_filled_nans.columns] = df_filled_nans
        df_filled_nans = train_df.groupby('RID').fillna(method='bfill')
        train_df[df_filled_nans.columns] = df_filled_nans

        return train_df

    def train(self):

        assert self.train_df_processed is not None, "Data is not yet set. Use set_data to set data first"

        train_df = self.train_df_processed
        futures = self.set_futures(train_df)

        # Not part of `preprocess` because it's needed for the futures.
        train_df = train_df.drop(['RID', 'AGE'], axis=1)

        # Fill nans by mean of training set
        self.train_df_mean = train_df.mean()
        train_df = train_df.fillna(self.train_df_mean)

        # Fill left over nans with 0
        train_df = train_df.fillna(0)

        def non_nan_y(_train_df, _y_df):
            """Drops all rows with a `y` value that is NaN

            Returns:
                Tuple containing (`train_df`, `y_df`), without NaNs for `y_df`.
            """

            # indices where the y value is not nan
            not_nan_idx = _y_df[_y_df.notna()].index

            # return from both the train dataframe and y the records with these indices
            return _train_df.loc[not_nan_idx], _y_df[not_nan_idx]

        self.train_df_diagnosis, self.y_diagnosis = non_nan_y(
            train_df, futures['Future_Diagnosis'])
        self.train_df_adas, self.y_adas = non_nan_y(train_df,
                                                    futures['Future_ADAS13'])
        self.train_df_ventricles, self.y_ventricles = non_nan_y(
            train_df, futures['ChangePerMonth_Ventricles_ICV'])

        logger.info("Training models")
        self.diagnosis_model.fit(self.train_df_diagnosis, self.y_diagnosis)
        self.adas_model.fit(self.train_df_adas, self.y_adas)
        self.ventricles_model.fit(self.train_df_ventricles, self.y_ventricles)

    def predict(self):

        assert self.test_df_processed is not None, "Data is not yet set. Use set_data to set data first"

        logger.info("Predicting")
        # test_df = self.preprocess(test_series.to_frame().T)

        test_df = self.test_df_processed
        rids = test_df['RID']
        test_df = test_df.drop(['RID', 'AGE'], axis=1)

        # Fill nans by mean of training set
        test_df = test_df.fillna(self.train_df_mean)
        test_df = test_df.fillna(0)

        diag_probas = self.diagnosis_model.predict_proba(test_df)
        adas_prediction = self.adas_model.predict(test_df)
        ventricles_change_prediction = self.ventricles_model.predict(test_df)

        if self.confidence_intervals:
            logger.info("Bootstrap adas")
            adas_ci = bootstrap(self.adas_model, self.train_df_adas,
                                self.y_adas, test_df)

            logger.info("Bootstrap ventricles")
            ventricles_ci = bootstrap(self.ventricles_model,
                                      self.train_df_ventricles,
                                      self.y_ventricles, test_df)
        else:
            adas_ci = ventricles_ci = 0

        def add_months_to_str_date(strdate, months=1):
            return (datetime.strptime(strdate, '%Y-%m-%d') +
                    relativedelta(months=months)).strftime('%Y-%m-%d')

        df = pd.DataFrame.from_dict({
            'RID':
            rids,
            'month':
            1,
            'Forecast Date':
            list(
                map(lambda x: add_months_to_str_date(x, 1),
                    self.exam_dates.tolist())),
            'CN relative probability':
            diag_probas.T[0],
            'MCI relative probability':
            diag_probas.T[1],
            'AD relative probability':
            diag_probas.T[2],
            'ADAS13':
            adas_prediction,
            'ADAS13 50% CI lower':
            adas_prediction - adas_ci,
            'ADAS13 50% CI upper':
            adas_prediction + adas_ci,
            'Ventricles_ICV':
            test_df['Ventricles_ICV'] + ventricles_change_prediction,
            'Ventricles_ICV 50% CI lower':
            test_df['Ventricles_ICV'] + ventricles_change_prediction -
            ventricles_ci,
            'Ventricles_ICV 50% CI upper':
            test_df['Ventricles_ICV'] + ventricles_change_prediction +
            ventricles_ci,
        })

        # copy each row for each month
        new_df = df.copy()
        for i in range(2, 12 * 10):
            df_copy = df.copy()
            df_copy['month'] = i
            df_copy['Forecast Date'] = df_copy['Forecast Date'].map(
                lambda x: add_months_to_str_date(x, i - 1))

            df_copy['Ventricles_ICV'] = test_df[
                'Ventricles_ICV'] + ventricles_change_prediction * df_copy[
                    'month']
            df_copy['Ventricles_ICV 50% CI lower'] = test_df[
                'Ventricles_ICV'] + (ventricles_change_prediction -
                                     ventricles_ci) * df_copy['month']
            df_copy['Ventricles_ICV 50% CI upper'] = test_df[
                'Ventricles_ICV'] + (ventricles_change_prediction +
                                     ventricles_ci) * df_copy['month']
            new_df = new_df.append(df_copy)

        return new_df
Beispiel #25
0
      RFE(estimator=svm.SVR(kernel='linear', C=1, cache_size=500), 
        step=0.33, n_features_to_select=100),
      svm.SVR(kernel='linear', C=1, cache_size=500)
    )
    regression = TransformedTargetRegressor(regressor=regression, transformer=StandardScaler())

    print('Computing 5-fold cross validation')
    cross_val_scores = cross_val_score(regression, X_train, y_train, 
        cv=5, verbose=1, scoring='neg_mean_absolute_error')
    print(cross_val_scores)

    print('Fitting model to training data')
    regression.fit(X_train, y_train)

    print('Running model on test data')
    predicted = regression.predict(X_test)
    print('SVM model predictions: {}'.format(predicted))
    print('SVM model MAE on test data: {}'.format(mean_absolute_error(y_test, predicted)))

    avg_test_prediction = sum(predicted)/len(predicted)
    print('SVM model average predicted accuracy: {}'.format(avg_test_prediction))

    plot_test_data(predicted, y_test)

    print('\nRunning model on challenge data sets\n')
    challenge_predictions = []

    aave_file = os.path.join(THIS_FOLDER, AAVE_DATA)
    twitter_aave_data = read_data(aave_file, 5, tab_separated=True)
    print('Read {} AAVE tweets'.format(len(twitter_aave_data)))
#
# On this plot, we see that for the large True price values, our model tends to
# under-estimate the price of the house. Typically, this issue arises when the
# target to predict does not follow a normal distribution. In this case the
# model would benefit from target transformation.

# %%
from sklearn.preprocessing import QuantileTransformer
from sklearn.compose import TransformedTargetRegressor

transformer = QuantileTransformer(n_quantiles=900,
                                  output_distribution="normal")
model_transformed_target = TransformedTargetRegressor(regressor=regressor,
                                                      transformer=transformer)
model_transformed_target.fit(data_train, target_train)
target_predicted = model_transformed_target.predict(data_test)

# %%
predicted_actual = {
    "True values (k$)": target_test,
    "Predicted values (k$)": target_predicted
}
predicted_actual = pd.DataFrame(predicted_actual)

# %%
sns.scatterplot(data=predicted_actual,
                x="True values (k$)",
                y="Predicted values (k$)")
plt.axline((0, 0), slope=1, color="tab:orange", label="Perfect fit")
plt.axis('square')
plt.legend()
def main():
    # Read raw data.
    # https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality
    raw_data = pd.read_csv('winequality-white.csv', sep=';')
    print('raw_data :\n', raw_data.head())

    # Extract data from dataset.
    x = raw_data[raw_data.columns[:-1]].values  # Dataset: variables.
    y = raw_data['quality'].values  # Dataset: labels.
    print('x :\n', x[:5])
    print('y :\n', y[:5])

    # Split data set into training set and testing set.
    # https://openclassrooms.com/fr/courses/4011851-initiez-vous-au-machine-learning/4020631-exploitez-votre-jeu-de-donnees
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

    # Change the hyperparameters of the model to find the best one, compare different models (with/without regularization).
    models = []
    models.append((kernel_ridge.KernelRidge(kernel='rbf'), 'reg ridge rbf'
                   ))  # We use a gaussian kernel: 'rbf' radial basis function.
    for idx_model, model_lbl in enumerate(models):
        model, lbl = model_lbl[0], model_lbl[1]

        # Train a model.
        best_rmse, best_g, best_a = float('inf'), 0, 0
        worst_rmse, worst_g, worst_a = 0, 0, 0
        all_g, all_a, all_rmse = [], [], []
        for g in np.logspace(-2, 2,
                             6):  # g coefficient between 10^-2 and 10^2.
            # Set parameter model.
            model.set_params(gamma=g)
            for a in np.logspace(-2, 2,
                                 6):  # a coefficient between 10^-2 and 10^2.
                # Set parameter model.
                model.set_params(alpha=a)

                # Scale data to reduce weights.
                # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables
                # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn
                pipe = Pipeline(
                    [('scale', preprocessing.StandardScaler()),
                     ('model', model)]
                )  # Data scaling applied before / after any operator applied to the model.
                treg = TransformedTargetRegressor(
                    regressor=pipe, transformer=preprocessing.MinMaxScaler()
                )  # Target scaling applied before / after any operator applied to the model.

                # Feed the model.
                treg.fit(x_train, y_train)
                # Get prediction for positive value
                y_prob = treg.predict(x_test)

                # Compute root mean square error.
                rmse = np.sqrt(mean_squared_error(y_test, y_prob))

                # Save best and worst models.
                all_g.append(g)
                all_a.append(a)
                all_rmse.append(rmse)
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_g = g
                    best_a = a
                if rmse > worst_rmse:
                    worst_rmse = rmse
                    worst_g = g
                    worst_a = a

        # Plot random binary classifier.
        axis = plt.subplot(1, 2, idx_model + 1, projection='3d')
        axis.set_xlabel('gamma')
        axis.set_ylabel('alpha')
        axis.set_zlabel('rms error')
        axis.scatter3D(all_g, all_a, all_rmse)

        # Get the best and worst model.
        axis = plt.subplot(1, 2, idx_model + 2)
        for g, a in zip([best_g, worst_g], [best_a, worst_a]):
            model.set_params(gamma=g)
            model.set_params(alpha=a)

            # Scale data to reduce weights.
            # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables
            # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn
            pipe = Pipeline(
                [('scale', preprocessing.StandardScaler()), ('model', model)]
            )  # Data scaling applied before / after any operator applied to the model.
            treg = TransformedTargetRegressor(
                regressor=pipe, transformer=preprocessing.MinMaxScaler()
            )  # Target scaling applied before / after any operator applied to the model.

            # Feed the model.
            treg.fit(x_train, y_train)
            # Get prediction for positive value
            y_prob = treg.predict(x_test)

            # Compute root mean square error.
            rmse = np.sqrt(mean_squared_error(y_test, y_prob))

            # Plot true versus predicted score (marker size = number of pairs true/predicted = the bigger, the better).
            sizes = {}
            for (yt, yp) in zip(list(y_test), list(y_prob)):
                if (yt, yp) in sizes.keys():
                    sizes[(yt, yp)] += 1
                else:
                    sizes[(yt, yp)] = 1
            keys = sizes.keys()
            axis.scatter(
                [k[0] for k in keys],
                [k[1] for k in keys],
                s=[
                    sizes[k] for k in keys
                ],  # marker size = number of pairs (true, predicted) = the bigger, the better.
                label='alpha %08.3f - gamma %08.3f - RMSE = %0.5f' %
                (a, g, rmse))
            axis.set_xlabel('True score')
            axis.set_ylabel('Predicted score')
            axis.set_title('best kernel Ridge Regression')
            axis.legend()
    plt.subplots_adjust(left=0.1,
                        bottom=0.1,
                        right=0.9,
                        top=0.9,
                        wspace=0.3,
                        hspace=0.3)
    plt.show()
Beispiel #28
0
def main():
    # Read raw data.
    # https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/Parcours_data_scientist/entrainez-un-modele-predictif-lineaire/TP_1_prostate_dataset.txt
    # https://rafalab.github.io/pages/649/prostate.html
    raw_data = pd.read_csv('prostate_dataset.txt', delimiter='\t')
    print('raw_data :\n', raw_data.head())

    # Extract data from dataset.
    x = raw_data[raw_data.columns[1:-3]].values  # Dataset: variables.
    y = raw_data['lpsa'].values  # Dataset: labels.
    print('x :\n', x[:5])
    print('y :\n', y[:5])

    # Split data set into training set and testing set.
    # https://openclassrooms.com/fr/courses/4011851-initiez-vous-au-machine-learning/4020631-exploitez-votre-jeu-de-donnees
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    # Change the hyperparameter alpha of the model to find the best one, compare different models (with/without regularization).
    # https://openclassrooms.com/fr/courses/4011851-initiez-vous-au-machine-learning/4022441-entrainez-votre-premier-k-nn
    # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507806-reduisez-le-nombre-de-variables-utilisees-par-votre-modele
    n_alphas = 100
    alphas = np.logspace(-5, 5, n_alphas)  # alphas between 10^-5 and 10^5.
    models = []
    models.append((linear_model.LinearRegression(),
                   'linear reg'))  # Baseline to compare to.
    models.append(
        (linear_model.Ridge(),
         'ridge'))  # Compared to LinearRegression: Ridge reduces weights.
    models.append(
        (linear_model.Lasso(fit_intercept=False), 'lasso'
         ))  # Compared to LinearRegression: Lasso can cancel some weights.
    models.append(
        (linear_model.ElasticNet(),
         'elastic net'))  # Mixing Ridge (alpha) and Lasso (1. - alpha).
    error_min, best_alpha, best_model = float('inf'), 0, ''
    _, all_axis = plt.subplots(2, 4)
    for model_lbl in models:
        model, lbl = model_lbl[0], model_lbl[1]

        # Change the alpha hyperparameter.
        coefs, errors = [], []
        for a in alphas:
            if 'alpha' in model.get_params():  # LinearRegression has no alpha.
                model.set_params(alpha=a)

            # Scale data to reduce weights.
            # https://openclassrooms.com/fr/courses/4444646-entrainez-un-modele-predictif-lineaire/4507801-reduisez-l-amplitude-des-poids-affectes-a-vos-variables
            # https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308246-tp-selectionnez-le-nombre-de-voisins-dans-un-knn
            pipe = Pipeline(
                [('scale', preprocessing.StandardScaler()), ('model', model)]
            )  # Data scaling applied before / after any operator applied to the model.
            treg = TransformedTargetRegressor(
                regressor=pipe, transformer=preprocessing.MinMaxScaler()
            )  # Target scaling applied before / after any operator applied to the model.

            # Train a model.
            treg.fit(x_train, y_train)
            coefs.append(treg.regressor_['model'].coef_
                         )  # LinearRegression will have always the same coefs.
            errors.append(np.mean(
                (treg.predict(x_test) - y_test
                 )**2))  # LinearRegression will have always the same error.
        # Plot errors.
        axis = all_axis.ravel()[0]
        axis.plot(alphas, errors, label=lbl)
        axis.set_xscale('log')
        axis.set_xlabel('alpha')
        axis.set_ylabel('errors')
        axis.legend()
        # Save best model / alpha.
        if np.min(errors) < error_min:
            error_min = np.min(errors)
            best_alpha = alphas[np.argmin(errors)]
            best_model = lbl
        # Plot weights.
        nb_coefs = np.shape(coefs)[1]
        for c in range(nb_coefs):
            axis = all_axis.ravel()[c + 1]
            coef = np.array(coefs)[:, c]
            axis.plot(alphas, coef, label=lbl)
            axis.set_xscale('log')
            axis.set_xlabel('alpha')
            axis.set_ylabel('weights_' + str(c) + ': ' +
                            raw_data.columns[1 + c])
            axis.legend()
    for i in range(8):
        axis = all_axis.ravel()[i]
        axis.axvline(best_alpha,
                     label='best: ' + best_model,
                     color='k',
                     ls='--')
        axis.legend()
    plt.subplots_adjust(left=0.1,
                        bottom=0.1,
                        right=0.9,
                        top=0.9,
                        wspace=0.3,
                        hspace=0.3)
    plt.show()
Beispiel #29
0
class WrappedModelRegression:
    """Wrapper for Models in Regression problems.

    Models get wrapped with TransformedTargetRegressor to transform y target before predictions on X features take
    place. Wrapper additionally customizes __name__, __class__ and __str__ methods/attributes to return those values
    from main Model (not TransformedTargetRegressor).

    Attributes:
        clf (sklearn.compose.TransformedTargetRegressor): Wrapped model for regression problems
    """
    def __init__(self, regressor, transformer):
        """Create WrappedModelRegression object.

        Override __name__ and __class__ attributes with appropriate attributes from regressor.

        Args:
            regressor (sklearn.Model): Model used to predict regression target
            transformer (sklearn.Transformer): Transformer used to transform y (target)
        """
        self.clf = TransformedTargetRegressor(regressor=regressor,
                                              transformer=transformer)
        self.__name__ = self.clf.regressor.__class__.__name__
        self.__class__ = self.clf.regressor.__class__

    def fit(self, *args, **kwargs):
        """Fit Model in clf attribute with provided arguments.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.

        Returns:
            self
        """
        self.clf.fit(*args, **kwargs)
        return self

    def predict(self, *args, **kwargs):
        """Predict provided arguments with Model in clf attribute.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.

        Returns:
            numpy.ndarray: predictions
        """
        return self.clf.predict(*args, **kwargs)

    def get_params(self, *args, **kwargs):
        """Return params of regressor inside wrapped clf Model.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.

        Returns:
            dict: params of regressor
        """
        return self.clf.regressor.get_params(*args, **kwargs)

    def __str__(self):
        """Return __str__method of regressor inside wrapped clf Model.

        Returns:
            str: __str__ method of regressor
        """
        return self.clf.regressor.__str__()

    def __class__(self, *args, **kwargs):
        """Return new object of regressor class instantiated with *args and **kwargs arguments.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.

        Returns:
            regressor: new regressor object

        """
        return self.clf.regressor.__class__(*args, **kwargs)
ax0.scatter(y_test, y_pred)
ax0.plot([0, 2000], [0, 2000], '--k')
ax0.set_ylabel('Target predicted')
ax0.set_xlabel('True Target')
ax0.set_title('Ridge regression \n without target transformation')
ax0.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % (
    r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
ax0.set_xlim([0, 2000])
ax0.set_ylim([0, 2000])

regr_trans = TransformedTargetRegressor(regressor=RidgeCV(),
                                        func=np.log1p,
                                        inverse_func=np.expm1)
regr_trans.fit(X_train, y_train)
y_pred = regr_trans.predict(X_test)

ax1.scatter(y_test, y_pred)
ax1.plot([0, 2000], [0, 2000], '--k')
ax1.set_ylabel('Target predicted')
ax1.set_xlabel('True Target')
ax1.set_title('Ridge regression \n with target transformation')
ax1.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % (
    r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
ax1.set_xlim([0, 2000])
ax1.set_ylim([0, 2000])

f.suptitle("Synthetic data", y=0.035)
f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])

###############################################################################
Beispiel #31
0
def plot_transformed_target():
    # !/usr/bin/env python
    # -*- coding: utf-8 -*-
    """
    ======================================================
    Effect of transforming the targets in regression model
    ======================================================

    In this example, we give an overview of the
    :class:`sklearn.compose.TransformedTargetRegressor`. Two examples
    illustrate the benefit of transforming the targets before learning a linear
    regression model. The first example uses synthetic data while the second
    example is based on the Boston housing data set.

    """

    # Author: Guillaume Lemaitre <*****@*****.**>
    # License: BSD 3 clause

    from __future__ import print_function, division

    import numpy as np
    import matplotlib
    import matplotlib.pyplot as plt
    from distutils.version import LooseVersion

    print(__doc__)

    ###############################################################################
    # Synthetic example
    ###############################################################################

    from sklearn.datasets import make_regression
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import RidgeCV
    from sklearn.compose import TransformedTargetRegressor
    from sklearn.metrics import median_absolute_error, r2_score

    # `normed` is being deprecated in favor of `density` in histograms
    if LooseVersion(matplotlib.__version__) >= '2.1':
        density_param = {'density': True}
    else:
        density_param = {'normed': True}

    ###############################################################################
    # A synthetic random regression problem is generated. The targets ``y`` are
    # modified by: (i) translating all targets such that all entries are
    # non-negative and (ii) applying an exponential function to obtain non-linear
    # targets which cannot be fitted using a simple linear model.
    #
    # Therefore, a logarithmic (`np.log1p`) and an exponential function
    # (`np.expm1`) will be used to transform the targets before training a linear
    # regression model and using it for prediction.

    X, y = make_regression(n_samples=10000, noise=100, random_state=0)
    y = np.exp((y + abs(y.min())) / 200)
    y_trans = np.log1p(y)

    ###############################################################################
    # The following illustrate the probability density functions of the target
    # before and after applying the logarithmic functions.

    f, (ax0, ax1) = plt.subplots(1, 2)

    ax0.hist(y, bins=100, **density_param)
    ax0.set_xlim([0, 2000])
    ax0.set_ylabel('Probability')
    ax0.set_xlabel('Target')
    ax0.set_title('Target distribution')

    ax1.hist(y_trans, bins=100, **density_param)
    ax1.set_ylabel('Probability')
    ax1.set_xlabel('Target')
    ax1.set_title('Transformed target distribution')

    f.suptitle("Synthetic data", y=0.035)
    f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    ###############################################################################
    # At first, a linear model will be applied on the original targets. Due to the
    # non-linearity, the model trained will not be precise during the
    # prediction. Subsequently, a logarithmic function is used to linearize the
    # targets, allowing better prediction even with a similar linear model as
    # reported by the median absolute error (MAE).

    f, (ax0, ax1) = plt.subplots(1, 2, sharey=True)

    regr = RidgeCV()
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)

    ax0.scatter(y_test, y_pred)
    ax0.plot([0, 2000], [0, 2000], '--k')
    ax0.set_ylabel('Target predicted')
    ax0.set_xlabel('True Target')
    ax0.set_title('Ridge regression \n without target transformation')
    ax0.text(
        100, 1750, r'$R^2$=%.2f, MAE=%.2f' %
        (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
    ax0.set_xlim([0, 2000])
    ax0.set_ylim([0, 2000])

    regr_trans = TransformedTargetRegressor(regressor=RidgeCV(),
                                            func=np.log1p,
                                            inverse_func=np.expm1)
    regr_trans.fit(X_train, y_train)
    y_pred = regr_trans.predict(X_test)

    ax1.scatter(y_test, y_pred)
    ax1.plot([0, 2000], [0, 2000], '--k')
    ax1.set_ylabel('Target predicted')
    ax1.set_xlabel('True Target')
    ax1.set_title('Ridge regression \n with target transformation')
    ax1.text(
        100, 1750, r'$R^2$=%.2f, MAE=%.2f' %
        (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
    ax1.set_xlim([0, 2000])
    ax1.set_ylim([0, 2000])

    f.suptitle("Synthetic data", y=0.035)
    f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])

    ###############################################################################
    # Real-world data set
    ###############################################################################

    ###############################################################################
    # In a similar manner, the boston housing data set is used to show the impact
    # of transforming the targets before learning a model. In this example, the
    # targets to be predicted corresponds to the weighted distances to the five
    # Boston employment centers.

    from sklearn.datasets import load_boston
    from sklearn.preprocessing import QuantileTransformer, quantile_transform

    dataset = load_boston()
    target = np.array(dataset.feature_names) == "DIS"
    X = dataset.data[:, np.logical_not(target)]
    y = dataset.data[:, target].squeeze()
    y_trans = quantile_transform(dataset.data[:, target],
                                 output_distribution='normal').squeeze()

    ###############################################################################
    # A :class:`sklearn.preprocessing.QuantileTransformer` is used such that the
    # targets follows a normal distribution before applying a
    # :class:`sklearn.linear_model.RidgeCV` model.

    f, (ax0, ax1) = plt.subplots(1, 2)

    ax0.hist(y, bins=100, **density_param)
    ax0.set_ylabel('Probability')
    ax0.set_xlabel('Target')
    ax0.set_title('Target distribution')

    ax1.hist(y_trans, bins=100, **density_param)
    ax1.set_ylabel('Probability')
    ax1.set_xlabel('Target')
    ax1.set_title('Transformed target distribution')

    f.suptitle("Boston housing data: distance to employment centers", y=0.035)
    f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    ###############################################################################
    # The effect of the transformer is weaker than on the synthetic data. However,
    # the transform induces a decrease of the MAE.

    f, (ax0, ax1) = plt.subplots(1, 2, sharey=True)

    regr = RidgeCV()
    regr.fit(X_train, y_train)
    y_pred = regr.predict(X_test)

    ax0.scatter(y_test, y_pred)
    ax0.plot([0, 10], [0, 10], '--k')
    ax0.set_ylabel('Target predicted')
    ax0.set_xlabel('True Target')
    ax0.set_title('Ridge regression \n without target transformation')
    ax0.text(
        1, 9, r'$R^2$=%.2f, MAE=%.2f' %
        (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
    ax0.set_xlim([0, 10])
    ax0.set_ylim([0, 10])

    regr_trans = TransformedTargetRegressor(
        regressor=RidgeCV(),
        transformer=QuantileTransformer(output_distribution='normal'))

    regr_trans.fit(X_train, y_train)
    y_pred = regr_trans.predict(X_test)

    ax1.scatter(y_test, y_pred)
    ax1.plot([0, 10], [0, 10], '--k')
    ax1.set_ylabel('Target predicted')
    ax1.set_xlabel('True Target')
    ax1.set_title('Ridge regression \n with target transformation')
    ax1.text(
        1, 9, r'$R^2$=%.2f, MAE=%.2f' %
        (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
    ax1.set_xlim([0, 10])
    ax1.set_ylim([0, 10])

    f.suptitle("Boston housing data: distance to employment centers", y=0.035)
    f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])

    plt.show()
Beispiel #32
0
def runmodels():
    # load the data
    dfTrain = pd.read_csv('train.csv', low_memory=False)
    dfTest = pd.read_csv('test.csv', low_memory=False)
    dfStore = pd.read_csv("store.csv", low_memory=False)

    # dropping the zero sales and closed stores
    dfTrain = dfTrain[(dfTrain.Open != 0) & (dfTrain.Sales != 0)]

    sales, holidays = prophetData(dfTrain)

    # filling the NaN values in CompetitionDistance col
    dfStore.CompetitionDistance.fillna(dfStore.CompetitionDistance.median(),
                                       inplace=True)

    # replace all the other NaN values with zeros
    dfStore.fillna(0, inplace=True)

    # fill the missing values
    dfTest.fillna(1, inplace=True)

    # merge train and test dataset with store data
    dfTrainStore = merge(dfTrain, dfStore)
    dfTestStore = merge(dfTest, dfStore)

    # Set the target column
    Y = dfTrainStore['Sales']
    Id = dfTestStore['Id']

    # remove dataset specific columns
    dfTrainStore = dfTrainStore.drop(['Customers', 'Sales'], axis=1)
    dfTestStore = dfTestStore.drop(['Id'], axis=1)

    # split the data into a training set and a validation set
    xTrain, xTrainTest, yTrain, yTrainTest = train_test_split(dfTrainStore,
                                                              Y,
                                                              test_size=0.20,
                                                              random_state=42)

    pipe = Pipeline(steps=[('multipleTrans', multipleTransformer()),
                           ('randomForest',
                            RandomForestRegressor(n_estimators=128,
                                                  criterion='mse',
                                                  max_depth=20,
                                                  min_samples_split=10,
                                                  min_samples_leaf=1,
                                                  min_weight_fraction_leaf=0.0,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  bootstrap=True,
                                                  oob_score=False,
                                                  n_jobs=4,
                                                  random_state=35,
                                                  verbose=0,
                                                  warm_start=False))])

    regModel = TransformedTargetRegressor(regressor=pipe,
                                          func=targetTransform,
                                          inverse_func=reverseTargetTransform)

    # training the Regression Model
    regModel.fit(xTrain, yTrain)

    # Regression Model prediction
    yPred = regModel.predict(xTrainTest)

    # predict on the testStore set
    predictions = regModel.predict(dfTestStore)

    # turn the predictions into a dataframe
    dfPreds = pd.DataFrame({'Id': Id, 'Sales': predictions})

    # training the prophet Model
    pModel = Prophet(interval_width=0.5, holidays=holidays)
    pModel.fit(sales)

    # dataframe that extends into future 6 weeks
    future_dates = pModel.make_future_dataframe(periods=6 * 7)

    # prophet model predictions
    forecast = pModel.predict(future_dates)

    # rename prediction columns and isolate the predictions
    fc = forecast[['ds', 'yhat']].rename(columns={
        'Date': 'ds',
        'Forecast': 'yhat'
    })

    # get the current time and turn it into a string
    now = datetime.datetime.now().strftime('%d-%m-%Y-%H-%M-%S-%f')[:-3]

    # Save the model
    # filenameReg = 'regModel-' + now + '.pkl'
    # filenamePro = 'pModel-' + now + '.pkl'
    # pickle.dump(regModel, open(filenameReg, 'wb'))
    # pickle.dump(pModel, open(filenamePro, 'wb'))

    return render_template('model.html',
                           labels=dfPreds['Id'],
                           values=dfPreds['Sales'],
                           linelabels=fc['ds'],
                           linevalues=fc['yhat'])
ax0.scatter(y_test, y_pred)
ax0.plot([0, 2000], [0, 2000], '--k')
ax0.set_ylabel('Target predicted')
ax0.set_xlabel('True Target')
ax0.set_title('Ridge regression \n without target transformation')
ax0.text(
    100, 1750, r'$R^2$=%.2f, MAE=%.2f' %
    (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
ax0.set_xlim([0, 2000])
ax0.set_ylim([0, 2000])
# Transform targets and use same linear model
regr_trans = TransformedTargetRegressor(regressor=RidgeCV(),
                                        func=np.log1p,
                                        inverse_func=np.expm1)
regr_trans.fit(X_train, y_train)
y_pred = regr_trans.predict(X_test)

ax1.scatter(y_test, y_pred)
ax1.plot([0, 2000], [0, 2000], '--k')
ax1.set_ylabel('Target predicted')
ax1.set_xlabel('True Target')
ax1.set_title('Ridge regression \n with target transformation')
ax1.text(
    100, 1750, r'$R^2$=%.2f, MAE=%.2f' %
    (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred)))
ax1.set_xlim([0, 2000])
ax1.set_ylim([0, 2000])

f.suptitle("Synthetic data", y=0.035)
f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95])