Example #1
0
def lgbm_insight_er():
    """Return 5-fold cross validation scores r2, mae, rmse"""
    steps = [('scaler', t.MyScaler(dont_scale='for_profit')),
             ('knn', t.KNNKeepDf())]
    pipe = Pipeline(steps)
    pipe.fit(X_raw_er)
    X = pipe.transform(X_raw_er)

    # Run once to get ideal parameters
    # params = {
    #     'max_bin': [10, 20, 50, 100, 255],
    #     'num_leaves': [5, 10, 31, 50],
    #     'min_data_in_leaf': [10, 20, 30],
    #     'bagging_fraction': [.1, .3, .5, .7, 1]
    # }

    # lgb_q = LGBMRegressor(objective='quantile')

    # gs = RandomizedSearchCV(lgb_q, params,
    #                         scoring=['r2', 'neg_mean_squared_error',
    #                                  'neg_mean_absolute_error'],
    #                         refit='neg_mean_squared_error'
    #                         )
    # gs.fit(X, y_er)

    lgbm = LGBMRegressor(num_leaves=50,
                         max_bin=100,
                         bagging_fraction=0.1,
                         objective='quantile')

    cv_results = cross_validate(
        lgbm,
        X.to_numpy(),
        y_er,
        scoring=['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'],
        return_train_score=True)

    output = pd.DataFrame(
        {
            'train_r2': [cv_results['train_r2'].mean()],
            'train_rmse': [
                np.mean([
                    np.sqrt(abs(i))
                    for i in cv_results['train_neg_mean_squared_error']
                ])
            ],
            'train_mae':
            [abs(cv_results['train_neg_mean_absolute_error'].mean())],
            'test_r2': [cv_results['test_r2'].mean()],
            'test_rmse': [
                np.mean([
                    np.sqrt(abs(i))
                    for i in cv_results['test_neg_mean_squared_error']
                ])
            ],
            'test_mae':
            [abs(cv_results['test_neg_mean_absolute_error'].mean())]
        },
        index=['LGBM'])
    return output
Example #2
0
def r_out(df, x_vars, y_var, csv_label):
    def to_r(x, y, csv_label):
        """merges y and X and exports csv to data/processed for use in R, etc.
        Use after scaling/imputing"""
        return pd.concat([y, x],
                         axis=1).to_csv('data/processed/' + csv_label + '.csv',
                                        index=False)

    X_raw, y = get_train_test(df, x_vars, y_var)

    steps = [('scaler', t.MyScaler(dont_scale=['for_profit'])),
             ('knn', t.KNNKeepDf())]
    pipe = Pipeline(steps)

    pipe.fit(X_raw)
    X = pipe.transform(X_raw)

    to_r(X, y, csv_label)
Example #3
0
def lr_insight_wr():
    """Return 5-fold cross validation scores r2, mae, rmse"""
    steps = [('scaler', t.MyScaler(dont_scale='for_profit')),
             ('knn', t.KNNKeepDf())]
    pipe = Pipeline(steps)
    pipe.fit(X_raw)
    X = pipe.transform(X_raw)

    lr = LinearRegression()
    lr.fit(X, y)
    cv_results = cross_validate(
        lr,
        X,
        y,
        scoring=['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'],
        return_train_score=True)
    output = pd.DataFrame(
        {
            'train_r2': [cv_results['train_r2'].mean()],
            'train_rmse': [
                np.mean([
                    np.sqrt(abs(i))
                    for i in cv_results['train_neg_mean_squared_error']
                ])
            ],
            'train_mae':
            [abs(cv_results['train_neg_mean_absolute_error'].mean())],
            'test_r2': [cv_results['test_r2'].mean()],
            'test_rmse': [
                np.mean([
                    np.sqrt(abs(i))
                    for i in cv_results['test_neg_mean_squared_error']
                ])
            ],
            'test_mae':
            [abs(cv_results['test_neg_mean_absolute_error'].mean())]
        },
        index=['LR'])
    return output
Example #4
0
def resample_kfold(X, y, sparse_df, model, folds=5, prefix='name_'):
    # Get folds
    folds = ShuffleSplit(n_splits=folds, test_size=(1 / folds))
    folds.get_n_splits(X, y)

    # Set up history/scoring lists
    model_history = []
    train_r2 = []
    train_mse = []
    train_mae = []

    test_r2 = []
    test_mse = []
    test_mae = []
    # execute k-fold
    for train_index, test_index in folds.split(X, y):
        xtrain, xtest = X.iloc[train_index, :], X.iloc[test_index, :]
        ytrain, ytest = y[train_index], y[test_index]

        # Simulate missingness on test fold
        xtest = pd.DataFrame(xtest, columns=X.columns)
        xtest = sim_miss(xtest, sparse_df)

        # Scale/transform xtrain
        steps = [('scaler', t.MyScaler(defs.dummy_vars)),
                 ('knn', t.KNNKeepDf())]
        pipe = Pipeline(steps)
        pipe.fit(xtrain)
        xtrain = pipe.transform(xtrain)

        # scale/impute test (test has simulated missing, imputed on
        # data from training folds)
        xtest = pipe.transform(xtest)

        # Run the model
        loop_model = copy.copy(model)
        loop_model.fit(xtrain, ytrain)

        # Save models
        model_history.append(loop_model)

        # Save Performance
        train_r2.append(r2_score(ytrain, loop_model.predict(xtrain)))
        train_mse.append(mean_squared_error(ytrain,
                                            loop_model.predict(xtrain)))
        train_mae.append(
            mean_absolute_error(ytrain, loop_model.predict(xtrain)))

        test_r2.append(r2_score(ytest, loop_model.predict(xtest)))
        test_mse.append(mean_squared_error(ytest, loop_model.predict(xtest)))
        test_mae.append(mean_absolute_error(ytest, loop_model.predict(xtest)))

    performance = {
        prefix + 'train_r2': train_r2,
        prefix + 'train_mse': train_mse,
        prefix + 'train_mae': train_mae,
        prefix + 'test_r2': test_r2,
        prefix + 'test_mse': test_mse,
        prefix + 'test_mae': test_mae
    }
    return performance
Example #5
0
complete_df = pd.read_pickle('data/interim/complete_df.pickle')
X_raw, y = get_train_test(complete_df,
                          defs.sparse_vars,
                          'would_recommend',
                          return_full=True)

X_raw_er, y_er = get_train_test(complete_df,
                                defs.sparse_vars,
                                'RATING_EST',
                                return_full=True)

X_raw.rename(columns=renames, inplace=True)
X_raw_er.rename(columns=renames, inplace=True)

steps = [('scaler', t.MyScaler(dont_scale='for_profit')),
         ('knn', t.KNNKeepDf())]
pipe = Pipeline(steps)
pipe.fit(X_raw, defs.dummy_vars)
X = pipe.transform(X_raw)

#%%
# RandomSearch best XGBRegressor parameters
params = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0.0, 0.1, 0.2, 0.5, 1],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7]
}

rs = RandomizedSearchCV(XGBRegressor(), params)