Example #1
0
def model_function(
        dataset,
        location_array_W,
        pred_ahead,
        target_col,
        extended_data=True,
        impute_missing=True,
        do_extract=True,
        shift_features=True,
        use_early_stopping=False,
        lgb_boosting_type="gbdt",
        lgb_num_leaves=31,
        lgb_learning_rate=0.1,
        lgb_max_depth=-1,

        split_train_pct=.7,
        split_test_pct=.85
):
    """
    This is the main model function that prepares a dataset based on some hyperparams, and trains a lightgbm model
    It returns both the predicted validation and test dataframe in a tuple
    :param dataset: name of the dataset, based on file names in data/kaggle-preprocessed/xxx.feather
    :param location_array_W: location array W
    :param pred_ahead: how far ahead the targetvariable should be shifted
    :param target_col: the name of the target column in the dataframe
    :param extended_data: (bool) whether to extend the features with the extended dataset
    :param impute_missing: (bool) impute missing features or leave them empty
    :param do_extract: (bool) use the original features or use the dimension reduction
    :param shift_features: (bool) enhance the data with shifted features
    :param use_early_stopping: (bool) whether to let the final model be the tree with the lowest error in the validation dataset
    :param lgb_boosting_type: lgb hyperparam
    :param lgb_num_leaves: lgb hyperparam
    :param lgb_learning_rate: lgb hyperparam
    :param lgb_max_depth: lgb hyperparam
    :param split_train_pct: percentage of data where to split train
    :param split_test_pct: pertentage of the data where to split the test data
    :return: (predicted_df_validation, predicted_df_test)
    """
    df = gather_df(dataset, extended_data, )
    df = prepare_df(df, impute_missing, do_extract, shift_features, location_array_W)

    y = df[target_col].pct_change(pred_ahead).shift(-pred_ahead)

    # splitting here is some hassle since we only want our training data to consist of days
    # where the target is non null, some of the datasets have years of missing target variables
    available_indexes = df.index[~pd.isna(y)]
    train_split = available_indexes[int(len(available_indexes) * split_train_pct)]
    val_split = available_indexes[int(len(available_indexes) * split_test_pct)]

    # add pred_ahead here because at time t, we do not know what the target variable will do in the future
    test_split_start = available_indexes[
        int(len(available_indexes) * split_test_pct) + pred_ahead
        ]

    X_train = df[(df.index <= train_split)]
    X_val = df[(df.index > train_split) & (df.index < val_split)]
    X_test = df[(df.index >= test_split_start)]

    y_train = y[(df.index <= train_split)]
    # log transform y_train to reduce long tail effects and np.clip just to be safe
    y_train = np.log(np.clip(y_train, -1 + 1e-6, 100) + 1)

    y_val = y[(df.index > train_split) & (df.index < val_split)]
    y_val = np.log(np.clip(y_val, -1 + 1e-6, 100) + 1)

    y_test = y[(df.index >= test_split_start)]

    # for training, filter all days with missing targets
    X_train = X_train[~pd.isna(y_train)]
    y_train = y_train[~pd.isna(y_train)]

    rmodel = lgb.LGBMRegressor(
        boosting_type=lgb_boosting_type,
        num_leaves=lgb_num_leaves,
        learning_rate=lgb_learning_rate,
        max_depth=lgb_max_depth,
    )
    eval_set = None
    if use_early_stopping:
        eval_set = (X_val, y_val)
    rmodel.fit(X_train, y_train, eval_set=eval_set, verbose=0)
    dfp_test = pd.DataFrame(
        {
            "p": np.exp(np.clip(rmodel.predict(X_test), -1000, 1000)) - 1,
            "y": y_test,
            "original": df[target_col][(df.index > test_split_start)],
        }
    )
    # since we used percentages as targets, we need to reverse this to be able to compute objective scores
    dfp_test["y"] = (dfp_test.y + 1) * dfp_test.original
    dfp_test["p"] = (dfp_test.p + 1) * dfp_test.original

    dfp_test = dfp_test[~pd.isna(dfp_test.y)]

    dfp_val = pd.DataFrame(
        {
            "p": np.exp(np.clip(rmodel.predict(X_val), -1000, 1000)) - 1,
            "y": y_val,
            "original": df[target_col][
                (df.index > train_split) & (df.index < val_split)
                ],
        }
    )
    dfp_val["y"] = (dfp_val.y + 1) * dfp_val.original
    dfp_val["p"] = (dfp_val.p + 1) * dfp_val.original

    dfp_val = dfp_val[~pd.isna(dfp_val.y)]

    return dfp_val, dfp_test, rmodel, X_test, y_test
Example #2
0
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output





# Feature importance

#lightGBM model fit
gbm = lgb.LGBMRegressor()
gbm.fit(train, target)
gbm.booster_.feature_importance()

# importance of each attribute
fea_imp_ = pd.DataFrame({'cols':train.columns, 'fea_imp':gbm.feature_importances_})
fea_imp_.loc[fea_imp_.fea_imp > 0].sort_values(by=['fea_imp'], ascending = False)







TRAINING_SIZE = 300000
Example #3
0
                        # get this months indices
                        trn_idx = np.where(np.isin(train.month, validation_months, invert=True))[0]
                        val_idx = np.where(np.isin(train.month, validation_months, invert=False))[0]
                        #print(f"split primary_use: train size {len(trn_idx)} val size {len(val_idx)}")

                        # remove indices not in this primary_use
                        trn_idx = np.intersect1d(trn_idx, np.where(np.isin(train.primary_use, primary_use_group))[0])
                        val_idx = np.intersect1d(val_idx, np.where(np.isin(train.primary_use, primary_use_group))[0])
                        #print(f"split primary_use: train size {len(trn_idx)} val size {len(val_idx)}")

                        # initialize model
                        model = lgb.LGBMRegressor(random_state=seed+9999*args.normalize_target,
                                                  n_estimators=9999,
                                                  learning_rate=args.lr,
                                                  feature_fraction=args.feature_fraction,
                                                  subsample=args.subsample,
                                                  num_leaves=args.n_leaves,
                                                  metric="rmse", 
                                                  silent=False)

                        # fit model
                        msg = f'Training {full_sub_model_name} - train# {len(trn_idx)} val# {len(val_idx)}'
                        #print(f'{datetime.now()} - Training {full_sub_model_name} - train# {len(trn_idx)} val# {len(val_idx)}')
                        with timer(msg):
                            model.fit(train.loc[trn_idx, FEATURES], train.loc[trn_idx, "target"],
                                      eval_set=[(train.loc[val_idx, FEATURES], train.loc[val_idx, "target"])],
                                      early_stopping_rounds=50,
                                      verbose=50)

                        model.booster_.save_model(full_sub_model_name)
Example #4
0
def train_model_regression(X,
                           X_test,
                           y,
                           params,
                           folds,
                           model_type='lgb',
                           eval_metric='mae',
                           columns=None,
                           plot_feature_importance=False,
                           model=None,
                           verbose=10000,
                           early_stopping_rounds=200,
                           n_estimators=50000,
                           mol_type=-1,
                           fold_group=None):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.

    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type

    """
    columns = X.columns if columns is None else columns
    X_test = X_test[columns]

    # to set up scoring parameters
    metrics_dict = {
        'mae': {
            'lgb_metric_name': 'mae',
            'catboost_metric_name': 'MAE',
            'sklearn_scoring_function': metrics.mean_absolute_error
        },
        'group_mae': {
            'lgb_metric_name': 'mae',
            'catboost_metric_name': 'MAE',
            'scoring_function': group_mean_log_mae
        },
        'mse': {
            'lgb_metric_name': 'mse',
            'catboost_metric_name': 'MSE',
            'sklearn_scoring_function': metrics.mean_squared_error
        }
    }

    result_dict = {}

    # out-of-fold predictions on train data
    oof = np.zeros(len(X))

    # averaged predictions on train data
    prediction = np.zeros(len(X_test))

    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    model_list = []

    # split and train on folds
    for fold_n, (train_index,
                 valid_index) in enumerate(folds.split(X, groups=fold_group)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[
                valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params,
                                      n_estimators=n_estimators,
                                      n_jobs=-1)
            model.fit(X_train,
                      y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                      verbose=verbose,
                      early_stopping_rounds=early_stopping_rounds)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train,
                                     label=y_train,
                                     feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid,
                                     label=y_valid,
                                     feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            params["objective"] = "reg:linear"
            params["eval_metric"] = metrics_dict[eval_metric][
                'lgb_metric_name']
            model = xgb.train(dtrain=train_data,
                              num_boost_round=20000,
                              evals=watchlist,
                              early_stopping_rounds=200,
                              verbose_eval=verbose,
                              params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid,
                                                     feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test,
                                               feature_names=X.columns),
                                   ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid).reshape(-1, )
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](
                y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')

            y_pred = model.predict(X_test).reshape(-1, )

        if model_type == 'cat':
            model = CatBoostRegressor(
                iterations=20000,
                eval_metric=metrics_dict[eval_metric]['catboost_metric_name'],
                **params,
                loss_function=metrics_dict[eval_metric]
                ['catboost_metric_name'])
            model.fit(X_train,
                      y_train,
                      eval_set=(X_valid, y_valid),
                      cat_features=[],
                      use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)

        oof[valid_index] = y_pred_valid.reshape(-1, )
        if eval_metric != 'group_mae':
            scores.append(
                metrics_dict[eval_metric]['sklearn_scoring_function'](
                    y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](
                y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred

        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)
        model_list += [model]
    prediction /= folds.n_splits
    try:
        cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + 'CV mean score: {0:.4f}, std: {1:.4f}.'.format(
            np.mean(scores), np.std(scores))
        print(cv_score_msg)
        send_message(cv_score_msg)
    except Exception as e:
        print(e)
        pass

    result_dict["models"] = model_list
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores

    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= folds.n_splits
            cols = feature_importance[[
                "feature", "importance"
            ]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[
                feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12))
            sns.barplot(x="importance",
                        y="feature",
                        data=best_features.sort_values(by="importance",
                                                       ascending=False))
            plt.title('LGB Features (avg over folds)')
            feature_importance.to_csv(log_path / f"importance_{mol_type}.csv")
            result_dict['feature_importance'] = feature_importance

    return result_dict
Example #5
0
                             learning_rate=0.05,
                             max_depth=3,
                             min_child_weight=1.7817,
                             n_estimators=1000,
                             reg_alpha=0.4640,
                             reg_lambda=0.8571,
                             subsample=0.5213,
                             silent=1,
                             random_state=7,
                             nthread=-1)
model_lgb = lgb.LGBMRegressor(objective='regression',
                              num_leaves=5,
                              learning_rate=0.05,
                              n_estimators=720,
                              max_bin=55,
                              bagging_fraction=0.8,
                              bagging_freq=5,
                              feature_fraction=0.2319,
                              feature_fraction_seed=9,
                              bagging_seed=9,
                              min_data_in_leaf=6,
                              min_sum_hessian_in_leaf=11)
n_folds = 5
score = rmsle_cv(lasso, train, y_train, n_folds=n_folds)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
score = rmsle_cv(ENet, train, y_train, n_folds=n_folds)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
score = rmsle_cv(KRR, train, y_train, n_folds=n_folds)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(),
                                                     score.std()))
score = rmsle_cv(GBoost, train, y_train, n_folds=n_folds)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(
Example #6
0
def merf(normalise = False):
    hyper_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': ['l1', 'rmse'],
        'learning_rate': 0.001,
        'feature_fraction': 0.8,
         
        "max_depth": 6,
        "max_bin": 512,
        "num_leaves": 40,
        "num_iterations": 100000,
        "n_estimators": 300,
        "verbose": -1
        }
      #   'bagging_fraction': 0.7,
      #  'bagging_freq': 10, "num_leaves": 12, 
      
    gbm = lgb.LGBMRegressor(**hyper_params)    
    
    ap2 = ap.fillna(method = "pad") 
    ap2.isna().sum().sum()
    X_train, Y_train, X_test, Y_test = preprocessing(ap2, hour2int = True, onehotencode = False)
      
    Z_train = np.ones((len(X_train), 1))
    
    clusters_train = X_train['hours']
    clusters_test= X_test['hours']
    
    X_train1 = X_train.drop(["hours"],axis = 1)
    
    X_test1 = X_test.drop(["hours"],axis = 1)
    
    if normalise:
        X_train1 =(X_train1-X_train1.mean())/X_train1.std()
        X_test1 =(X_test1-X_test1.mean())/X_test1.std()
    # we should not nornalise the Y (response)    
    #   Y_train1 =(Y_train-Y_train.mean())/Y_train.std()
         
    #my_imputer = SimpleImputer()
    #X_train1 = my_imputer .fit_transform(X_train1)   # fit missing
    #X_test1  = my_imputer .fit_transform(X_test1)  
    
    # normalising for boosting is commonly not necessary, but for the mixed effect models 
    # we actually may want to normalise. But we only normalise X (predictors)!
       # check if missing 
    print( Y_train1.isnull().any().any(),X_train1.isnull().any().any(),X_test.isnull().any().any())
    
    merf = MERF(gbm, max_iterations = 4)
    merf.fit(X_train1,  Z_train, clusters_train, Y_train1)
    
    Z_test = np.ones((len(X_test1), 1))
    y_pred_ = merf.predict(X_test1, Z_test, clusters_test)
    # also normalise the response and prediction wont work
    #if normalise:
    #    y_pred = y_pred_*Y_train.std()+Y_train.mean() 
        
    mae = abs(y_pred - Y_test).mean()
    rmse =  math.sqrt(((y_pred - Y_test)*(y_pred - Y_test)).mean())
    rrmse = rmse / Y_test.median()
    r2 = get_r2_numpy_corrcoef(Y_test, y_pred)
    return(mae, rmse, rrmse, r2)
Example #7
0
                                bond_type]['molecule_name']
 oof = np.zeros(len(X_type))
 prediction_type = np.zeros(len(X_test_type))
 bond_scores = []
 for fold_n, (train_idx, valid_idx) in enumerate(
         folds.split(X_type, groups=mol_group_type)):
     if fold_n == 1:
         # ONLY TRAIN FOR FOLD 0
         continue
     fold_start = timer()
     logger.info('Running Type {} - Fold {} of {}'.format(
         bond_type, fold_count, folds.n_splits))
     X_train, X_valid = X_type.iloc[train_idx], X_type.iloc[valid_idx]
     y_train, y_valid = y_type.iloc[train_idx], y_type.iloc[valid_idx]
     model = lgb.LGBMRegressor(**lgb_params,
                               n_estimators=N_ESTIMATORS,
                               n_jobs=N_THREADS)
     model.fit(
         X_train.drop('type', axis=1),
         y_train,
         eval_set=[  #(X_train.drop('type', axis=1), y_train),
             (X_valid.drop('type', axis=1), y_valid)
         ],
         eval_metric=EVAL_METRIC,
         verbose=VERBOSE,
         early_stopping_rounds=EARLY_STOPPING_ROUNDS)
     now = timer()
     update_tracking(run_id,
                     '{}_tr_sec_f{}'.format(bond_type, fold_n + 1),
                     (now - fold_start),
                     integer=True)
Example #8
0
def fit_lgb(i, X_train_cv, y_train_cv):
    model = lgb.LGBMRegressor(**lgb_param)
    model.fit(X_train_cv, y_train_cv)
    return model
Example #9
0
'''





# LGBR = GridSearchCV(model_lgb,lgb_params,cv=3,scoring='roc_auc',verbose=5,return_train_score = True)
# LGBR.fit(train_data,y_train)
# LGB_best = LGBR.best_estimator_
# print(LGB_best)
# print(LGBR.best_score_)

model_lgb = lgb.LGBMRegressor(boosting_type='gbdt',
                              objective='regression',
                              learning_rate=0.01, n_estimators=3000,
                              reg_alpha = 0.1,reg_lambda = 0.01,
                              max_bin = 200, num_leaves=150,max_depth=-1,
                              subsample_freq=5, colsample_bytree = 0.6,subsample = 0.8, 
                              min_child_samples=50,min_split_gain=0,random_state=1024,n_jobs=-1)

#lgb_1

auc = rmsle_cv(model_lgb,train_data,y_train)
print('model_lgb AUC : ',auc)

model_lgb.fit(train_data,y_train)
test_y_prob = model_lgb.predict(test_data)
test_y_prob = scale(test_y_prob)
test_y_cat=[int(item>0.25) for item in list(test_y_prob)]
test_result = pd.DataFrame(EID.values,columns=["EID"])
test_result['FORTARGET']=test_y_cat
Example #10
0
    def gbm_model():
        model_gbm = gbm.LGBMRegressor()

        return model_gbm
Example #11
0

# In[8]:


from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, mutual_info_regression


pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('regressor', lgb.LGBMRegressor(
        objective='regression',
        num_leaves=31,
        learning_rate=0.01,
        silent=False
    ))
])

N_FEATURES_OPTIONS = [50, 100, 300]

param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'regressor__boosting_type': ['gbdt', 'dart'], #'goss', 'rf'],
        'regressor__n_estimators': [50, 100, 500]
    },
    {
        'reduce_dim': [SelectKBest(mutual_info_regression)],
Example #12
0
 def light_gbm_regression(self):
     model = lgb.LGBMRegressor()
     return self.fiting_model(model)
Example #13
0
    "bagging_fraction": 0.75,
    "bagging_seed": 11,
    "metric": 'mae',
    "verbosity": -1,
    'reg_alpha': 0.1302650970728192,
    'reg_lambda': 0.3603427518866501
}

for fold_n, (train_index,
             valid_index) in enumerate(folds.split(X_train_scaled)):

    print(fold_n)
    X_train, X_valid = X_train_scaled.iloc[train_index], X_train_scaled.iloc[
        valid_index]
    y_train, y_valid = y_tr.iloc[train_index], y_tr.iloc[valid_index]
    model = lgb.LGBMRegressor(**params, n_estimators=50000, n_jobs=-1)
    model.fit(X_train,
              y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric='mae',
              verbose=10000,
              early_stopping_rounds=200)
    y_pred_valid = model.predict(X_valid)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration_)

    oof[valid_index] = y_pred_valid
    scores.extend(mean_absolute_error(y_valid.values.reshape(-1),
                                      y_pred_valid))
    prediction += y_pred

prediction /= n_fold
Example #14
0
File: all.py Project: k-jun/sotuken
    df, mean, std = lib.normalize(df)

    (x_train_lstm,
     y_train_lstm), (x_test_lstm,
                     y_test_lstm), columns = lib.train_test_split_lstm(
                         df["price"].values, df.index, PAST_HISTORY,
                         TRAIN_RATIO)
    (x_train, y_train), (x_test, y_test), columns = lib.train_test_split(
        df["price"].values, df.index, PAST_HISTORY, TRAIN_RATIO)
    # モデルを定義
    lstm = create_model(x_train_lstm.shape[-2:])
    rfr = RandomForestRegressor(max_depth=5,
                                random_state=RANDOM_STATE,
                                n_estimators=100)
    xgb = xgboost.XGBRegressor(n_estimators=100, random_state=RANDOM_STATE)
    lgbm = lightgbm.LGBMRegressor(n_estimators=100, random_state=RANDOM_STATE)
    svm = SVR()
    # モデルを学習
    # lstm.fit(x_train_lstm, y_train_lstm, batch_size=BATCH_SIZE, epochs=EPOCHS,
    #           verbose=1, validation_data=(x_test_lstm, y_test_lstm))
    rfr.fit(x_train, y_train)
    xgb.fit(x_train, y_train)
    lgbm.fit(x_train, y_train)
    svm.fit(x_train, y_train)

    # モデルで予測
    # y_pred_lstm = lstm.predict(x_test_lstm)
    y_pred_rfr = rfr.predict(x_test)
    y_pred_xgb = xgb.predict(x_test)
    y_pred_lgbm = lgbm.predict(x_test)
    y_pred_svm = svm.predict(x_test)
Example #15
0
def hold_out_lgb_validation(X, y, params, eval_metric='mae', columns=None,
                            plot_feature_importance=False,
                            verbose=10000, early_stopping_rounds=200, n_estimators=50000, ):
    columns = X.columns if columns is None else columns

    # to set up scoring parameters
    metrics_dict = {'mae': {'lgb_metric_name': 'mae',
                            'catboost_metric_name': 'MAE',
                            'sklearn_scoring_function': metrics.mean_absolute_error},
                    'group_mae': {'lgb_metric_name': 'mae',
                                  'catboost_metric_name': 'MAE',
                                  'scoring_function': group_mean_log_mae},
                    'mse': {'lgb_metric_name': 'mse',
                            'catboost_metric_name': 'MSE',
                            'sklearn_scoring_function': metrics.mean_squared_error}
                    }

    result_dict = {}

    X_train, X_valid, y_train, y_valid = train_test_split(X[columns], y, test_size=0.1, random_state=42)
    eval_result = {}
    callbacks = [lgb.record_evaluation(eval_result)]
    model:lgb.LGBMRegressor = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1, importance_type='gain')
    print(model)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
              verbose=verbose, early_stopping_rounds=early_stopping_rounds,
              callbacks=callbacks)

    y_pred_valid = model.predict(X_valid)

    if eval_metric != 'group_mae':
        score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
    else:
        score = metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type'])

    if plot_feature_importance:
        # feature importance
        feature_importance = pd.DataFrame()
        feature_importance["feature"] = columns
        feature_importance["importance"] = model.feature_importances_
    else:
        feature_importance = None

    try:
        cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + f' HOLD_OUT score: {score:.4f} .'
        print(cv_score_msg)
        if not DEBUG and LINE_MSG:
            send_message(cv_score_msg)
    except Exception as e:
        print(e)
        pass

    result_dict["model"] = model
    result_dict['y_pred_valid'] = pd.DataFrame(y_pred_valid, index=X_valid.index, columns=["scalar_coupling_constant"])
    result_dict['score'] = score
    result_dict["importance"] = feature_importance
    result_dict["eval_result"] = eval_result
    result_dict["best_iteration"] = model.best_iteration_
    return result_dict
Example #16
0
    "rougher.input.floatbank11_copper_sulfate",
    "rougher.input.feed_sol",
    "rougher.input.feed_size"
])
#
# COLS_TO_DIFF_TOP20 = [
#
# ]
level0_models_rougher = {}
obj = 'mae'
level0_models_rougher['LGBM_rougher_base_a'] = lgb.LGBMRegressor(
    objective=obj,
    learning_rate=0.05,
    n_estimators=500,
    random_state=91,
    **{
        'max_depth': 5,
        'num_leaves': 100,
        'feature_fraction': '0.363',
        'bagging_fraction': '0.262'
    })
level0_models_rougher['LGBM_rougher_base_b'] = lgb.LGBMRegressor(
    objective=obj,
    learning_rate=0.05,
    n_estimators=500,
    random_state=92,
    **{
        'max_depth': 4,
        'num_leaves': 110,
        'feature_fraction': '0.448',
        'bagging_fraction': '0.445'
Example #17
0
from sklearn.cross_validation import StratifiedKFold

seed_ls = []
# 五折交叉训练,构造五个模型
skf = list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    model = lgb.LGBMRegressor(objective='regression',
                              num_leaves=125,
                              learning_rate=0.05,
                              n_estimators=2500,
                              boosting_type="gbdt",
                              max_depth=-1,
                              seed=2018,
                              num_thread=-1,
                              max_bin=425,
                              bagging_fraction=0.8,
                              colsample_bytree=0.9,
                              subsample=0.8,
                              lambda_l2=0.20)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index],
                          y[train_index],
                          eval_names=['train', 'valid'],
                          eval_metric='rmse',
                          eval_set=[(X[train_index], y[train_index]),
                                    (X[test_index], y[test_index])],
                          early_stopping_rounds=100)
Example #18
0
def model():
    """
    处理过程,在示例中,使用随机方法生成结果,并将结果文件存储到预测结果路径下。
    :return: 
    """
    #    import xgboost as xgb
    #    dtrain=xgb.DMatrix(X_train, label=y_train)
    #    dtest=xgb.DMatrix(X_test, label=y_test)
    #    dval = xgb.DMatrix(X_val)
    #    param = {'learning_rate' : 0.1, 'n_estimators': 1000, 'max_depth': 3,
    #        'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8,
    #        'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'reg:linear'}
    #    num_round = 283
    #    param['nthread'] = 4
    #    param['eval_metric'] = "auc"
    #    param.update({'eval_metric': 'logloss'})
    #    plst = param.items()
    #    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    #    xgbr=xgb.train(plst, dtrain, num_round, evallist)

    #    from sklearn.model_selection import GridSearchCV
    ##    import xgboost as xgb
    #    from xgboost.sklearn import XGBRegressor
    #
    #    cv_params = { 'max_depth':list(range(10,2,-1)),'min_child_weight':list(range(6,1,-1)}
    #    other_params = {'learning_rate': 0.1, 'seed': 0, 'n_estimators': 500,'subsample': 0.8,
    #                    'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
    #
    #    model = XGBRegressor(**other_params)
    #    xgbr = GridSearchCV(estimator=model, param_grid=cv_params, cv=5, verbose=1, n_jobs=4)
    #    xgbr.fit(X_train, y_train)
    #    print('每轮迭代运行结果:{0}'.format(xgbr.grid_scores_))
    #    print('参数的最佳取值:{0}'.format(xgbr.best_params_))
    #    print('最佳模型得分:{0}'.format(xgbr.best_score_))
    #    from sklearn.ensemble import RandomForestRegressor
    #    xgbr = RandomForestRegressor()
    #    xgbr.fit(X_train, y_train)

    from xgboost import XGBRegressor
    xgbr = XGBRegressor(max_depth=4)
    print(xgbr)
    xgbr.fit(X_train, y_train)

    import lightgbm as lgb
    lgbr = lgb.LGBMRegressor(max_depth=6)
    print(lgbr)
    lgbr.fit(X_train, y_train)

    #    xgbr = XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
    #                         learning_rate=0.1, max_delta_step=0, max_depth=5,
    #                         min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
    #                         objective='reg:linear', reg_alpha=0, reg_lambda=1,
    #                         scale_pos_weight=1, seed=0, silent=True, subsample=1)

    #    from xgboost import plot_importance
    #    from matplotlib import pyplot
    #    plot_importance(xgbr,importance_type = 'cover')
    #    pyplot.show()

    #    from sklearn import preprocessing
    #    Pred = preprocessing.scale(xgbr.predict(X_val))
    #    Pred = xgbr.predict(X_val)
    #    Pred=(xgbr.predict(X_val)-xgbr.predict(X_val).min())/((xgbr.predict(X_val).max()-xgbr.predict(X_val).min()))
    #    prep_1=np.log(xgbr.predict(X_val))

    Id_pred = pd.DataFrame()
    Id_pred['Id'] = X_val_df.index
    Id_pred['pred_1'] = xgbr.predict(X_val)
    Id_pred['pred_2'] = lgbr.predict(X_val)

    #    Id_pred['Pred']=prep_1
    Id_pred['Pred'] = 0.6 * Id_pred['pred_1'].rank(
    ) + 0.4 * Id_pred['pred_2'].rank()

    del Id_pred['pred_1'], Id_pred['pred_2']
    Id_pred.to_csv(path_test_out + "test.csv", index=None)
    #    print(Id_pred['Pred'])#.value_counts().sort_values()

    from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
    print('The score is:', xgbr.score(X_test, y_test))
    print('The r2_score is:', r2_score(y_test, xgbr.predict(X_test)))
    print('The mean_squared_error is:',
          mean_squared_error(y_test, xgbr.predict(X_test)))
    print('The mean_absolute_error is:',
          mean_absolute_error(y_test, xgbr.predict(X_test)))
Example #19
0
 def to_local(self):
     model = lightgbm.LGBMRegressor(**self.get_params())
     self._copy_extra_params(self, model)
     return model
    def identify_zero_importance(self,
                                 task,
                                 eval_metric=None,
                                 n_iterations=10,
                                 early_stopping=True):
        """
        
        Identify the features with zero importance according to a gradient boosting machine.
        The gbm can be trained with early stopping using a validation set to prevent overfitting. 
        The feature importances are averaged over `n_iterations` to reduce variance. 
        
        Uses the LightGBM implementation (http://lightgbm.readthedocs.io/en/latest/index.html)

        Parameters 
        --------

        eval_metric : string
            Evaluation metric to use for the gradient boosting machine for early stopping. Must be
            provided if `early_stopping` is True

        task : string
            The machine learning task, either 'classification' or 'regression'

        n_iterations : int, default = 10
            Number of iterations to train the gradient boosting machine
            
        early_stopping : boolean, default = True
            Whether or not to use early stopping with a validation set when training
        
        
        Notes
        --------
        
        - Features are one-hot encoded to handle the categorical variables before training.
        - The gbm is not optimized for any particular task and might need some hyperparameter tuning
        - Feature importances, including zero importance features, can change across runs

        """

        if early_stopping and eval_metric is None:
            raise ValueError(
                """eval metric must be provided with early stopping. Examples include "auc" for classification or
                             "l2" for regression.""")

        if self.labels is None:
            raise ValueError("No training labels provided.")

        # One hot encoding
        features = pd.get_dummies(self.data)
        self.one_hot_features = [
            column for column in features.columns
            if column not in self.base_features
        ]

        # Add one hot encoded data to original data
        self.data_all = pd.concat([features[self.one_hot_features], self.data],
                                  axis=1)

        # Extract feature names
        feature_names = list(features.columns)

        # Convert to np array
        features = np.array(features)
        labels = np.array(self.labels).reshape((-1, ))

        # Empty array for feature importances
        feature_importance_values = np.zeros(len(feature_names))

        print('Training Gradient Boosting Model\n')

        # Iterate through each fold
        for _ in range(n_iterations):

            if task == 'classification':
                model = lgb.LGBMClassifier(n_estimators=1000,
                                           learning_rate=0.05,
                                           verbose=-1)

            elif task == 'regression':
                model = lgb.LGBMRegressor(n_estimators=1000,
                                          learning_rate=0.05,
                                          verbose=-1)

            else:
                raise ValueError(
                    'Task must be either "classification" or "regression"')

            # If training using early stopping need a validation set
            if early_stopping:

                train_features, valid_features, train_labels, valid_labels = train_test_split(
                    features, labels, test_size=0.15, stratify=labels)

                # Train the model with early stopping
                model.fit(train_features,
                          train_labels,
                          eval_metric=eval_metric,
                          eval_set=[(valid_features, valid_labels)],
                          early_stopping_rounds=100,
                          verbose=-1)

                # Clean up memory
                gc.enable()
                del train_features, train_labels, valid_features, valid_labels
                gc.collect()

            else:
                model.fit(features, labels)

            # Record the feature importances
            feature_importance_values += model.feature_importances_ / n_iterations

        feature_importances = pd.DataFrame({
            'feature':
            feature_names,
            'importance':
            feature_importance_values
        })

        # Sort features according to importance
        feature_importances = feature_importances.sort_values(
            'importance', ascending=False).reset_index(drop=True)

        # Normalize the feature importances to add up to one
        feature_importances['normalized_importance'] = feature_importances[
            'importance'] / feature_importances['importance'].sum()
        feature_importances['cumulative_importance'] = np.cumsum(
            feature_importances['normalized_importance'])

        # Extract the features with zero importance
        record_zero_importance = feature_importances[
            feature_importances['importance'] == 0.0]

        to_drop = list(record_zero_importance['feature'])

        self.feature_importances = feature_importances
        self.record_zero_importance = record_zero_importance
        self.ops['zero_importance'] = to_drop

        print('\n%d features with zero importance after one-hot encoding.\n' %
              len(self.ops['zero_importance']))
Example #21
0
            np.where(rmse_oob_all == np.min(rmse_oob_all))[0][0]]
        regression_model = RandomForestRegressor(
            n_estimators=random_forest_number_of_trees,
            max_features=int(
                max(
                    math.ceil(autoscaled_x_train.shape[1] *
                              optimal_random_forest_x_variables_rate), 1)),
            oob_score=True)
    elif method == 'gp':  # Gaussian process
        regression_model = GaussianProcessRegressor(ConstantKernel() * RBF() +
                                                    WhiteKernel(),
                                                    alpha=0)
    elif method == 'lgb':  # LightGBM
        import lightgbm as lgb

        regression_model = lgb.LGBMRegressor()
    elif method == 'xgb':  # XGBoost
        import xgboost as xgb

        regression_model = xgb.XGBRegressor()
    elif method == 'gbdt':  # scikit-learn
        from sklearn.ensemble import GradientBoostingRegressor

        regression_model = GradientBoostingRegressor()
    regression_model.fit(autoscaled_x_train, autoscaled_y_train)

    # calculate y
    calculated_ytrain = np.ndarray.flatten(
        regression_model.predict(autoscaled_x_train))
    if do_autoscaling:
        calculated_ytrain = calculated_ytrain * y_train.std(
Example #22
0
def train_lgb_regression_alldata(X,
                                 X_test,
                                 y,
                                 params,
                                 eval_metric='mae',
                                 columns=None,
                                 plot_feature_importance=False,
                                 model=None,
                                 verbose=10000,
                                 n_estimators=50000,
                                 mol_type=-1):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.

    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type

    """
    columns = X.columns if columns is None else columns
    X_test = X_test[columns]
    X_train, y_train = X[columns], y

    # to set up scoring parameters
    metrics_dict = {
        'mae': {
            'lgb_metric_name': 'mae',
            'catboost_metric_name': 'MAE',
            'sklearn_scoring_function': metrics.mean_absolute_error
        },
        'group_mae': {
            'lgb_metric_name': 'mae',
            'catboost_metric_name': 'MAE',
            'scoring_function': group_mean_log_mae
        },
        'mse': {
            'lgb_metric_name': 'mse',
            'catboost_metric_name': 'MSE',
            'sklearn_scoring_function': metrics.mean_squared_error
        }
    }

    result_dict = {}

    model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1)
    model.fit(X_train,
              y_train,
              eval_set=[(X_train, y_train)],
              eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
              verbose=verbose)

    result_dict['prediction'] = model.predict(X_test)

    if plot_feature_importance:
        # feature importance
        feature_importance = pd.DataFrame()
        feature_importance["feature"] = columns
        feature_importance["importance"] = model.feature_importances_
        result_dict['feature_importance'] = feature_importance

    return result_dict
Example #23
0
def make_regressor(iterations=6000, clf=None):
    
    if clf == 'cat':            
        clf = CatBoostRegressor(
            loss_function='RMSE',
            # eval_metric="WKappa",
            task_type="CPU",
            #learning_rate=0.01,
            iterations=iterations,
            od_type="Iter",
            #depth=4,
            early_stopping_rounds=500,
            #l2_leaf_reg=10,
            #border_count=96,
            random_seed=SEED,
            #use_best_model=True
            )
        
    if clf == 'xgb':        
        clf = xgb.XGBRegressor(
            n_estimators = 5000,
            max_depth = 10,
            min_child_weight = 3,
            gamma = 0.25,
            n_jobs = -1,
            #verbosity=3,
            random_state=SEED
            )
        
    if clf == 'lgb':        
        clf = lgb.LGBMRegressor(
            learning_rate = 0.01,
            n_estimators = 2000,
            max_depth = 15,
            #reg_alpha = 1,
            #reg_lambda = 1,
            random_state=SEED
            )
            
        
    if clf == 'ngb':        
        clf = NGBRegressor(
            Dist=Normal,
            Score=MLE,
            Base=default_tree_learner,
            natural_gradient=True,
            n_estimators=2000,
            learning_rate=0.01,
            minibatch_frac=0.6,
            verbose=True,
            verbose_eval=50
            )
        
    if clf == 'ext':        
        clf = ExtraTreesRegressor(
            #learning_rate = 0.01,
            n_estimators = 2000,
            max_depth = 15,
            #reg_alpha = 1,
            #reg_lambda = 1,
            random_state=SEED,
            verbose=3,
            #n_jobs=-1
            )
            
    return clf
Example #24
0
    def optimize(self):
        dataset = self._get_data()

        remove_columns = ['id', 'score']
        x_columns = [column for column in dataset.columns if column not in remove_columns]

        x_data = dataset[x_columns]
        y_data = dataset['score']

        """n_estimators: best:239 best_score: 14.847823004441079"""
        # params = {
        #     'boosting_type': 'gbdt',
        #     'objective': 'mae',
        #     'learning_rate': 0.1,
        #     'num_leaves': 50,
        #     'max_depth': 6,
        #     'subsample': 0.8,
        #     'colsample_bytree': 0.8,
        # }
        # dtrain = lgb.Dataset(x_data, y_data)
        # cv_results = lgb.cv(params, dtrain, num_boost_round=1000, nfold=5, stratified=False, shuffle=True,
        #                     metrics='mae', early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=2018)
        # print('best n_estimators:', len(cv_results['l1-mean']))
        # print('best cv score:', cv_results['l1-mean'][-1])

        """max_depth:6 num_leaves:31 best_score:14.803535507027162"""
        # params = {
        #     'boosting_type': 'gbdt',
        #     'objective': 'mae',
        #     'n_estimators': 239,
        #     'metric': 'mae',
        #     'learning_rate': 0.1,
        #     'num_leaves': 50,
        #     'max_depth': 6,
        #     'subsample': 0.8,
        #     'colsample_bytree': 0.8,
        # }
        # grid_params = {
        #     'max_depth': [6],
        #     'num_leaves': [28, 29, 30, 31, 32, 33, 34, 35]
        # }
        # gbm = lgb.LGBMRegressor(**params)
        # grid_search = GridSearchCV(gbm, param_grid=grid_params, scoring='neg_mean_absolute_error', cv=5, verbose=1,
        #                            n_jobs=5)
        # grid_search.fit(x_data, y_data)
        # print(f'best params: {grid_search.best_params_}')
        # print(f'best score: {grid_search.best_score_}')

        """min_child_samples:43 min_child_weight:0 best_score:14.783911433202508"""
        # params = {
        #     'boosting_type': 'gbdt',
        #     'objective': 'mae',
        #     'n_estimators': 239,
        #     'metric': 'mae',
        #     'learning_rate': 0.1,
        #     'num_leaves': 31,
        #     'max_depth': 6,
        #     'subsample': 0.8,
        #     'colsample_bytree': 0.8,
        # }
        # grid_params = {
        #     'min_child_samples': [43],
        #     'min_child_weight': [0, 0.001]
        # }
        # gbm = lgb.LGBMRegressor(**params)
        # grid_search = GridSearchCV(gbm, param_grid=grid_params, scoring='neg_mean_absolute_error', cv=5, verbose=1,
        #                            n_jobs=5)
        # grid_search.fit(x_data, y_data)
        # print(f'best params: {grid_search.best_params_}')
        # print(f'best score: {grid_search.best_score_}')

        """subsample:0.32 colsample_bytree:0.36 best_score:14.771928920921576"""
        # params = {
        #     'boosting_type': 'gbdt',
        #     'objective': 'mae',
        #     'n_estimators': 239,
        #     'metric': 'mae',
        #     'learning_rate': 0.1,
        #     'min_child_samples': 43,
        #     'min_child_weight': 0,
        #     'num_leaves': 65,
        #     'max_depth': 6,
        #     'subsample': 0.8,
        #     'colsample_bytree': 0.8,
        # }
        # grid_params = {
        #     'subsample': [0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5],
        #     'colsample_bytree': [0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5]
        # }
        # gbm = lgb.LGBMRegressor(**params)
        # grid_search = GridSearchCV(gbm, param_grid=grid_params, scoring='neg_mean_absolute_error', cv=5, verbose=1,
        #                            n_jobs=5)
        # grid_search.fit(x_data, y_data)
        # print(f'best params: {grid_search.best_params_}')
        # print(f'best score: {grid_search.best_score_}')

        """reg_alpha:2 reg_lambda:0.1 best_score:14.75515862949816"""
        # params = {
        #     'boosting_type': 'gbdt',
        #     'objective': 'mae',
        #     'n_estimators': 239,
        #     'metric': 'mae',
        #     'learning_rate': 0.1,
        #     'min_child_samples': 43,
        #     'min_child_weight': 0,
        #     'num_leaves': 65,
        #     'max_depth': 6,
        #     'subsample': 0.32,
        #     'colsample_bytree': 0.36,
        # }
        # grid_params = {
        #     'reg_alpha': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 3],
        #     'reg_lambda': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 3],
        # }
        # gbm = lgb.LGBMRegressor(**params)
        # grid_search = GridSearchCV(gbm, param_grid=grid_params, scoring='neg_mean_absolute_error', cv=5, verbose=1,
        #                            n_jobs=5)
        # grid_search.fit(x_data, y_data)
        # print(f'best params: {grid_search.best_params_}')
        # print(f'best score: {grid_search.best_score_}')

        """learning_rate:0.1 best_score:14.778696016248404"""
        # params = {
        #     'boosting_type': 'gbdt',
        #     'objective': 'mae',
        #     'n_estimators': 239,
        #     'metric': 'mae',
        #     'learning_rate': 0.1,
        #     'min_child_samples': 43,
        #     'min_child_weight': 0,
        #     'num_leaves': 65,
        #     'max_depth': 6,
        #     'subsample': 0.32,
        #     'colsample_bytree': 0.36,
        #     'reg_alpha': 2,
        #     'reg_lambda': 0.1,
        # }
        # grid_params = {
        #     'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
        # }
        # gbm = lgb.LGBMRegressor(**params)
        # grid_search = GridSearchCV(gbm, param_grid=grid_params, scoring='neg_mean_absolute_error', cv=5, verbose=1,
        #                            n_jobs=5)
        # grid_search.fit(x_data, y_data)
        # print(f'best params: {grid_search.best_params_}')
        # print(f'best score: {grid_search.best_score_}')

        params = {
            'boosting_type': 'gbdt',
            'objective': 'mae',
            'n_estimators': 10000,
            'metric': 'mae',
            'learning_rate': 0.01,
            'min_child_samples': 46,
            'min_child_weight': 0.01,
            'subsample_freq': 1,
            'num_leaves': 40,
            'max_depth': 7,
            'subsample': 0.42,
            'colsample_bytree': 0.48,
            'reg_alpha': 2,
            'reg_lambda': 0.1,
            'verbose': -1,
            'seed': 4590
        }
        grid_params = {
            'subsample': [0.45, 0.5, 0.55],
            'colsample_bytree': [0.85, 0.9, 0.95]
        }
        gbm = lgb.LGBMRegressor(**params)
        grid_search = GridSearchCV(gbm, param_grid=grid_params, scoring='neg_mean_absolute_error', cv=5, verbose=1,
                                   n_jobs=5)
        grid_search.fit(x_data, y_data)
        print(f'best params: {grid_search.best_params_}')
        print(f'best score: {grid_search.best_score_}')
Example #25
0
        d_train = pd.concat([y_train, X_train], ignore_index=True, axis=1)
        print("X_train={}, y_train={} d_train={}".format(
            X_train.shape, y_train.shape, d_train.shape))
        np.savetxt("D:/LightGBM-master/examples/regression/geo_test.csv",
                   d_train,
                   delimiter='\t')

    if model_type == 'mort':
        model = LiteMORT(params).fit(X_train,
                                     y_train,
                                     eval_set=[(X_valid, y_valid)])
        #y_pred_valid = model.predict(X_valid)
        #y_pred = model.predict(X_test)

    if model_type == 'lgb':
        model = lgb.LGBMRegressor(**params, n_jobs=-1)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_train, y_train), (X_valid, y_valid)],
                  eval_metric='auc',
                  verbose=5)
        model.booster_.save_model('geo_test_.model')
        #y_pred_valid = model.predict(X_valid)
        #y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
    break

input("loss is {} time={:.3g} model={}...".format(0,
                                                  time.time() - t0,
                                                  model_type))
sys.exit(-1)
Example #26
0
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')

y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

print('Starting training...')
# train
gbm = lgb.LGBMRegressor(num_leaves=31, learning_rate=0.05, n_estimators=20)
gbm.fit(X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=5)

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred)**0.5)

# feature importances
print('Feature importances:', list(gbm.feature_importances_))
Example #27
0
def main(sum_of_logs=False, nrows=None):
    try:
        trn_users, trn_y, indexers = get_user_data(file_name=TRN_PATH,
                                                   cat_indexers=None,
                                                   nrows=nrows,
                                                   sum_of_logs=sum_of_logs)
        sub_users, _, _ = get_user_data(file_name=SUB_PATH,
                                        cat_indexers=indexers,
                                        nrows=nrows)

        folds = KFold(n_splits=5, shuffle=True, random_state=7956112)

        sub_preds = np.zeros(sub_users.shape[0])
        oof_preds = np.zeros(trn_users.shape[0])
        oof_scores = []
        lgb_params = {
            'learning_rate': 0.03,
            'n_estimators': 2000,
            'num_leaves': 128,
            'subsample': 0.2217,
            'colsample_bytree': 0.6810,
            'min_split_gain': np.power(10.0, -4.9380),
            'reg_alpha': np.power(10.0, -3.2454),
            'reg_lambda': np.power(10.0, -4.8571),
            'min_child_weight': np.power(10.0, 2),
            'silent': True
        }

        for fold_, (trn_, val_) in enumerate(folds.split(trn_users)):
            model = lgb.LGBMRegressor(**lgb_params)

            model.fit(trn_users.iloc[trn_],
                      trn_y.iloc[trn_],
                      eval_set=[(trn_users.iloc[trn_], trn_y.iloc[trn_]),
                                (trn_users.iloc[val_], trn_y.iloc[val_])],
                      eval_metric='rmse',
                      early_stopping_rounds=100,
                      verbose=0)

            oof_preds[val_] = model.predict(trn_users.iloc[val_])
            curr_sub_preds = model.predict(sub_users)
            curr_sub_preds[curr_sub_preds < 0] = 0
            sub_preds += curr_sub_preds / folds.n_splits
            #     preds[preds <0] = 0

            logger.info('Fold %d RMSE (raw output) : %.5f' %
                        (fold_ + 1, rmse(trn_y.iloc[val_], oof_preds[val_])))
            oof_preds[oof_preds < 0] = 0
            oof_scores.append(rmse(trn_y.iloc[val_], oof_preds[val_]))
            logger.info('Fold %d RMSE : %.5f' % (fold_ + 1, oof_scores[-1]))

        logger.info('Full OOF RMSE (zero clipped): %.5f +/- %.5f' %
                    (rmse(trn_y, oof_preds), float(np.std(oof_scores))))

        # Stay in logs for submission
        sub_users['PredictedLogRevenue'] = sub_preds
        sub_users[['PredictedLogRevenue']].to_csv("simple_lgb.csv", index=True)

        logger.info('Submission data shape : {}'.format(
            sub_users[['PredictedLogRevenue']].shape))

        hist, bin_edges = np.histogram(np.hstack((oof_preds, sub_preds)),
                                       bins=25)
        plt.figure(figsize=(12, 7))
        plt.title('Distributions of OOF and TEST predictions',
                  fontsize=15,
                  fontweight='bold')
        plt.hist(oof_preds,
                 label='OOF predictions',
                 alpha=.6,
                 bins=bin_edges,
                 density=True,
                 log=True)
        plt.hist(sub_preds,
                 label='TEST predictions',
                 alpha=.6,
                 bins=bin_edges,
                 density=True,
                 log=True)
        plt.legend()
        plt.savefig('distributions.png')

    except Exception as err:
        logger.exception("Unexpected error")
 def run_hyperopt(self, param_space, X_vars, model_params, fmin_max_evals,
                  algo = 'tpe', metric = 'balanced_accuracy_score', 
                  trials_obj = None, model_type = 'indicator'): 
     '''
     Function to run Bayeisan or Random Search hyperparameter optimization
     '''
     
     #Builds the model object to conduct hyperparameter tuning on 
     if model_type == 'indicator':
         hyperopt_model = lightgbm.LGBMModel(**model_params, importance_type = 'gain')
     elif model_type == 'regressor':
         hyperopt_model = lightgbm.LGBMRegressor(**model_params, importance_type = 'gain')
     eval_set = [(self.df_tune[X_vars], self.df_tune[self.target])]
     hyperopt_model.fit(X =self.df_train[X_vars],
                        y = self.df_train[self.target],
                        eval_set = eval_set,
                        verbose = False)
     data = self.df_tune
     
     def evaluate_metric(params):
         
         hyperopt_model.set_params(**params, bagging_freq  = 1 ).fit(X =self.df_train[X_vars],
                                                                     y = self.df_train[self.target],
                                                                     eval_set = eval_set,
                                                                     verbose = False)
         
         eval_x = data[X_vars]
         y_true = data[self.target]
         
         y_score = hyperopt_model.predict(eval_x)
         
         y_pred =  [np.argmax(i) for i in y_score]
         
         if isinstance(metric, str):
             sk_scorer = getattr(metrics, metric, None)
         if sk_scorer is None:
             print(f"Specified metric {metric} does not exist in sklearn")
         
         score = sk_scorer(y_true = y_true, y_pred = y_pred)
         
         return {'loss': -score, 'params': params, 'status': STATUS_OK }
     
     if trials_obj is None:
         self.trials = Trials()
     else:
         self.trials = trials_obj
         
     if algo == 'tpe':
         algo = tpe.suggest 
     elif algo == 'random':
         algo = rand.suggest
         
     best_params = fmin(
         evaluate_metric,
         space = param_space,
         algo = algo,
         max_evals = fmin_max_evals,
         rstate = np.random.RandomState(self.seed),
         trials = self.trials
     )
     
     return best_params, self.trials
Example #29
0

model_random_state = 5
params = {
    'max_depth': range(6,15),
    'num_leaves': range(20, 100, 10),
#     'min_child_samples': [18, 19, 20, 21, 22],
#     'min_child_weight':[0.001, 0.002],
    'feature_fraction': [0.5, 0.6, 0.7,0.8],
    'bagging_fraction': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
    'reg_alpha': [ 0.6, 0.7, 0.8, 0.9, 1],
    'reg_lambda': [ 0.6, 0.7, 0.8, 0.9, 1]
    }

model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=40, max_depth=6,
                              metric='rmse', bagging_fraction = 0.5,feature_fraction = 0.4)
rc = RandomizedSearchCV(model_lgb, params, cv=5, random_state=model_random_state)
rc.fit(train_X_PM25,train_y_PM25)

#%%
print('best params: ', rc.best_params_)
print('best score: ', rc.best_score_)
best_position = rc.best_index_
print('best train score:', rc.cv_results_['mean_train_score'][best_position])
print('best train std:', rc.cv_results_['std_train_score'][best_position])
print('best test score:', rc.cv_results_['mean_test_score'][best_position])
print('best test std:', rc.cv_results_['std_test_score'][best_position])
# best params:  {'reg_lambda': 0.9, 'reg_alpha': 0.6, 'num_leaves': 60, 'max_depth': 14, 'feature_fraction': 0.5, 'bagging_fraction': 0.3}
# best score:  0.8011678911600456
# best train score: 0.9083334290707444
Example #30
0
def train_model(X,
                X_test,
                y,
                folds,
                params=None,
                model_type='lgb',
                plot_feature_importance=False,
                model=None):
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print('Fold', fold_n, 'started at', time.ctime())
        if type(X) == np.ndarray:
            X_train, X_valid = X[train_index], X[valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators=1000, n_jobs=-1)
            model.fit(X_train,
                      y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric='mae',
                      verbose=100,
                      early_stopping_rounds=200)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train,
                                     label=y_train,
                                     feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid,
                                     label=y_valid,
                                     feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data,
                              num_boost_round=20000,
                              evals=watchlist,
                              early_stopping_rounds=200,
                              verbose_eval=500,
                              params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid,
                                                     feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test,
                                               feature_names=X.columns),
                                   ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid).reshape(-1, )
            score = mean_absolute_error(y_valid, y_pred_valid)
            print(f'Fold {fold_n}. MAE: {score:.4f}.')
            print('')

            y_pred = model.predict(X_test).reshape(-1, )

        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000,
                                      eval_metric='MAE',
                                      **params)
            model.fit(X_train,
                      y_train,
                      eval_set=(X_valid, y_valid),
                      cat_features=[],
                      use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)

        oof[valid_index] = y_pred_valid.reshape(-1, )
        scores.append(mean_absolute_error(y_valid, y_pred_valid))

        prediction += y_pred

        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)

    prediction /= n_fold

    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(
        np.mean(scores), np.std(scores)))

    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[[
                "feature", "importance"
            ]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[
                feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12))
            sns.barplot(x="importance",
                        y="feature",
                        data=best_features.sort_values(by="importance",
                                                       ascending=False))
            plt.title('LGB Features (avg over folds)')
            plt.show()

            return oof, prediction, feature_importance
        return oof, prediction, scores

    else:
        return oof, prediction, scores