Beispiel #1
0
    'x3E', 'x4', 'x5', 'x6'
]

df = pd.read_csv('train.csv', index_col=0)
df = df.fillna(0)  # replace NaN entries
df_test = pd.read_csv('test.csv', index_col=0)
df_test = df_test.fillna(0)  # replace NaN entries

weights = []
for index, row in df.iterrows():
    weights.append(float(row['Weight']))

X = df[features]
Y = df['y']

model = LGBMRegressor()  #n_estimators=1000, learning_rate=0.01)
param_grid = {
    'learning_rate': [0.07, 0.1],
    'n_estimators': [10000],
    'boosting_type': ['gbdt'],
    'min_data_in_leaf': [40],
    'num_leaves': [80],
    'max_depth': [-1],
    'num_iterations': [110]
}
gbm = GridSearchCV(model, param_grid)
gbm.fit(X, Y, sample_weight=weights)

print('Best parameters found are:', gbm.best_params_)
print('Best score:', gbm.best_score_)
print('Feature importances:', list(gbm.best_estimator_.feature_importances_))
Beispiel #2
0
lgb_params['sub_feature'] = 0.35    
lgb_params['bagging_fraction'] = 0.85 # sub_row
lgb_params['bagging_freq'] = 40
lgb_params['num_leaves'] = 512        # num_leaf
lgb_params['min_data'] = 500         # min_data_in_leaf
lgb_params['min_hessian'] = 0.05     # min_sum_hessian_in_leaf
lgb_params['verbose'] = 0
lgb_params['feature_fraction_seed'] = 2
lgb_params['bagging_seed'] = 3


# XGB model
xgb_model = XGBRegressor(**xgb_params)

# lgb model
lgb_model = LGBMRegressor(**lgb_params)

# RF model
rf_model = RandomForestRegressor(**rf_params)

# ET model
et_model = ExtraTreesRegressor()

# SVR model
# SVM is too slow in more then 10000 set
#svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.05)

# DecsionTree model
dt_model = DecisionTreeRegressor()

# AdaBoost model
Beispiel #3
0
print(test.shape)  #(10000,71)

x = train[0:, :71]
y = train[0:, 71:]

print(x.shape)  #(10000,71)
print(y.shape)  #(10000,4)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=66)

model = MultiOutputRegressor(
    LGBMRegressor(n_estimators=1000,
                  learning_rate=0.05,
                  max_depth=-1,
                  colsample_bytree=0.8))

model.fit(x_train, y_train)

score = model.score(x_test, y_test)

print("R2:", score)

# thresholds = np.sort(model.feature_importances_) # 오름차순 정렬(feature_importances정렬)
# print(thresholds)

# models=[]
# res = np.array([])
# for thresh in thresholds:
#     selection = SelectFromModel(model, threshold=thresh, prefit=True)
Beispiel #4
0
def process(msg):
    att_dict = dict()
    att_dict['config'] = dict()

    att_dict['operator'] = 'lightgbm'
    if api.config.debug_mode == True:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='DEBUG')
    else:
        logger, log_stream = slog.set_logging(att_dict['operator'],
                                              loglevel='INFO')
    logger.info("Process started")
    time_monitor = tp.progress()

    df = msg.body
    if not isinstance(df, pd.DataFrame):
        raise TypeError('Message body does not contain a pandas DataFrame')

    ###### start of doing calculation

    model = LGBMRegressor(n_estimators=200,
                          learning_rate=0.03,
                          num_leaves=32,
                          colsample_bytree=0.9497036,
                          subsample=0.8715623,
                          max_depth=8,
                          reg_alpha=0.04,
                          reg_lambda=0.073,
                          min_split_gain=0.0222415,
                          min_child_weight=40)

    att_dict['config']['train columns'] = api.config.train_cols
    train_cols = tfp.read_list(api.config.train_cols, df.columns)

    att_dict['config']['label'] = api.config.label_col
    label = tfp.read_value(api.config.label_col)
    if not label:
        raise ValueError('Label is mandatory')

    # cast to categorical dtype
    for c in df[train_cols].select_dtypes(include='category').columns:
        unique_num = len(df[c].unique())
        nan_num = df[c].isna().count()
        logger.debug('Cast to category - {}: unique {}, nan: {} of {}'.format(
            c, unique_num, nan_num, df.shape[0]))
        df[c] = df[c].cat.codes
        df[c] = df[c].astype('int32')

    if pd.api.types.is_categorical(df[label]):
        df[label] = df[label].astype('category')
        logger.debug('Cast label to <category>')
        df[label] = df[label].cat.codes
        df[label] = df[label].astype('int32')

    print(df.select_dtypes(include='category').head(10))
    logger.debug('Train with {} features'.format(len(train_cols)))
    print(train_cols)
    model.fit(df[train_cols], df[label], eval_metric='auc')

    ###### end of doing calculation

    ##############################################
    #  final infos to attributes and info message
    ##############################################

    if df.empty:
        raise ValueError('DataFrame is empty')

    logger.info('End of Process: {}'.format(time_monitor.elapsed_time()))

    att_dict['memory'] = df.memory_usage(deep=True).sum() / 1024**2
    att_dict['columns'] = str(list(df.columns))
    att_dict['shape'] = df.shape
    att_dict['id'] = str(id(df))

    logger.debug('Columns: {}'.format(str(df.columns)))
    logger.debug('Shape (#rows - #columns): {} - {}'.format(
        df.shape[0], df.shape[1]))
    logger.debug('Memory: {} kB'.format(att_dict['memory']))

    return log_stream.getvalue(), api.Message(attributes=att_dict, body=model)
def crossValidation(XTraining, kfold, modelName, model='default', verbose=False):
    maeList = []
    mapeList = []
    rmseList = []

    for k in reversed(range(1, kfold+1)):
        if verbose:
            print(f'\nKFold Number: {k}')
        # Start and End Date for Validation
        startDateValid = XTraining['Date'].max() - datetime.timedelta(days=k*6*7)
        endDateValid = XTraining['Date'].max() - datetime.timedelta(days=(k-1)*6*7)

        # Filtering Dataset
        training = XTraining[XTraining['Date'] < startDateValid]
        validation = XTraining[(XTraining['Date'] >= startDateValid) & (XTraining['Date'] <= endDateValid)]

        # Training and Validation Dataset
        # Training
        XKFoldTraining = training.drop(['Date', 'Sales'], axis=1)
        yKFoldTraining = training['Sales']

        # Validation
        XKFoldValidation = validation.drop(['Date', 'Sales'], axis=1)
        yKFoldValidation = validation['Sales']

        # Model
        ## Model Map
        modelMap = {
            'Linear Regression': LinearRegression(),
            'Lasso': Lasso(alpha=0.01),
            'Random Forest Regressor': RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42),
            'XGBoost Regressor': xgb.XGBRegressor( objective='reg:squarederror', n_estimators=500, eta=0.01, max_depth=10, 
                                                      subsample=0.7, colsample_bytree=0.9),
            'Lightgbm Regressor': LGBMRegressor(num_leaves=10, min_data_in_leaf=50, n_jobs=-1, random_state=42, n_estimators=500)   
        }
        
        ## Mapped Model
        if model == 'default':
            model = modelMap[modelName]
        else: model = model
        
        model.fit(XKFoldTraining, yKFoldTraining)

        # Prediction
        yhat = model.predict(XKFoldValidation)

        #Performance
        modelResult = mlError('Linear Regression', np.expm1(yKFoldValidation), np.expm1(yhat))
        
        #Store Performance of each KFold iteration
        maeList.append(modelResult['MAE'].tolist())
        mapeList.append(modelResult['MAPE'].tolist())
        rmseList.append(modelResult['RMSE'].tolist())


    dictResult = {
                    'Model Name': [modelName],
                    'MAE CV': [np.round(np.mean(maeList),2).astype(str) + ' +/- ' + np.round(np.std(maeList),2).astype(str)],
                    'MAPE CV': [np.round(np.mean(mapeList),2).astype(str) + ' +/- ' + np.round(np.std(mapeList),2).astype(str)],
                    'RMSE CV': [np.round(np.mean(rmseList),2).astype(str) + ' +/- ' + np.round(np.std(rmseList),2).astype(str)]
                }

    return pd.DataFrame(dictResult)
Beispiel #6
0
# print(new_examDf[new_examDf.isnull() == True].count())  # 检验缺失值,若输出为0,说明该列没有缺失值

# 划分训练数据和测试数据
# X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

# X_train,X_test,y_train,y_test = train_test_split(features,targets,test_size=0.25)

#晓梦
X_train, X_test, y_train, y_test = train_test_split(new_examDf.iloc[:, :34],
                                                    new_examDf.iloc[:, 34],
                                                    train_size=0.75,
                                                    random_state=0)

# 模型训练
gbm = LGBMRegressor(objective='regression',
                    num_leaves=31,
                    learning_rate=0.1,
                    n_estimators=40)
gbm.fit(X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=5)

# 模型存储
joblib.dump(gbm, 'loan_model.pkl')
# 模型加载
gbm = joblib.load('loan_model.pkl')

# 模型预测
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
Beispiel #7
0
def get_model_from_name(model_name, training_params=None, is_hp_search=False):
    global keras_imported

    # For Keras
    epochs = 1000
    # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning':
    #     print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy')
    #     epochs = 100

    all_model_params = {
        'LogisticRegression': {},
        'RandomForestClassifier': {'n_jobs': -2, 'n_estimators': 30},
        'ExtraTreesClassifier': {'n_jobs': -1},
        'AdaBoostClassifier': {},
        'SGDClassifier': {'n_jobs': -1},
        'Perceptron': {'n_jobs': -1},
        'LinearSVC': {'dual': False},
        'LinearRegression': {'n_jobs': -2},
        'RandomForestRegressor': {'n_jobs': -2, 'n_estimators': 30},
        'LinearSVR': {'dual': False, 'loss': 'squared_epsilon_insensitive'},
        'ExtraTreesRegressor': {'n_jobs': -1},
        'MiniBatchKMeans': {'n_clusters': 8},
        'GradientBoostingRegressor': {'presort': False, 'learning_rate': 0.1, 'warm_start': True},
        'GradientBoostingClassifier': {'presort': False, 'learning_rate': 0.1, 'warm_start': True},
        'SGDRegressor': {'shuffle': False},
        'PassiveAggressiveRegressor': {'shuffle': False},
        'AdaBoostRegressor': {},
        'LGBMRegressor': {'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384},
        'LGBMClassifier': {'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384},
        'DeepLearningRegressor': {'epochs': epochs, 'batch_size': 50, 'verbose': 2},
        'DeepLearningClassifier': {'epochs': epochs, 'batch_size': 50, 'verbose': 2},
        'CatBoostRegressor': {},
        'CatBoostClassifier': {}
    }

    # if os.environ.get('is_test_suite', 0) == 'True':
    #     all_model_params

    model_params = all_model_params.get(model_name, None)
    if model_params is None:
        model_params = {}

    if is_hp_search == True:
        if model_name[:12] == 'DeepLearning':
            model_params['epochs'] = 50
        if model_name[:4] == 'LGBM':
            model_params['n_estimators'] = 500


    if training_params is not None:
        print('Now using the model training_params that you passed in:')
        print(training_params)
        # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it)
        model_params.update(training_params)
        print('After overwriting our defaults with your values, here are the final params that will be used to initialize the model:')
        print(model_params)


    model_map = {
        # Classifiers
        'LogisticRegression': LogisticRegression(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RidgeClassifier': RidgeClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),


        'LinearSVC': LinearSVC(),

        # Regressors
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'Ridge': Ridge(),
        'LinearSVR': LinearSVR(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'RANSACRegressor': RANSACRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),

        'Lasso': Lasso(),
        'ElasticNet': ElasticNet(),
        'LassoLars': LassoLars(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'BayesianRidge': BayesianRidge(),
        'ARDRegression': ARDRegression(),

        # Clustering
        'MiniBatchKMeans': MiniBatchKMeans(),
    }

    try:
        model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001)
        model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(max_iter=1000, tol=0.001)
        model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor(max_iter=1000, tol=0.001)
    except TypeError:
        model_map['SGDClassifier'] = SGDClassifier()
        model_map['Perceptron'] = Perceptron()
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier()
        model_map['SGDRegressor'] = SGDRegressor()
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor()

    if xgb_installed:
        model_map['XGBClassifier'] = XGBClassifier()
        model_map['XGBRegressor'] = XGBRegressor()

    if lgb_installed:
        model_map['LGBMRegressor'] = LGBMRegressor()
        model_map['LGBMClassifier'] = LGBMClassifier()

    if catboost_installed:
        model_map['CatBoostRegressor'] = CatBoostRegressor()
        model_map['CatBoostClassifier'] = CatBoostClassifier()

    if model_name[:12] == 'DeepLearning':
        if keras_imported == False:
            # Suppress some level of logs if TF is installed (but allow it to not be installed, and use Theano instead)
            try:
                os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
                os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
                from tensorflow import logging
                logging.set_verbosity(logging.INFO)
            except:
                pass

            global maxnorm
            global Dense, Dropout
            global LeakyReLU, PReLU, ThresholdedReLU, ELU
            global Sequential
            global keras_load_model
            global regularizers, optimizers
            global Activation
            global KerasRegressor, KerasClassifier

            from keras.constraints import maxnorm
            from keras.layers import Activation, Dense, Dropout
            from keras.layers.advanced_activations import LeakyReLU, PReLU, ThresholdedReLU, ELU
            from keras.models import Sequential
            from keras.models import load_model as keras_load_model
            from keras import regularizers, optimizers
            from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier
            keras_imported = True

        model_map['DeepLearningClassifier'] = KerasClassifier(build_fn=make_deep_learning_classifier)
        model_map['DeepLearningRegressor'] = KerasRegressor(build_fn=make_deep_learning_model)

    try:
        model_without_params = model_map[model_name]
    except KeyError as e:
        print('It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize')
        raise(e)

    if os.environ.get('is_test_suite', False) == 'True':
        if 'n_jobs' in model_params:
            model_params['n_jobs'] = 1
    model_with_params = model_without_params.set_params(**model_params)

    return model_with_params
Beispiel #8
0
# train_ds = lgb.Dataset(x_train, label = y_train)
# test_ds = lgb.Dataset(x_test, label = y_test)

# params = {'learning_rate' : 0.01, 'max_depth': 13 , 'boosting' : 'gbdt',
#         'objective':'regression', 'metric':'mse', 'is_training_metric':True,
#         'num_leaves':144, 'feature_fraction':0.9, 'bagging_fraction':0.7,
#         'bagging_freq':5, 'seed':2020}

# model = lgb.train(params, train_ds, 1000, test_ds,verbose_eval=100, early_stopping_rounds=100)

parameter = {
    'num_iterations': 1000,
    'learning_rate': 0.05,
    'early_stopping_round': 20,
}
model = LGBMRegressor()

model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric="error",
          eval_set=[(x_train, y_train), (x_test, y_test)],
          early_stopping_rounds=100)

# rmse, mae, logloss, error, auc

y_pred = model.predict(x_test)

r2 = r2_score(y_pred, y_test)
print(f"r2: {r2}")
Beispiel #9
0
    def cv(self, nfolds=5, submission=True):
        self.regressors.clear()
        self.feature_importance_df = pd.DataFrame()

        if not submission:
            folds = data_prepare.get_folds(df=self.x_train, n_splits=nfolds)
        else:
            folds = data_prepare.get_folds(
                df=self.x_train[['totals.pageviews']].reset_index(),
                n_splits=nfolds)

        if 'fullVisitorId' in self.x_train.columns:
            self.x_train.drop('fullVisitorId', axis=1, inplace=True)
        if 'fullVisitorId' in self.x_test.columns:
            self.x_test.drop('fullVisitorId', axis=1, inplace=True)
        #if 'fullVisitorId' in self.y_train.columns:
        #self.y_train.drop('fullVisitorId', axis=1, inplace=True)

        oof_preds = np.zeros(self.x_train.shape[0])
        preds_test = np.empty((nfolds, self.x_test.shape[0]))

        self.logfile.write('param: {}\n'.format(self.param))
        self.logfile.write('fold: {}\n'.format(nfolds))
        self.logfile.write('data shape: {}\n'.format(self.x_train.shape))
        self.logfile.write('features: {}\n'.format(
            self.x_train.columns.tolist()))

        if self.comment is not None:
            self.logfile.write('comment: {}\n'.format(self.comment))

        self.logfile.write('output: ../output/{}.csv\n'.format(self.name))
        self.logfile.flush()

        for n_fold, (train_idx, valid_idx) in enumerate(folds):
            fstart = time.time()
            train_x, train_y = self.x_train.iloc[train_idx], self.y_train.iloc[
                train_idx]
            valid_x, valid_y = self.x_train.iloc[valid_idx], self.y_train.iloc[
                valid_idx]

            # lgbRegressor parameters found by Bayesian optimization
            clf = LGBMRegressor(**self.param)
            clf.fit(train_x,
                    np.log1p(train_y),
                    eval_set=[(valid_x, np.log1p(valid_y))],
                    eval_metric='rmse',
                    verbose=100,
                    early_stopping_rounds=200)

            oof_preds[valid_idx] = clf.predict(
                valid_x, num_iteration=clf.best_iteration_)
            preds_test[n_fold, :] = clf.predict(
                self.x_test, num_iteration=clf.best_iteration_)

            #remove negative and transform un log
            oof_preds[oof_preds < 0] = 0
            preds_test[preds_test < 0] = 0

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = self.x_train.columns.tolist()
            fold_importance_df["importance"] = clf.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            self.feature_importance_df = pd.concat(
                [self.feature_importance_df, fold_importance_df], axis=0)

            strlog = '[{}][{:.1f} sec] Fold {} RMSE : {:.6f}'.format(
                str(datetime.now()),
                time.time() - fstart, n_fold + 1,
                mean_squared_error(np.log1p(valid_y),
                                   oof_preds[valid_idx])**.5)
            print(strlog)
            self.logfile.write(strlog + '\n')
            self.logfile.flush()

            self.regressors.append(clf)
            del clf, train_x, train_y, valid_x, valid_y
            gc.collect()

        full_rmse = mean_squared_error(np.log1p(self.y_train), oof_preds)**.5
        strlog = 'Full RMSE score {:.6f}'.format(full_rmse)
        print(strlog)
        self.logfile.write(strlog + '\n')

        preds = preds_test.mean(axis=0)

        if submission:
            #sub = pd.read_csv('../input/sample_submission.csv')
            #sub['PredictedLogRevenue'] = preds
            preds.to_csv('../output/submission/{}.csv'.format(self.name),
                         index=True)

            cols = self.feature_importance_df[[
                "feature", "importance"
            ]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:100].index
            self.logfile.write('top features:\n')
            for c in cols:
                self.logfile.write('{}\n'.format(c))

            self.logfile.flush()

            self.display_importances(self.feature_importance_df, self.name)

        # for stack
        np.save('../output/feats/{}_trn_prd_feats'.format(self.name),
                oof_preds)
        np.save('../output/feats/{}_tes_prd_feats'.format(self.name), preds)
        return self.feature_importance_df, full_rmse, oof_preds, preds
Beispiel #10
0
def find_regression_model(X_train, X_test, y_train, y_test, ensembling=True):
    models = []
    overall_accuracies = []

    if ensembling:
        print("Ensembling is enabled.")
        learning_rate_list = [0.1, 0.01, 0.001]
        gamma_list = [0, 1, 5]
        colsample_bytree_list = [0.3, 0.5, 0.8, 1]
        # Creating list of cv scores
        scores = []
        params = []

        for lr in learning_rate_list:
            for g in gamma_list:
                for cb in colsample_bytree_list:
                    xgb = XGBRegressor(learning_rate=lr,
                                       gamma=g,
                                       colsample_bytree=cb,
                                       objective="reg:squarederror")
                    xgb.fit(X_train, y_train)
                    y_pred = xgb.predict(X_test)
                    scores.append(explained_variance_score(y_test, y_pred))
                    params.append([lr, g, cb])
        XGB_max_scores = max(scores)
        print(
            "XGBoost regressor explained variance score is:", XGB_max_scores,
            "with the following values\nfor learning rate, g, and number of columns used by each tree:",
            params[scores.index(XGB_max_scores)])
        lr_best = params[scores.index(XGB_max_scores)][0]
        g_best = params[scores.index(XGB_max_scores)][1]
        cb_best = params[scores.index(XGB_max_scores)][2]
        models.append(
            XGBRegressor(learning_rate=lr_best,
                         gamma=g_best,
                         colsample_bytree=cb_best,
                         objective="reg:squarederror"))
        overall_accuracies.append(XGB_max_scores)

        # Use LightGBM
        learning_rate_list = [0.1, 0.01, 0.001]
        colsample_bytree_list = [0.3, 0.5, 0.8, 1]
        # Creating list of cv scores
        scores = []
        params = []

        for lr in learning_rate_list:
            for cb in colsample_bytree_list:
                lgbm = LGBMRegressor(learning_rate=lr, colsample_bytree=cb)
                lgbm.fit(X_train, y_train)
                y_pred = lgbm.predict(X_test)
                scores.append(explained_variance_score(y_test, y_pred))
                params.append([lr, cb])
        LGBM_max_scores = max(scores)
        print("LightGBM regressor explained variance score is:",
              LGBM_max_scores,
              "with the following values\nfor learning rate and objective:",
              params[scores.index(LGBM_max_scores)])
        lr_best = params[scores.index(LGBM_max_scores)][0]
        cb_best = params[scores.index(LGBM_max_scores)][1]
        models.append(
            LGBMRegressor(learning_rate=lr_best, colsample_bytree=cb_best))
        overall_accuracies.append(LGBM_max_scores)

    else:
        print("Ensembling is disabled.")
        alpha_list = [100, 50, 25, 10, 5, 1, 0.75, 0.5, 0.25, 0.1, 1e-5]
        max_iter_list = [100000, 50000, 10000, 5000, 10000]

        # Use lasso regression
        scores = []
        params = []

        for a in alpha_list:
            for i in max_iter_list:
                lasso = Lasso(alpha=a, max_iter=i)
                lasso.fit(X_train, y_train)
                y_pred = lasso.predict(X_test)
                scores.append(explained_variance_score(y_test, y_pred))
                params.append([a, i])
        lasso_max_scores = max(scores)
        print("Lasso model explained variance score is:", lasso_max_scores,
              "with the values for alpha and max iterations: ",
              params[scores.index(lasso_max_scores)])
        a_best = params[scores.index(lasso_max_scores)][0]
        i_best = params[scores.index(lasso_max_scores)][1]
        models.append(Lasso(alpha=a_best, max_iter=i_best))
        overall_accuracies.append(lasso_max_scores)

        # Use ridge regression
        scores = []
        params = []

        for a in alpha_list:
            for i in max_iter_list:
                ridge = Ridge(alpha=a, max_iter=i)
                ridge.fit(X_train, y_train)
                y_pred = ridge.predict(X_test)
                scores.append(explained_variance_score(y_test, y_pred))
                params.append([a, i])
        ridge_max_scores = max(scores)
        print("Ridge model explained variance score is:", ridge_max_scores,
              "with the values for alpha and max iterations: ",
              params[scores.index(ridge_max_scores)])
        a_best = params[scores.index(ridge_max_scores)][0]
        i_best = params[scores.index(ridge_max_scores)][1]
        models.append(Ridge(alpha=a_best, max_iter=i_best))
        overall_accuracies.append(ridge_max_scores)

        # Use elastic net
        for a in alpha_list:
            for i in max_iter_list:
                try:
                    elastic = ElasticNet(alpha=a, max_iter=i, l1_ratio=0.5)
                    elastic.fit(X_train, y_train)
                    y_pred = elastic.predict(X_test)
                    scores.append(explained_variance_score(y_test, y_pred))
                    params.append([a, i])
                except:
                    continue
        elastic_max_scores = max(scores)
        a_best = params[scores.index(elastic_max_scores)][0]
        i_best = params[scores.index(elastic_max_scores)][1]
        l1_best = 0.5
        models.append(ElasticNet(alpha=a_best, max_iter=i_best, l1_ratio=0.5))
        overall_accuracies.append(elastic_max_scores)
        print("Elastic net regression explained variance score is:",
              elastic_max_scores,
              "with the values\nfor alpha, max iterations, and l1 ratio: ",
              params[scores.index(elastic_max_scores)])

        # Use linear regression
        linear = LinearRegression()
        linear.fit(X_train, y_train)
        y_pred = linear.predict(X_test)
        print("Linear regression explained variance score is:",
              explained_variance_score(y_test, y_pred), "using\n", linear)
        models.append(LinearRegression())
        overall_accuracies.append(explained_variance_score(y_test, y_pred))

        # Use SVM
        # Creating lists of gamma and c for SVM
        gamma_list = [1e-3, 1e-5, 1e-7, 1e-9]
        c_list = [1, 10, 100, 1000]
        kernel_list = ["linear", "rbf", "poly"]
        # Creating list of cv scores
        scores = []
        params = []

        # Perform gridsearch on SVR model
        for c in c_list:
            for g in gamma_list:
                for k in kernel_list:
                    svr = SVR(gamma=g, C=c, kernel=k)
                    svr.fit(X_train, y_train)
                    y_pred = svr.predict(X_test)
                    scores.append(explained_variance_score(y_test, y_pred))
                    params.append([g, c, k])
        SVR_max_scores = max(scores)
        print("Support vector regressor explained variance score is:",
              SVR_max_scores, "with the following values\nfor g, c, and k:",
              params[scores.index(SVR_max_scores)])
        g_best = params[scores.index(SVR_max_scores)][0]
        c_best = params[scores.index(SVR_max_scores)][1]
        k_best = params[scores.index(SVR_max_scores)][2]
        models.append(SVR(gamma=g_best, C=c_best, kernel=k_best))
        overall_accuracies.append(SVR_max_scores)

    top_accuracy = max(overall_accuracies)
    best_model = models[overall_accuracies.index(top_accuracy)]
    print(
        "\nThe best model found for the regression problem on the dataset is:\n",
        best_model)
Beispiel #11
0
def train(save_pickles=True):

    # --- establish SQL Connection ---
    SQL = tycho.PostgreSQLCon()
    SQL.make_con()

    # --- Read in ETL Pickle ---
    merged = SQL.sql_to_pandas('etl_L3')

    # --- Sanitize ---
    ColumnSanitize = tycho.ColumnSanitizer()
    clean = ColumnSanitize.sanitize(merged)

    # --- Create average lookup tables ---
    avg_table = tycho.calc_average_y_vals_per_MW(clean)

    # --- Split ---
    Splitter = tycho.FourWaySplit()
    X_train_df, X_test_df, y_train_all, y_test_all = Splitter.split(clean)

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~~ Pipeline ~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    pandas_pipe = Pipeline(steps=[
        ('capacity', tycho.CapacityFeatures()),
        ('date', tycho.DateFeatures()),
        ('avg_values', tycho.ApplyAvgY(avg_table)),
        ('dropnull', tycho.DropNullColumns()),
        ('onehot', tycho.OneHotEncodeWithThresh()),
    ])

    numpy_pipe = Pipeline(steps=[
        ('imputer', SimpleImputer()),
        ('scaler', tycho.LowMemoryMinMaxScaler()),
    ])

    preprocess_pipe = Pipeline(steps=[
        ('pd', pandas_pipe),
        ('np', numpy_pipe),
    ])

    # --- Fit/transform ---
    X_train = preprocess_pipe.fit_transform(X_train_df)
    X_test = preprocess_pipe.transform(X_test_df)

    # --- Create complete dfs for output ---
    train_out_df = X_train_df[[
        'datetime_utc', 'plant_id_wri', 'estimated_generation_gwh',
        'primary_fuel'
    ]]
    train_out_df = pd.concat([train_out_df, y_train_all], axis='columns')

    test_out_df = X_test_df[[
        'datetime_utc', 'plant_id_wri', 'estimated_generation_gwh',
        'primary_fuel'
    ]]
    test_out_df = pd.concat([test_out_df, y_test_all], axis='columns')

    # --- output preprocessing pipe ---
    if save_pickles:
        out_path = os.path.join('models', config.TRAIN_MODEL)
        with open(os.path.join(out_path, 'pipe.pkl'), 'wb') as handle:
            pickle.dump(preprocess_pipe, handle)

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~ Train Model ~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    for y_col in config.ML_Y_COLS:
        log.info('\n')
        log.info(f'....beginning fit for {y_col} using {config.TRAIN_MODEL}')

        # --- Subset y ---
        y_train = np.array(y_train_all[y_col])
        y_test = np.array(y_test_all[y_col])

        # --- Initialize Model ---
        if config.TRAIN_MODEL == 'lr':
            model = LinearRegression(fit_intercept=True,
                                     normalize=False,
                                     n_jobs=-1)

        elif config.TRAIN_MODEL == 'bayes-lgbm':

            estimator = LGBMRegressor(
                random_state=1,
                n_jobs=12,
                verbose=-1,
                num_iterations=1000,
                boosting_type=None,
                learning_rate=0.03,
                subsample=0.7,
                boosting='dart',
            )

            lgbm_pbounds = {
                # 'boosting':['gbdt','dart'],
                # 'learning_rate': (0.01, 1.),
                # 'n_estimators': (2, 2000),
                'max_depth': (3, 12),
                # 'min_child_weight': (0., 100.),
                # 'min_data_in_leaf' : (1, 40),
                'num_leaves': (
                    2, 2000
                ),  # large num_leaves helps improve accuracy but might lead to over-fitting
                # 'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
                'objective': ['rmse', 'mae', 'tweedie'],
                'max_bin': (
                    128, 10000
                ),  # large max_bin helps improve accuracy but might slow down training progress
                # 'colsample_bytree' : (0.3,1),
                # 'subsample' : (0.3, 1.),
                # 'reg_alpha' : (0., 300.),
                # 'reg_lambda' : (0., 300.),
            }

            model = tycho.BayesRegressor(estimator=estimator,
                                         pbounds=lgbm_pbounds)

        elif config.TRAIN_MODEL == 'bayes-xgb':

            estimator = XGBRegressor(random_state=1,
                                     nthread=12,
                                     tree_method='gpu_hist',
                                     single_precision_histogram=True,
                                     validate_paramters=True)

            xgb_pbounds = {
                'booster': ['dart', 'gbtree', 'gblinear'],
                'max_depth': (3, 11),
                # 'learning_rate': (0.1, 0.5),
                'subsample': (0.1, 1.),
                # 'sampling_metod':['uniform','gradient_based'],
                'colsample_bytree': (0.1, 1.),
                # 'colsample_bylevel': (0.1, 1.),
                'max_bin': (
                    2, 10000
                ),  # large max_bin helps improve accuracy but might slow down training progress
                # 'grow_policy':['depthwise','lossguide'],
                # 'min_child_weight': (0., 100),
                'reg_alpha': (0., 250.),
                'reg_lambda': (0., 250.),
                'gamma': (0., 10.),
                # 'objective': ['reg:tweedie'],
            }

            model = tycho.BayesRegressor(estimator=estimator,
                                         pbounds=xgb_pbounds)

        # --- Fit  ---
        model.fit(X_train, y_train)

        # --- Get best estimator ---
        y_train_pred = model.predict(X_train)
        log.info(
            f'........best train MAE for {y_col}: {mae(y_train, y_train_pred)}'
        )
        log.info(
            f'........best train mape for {y_col}: {mape(y_train, y_train_pred)}'
        )
        log.info(
            f'........average value for {y_col} is {y_train.mean()}, MAE as a percent is {mae(y_train, y_train_pred) / y_train.mean()}'
        )

        # --- Predict on test ---
        y_pred = model.predict(X_test)
        log.info(f'........best test mae for {y_col}: {mae(y_test, y_pred)}')
        log.info(f'........best test mape for {y_col}: {mape(y_test, y_pred)}')
        log.info(
            f'........average value for {y_col} is {y_test.mean()}, MAE as a percent is {mae(y_test, y_pred) / y_test.mean()}'
        )

        if save_pickles:
            if config.TRAIN_MODEL == 'tpot':
                # --- Output model pipeline ---
                model.export(
                    os.path.join(out_path, f'tpot_best_pipe_{y_col}.py'))
                best = model.fitted_pipeline_
                with open(
                        os.path.join(
                            out_path,
                            f'model_{y_col}_{config.TRAIN_MODEL}.pkl'),
                        'wb') as handle:
                    pickle.dump(best, handle)

            else:
                # --- Output model ---
                with open(
                        os.path.join(
                            out_path,
                            f'model_{y_col}_{config.TRAIN_MODEL}.pkl'),
                        'wb') as handle:
                    pickle.dump(model, handle)

        # --- save predictions to out dfs ---
        train_out_df[f'pred_{y_col}'] = y_train_pred
        test_out_df[f'pred_{y_col}'] = y_pred

    return train_out_df, test_out_df
Beispiel #12
0
def create_model(hyperparams):
    from lightgbm import LGBMRegressor

    return LGBMRegressor(**hyperparams)
gc.collect()

drop_cols = ["codmes"]
test_preds = []
train_preds = []
y_train["target"] = y_train["margen"].astype("float32")
for mes in X_train.codmes.unique():
    print("*" * 10, mes, "*" * 10)
    Xt = X_train[X_train.codmes != mes]
    yt = y_train.loc[Xt.index, "target"]
    Xt = Xt.drop(drop_cols, axis=1)

    Xv = X_train[X_train.codmes == mes]
    yv = y_train.loc[Xv.index, "target"]

    learner = LGBMRegressor(n_estimators=5000)
    learner.fit(Xt,
                yt,
                early_stopping_rounds=50,
                eval_metric="mae",
                eval_set=[(Xt, yt), (Xv.drop(drop_cols, axis=1), yv)],
                verbose=50)
    gc.collect()
    test_preds.append(
        pd.Series(learner.predict(X_test.drop(drop_cols, axis=1)),
                  index=X_test.index,
                  name="fold_" + str(mes)))
    train_preds.append(
        pd.Series(learner.predict(Xv.drop(drop_cols, axis=1)),
                  index=Xv.index,
                  name="probs"))
Beispiel #14
0










#############################################

from lightgbm import LGBMRegressor

lgbm = LGBMRegressor()
lgbm_param_grid = {# LightGBM
        
        #n_estimators, max_depth, num_leaves, sub_sample, colsample_bytree
        
        'n_estimators': [50],
        'learning_rate': [0.1],
        'colsample_bytree': [0.6],
        'max_depth': [-1],
        'num_leaves': [31],
        'reg_alpha': [1.5],
        'reg_lambda': [0],
        'min_split_gain': [0],
        'subsample': [0.2],
        'subsample_freq': [0]
        }
"""Module that contains the model configuration used in the training pipeline."""
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklego.preprocessing import ColumnSelector
from sklearn.compose import ColumnTransformer

from lightgbm import LGBMRegressor

from src.config import config

RUN_NAME = "lightgbm"

#Prepare pipeline
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

column_selector = ColumnSelector(config.FEATURES)

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer,
                                                config.NUMERIC_FEATURES),
                                               ('cat', categorical_transformer,
                                                config.CATEGORICAL_FEATURES)])

#Create model
xgb_model = LGBMRegressor()

model = Pipeline([('column_selector', column_selector),
                  ("preprocessor", preprocessor), ("regressor", xgb_model)])
Beispiel #16
0
 def test_lightgbm_regressor2(self):
     model = LGBMRegressor(n_estimators=2, max_depth=1, min_child_samples=1)
     dump_single_regression(model, suffix="2")
 # XGB is tuned using the result obtained from xsede2.py
 "xgb": XGBRegressor(n_estimators=1000,
                     learning_rate=0.01, 
                     min_child_weight=3,
                     max_depth=6, 
                     gamma=0, 
                     subsample=0.8, 
                     colsample_bytree=0.85, 
                     random_state=0), 
 "lgb": LGBMRegressor(objective='regression', 
                      num_leaves=5, 
                      learning_rate=0.05, 
                      n_estimators=720, 
                      max_bin=55, 
                      bagging_fraction=0.8, 
                      bagging_freq=5, 
                      feature_fraction=0.2319, 
                      feature_fraction_seed=9, 
                      bagging_seed=9, 
                      min_data_in_leaf=6, 
                      min_sum_hessian_in_leaf=11,
                      random_state=0),
 "rf": RandomForestRegressor(n_estimators=1000, 
                             bootstrap=True, 
                             max_features='sqrt', 
                             max_depth=6, 
                             min_samples_split=3, 
                             min_samples_leaf=1, 
                             random_state=0), 
 "knn": KNeighborsRegressor(n_neighbors = 10), 
 "ada": AdaBoostRegressor(n_estimators=1000,
Beispiel #18
0
                             catb_params,
                             cv=5,
                             n_jobs=-1,
                             verbose=2).fit(X_train, y_train)
catb_cv_model.best_params_ = {
    'depth': 3,
    'iterations': 500,
    'learning_rate': 0.1
}
# Final Model
catb_tuned = CatBoostRegressor(**catb_cv_model.best_params_).fit(
    X_train, y_train)
np.sqrt(mean_squared_error(y_test, y_pred))

# LightGBM: Model & Tahmin
lgb_model = LGBMRegressor().fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))
# Model Tuning
lgb_model = LGBMRegressor()
lgbm_params = {
    "learning_rate": [0.01, 0.001, 0.1, 0.5, 1],
    "n_estimators": [200, 500, 1000, 5000],
    "max_depth": [6, 8, 10, 15, 20],
    "colsample_bytree": [1, 0.8, 0.5, 0.4]
}
lgbm_cv_model = GridSearchCV(lgb_model,
                             lgbm_params,
                             cv=10,
                             n_jobs=-1,
                             verbose=2).fit(X_train, y_train)
보통 learning_rate와 n_estimators는 같이 움직인다.
scikit-learn 패키지가 아니므로 GPU버전으로 설치한다면 GPU 사용도 가능하다.
"""

## LightGBM
"""
주요 특징
    scikit-learn 패키지가 아니다.
    성능이 우수하다
    속도도 매우 빠르다.
"""
from lightgbm import LGBMRegressor, LGBMClassifier

lgbm = LGBMRegressor(random_state=42,
                     learning_rate=0.01,
                     n_estimators=2000,
                     colsample_bytree=0.8,
                     subsample=0.8,
                     max_depth=7)
lgbm.fit(x_train, y_train)
lgbm_pred = lgbm.predict(x_test)
mse_eval('LGBM Ensemble', lgbm_pred, y_test)
"""
# 주요 Hyperparameter
    - random_state : 랜덤 시드 고정 값. 고정해두고 튜닝할 것
    - n_jobs : CPU 사용 갯수
    - learning_rate : 학습율. 너무 큰 학습율은 성능을 떨어뜨리고, 너무 작은 학습율은 학습이 느리다. 적절한 값을 찾아야 함. default = 0.1
    - n_estimators : 부스팅 스테이지 수, default값은 100개
    - max_depth : 트리의 깊이. 과대적합 방지용. default = 3
    - colsample_bytree : 샘플 사용 비율(max_features와 비슷한 개념). 과대적합 방지용. default=1.0

보통 learning_rate와 n_estimators는 같이 움직인다.
Beispiel #20
0

xgb =XGBRegressor( booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=4, min_child_weight=1.5, n_estimators=2400,
             n_jobs=1, nthread=None, objective='reg:linear',
             reg_alpha=0.6, reg_lambda=0.6, scale_pos_weight=1, 
             silent=None, subsample=0.8, verbosity=1)


lgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=12000, 
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.4, 
                                       )


#Fitting
xgb.fit(x_train, y_train)
lgbm.fit(x_train, y_train,eval_metric='rmse')


predict1 = xgb.predict(x_test)
predict = lgbm.predict(x_test)

Beispiel #21
0

# In[174]:


cv_rmse(svr_fit).mean()
"""

from lightgbm import LGBMRegressor

lgbm_model = LGBMRegressor(objective='regression',
                           num_leaves=31,
                           learning_rate=0.1,
                           n_estimators=200,
                           max_bin=100,
                           bagging_fraction=0.8,
                           bagging_freq=5,
                           feature_fraction=0.8,
                           feature_fraction_seed=9,
                           bagging_seed=9,
                           min_data_in_leaf=20,
                           min_sum_hessian_in_leaf=11)
lgbm_fit = lgbm_model.fit(X_train, y_train)

# In[198]:

cv_rmse(lgbm_fit).mean()

from mlxtend.regressor import StackingCVRegressor
from sklearn.pipeline import make_pipeline

#setup models
Beispiel #22
0
    gbr = GradientBoostingRegressor(n_estimators=3000,
                                    learning_rate=0.05,
                                    max_depth=4,
                                    max_features='sqrt',
                                    min_samples_leaf=15,
                                    min_samples_split=10,
                                    loss='huber',
                                    random_state=42)

    lightgbm = LGBMRegressor(
        objective='regression',
        num_leaves=4,
        learning_rate=0.01,
        n_estimators=5000,
        max_bin=200,
        bagging_fraction=0.75,
        bagging_freq=5,
        bagging_seed=7,
        feature_fraction=0.2,
        feature_fraction_seed=7,
        verbose=-1,
    )

    xgboost = XGBRegressor(learning_rate=0.01,
                           n_estimators=3460,
                           max_depth=3,
                           min_child_weight=0,
                           gamma=0,
                           subsample=0.7,
                           colsample_bytree=0.7,
                           objective='reg:linear',
Beispiel #23
0
        'colsample_bytree': [0.6, 0.7, 0.8],
        'n_estimators': [500, 1000],
        'random_state': [0]
    },
    'lightGBM': {
        'min_data_in_leaf': [100, 300, 500, 1000, 1500],
        'num_leaves': [15, 30, 40, 50, 60],
        'max_depth': [15, 30, 45],
        'random_state': [0]
    }
}

model_map = {
    'random_forest': RandomForestRegressor(),
    'xgboost': XGBRegressor(),
    'lightGBM': LGBMRegressor()
}

opt = docopt(__doc__)


# Label-encode the categorical features and split the data
def preprocess(full_train, full_test):
    X_train = full_train.drop(['price'], axis=1)
    y_train = full_train['price']
    X_test = full_test.drop(['price'], axis=1)
    y_test = full_test['price']

    for feature in categorical_features:
        le = LabelEncoder()
        le.fit(X_train[feature])
Beispiel #24
0
X_train, X_test, y_train, y_test = train_test_split(reg_features,
                                                    reg_target,
                                                    test_size=0.25,
                                                    random_state=42)
'''
'''

reg_features = data_reg_process(reg_features)

reg = LGBMRegressor(num_leaves=40,
                    max_depth=7,
                    n_estimators=10000,
                    min_child_weight=10,
                    subsample=0.7,
                    colsample_bytree=0.7,
                    reg_alpha=0,
                    learning_rate=0.1,
                    reg_lambda=0.5,
                    bagging_fraction=0.8,
                    bagging_freq=5,
                    feature_fraction=0.2319,
                    feature_fraction_seed=9,
                    bagging_seed=9)
'''
实际回归建模
'''

reg.fit(reg_features,
        reg_target,
        eval_set=[(reg_features, reg_target)],
        eval_metric='rmse',
        early_stopping_rounds=100)
Beispiel #25
0
#! /usr/env/python

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_log_error
from lightgbm import LGBMRegressor

features = ['Market', 'Day', 'Stock', 'x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E', 'x4', 'x5', 'x6']

df = pd.read_csv('train.csv', index_col=0)
df = df.fillna(0) # replace NaN entries
df_test = pd.read_csv('test.csv', index_col=0)
df_test = df_test.fillna(0) # replace NaN entries

weights = []
for index, row in df.iterrows():
	weights.append(float(row['Weight']))


X = df[features]
Y = df['y']

model = LGBMRegressor(n_estimators=10000, learning_rate=0.01, min_data_in_leaf=40, num_leaves=80, num_iterations=110)

model.fit(X,Y, sample_weight = weights)
yp = pd.Series(model.predict(df_test[features])).rename('y')
yp.index.name = 'Index'
print(yp.head())

yp.to_csv('GradientBoostedRegressor4c.csv', header=True)
Beispiel #26
0
        wlist = {'train': d_train, 'eval': d_val}
        model = lgbm.train(params=params,
                           train_set=d_train,
                           valid_sets=d_val,
                           evals_result=wlist)
        models.append(model)

    return models


final_y_test_pred = []
final_y_pred = []

# 모델 컬럼별 4번
for i in range(4):
    model = LGBMRegressor(**params)
    model.fit(x_train,
              y_train[:, i],
              eval_set=[(x_train, y_train[:, i]), (x_test, y_test[:, i])],
              verbose=True)

    y_test_pred = model.predict(x_test)
    score = model.score(x_test, y_test[:, i])
    mae = MAE(y_test[:, i], y_test_pred)
    print("r2 : ", score)
    print("mae :", mae)

    thresholds = np.sort(model.feature_importances_)[[
        i for i in range(0, len(model.feature_importances_), 30)
    ]]
    print("model.feature_importances_ : ", model.feature_importances_)
Beispiel #27
0
dtest = xgb.DMatrix(test)

predictions = final_gb.predict(dtest)
pd.DataFrame(predictions,
             columns=['Fees']).to_csv('prediction_DOC_XGBCV_14.csv')

############################################################################

lightgbm = LGBMRegressor(objective='regression',
                         num_leaves=450,
                         learning_rate=0.1,
                         n_estimators=1200,
                         max_bin=30,
                         bagging_fraction=0.8,
                         bagging_freq=9,
                         feature_fraction=0.129,
                         feature_fraction_seed=9,
                         bagging_seed=9,
                         min_data_in_leaf=3,
                         min_sum_hessian_in_leaf=6,
                         random_state=10)

xgb = XGBRegressor(learning_rate=0.1,
                   n_estimators=1500,
                   max_depth=30,
                   min_child_weight=12,
                   gamma=0,
                   reg_alpha=2e-5,
                   subsample=0.8,
                   colsample_bytree=0.8,
Beispiel #28
0
y = data['Airfare(NZ$)']
X = data.drop(['Airfare(NZ$)'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Create cross-validation
cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=24)

# Define a scoring system
def mse(y, y_pred):
    return mean_squared_error(y, y_pred)


# Define models
lgbm = LGBMRegressor(
        learning_rate=0.1,
        max_bin=150,
        boosting_type='goss'
)

svr = SVR(kernel='rbf')

rf = RandomForestRegressor(n_jobs=-1,
                           oob_score=True)

# Grid-search
param_grid_lgbm = {
    'num_leaves': [80],
    'max_depth': [7, 10],
    'n_estimators': [200],
    'min_data_in_leaf': [100, 300]
}
Beispiel #29
0
 def test_lightgbm_regressor(self):
     model = LGBMRegressor(n_estimators=3, min_child_samples=1)
     dump_single_regression(model)
Beispiel #30
0
def run(dt,
        rttFlag=True,
        method="xgboost",
        dir="../../data/throughputRelation/data/1"):

    train = pd.read_csv(dir + "/train/sample.csv", delimiter=' ')
    test = pd.read_csv(dir + "/test/sample.csv", delimiter=' ')
    test2 = pd.read_csv(dir + "/test/sample.csv", delimiter=' ')
    len_train = train.shape[0]
    datas = pd.concat([train, test], sort=False)
    if HIT_FLAG == True:
        if rttFlag:
            train.drop(['hTime', 'maxMT', 'reqCount', 'avgMT', 'totalMT'],
                       axis=1,
                       inplace=True)
            test.drop(['hTime', 'maxMT', 'reqCount', 'avgMT', 'totalMT'],
                      axis=1,
                      inplace=True)
        else:
            train.drop(
                ['hTime', 'rtt', 'maxMT', 'reqCount', 'avgMT', 'totalMT'],
                axis=1,
                inplace=True)
            test.drop(
                ['hTime', 'rtt', 'maxMT', 'reqCount', 'avgMT', 'totalMT'],
                axis=1,
                inplace=True)
    else:
        if rttFlag:
            train.drop(['mTime', 'maxMT', 'reqCount', 'avgMT', 'totalMT'],
                       axis=1,
                       inplace=True)
            test.drop(['mTime', 'maxMT', 'reqCount', 'avgMT', 'totalMT'],
                      axis=1,
                      inplace=True)
        else:
            train.drop(
                ['mTime', 'rtt', 'maxMT', 'reqCount', 'avgMT', 'totalMT'],
                axis=1,
                inplace=True)
            test.drop(
                ['mTime', 'rtt', 'maxMT', 'reqCount', 'avgMT', 'totalMT'],
                axis=1,
                inplace=True)

    len_train = train.shape[0]

    datas = pd.concat([train, test], sort=False)
    #
    # skew_ = datas.select_dtypes(include=['int', 'float']).apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
    # skew_df = pd.DataFrame({'Skew': skew_})
    # skewed_df = skew_df[(skew_df['Skew'] > 0.5) | (skew_df['Skew'] < -0.5)]
    #
    # print(skewed_df.index)
    if HIT_FLAG == True:
        if rttFlag:
            skew_column = ['mTime', 'size', 'rtt', 'mThroughput']
        else:
            skew_column = ['mTime', 'size', 'mThroughput']
    else:
        if rttFlag:
            skew_column = ['hTime', 'size', 'rtt', 'hThroughput']
        else:
            skew_column = ['hTime', 'size', 'hThroughput']

    lam = 0.1
    for col in skew_column:
        train[col] = boxcox1p(train[col], lam)
        test[col] = boxcox1p(test[col], lam)
    if HIT_FLAG == True:
        train['hThroughput'] = np.log(train['hThroughput'])
        x = train.drop('hThroughput', axis=1)
        y = train['hThroughput']
        x_test = test.drop('hThroughput', axis=1)
    else:
        train['mThroughput'] = np.log(train['mThroughput'])
        x = train.drop('mThroughput', axis=1)
        y = train['mThroughput']
        x_test = test.drop('mThroughput', axis=1)

    print(x.columns)
    if method == "lasso":
        model = Lasso(max_iter=1e7, alpha=0.0001, random_state=1)
    elif method == "ridge":
        model = Ridge(alpha=14.5)
    elif method == "lgbm":
        model = LGBMRegressor(
            objective='regression',
            max_depth=6,
            num_leaves=4,
            learning_rate=0.05,
            n_estimators=5000,
            max_bin=200,
            bagging_fraction=0.75,
            bagging_freq=5,
            bagging_seed=7,
            feature_fraction=0.2,
            feature_fraction_seed=7,
            verbose=-1,
        )
    elif method == "LinearRegression":
        model = LinearRegression()
    else:
        # model = XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=500, silent=False, objective='reg:gamma')
        model = XGBRegressor(max_depth=6,
                             learning_rate=0.05,
                             n_estimators=5000,
                             silent=False,
                             objective='reg:gamma')
    startTime = time.time()
    model.fit(x, y)
    endTime = time.time()
    trainTime = endTime - startTime
    if HIT_FLAG:
        modelDir = dir + "/m2h/model/" + dt
    else:
        modelDir = dir + "/h2m/model/" + dt
    if rttFlag:
        modelDir += "/rtt"
    else:
        modelDir += "/nortt"

    if os.path.exists(modelDir) == False:
        os.makedirs(modelDir)

    joblib.dump(model, modelDir + "/" + method + ".m")

    startTime = time.time()
    y_pred = model.predict(x_test)
    endTime = time.time()

    plt.rcParams['figure.figsize'] = (4.0, 4.0)
    fig, ax = plt.subplots()
    if HIT_FLAG:
        y_test = test2['hThroughput'] / 1024 / 1024
    else:
        y_test = test2['mThroughput'] / 1024 / 1024

    y_pred = np.exp(y_pred) / 1024 / 1024
    # ax.scatter(y_test, y_pred, s=2, alpha=0.1)
    # ax.plot([y_pred.min(), y_pred.max()], [y_pred.min(), y_pred.max()], 'k--', lw=4)
    # ax.set_xlabel('Real throughput/Mbps')
    # ax.set_ylabel('Predicted throughput/Mbps')
    # plt.title(method)
    # plt.savefig("../plot/"+method+".pdf", bbox_inches = 'tight')
    # plt.show()

    MSE_loss = np.average(np.square(y_pred - y_test))
    print(method, "MSE_loss =", MSE_loss, "testTime=", endTime - startTime,
          "trainTime=", trainTime)
    return method, MSE_loss, endTime - startTime, trainTime
ica2_results_test = ica.transform(test)



# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]

    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]


y = np.array(train["y"])
X=np.array(train.drop('y', axis=1))

#LightGBM Regressor
model = LGBMRegressor(boosting_type='gbdt', num_leaves=10, max_depth=4, learning_rate=0.005, n_estimators=675,
                      max_bin=25, subsample_for_bin=50000, min_split_gain=0, min_child_weight=5, min_child_samples=10,
                      subsample=0.995, subsample_freq=1, colsample_bytree=1, reg_alpha=0, reg_lambda=0, seed=0,
                      nthread=-1, silent=True)

#Fit to training data
model.fit(X, y)

# Generate Predictions
test['y']=model.predict(test)
# output=test[['ID', 'y']]
#
# #Save predictions to 'output.csv'
# output.to_csv('output.csv', index=False)