Esempio n. 1
0
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV

CAT_FEATS = ['SIGNIFICANT', 'SERIOUS']

NUM_FEATS = ['FATAL', 'INJURE', 'UNINTENTIONAL_RELEASE_BBLS', 'ACCIDENT_PSIG', 'MOP_PSIG', 'RECOVERED_BBLS',
             'PIPE_DIAMETER', 'PIPE_SMYS', 'EX_HYDROTEST_PRESSURE', 'MANUFACTURED_YEAR', 'NORMAL_PSIG',
             'ACCOMPANYING_LIQUID']

FEATS = NUM_FEATS + CAT_FEATS

TARGET = 'TOTAL_COST_CURRENT'

model_type = LGBMRegressor()

lgbm_param_grid = {'regressor__num_leaves': (20, 100),
                   'regressor__n_estimators': (20, 500),
                   'regressor__learning_rate': (0.05, 0.3),
                   'regressor__feature_fraction': (0.1, 0.9),
                   'regressor__bagging_fraction': (0.8, 1),
                   'regressor__max_depth': (15, 25),
                   'regressor__min_split_gain': (0.001, 0.1),
                   'regressor__min_child_weight': (10, 50),
                   'regressor__preprocessor__num__imputer__strategy': ['mean', 'median']}


def data_acquisition():

    return pd.read_csv('./data/processed/pipelines_incident_for_modelling.csv', low_memory=False)
Esempio n. 2
0
                   n_jobs=1,
                   nthread=None,
                   objective='reg:linear',
                   reg_alpha=0.6,
                   reg_lambda=0.6,
                   scale_pos_weight=1,
                   silent=None,
                   subsample=0.8,
                   verbosity=1)

lgbm = LGBMRegressor(
    objective='regression',
    num_leaves=4,
    learning_rate=0.01,
    n_estimators=12000,
    max_bin=200,
    bagging_fraction=0.75,
    bagging_freq=5,
    bagging_seed=7,
    feature_fraction=0.4,
)

#Fitting
xgb.fit(x_train, y_train)
lgbm.fit(x_train, y_train, eval_metric='rmse')

predict1 = xgb.predict(x_test)
predict = lgbm.predict(x_test)

print('Root Mean Square Error test = ' +
      str(math.sqrt(metrics.mean_squared_error(y_test, predict1))))
Esempio n. 3
0
    df_train, targets, df_test = data.split_data(df, logerror)

    dtrain = xgb.DMatrix(df_train.values, targets)
    dtest = xgb.DMatrix(df_test.values)

    sub_model = xgb.train(
            model_params.get_xtune11k(),
            dtrain, num_boost_round=105,
            )
    xgb_preds = sub_model.predict(dtest)
    print( "\n XGB predictions:" )
    print( pd.DataFrame(xgb_preds).head() )

    ################
    ##  LightGBM  ##
    ################
    sub_model = LGBMRegressor(**model_params.get_ltune7k())
    sub_model.fit(df_train, targets)
    lgb_preds = sub_model.predict(df_test)
    print( "\n LGB predictions:" )
    print( pd.DataFrame(lgb_preds).head() )

    weights = (xgb_weight, 1-xgb_weight)
    final_preds = tools.ensemble_preds([xgb_preds, lgb_preds], weights)
    data.generate_simple_kaggle_file(final_preds, 'ensemble')

    print( "\n 'Ensemble predictions:" )
    print( pd.DataFrame(final_preds).head() )

Esempio n. 4
0
 def objective(param):
     tuning_pipeline = make_pipeline(RobustScaler(), LGBMRegressor(**param))
     loss = -nm_penalty(tuning_pipeline, [tunning_train_x, train_y],
                        np.ones(tunning_train_x.shape[1]))
     return loss
# After trying various runs of grid search the following set of parameter grid values gave good results
lgbm_models = []
# nFix Model
lgbm_models.append(
    LGBMRegressor(boosting_type='gbdt',
                  class_weight=None,
                  colsample_bytree=1.0,
                  importance_type='split',
                  lambda_l1=4.6,
                  lambda_l2=8.6,
                  learning_rate=0.1,
                  max_depth=-1,
                  min_child_samples=20,
                  min_child_weight=0.001,
                  min_split_gain=0.0,
                  n_estimators=100,
                  n_jobs=-1,
                  num_leaves=75,
                  objective=None,
                  random_state=10,
                  reg_alpha=0,
                  reg_lambda=0,
                  silent=True,
                  subsample=1.0,
                  subsample_for_bin=200000,
                  subsample_freq=0))
# FFD Model
lgbm_models.append(
    LGBMRegressor(boosting_type='gbdt',
                  class_weight=None,
                  colsample_bytree=1.0,
Esempio n. 6
0
# time 481.330000162

# predict using GBM measured by mape


def mape(y_pred, y_true):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


X_train, X_test, y_train, y_test = train_test_split(full_data.drop('Stake',
                                                                   axis=1),
                                                    full_data['Stake'],
                                                    test_size=0.33)


def valid_mape(clf):
    clf = clf.fit(X_train, np.log(y_train))
    y_pred = clf.predict(X_test)
    acc = 100 - mape(np.exp(y_pred), y_test)
    print('Accuracy is {0}%'.format(acc))
    return acc


valid_mape(GradientBoostingRegressor(learning_rate=.01,
                                     n_estimators=300))  # 50%
valid_mape(LGBMRegressor(learning_rate=.01, n_estimators=1200))

plt.hist(np.log(full_data['Stake']))
plt.show()
Esempio n. 7
0
df.head()
df.shape
check_df(df)

df.dropna(inplace=True)

y = df["SALARY"]  # dependent variable
X = df.drop(["SALARY"], axis=1)  # independent variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=17)

#######################################
# LightGBM: Model & Tahmin
#######################################

lgb_model = LGBMRegressor().fit(X_train, y_train)
y_pred = lgb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))  # 313

#######################################
# Model Tuning
#######################################

lgb_model = LGBMRegressor()

lgbm_params = {"learning_rate": [0.01, 0.1],
               "n_estimators": [500, 1000],
               "max_depth": [3, 5, 8],
               "colsample_bytree": [1, 0.8, 0.5]}

lgbm_cv_model = GridSearchCV(lgb_model,
Esempio n. 8
0
    'n_estimators': range(100, 300, 50),
    'eta': [0.1, 0.2],
    'max_depth': range(3, 10, 1),
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}
rand_search_xgb = RandomizedSearchCV(estimator=xgbreg,
                                     param_distributions=rand_param_xgb,
                                     verbose=1,
                                     n_jobs=-1,
                                     n_iter=200,
                                     cv=8)
rand_search_xgb.fit(X_train, y_train)
best_param = rand_search_xgb.best_params_
best_param

xgbreg = XGBRegressor(subsample=0.5, n_estimators=150, max_depth=4, eta=0.1)
xgbreg.fit(X_train, y_train)
xgbreg.score(X_test, y_test)

#Light GBM REgressor
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)
lgbm.score(X_test, y_test)
lgbm.score(X_train, y_train)

X_train.columns
#Saving models
joblib.dump(ranreg, 'RFReg_model.ml')
joblib.dump(xgbreg, 'XGBReg_model.ml')
joblib.dump(lgbm, 'LGBMReg_model.ml')
Esempio n. 9
0
def get_model_from_name(model_name, training_params=None, is_hp_search=False):
    global keras_imported

    # For Keras
    epochs = 1000
    # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning':
    #     print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy')
    #     epochs = 100

    all_model_params = {
        'LogisticRegression': {},
        'RandomForestClassifier': {
            'n_jobs': -2,
            'n_estimators': 30
        },
        'ExtraTreesClassifier': {
            'n_jobs': -1
        },
        'AdaBoostClassifier': {},
        'SGDClassifier': {
            'n_jobs': -1
        },
        'Perceptron': {
            'n_jobs': -1
        },
        'LinearSVC': {
            'dual': False
        },
        'LinearRegression': {
            'n_jobs': -2
        },
        'RandomForestRegressor': {
            'n_jobs': -2,
            'n_estimators': 30
        },
        'LinearSVR': {
            'dual': False,
            'loss': 'squared_epsilon_insensitive'
        },
        'ExtraTreesRegressor': {
            'n_jobs': -1
        },
        'MiniBatchKMeans': {
            'n_clusters': 8
        },
        'GradientBoostingRegressor': {
            'learning_rate': 0.1,
            'warm_start': True
        },
        'GradientBoostingClassifier': {
            'learning_rate': 0.1,
            'warm_start': True
        },
        'SGDRegressor': {
            'shuffle': False
        },
        'PassiveAggressiveRegressor': {
            'shuffle': False
        },
        'AdaBoostRegressor': {},
        'LGBMRegressor': {
            'n_estimators': 2000,
            'learning_rate': 0.15,
            'num_leaves': 8,
            'lambda_l2': 0.001,
            'histogram_pool_size': 16384
        },
        'LGBMClassifier': {
            'n_estimators': 2000,
            'learning_rate': 0.15,
            'num_leaves': 8,
            'lambda_l2': 0.001,
            'histogram_pool_size': 16384
        },
        'DeepLearningRegressor': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'DeepLearningClassifier': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'CatBoostRegressor': {},
        'CatBoostClassifier': {}
    }

    # if os.environ.get('is_test_suite', 0) == 'True':
    #     all_model_params

    model_params = all_model_params.get(model_name, None)
    if model_params is None:
        model_params = {}

    if is_hp_search == True:
        if model_name[:12] == 'DeepLearning':
            model_params['epochs'] = 50
        if model_name[:4] == 'LGBM':
            model_params['n_estimators'] = 500

    if training_params is not None:
        print('Now using the model training_params that you passed in:')
        print(training_params)
        # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it)
        model_params.update(training_params)
        print(
            'After overwriting our defaults with your values, here are the final params that will be used to initialize the model:'
        )
        print(model_params)

    model_map = {
        # Classifiers
        'LogisticRegression': LogisticRegression(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RidgeClassifier': RidgeClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'LinearSVC': LinearSVC(),

        # Regressors
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'Ridge': Ridge(),
        'LinearSVR': LinearSVR(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'RANSACRegressor': RANSACRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'Lasso': Lasso(),
        'ElasticNet': ElasticNet(),
        'LassoLars': LassoLars(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'BayesianRidge': BayesianRidge(),
        'ARDRegression': ARDRegression(),

        # Clustering
        'MiniBatchKMeans': MiniBatchKMeans(),
    }

    try:
        model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001)
        model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(
            max_iter=1000, tol=0.001)
        model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor(
            max_iter=1000, tol=0.001)
    except TypeError:
        model_map['SGDClassifier'] = SGDClassifier()
        model_map['Perceptron'] = Perceptron()
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(
        )
        model_map['SGDRegressor'] = SGDRegressor()
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor()

    if xgb_installed:
        model_map['XGBClassifier'] = XGBClassifier()
        model_map['XGBRegressor'] = XGBRegressor()

    if lgb_installed:
        model_map['LGBMRegressor'] = LGBMRegressor()
        model_map['LGBMClassifier'] = LGBMClassifier()

    if catboost_installed:
        model_map['CatBoostRegressor'] = CatBoostRegressor(
            calc_feature_importance=True)
        model_map['CatBoostClassifier'] = CatBoostClassifier(
            calc_feature_importance=True)

    if model_name[:12] == 'DeepLearning':
        if keras_imported == False:
            # Suppress some level of logs if TF is installed (but allow it to not be installed, and use Theano instead)
            try:
                os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
                os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
                from tensorflow import logging
                logging.set_verbosity(logging.INFO)
            except:
                pass

            global maxnorm
            global Dense, Dropout
            global LeakyReLU, PReLU, ThresholdedReLU, ELU
            global Sequential
            global keras_load_model
            global regularizers, optimizers
            global Activation
            global KerasRegressor, KerasClassifier

            from keras.constraints import maxnorm
            from keras.layers import Activation, Dense, Dropout
            from keras.layers.advanced_activations import LeakyReLU, PReLU, ThresholdedReLU, ELU
            from keras.models import Sequential
            from keras.models import load_model as keras_load_model
            from keras import regularizers, optimizers
            from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier
            keras_imported = True

        model_map['DeepLearningClassifier'] = KerasClassifier(
            build_fn=make_deep_learning_classifier)
        model_map['DeepLearningRegressor'] = KerasRegressor(
            build_fn=make_deep_learning_model)

    try:
        model_without_params = model_map[model_name]
    except KeyError as e:
        print(
            'It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize'
        )
        raise (e)

    if os.environ.get('is_test_suite', False) == 'True':
        if 'n_jobs' in model_params:
            model_params['n_jobs'] = 1
    model_with_params = model_without_params.set_params(**model_params)

    return model_with_params
Esempio n. 10
0
def get_model(data, target, use_ensemble=True):

    params1 = {
        'el__alpha': np.logspace(-5, 2, 30),
        'el__l1_ratio': np.linspace(0, 1, 3),
        'pca__n_components': [2, 5, 10]
    }

    params2 = {
        'rf__n_estimators': range(10, 101, 30),
        'rf__max_depth': [2, 5, 9],
        'pca__n_components': [2, 5, 10]
    }

    params3 = {
        'lgb__learning_rate': np.logspace(-6, 0, 5),
        'lgb__n_estimators': range(10, 101, 30),
        'lgb__max_depth': [6, 9, 12],
        'pca__n_components': [2, 5, 10],
        'lgb__num_leaves': [100]
    }

    rf = Pipeline([('scale', StandardScaler()), ('pca', PCA()),
                   ('rf', RandomForestRegressor())])
    el = Pipeline([('scale', StandardScaler()), ('pca', PCA()),
                   ('el', ElasticNet(max_iter=5000))])
    lgb = Pipeline([('scale', StandardScaler()), ('pca', PCA()),
                    ('lgb', LGBMRegressor())])

    gr_lgb = GridSearchCV(lgb,
                          params3,
                          cv=TimeSeriesSplit(),
                          scoring='neg_mean_squared_error',
                          refit=True)
    gr_lgb.fit(data, target)
    logger.info('Booster params discovered')

    gr_el = GridSearchCV(el,
                         params1,
                         cv=TimeSeriesSplit(),
                         scoring='neg_mean_squared_error',
                         refit=True)
    gr_el.fit(data, target)
    logger.info('ElasticNet params discovered')

    gr_rf = GridSearchCV(rf,
                         params2,
                         cv=TimeSeriesSplit(),
                         scoring='neg_mean_squared_error',
                         refit=True)
    gr_rf.fit(data, target)
    logger.info('RandomForest params discovered')

    res_scores = {
        'elastic': gr_el.best_score_,
        'random_forest': gr_rf.best_score_,
        'lgbm': gr_lgb.best_score_
    }

    res_est = {
        'elastic': gr_el.best_estimator_,
        'random_forest': gr_rf.best_estimator_,
        'lgbm': gr_lgb.best_estimator_
    }
    if use_ensemble:
        estimators = [('elastic', gr_el.best_estimator_),
                      ('random_forest', gr_rf.best_estimator_),
                      ('lgbm', gr_lgb.best_estimator_)]

        stacked = StackingRegressor(estimators=estimators,
                                    final_estimator=RandomForestRegressor(
                                        n_estimators=100, max_depth=3),
                                    passthrough=True)
        stacked.fit(data, target)
        logger.info('Ensemble fitted')
        return stacked
    return res_est[sorted(res_scores, key=lambda x: (-res_scores[x], x))[0]]
Esempio n. 11
0
from xgboost import XGBRegressor
xgb_model = XGBRegressor(learning_rate = 0.01, n_estimators = 3300,
                        objective = "reg:linear",
                                     max_depth= 3, min_child_weight=2,
                                     gamma = 0, subsample=0.6,
                                     colsample_bytree=0.7,
                                     scale_pos_weight=1,seed=0, 
                                     reg_alpha= 0, reg_lambda= 1)
xgb_model.fit(x_train, y_train)

#4-4 LGBMRegressor
from lightgbm import LGBMRegressor
lgbm_model = LGBMRegressor(learning_rate = 0.01, n_estimators = 2900,
                        objective='regression',
                                     max_depth= 3,min_child_weight=0,
                                     gamma = 0, 
                                     subsample=0.6, colsample_bytree=0.6, 
                                     scale_pos_weight=1,seed=0, 
                                     reg_alpha= 0.1, reg_lambda= 0)
lgbm_model.fit(x_train, y_train)

#4-5SVR
from sklearn.svm import SVR
SVR_model  = SVR(C = 10, epsilon = 0.1, gamma = 1e-06)
SVR_model.fit(x_train, y_train)

#4-6ElasticNetCV
from sklearn.linear_model import ElasticNetCV
alphas = [0.0001, 0.0002, 0.0003]
l1ratio = [0.5, 0.6, 0.7, 0.8, 0.7]
elastic_model = ElasticNetCV(max_iter=1e7, alphas = alphas, cv = kfolds, l1_ratio = l1ratio)
plt.matshow(train.corr())

#encoding string value to integer
train['Depth'] = LabelEncoder().fit_transform(train['Depth'].astype(str))
test['Depth'] = LabelEncoder().fit_transform(test['Depth'].astype(str))

# setting training and output columns value
output = ['Ca', 'P', 'pH', 'SOC', 'Sand']
output_val = train[output].values
train.drop(output, axis=1, inplace=True)

#classifiers
rf = RandomForestRegressor(n_estimators=300, min_samples_leaf=30)
ada = AdaBoostRegressor(random_state=42)
grad = GradientBoostingRegressor(n_estimators=101, random_state=42)
lgb = LGBMRegressor(max_depth=7, learning_rate=0.08)

#multiple output regressor
multi_regressor = MultiOutputRegressor(lgb)

#training classifiers
multi_regressor.fit(train, output_val)

#prediction
pred = multi_regressor.predict(test[train.columns])

#saving the value to csv
sample = pd.read_csv('sample_submission.csv')
sample[['Ca', 'P', 'pH', 'SOC', 'Sand']] = pred
sample.to_csv('abc.csv', index=None)
Esempio n. 13
0
print("Time:", time.time() - start_time)
y_pred = svm_reg.predict(X_test)

time.time()

print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_pred))
"""# Gradient Boosted Trees"""

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

lgbm_reg = LGBMRegressor(
    n_estimators=1000,
    random_state=42,
    objective='mape',
    num_iterations=5000,
)

start_time = time.time()
#Entrenamiento de GBT
lgbm_reg.fit(X_train,
             y_train,
             eval_set=(X_valid, y_valid),
             eval_metric='mape',
             early_stopping_rounds=200)
print("Time: ", time.time() - start_time)

y_pred = lgbm_reg.predict(X_test)

print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
from pandas import DataFrame
# import matplotlib.pyplot as plt
from tqdm import tqdm
from lightgbm import LGBMRegressor
from onnxruntime import InferenceSession
from skl2onnx import to_onnx, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_regressor_output_shapes  # noqa
from onnxmltools import __version__ as oml_version
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm  # noqa

N = 1000
X = numpy.random.randn(N, 20)
y = (numpy.random.randn(N) +
     numpy.random.randn(N) * 100 * numpy.random.randint(0, 1, 1000))

reg = LGBMRegressor(n_estimators=1000)
reg.fit(X, y)

######################################
# Register the converter for LGBMClassifier
# +++++++++++++++++++++++++++++++++++++++++
#
# The converter is implemented in :epkg:`onnxmltools`:
# `onnxmltools...LightGbm.py
# <https://github.com/onnx/onnxmltools/blob/master/onnxmltools/convert/
# lightgbm/operator_converters/LightGbm.py>`_.
# and the shape calculator:
# `onnxmltools...Regressor.py
# <https://github.com/onnx/onnxmltools/blob/master/onnxmltools/convert/
# lightgbm/shape_calculators/Regressor.py>`_.
fi = []
test_probs = []
train_probs = []
for mes in X_train.codmes.unique():
    if aux < 4:
        aux += 1
        continue
    print("*" * 10, mes, "*" * 10)
    Xt = X_train[X_train.codmes != mes]
    yt = y_train.loc[Xt.index, "target"]
    Xt = Xt.drop(drop_cols, axis=1)

    Xv = X_train[X_train.codmes == mes]
    yv = y_train.loc[Xv.index, "target"]

    learner = LGBMRegressor(n_estimators=10000)
    learner.fit(Xt,
                yt,
                early_stopping_rounds=100,
                eval_metric="mae",
                eval_set=[(Xt, yt), (Xv.drop(drop_cols, axis=1), yv)],
                verbose=50)
    gc.collect()
    test_probs.append(
        pd.Series(learner.predict(X_final_test.drop(drop_cols, axis=1)),
                  index=X_final_test.index,
                  name="fold_" + str(mes)))
    train_probs.append(
        pd.Series(learner.predict(Xv.drop(drop_cols, axis=1)),
                  index=Xv.index,
                  name="probs"))
    #     'early_stopping_rounds': [100, 200, 300],
}

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'verbose': -1,
    'learning_rate': 0.02,
    'n_estimators': 200,
    'num_leaves': 100,
}

if APPLY_GRID:
    print("Starting Grid")
    best_score, best_params = grid_search_tscv(LGBMRegressor(**params),
                                               X,
                                               y,
                                               grid,
                                               tscv=10)
    params.update(best_params)
    print(f"Best params {best_params}")
    print("Done")

model = LGBMRegressor(**params)
model.fit(X_train, y_train)
predict = model.predict(X_valid)
mean_absolute_percentage_error(y_valid, predict)

if FEATURE_SELECTION:
    print("start features selector")
Esempio n. 17
0
# model 별로 평가 수행
lr_reg = LinearRegression()
ridge_reg = Ridge(alpha=10)
lasso_reg = Lasso(alpha=0.01)

for model in [lr_reg, ridge_reg, lasso_reg]:
    get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=True)

coef = pd.Series(lr_reg.coef_, index=X_features_ohe.columns)
coef_sort = coef.sort_values(ascending=False)[:20]
sns.barplot(x=coef_sort.values, y=coef_sort.index)

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# 랜덤 포레스트, GBM, XGBoost, LightGBM model 별로 평가 수행
rf_reg = RandomForestRegressor(n_estimators=500)
gbm_reg = GradientBoostingRegressor(n_estimators=500)
xgb_reg = XGBRegressor(n_estimators=500)
lgbm_reg = LGBMRegressor(n_estimators=500)

for model in [rf_reg, gbm_reg, xgb_reg, lgbm_reg]:
    # XGBoost의 경우 DataFrame이 입력 될 경우 버전에 따라 오류 발생 가능. ndarray로 변환.
    get_model_predict(model,
                      X_train.values,
                      X_test.values,
                      y_train.values,
                      y_test.values,
                      is_expm1=True)
Esempio n. 18
0
def automl_train(train_csv, model_dir, mode):

    start_time = time.time()

    df = pd.read_csv(train_csv)
    df_y = df.target

    df_X = df.drop('target', axis=1)
    is_big = df_X.memory_usage().sum() > BIG_DATASET_SIZE

    print('Dataset read, shape {}'.format(df_X.shape))

    # drop constant features
    constant_columns = [
        col_name for col_name in df_X.columns if df_X[col_name].nunique() == 1
    ]
    df_X.drop(constant_columns, axis=1, inplace=True)

    # dict with data necessary to make predictions
    model_config = {}
    model_config['categorical_values'] = {}
    model_config['is_big'] = is_big

    if is_big:
        # missing values
        if any(df_X.isnull()):
            model_config['missing'] = True
            df_X.fillna(-1, inplace=True)

        new_feature_count = min(
            df_X.shape[1],
            int(df_X.shape[1] /
                (df_X.memory_usage().sum() / BIG_DATASET_SIZE)))
        # take only high correlated features
        correlations = np.abs([
            np.corrcoef(df_y, df_X[col_name])[0, 1]
            for col_name in df_X.columns if col_name.startswith('number')
        ])
        new_columns = df_X.columns[np.argsort(correlations)
                                   [-new_feature_count:]]
        df_X = df_X[new_columns]

    else:
        # features from datetime
        df_X = transform_datetime_features(df_X)

        # categorical encoding
        categorical_values = {}
        mean_encoding = {"unknown": df_y.mean()}
        mean_encoding_values = {}

        for col_name in list(df_X.columns):
            col_unique_values = df_X[col_name].unique()

            # Mean target encoding
            if 2 < len(col_unique_values) <= TARGET_ENCODING_MAX_VALUES:
                mean_encoding_values[col_name] = col_unique_values
                df_X["number_%s_mean_encoding" %
                     col_name] = mean_encoding["unknown"]
                for unique_value in col_unique_values:
                    mean_encoding["%s_%s" % (col_name, unique_value)] = df_y[
                        df_X[col_name] == unique_value].mean()
                    df_X["number_%s_mean_encoding" % col_name][df_X[col_name] == unique_value] = \
                        mean_encoding["%s_%s" % (col_name, unique_value)]

            # One hot encoding
            if 2 < len(col_unique_values) <= ONEHOT_MAX_UNIQUE_VALUES:
                categorical_values[col_name] = col_unique_values
                for unique_value in col_unique_values:
                    df_X['onehot_{}={}'.format(col_name, unique_value)] = (
                        df_X[col_name] == unique_value).astype(int)

        model_config['categorical_values'] = categorical_values
        model_config['mean_encoding'] = mean_encoding
        model_config['mean_encoding_values'] = mean_encoding_values

        # missing values
        if any(df_X.isnull()):
            model_config['missing'] = True
            df_X.fillna(-1, inplace=True)

    # use only numeric columns
    used_columns = [
        col_name for col_name in df_X.columns
        if col_name.startswith('number') or col_name.startswith('onehot')
    ]
    df_X = df_X[used_columns]

    # Finding most informative features by coefficients of liner regression
    if args.mode == 'regression':
        model = Ridge(alpha=0.3, copy_X=False)
    else:
        model = LogisticRegression(C=0.3, n_jobs=-1)

    model.fit(df_X, df_y)

    # Generate new features from most informative features by pair-wise division
    feature_generation_columns = []
    for r, i in sorted(zip(model.coef_, df_X.columns))[:10]:
        feature_generation_columns.append(i)

    for i in feature_generation_columns:
        for j in feature_generation_columns:
            if i == j:
                continue
            k = df_X[j]
            k[k == 0] = 0.0001

            df_X["number_%s_%s" % (i, j)] = df_X[i] / k

    df_X = df_X.values
    model_config['used_columns'] = used_columns
    model_config['feature_generation_columns'] = feature_generation_columns

    # scaling
    scaler = StandardScaler(copy=False)
    df_X = scaler.fit_transform(df_X)
    model_config['scaler'] = scaler

    # fitting
    model_config['mode'] = mode

    kf = KFold(n_splits=3)
    models = []

    if not is_big:
        for i, (train, test) in enumerate(kf.split(df_X)):
            print("FOLD ", i)

            if args.mode == 'regression':
                model = LGBMRegressor(reg_alpha=0.3,
                                      reg_lambda=0.1,
                                      min_child_weight=10,
                                      zero_as_missing=True,
                                      learning_rate=0.01,
                                      num_leaves=100,
                                      feature_fraction=0.7,
                                      bagging_fraction=0.7,
                                      n_estimators=800,
                                      n_jobs=-1,
                                      min_child_samples=30)
            else:
                model = LGBMClassifier(reg_alpha=0.3,
                                       reg_lambda=0.1,
                                       min_child_weight=10,
                                       zero_as_missing=True,
                                       learning_rate=0.01,
                                       num_leaves=100,
                                       feature_fraction=0.7,
                                       bagging_fraction=0.7,
                                       n_estimators=800,
                                       n_jobs=-1,
                                       min_child_samples=30)

            train_x, test_x, train_y, test_y = df_X[train], df_X[test], df_y[
                train], df_y[test]
            # train_test_split(df_X, df_y, test_size=0.15)

            model.fit(train_x,
                      train_y,
                      eval_set=(test_x, test_y),
                      early_stopping_rounds=7)
            models.append(model)

        model = ModelsEnsemble(models)
    else:
        if TIME_LIMIT > 5 * 60:
            if args.mode == 'regression':
                model = LGBMRegressor(reg_alpha=0.3,
                                      reg_lambda=0.1,
                                      min_child_weight=10,
                                      zero_as_missing=True,
                                      learning_rate=0.01,
                                      num_leaves=200,
                                      feature_fraction=0.7,
                                      bagging_fraction=0.7,
                                      n_estimators=800,
                                      n_jobs=-1,
                                      min_child_samples=60)
            else:
                model = LGBMClassifier(reg_alpha=0.3,
                                       reg_lambda=0.1,
                                       min_child_weight=10,
                                       zero_as_missing=True,
                                       learning_rate=0.01,
                                       num_leaves=200,
                                       feature_fraction=0.7,
                                       bagging_fraction=0.7,
                                       n_estimators=800,
                                       n_jobs=-1,
                                       min_child_samples=60)

            train_x, test_x, train_y, test_y = train_test_split(df_X,
                                                                df_y,
                                                                test_size=0.15)
            model.fit(train_x,
                      train_y,
                      eval_set=(test_x, test_y),
                      early_stopping_rounds=7)
        else:
            if args.mode == 'regression':
                model = Ridge(alpha=0.2, copy_X=False)
            else:
                model = LogisticRegression(C=0.2, n_jobs=-1)

            model.fit(df_X, df_y)

    model_config['model'] = model

    model_config_filename = os.path.join(model_dir, 'model_config.pkl')
    with open(model_config_filename, 'wb') as fout:
        pickle.dump(model_config, fout, protocol=pickle.HIGHEST_PROTOCOL)

    print('Train time: {}'.format(time.time() - start_time))
Esempio n. 19
0
        # Returns Boxplot of features disaplay False or True to see the plots for automation False
        feature_selector.plot(
            x_size=12,
            figsize=(12, 8),
            y_scale="log",
            which_features="all",
            display=False,
        )


if __name__ == "__main__":
    tree_classifiers = {
        "tree-classifier": DecisionTreeClassifier(),
        "forest-classifier": RandomForestClassifier(),
        "xgboost-classifier": XGBClassifier(),
        "lightgbm-classifier": LGBMClassifier(),
        "catboost-classifier": CatBoostClassifier(),
    }

    tree_regressors = {
        "tree-regressor": DecisionTreeRegressor(),
        "forest-regressor": RandomForestRegressor(),
        "xgboost-regressor": XGBRegressor(),
        "lightgbm-regressor": LGBMRegressor(),
        "catboost-regressor": CatBoostRegressor(),
    }

    test_models("regression", tree_regressors)
    test_models("classification", tree_classifiers)
Esempio n. 20
0
def main(input_file_path, output_file_path, tgt="Oil_norm", n_splits=5):
    input_file_name = os.path.join(input_file_path, "Train_final.pck")
    input_file_name_test = os.path.join(input_file_path, "Test_final.pck")

    output_file_name = os.path.join(output_file_path, f"models_lgbm_{tgt}.pck")

    df = pd.read_pickle(input_file_name).drop(exclude_cols, axis=1)
    df_test = pd.read_pickle(input_file_name_test)
    ids = df_test["EPAssetsId"]
    ids_uwi = df_test["UWI"]

    df_test = df_test.drop(exclude_cols, axis=1)

    cv = KFold(n_splits=n_splits, shuffle=False)
    models = []
    scores = []
    scores_dm = []
    y = df.loc[~df[tgt].isna(), tgt]
    X = df.loc[~df[tgt].isna(), keep_only_cols]
    X_test = df_test.copy().loc[:, keep_only_cols]

    preds_test = np.zeros((n_splits, df_test.shape[0]))
    k = 0
    for train_index, test_index in cv.split(X):
        X_train, X_val = X.iloc[train_index, :], X.iloc[test_index, :]
        model = LGBMRegressor(
            num_leaves=16,
            learning_rate=0.1,
            n_estimators=300,
            reg_lambda=30,
            reg_alpha=30,
            objective="mae",
            random_state=123,
        )
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        geom_mean = gmean(y_train)
        dm = DummyRegressor(strategy="constant", constant=geom_mean)

        model.fit(
            X_train, y_train, categorical_feature=["Field", "WellTypeStandardised"]
        )
        # model.fit(X_train, y_train)
        dm.fit(X_train, y_train)

        score = mean_absolute_error(y_val, model.predict(X_val))
        score_dm = mean_absolute_error(y_val, dm.predict(X_val))

        # logging.info(f' Score = {score}')
        models.append(model)
        scores.append(score)
        scores_dm.append((score_dm))
        preds_test[k, :] = model.predict(X_test).reshape(1, -1)

    with open(output_file_name, "wb") as f:
        pickle.dump(models, f)
    logging.info(scores)
    logging.info(f"Mean scores LGBM = {np.mean(scores)}")
    logging.info(f"Mean scores Dummy = {np.mean(scores_dm)}")

    preds_df = pd.DataFrame(
        {"EPAssetsID": ids, "UWI": ids_uwi, tgt: preds_test.mean(axis=0)}
    )
    return preds_df
Esempio n. 21
0
    return best


if __name__ == "__main__":

    # 1 Load total_data_cache
    train_x = data_help.train_x
    train_y = data_help.train_y
    total_data_cache = [train_x, train_y]
    feature_names = train_x.columns

    # 2 Define model and parameters
    robust_lightgbm = make_pipeline(
        RobustScaler(),
        LGBMRegressor(random_state=42, objective="huber", n_jobs=2))
    LGBMRegressor_dic = {
        'learning_rate': hyperopt.hp.uniform('learning_rate', 0, 2),
        'reg_alpha': hyperopt.hp.uniform('reg_alpha', 0, 2),
        'reg_lambda': hyperopt.hp.uniform('reg_lambda', 0, 2),
        'n_jobs': hyperopt.hp.choice('n_jobs', [1]),
        'random_state': hyperopt.hp.choice('random_state', [42]),
        'max_depth': hyperopt.hp.randint('max_depth', 11) + 1,
        'n_estimators': hyperopt.hp.randint('n_estimators', 250) + 1,
        'subsample': hyperopt.hp.uniform('subsample', 0, 1)
    }

    # 3 Genetic Algorithm for feature selection
    key = "lightgbm"
    probability = 0.7
    bench_model = robust_lightgbm
Esempio n. 22
0
train_x, valid_x, train_y, valid_y = train_test_split(X,
                                                      Y,
                                                      test_size=0.2,
                                                      random_state=0)
xgb_params = {}
# xgb_params['n_estimators'] = 50
xgb_params['min_child_weight'] = 12
xgb_params['learning_rate'] = 0.37
xgb_params['max_depth'] = 6
xgb_params['subsample'] = 0.77
xgb_params['reg_lambda'] = 0.8
xgb_params['reg_alpha'] = 0.4
xgb_params['base_score'] = 0
# xgb_params['seed'] = 400
xgb_params['silent'] = 1
xgb_model = LGBMRegressor(**xgb_params)
xgb_model.fit(train_x, train_y)
pre_test = xgb_model.predict(valid_x)
print "the result of 1582 dimensional features"
#metrics model
mse = metrics.mean_squared_error(pre_test, valid_y)
rmse = np.sqrt(mse)
r2 = metrics.r2_score(pre_test, valid_y)
import math


def calcMean(x, y):
    sum_x = sum(x)
    sum_y = sum(y)
    n = len(x)
    x_mean = float(sum_x + 0.0) / n
Esempio n. 23
0
n_splits = 5
k_fold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=7)
n_fold = 0
perds = np.zeros(X_test.shape[0])
for train_index, val_index in k_fold.split(X, y):
    x_train, y_train = X[train_index], y[train_index]
    x_val, y_val = X[val_index], y[val_index]

    clf0 = ('clf0', LinearRegression(n_jobs=-1))
    clf1 = ('clf1', SGDRegressor())
    clf2 = ('clf2', RandomForestRegressor(n_jobs=-1))
    clf3 = ('clf3', SVR())
    clf4 = ('clf4', CatBoostRegressor())
    clf5 = ('clf5', KNeighborsRegressor(n_jobs=-1))
    clf6 = ('clf6', XGBRegressor(n_jobs=-1))
    clf7 = ('clf7', LGBMRegressor(n_jobs=-1))

    model = VotingRegressor([clf0, clf1, clf2, clf3, clf4, clf5, clf6, clf7],
                            n_jobs=-1)
    model.fit(x_train, y_train)
    pred = model.predict(X_test)
    perds += pred[:, 0]
    print('验证集正确率为:{}'.format(mean_absolute_error(model.predict(x_val),
                                                  y_val)))
    print('训练集正确率为:{}'.format(
        mean_absolute_error(model.predict(x_train), y_train)))

    dataframe = pd.DataFrame({
        'index': range(len(X_test)),
        'label': model.predict(X_test)
    })
Esempio n. 24
0
def train():

    seed = 0

    df = pd.read_csv('listings.csv')

    train, test = train_test_split(df,
                                   test_size=0.2,
                                   random_state=seed,
                                   shuffle=True)

    # Drop unnecessary columns
    train = train[[
        'neighbourhood_group', 'neighbourhood', 'room_type', 'minimum_nights',
        'price'
    ]]
    test = test[[
        'neighbourhood_group', 'neighbourhood', 'room_type', 'minimum_nights',
        'price'
    ]]

    # Power Transform
    X_train = train.drop(['price'], axis=1)
    y_train = train['price'].values

    X_test = test.drop(['price'], axis=1)
    y_test = test['price'].values

    num_cols = X_train._get_numeric_data().columns.tolist()

    pt = PowerTransformer(method='yeo-johnson')

    X_train[num_cols] = pt.fit_transform(X_train[num_cols])
    X_test[num_cols] = pt.transform(X_test[num_cols])

    # saving transformer first
    joblib.dump(pt.fit(y_train.reshape(-1, 1)), 'powerTransform.joblib')

    y_train = pt.fit_transform(y_train.reshape(-1, 1))
    y_test = pt.transform(y_test.reshape(-1, 1))

    # Label Encoder
    le = LabelEncoder()

    cat_cols_train = X_train.select_dtypes(
        include=['string', 'object']).columns.tolist()

    cat_cols_test = X_test.select_dtypes(
        include=['string', 'object']).columns.tolist()

    for col in cat_cols_train:

        joblib.dump(le.fit(X_train[col].astype('string')),
                    'le_{}.joblib'.format(col))

        X_train[col] = le.fit_transform(X_train[col].astype('string'))

    # I fit the test dataset because it contains previously unseen labels in the train dataset
    for col in cat_cols_test:
        X_test[col] = le.fit_transform(X_test[col].astype('string'))

    # Outliers
    X_train['price'] = y_train.ravel().tolist()

    X_train.drop(X_train[(X_train['price'] < -4)].index, inplace=True)

    y_train = X_train['price']

    X_train.drop('price', axis=1, inplace=True)

    # Model
    X_train = X_train.values

    y_train = y_train.values

    model = LGBMRegressor(max_depth=10, num_leaves=20, random_state=0)

    model.fit(X_train, y_train)

    joblib.dump(model, "model.joblib")
copyfile(config.path+'/config.py', path+'/config.py')

logging.basicConfig(level=logging.DEBUG,
    format='%(asctime)s:%(name)s:%(levelname)s:%(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    handlers=[logging.FileHandler(path + '/train.log', mode='w'), logging.StreamHandler()])

train = pd.read_csv(config.path+'/data/train_modified.tsv', sep=' ', index_col=0)
test = pd.read_csv(config.path+'/data/test_modified.tsv', sep=' ', index_col=0)

train.fillna('', inplace=True)
test.fillna('', inplace=True)

prep = pickle.load(open(config.features_path+'/all_tfidf_svd.pickle.dat', 'rb'))
splits = pickle.load(open(config.features_path+'/all_tfidf_svd_splits.pickle.dat', 'rb'))
lgbm = LGBMRegressor(**config.params)

fold, cv_score, cv_mse, cv_mae = 0, 0, 0, 0
models = {k:{'counter':v, 'model':lgbm} for k, v in prep.items()}

for train_index, test_index in splits:
    logging.info('make features...')
    train_data = pickle.load(open(config.features_path+'/train_fold_%i.pickle.dat' % fold, 'rb'))
    test_data = pickle.load(open(config.features_path+'/test_fold_%i.pickle.dat' % fold, 'rb'))

    logging.info(train_data.shape)
    logging.info(test_data.shape)
    
    logging.info('fitting model...')
    if config.early_stopping:
        models[fold]['model'].fit(train_data, train['target'].loc[train_index],
Esempio n. 26
0
                'BN_') or col.startswith('MU_'):
        X[col] = X[col].astype(bool)

# In[87]:

for col in X.columns:
    if col.startswith('hr_'):
        X.drop(col, axis=1, inplace=True)

# In[88]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# In[89]:

clf = LGBMRegressor(n_jobs=-1, n_estimators=500)
clf.fit(X_train, y_train)

# In[90]:

explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_train)

# In[95]:

shap.summary_plot(shap_values, X_train, max_display=100)

# In[92]:

shap.summary_plot(shap_values, X, plot_type="bar", max_display=50)
Esempio n. 27
0
print('O MSRE do Modelo  é : ', np.sqrt(mse(y_train,xgb_pred_train)))

print('teste')
print('O Score do Modelo  na base de teste é : ',xgb_model.score(x_test,y_test))
print('O MAE do Modelo  é : ', mae(y_test,xgb_pred))
print('O MSRE do Modelo  é : ', np.sqrt(mse(y_test,xgb_pred)))


# ## LGBM

# In[19]:


from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(random_state=42)
lgbm_model = lgbm.fit(x_train,y_train)
lgbm_pred = lgbm_model.predict(x_test)
lgbm_pred_train = lgbm_model.predict(x_train)


# Verificando a performance do modelo

print('treino')
print('O Score do Modelo na base de treino é : ',lgbm_model.score(x_train,y_train))
print('O MAE do Modelo  é : ', mae(y_train,lgbm_pred_train))
print('O MSRE do Modelo  é : ', np.sqrt(mse(y_train,lgbm_pred_train)))

print('teste')
print('O Score do Modelo  na base de teste é : ',lgbm_model.score(x_test,y_test))
print('O MAE do Modelo  é : ', mae(y_test,lgbm_pred))
Esempio n. 28
0
			now_time = repay_time + relativedelta(days=10)
			tmp_df['due_amount_sum_10'] = tmp_table.loc[
				(tmp_table['due_date'] >= repay_time) & (tmp_table['due_date'] <= now_time), 'due_amt'].sum()
			# due_amount_sum_20:第一期还款日为T日~T+20日的标的应还款总额
			now_time = repay_time + relativedelta(days=20)
			tmp_df['due_amount_sum_20'] = tmp_table.loc[
				(tmp_table['due_date'] >= repay_time) & (tmp_table['due_date'] <= now_time), 'due_amt'].sum()

			feature_tabel = pd.concat([feature_tabel, tmp_df], axis=0)
	feature_tabel.reset_index(drop=True, inplace=True)
	return feature_tabel


time_range_test = pd.date_range(start='2019-2-1', end='2019-3-31')
feature_tabel_test = generate_feature_test(test, time_range_test)
LGBMRegressor()

param_grid = {
	'boosting_type': ['gbdt'],
	'n_estimators': list(np.arange(100, 3000, step=10)),
	'num_leaves': list(range(20, 150)),
	'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base=10, num=1000)),
	'subsample_for_bin': list(range(20000, 300000, 20000)),
	'min_child_samples': list(range(20, 500, 5)),
	'reg_alpha': list(np.linspace(0, 1)),
	'reg_lambda': list(np.linspace(0, 1)),
	'colsample_bytree': list(np.linspace(0.6, 1, 10)),
	'subsample': list(np.linspace(0.5, 1, 100)),
}
predictors = list(filter(lambda x: x not in ['repay_time', 'flag', 'label'], feature_tabel.columns))
model = LGBMRegressor()
Esempio n. 29
0
        ############# Pipeline Predict ################
        preds_val = pipeline.predict(X_val)
        # preds_test = clf.predict(X_test)

        fold_score = np.sqrt(mean_squared_error(y_val, preds_val))
        fold_model_fitment = round((r2_score(y_val, preds_val)) * 100)

        print(f'\nRMSE for validation set is {fold_score}')
        print(f'Model Fitment for validation set is {fold_model_fitment} %')

        oofs[val_idx] = preds_val
        # preds += preds_test / N_SPLITS

    oofs_score = np.sqrt(mean_squared_error(target, oofs))
    oofs_model_fitment = round((r2_score(target, oofs)) * 100)

    print(f'\n\n------------- Overall  -------------')
    print(f' So, RMSE for oofs is {oofs_score}')
    print(f' & Overall Model Fitment for oofs is {oofs_model_fitment} %')

    #print(oofs)
    return oofs


#clf = XGBRegressor(n_estimators=200, n_jobs=-1)
#xgb_oofs = run_clf_kfold(clf, train, cat_num_cols)

clf = LGBMRegressor()
lgb_oofs = run_clf_kfold(clf, train, cat_num_cols)
Esempio n. 30
0
                                                    target,
                                                    test_size=0.33,
                                                    random_state=0)

# In[157]:

xgb = XGBRFRegressor(colsample_bynode=1,
                     colsample_bytree=0.6,
                     learning_rate=0.01,
                     max_delta=4,
                     min_child_weight=1.5,
                     n_estimators=2400,
                     reg_alpha=0.6,
                     reg_lambda=0.6)
lgbm = LGBMRegressor(objective='regression',
                     num_leaves=4,
                     learning_rate=0.01,
                     n_estimators=12000)

# In[158]:

xgb.fit(X_train, y_train)
lgbm.fit(X_train, y_train, eval_metric='rmse')

# In[162]:

predict1 = xgb.predict(X_test)
predict2 = lgbm.predict(X_test)

# In[164]:

print('Root Mean Square Error test = ' +