Example #1
0
def get_importances_from_model(X,
                               y,
                               features=None,
                               verbose=50,
                               early_stopping_rounds=200):

    lgb_params = {}
    lgb_params['boosting_type'] = 'gbdt'
    lgb_params['objective'] = 'binary'
    lgb_params['learning_rate'] = 0.03
    lgb_params['metric'] = 'auc'
    lgb_params['num_iterations'] = 10000
    lgb_params["colsample_bytree"] = 0.5
    lgb_params["subsample"] = 0.8
    lgb_params["reg_alpha"] = 0.3
    lgb_params['reg_lambda'] = 0.3
    lgb_params['max_depth'] = 8

    if features == None:
        features = X.columns.tolist()

    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=2017)

    lgb_train = Dataset(data=train_X, label=train_y, feature_name=features)
    lgb_val = Dataset(data=val_X, label=val_y, feature_name=features)

    lgb_booster = train(params=lgb_params,
                        train_set=lgb_train,
                        valid_sets=[lgb_train, lgb_val],
                        valid_names=["train", "validation"],
                        verbose_eval=verbose,
                        early_stopping_rounds=early_stopping_rounds)

    return lgb_booster
Example #2
0
def fit_lgb(x_tr, y_tr, x_va, y_va, cat_feats, args):
    from lightgbm import Dataset

    if args.clip_target != -1:
        y_tr = y_tr.clip(upper=args.clip_target)

    tr_ds = Dataset(x_tr, label=y_tr, free_raw_data=False)
    if args.mode not in ['full', 'fold']:
        va_ds = Dataset(x_va, label=y_va, free_raw_data=False)
        valid_sets = [tr_ds, va_ds]
    else:
        valid_sets = [tr_ds]

    params = {
        'learning_rate': 0.02,
        'max_depth': -1,
        'boosting': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'is_training_metric': True,
        'num_leaves': args.num_leaves,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.7,
        'lambda_l2': 0.7,
        'bagging_freq': 5,
        'seed': 42
    }

    kwargs = {
        'train_set': tr_ds,
        'categorical_feature': cat_feats,
        'verbose_eval': args.verbose_eval,
        'num_boost_round': args.num_boost_round,
    }

    if args.mode not in ['full', 'fold']:
        kwargs['early_stopping_rounds'] = 200
        kwargs['valid_sets'] = valid_sets

    if args.lr_decay:
        kwargs['callbacks'] = [
            lgb.reset_parameter(
                learning_rate=learning_rate_010_decay_power_0995)
        ]

    m = lgb.train(params, **kwargs)

    tr_pred = np.clip(m.predict(tr_ds.data), 0, 361)
    tr_score = np.sqrt(mean_squared_error(tr_pred, tr_ds.label))

    if args.mode not in ['full', 'fold']:
        va_pred = np.clip(m.predict(va_ds.data), 0, 361)
        va_score = np.sqrt(mean_squared_error(va_pred, va_ds.label))
    else:
        va_score = 0.

    return m, tr_score, va_score
Example #3
0
def get_importances_from_model(X, y, features=None):

    lgb_params = {}
    lgb_params['boosting_type'] = 'gbdt'
    lgb_params['objective'] = 'binary'
    lgb_params['learning_rate'] = 0.02
    lgb_params['metric'] = 'auc'
    #    lgb_params['num_leaves'] = 34
    lgb_params['colsample_bytree'] = 0.75
    lgb_params['subsample'] = 0.75
    lgb_params['n_estimators'] = 1500
    #    lgb_params['max_depth'] = 8
    #    lgb_params["reg_alpha"] = 0.041545473
    #    lgb_params['reg_lambda'] = 0.0735294
    #    lgb_params['min_split_gain'] = 0.0735294
    #    lgb_params['min_child_weight'] = 0.0735294
    #    lgb_params['silent'] = False

    if features == None:
        features = X.columns.tolist()

    lgb_train = Dataset(data=X, label=y, feature_name=features)

    lgb_booster = train(params=lgb_params,
                        train_set=lgb_train,
                        verbose_eval=50,
                        num_boost_round=1500)

    return lgb_booster
Example #4
0
    def test_onnxrt_python_lightgbm_categorical_iris(self):
        iris = load_iris()
        X, y = iris.data, iris.target
        X = (X * 10).astype(numpy.int32)
        X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
        other_x = numpy.random.randint(0,
                                       high=10,
                                       size=(1500, X_train.shape[1]))
        X_train = numpy.vstack([X_train, other_x]).astype(dtype=numpy.int32)
        y_train = numpy.hstack([
            y_train,
            numpy.zeros(500) + 3,
            numpy.zeros(500) + 4,
            numpy.zeros(500) + 5
        ]).astype(dtype=numpy.int32)
        self.assertEqual(y_train.shape, (X_train.shape[0], ))
        y_train = y_train % 2

        # Classic
        gbm = LGBMClassifier()
        gbm.fit(X_train, y_train)
        exp = gbm.predict_proba(X_test)
        onx = to_onnx(gbm,
                      initial_types=[('X',
                                      Int64TensorType([None,
                                                       X_train.shape[1]]))])
        self.assertIn('ZipMap', str(onx))
        oif = OnnxInference(onx)
        got = oif.run({'X': X_test})
        values = pandas.DataFrame(got['output_probability']).values
        self.assertEqualArray(exp, values, decimal=5)

        # categorical_feature=[0, 1]
        train_data = Dataset(X_train,
                             label=y_train,
                             feature_name=['c1', 'c2', 'c3', 'c4'],
                             categorical_feature=['c1', 'c2'])

        params = {
            "boosting_type": "gbdt",
            "learning_rate": 0.05,
            "n_estimators": 2,
            "objective": "binary",
            "max_bin": 5,
            "min_child_samples": 100,
            'verbose': -1,
        }

        booster = lgb_train(params, train_data)
        exp = booster.predict(X_test)

        onx = to_onnx(booster,
                      initial_types=[('X',
                                      Int64TensorType([None,
                                                       X_train.shape[1]]))])
        self.assertIn('ZipMap', str(onx))
        oif = OnnxInference(onx)
        got = oif.run({'X': X_test})
        values = pandas.DataFrame(got['output_probability']).values
        self.assertEqualArray(exp, values[:, 1], decimal=5)
Example #5
0
    def test_lightgbm_booster_multi_classifier(self):
        X = [[0, 1], [1, 1], [2, 0], [1, 2], [-1, 2], [1, -2]]
        X = numpy.array(X, dtype=numpy.float32)
        y = [0, 1, 0, 1, 2, 2]
        data = Dataset(X, label=y)
        model = train(
            {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',
                'n_estimators': 3,
                'min_child_samples': 1,
                'num_class': 3
            }, data)

        update_registered_converter(WrappedLightGbmBoosterClassifier,
                                    'WrappedLightGbmBoosterClassifier',
                                    calculate_lightgbm_output_shapes,
                                    convert_lightgbm,
                                    parser=lightgbm_parser,
                                    options={
                                        'zipmap': [False, True],
                                        'nocl': [False, True]
                                    })
        update_registered_converter(WrappedBooster,
                                    'WrappedBooster',
                                    calculate_lightgbm_output_shapes,
                                    convert_lightgbm,
                                    parser=lightgbm_parser,
                                    options={
                                        'zipmap': [False, True],
                                        'nocl': [False, True]
                                    })
        update_registered_converter(Booster,
                                    'LightGbmBooster',
                                    calculate_lightgbm_output_shapes,
                                    convert_lightgbm,
                                    parser=lightgbm_parser)

        model_onnx = to_onnx(
            model,
            initial_types=[('X', FloatTensorType([None, 2]))],
            options={WrappedLightGbmBoosterClassifier: {
                'zipmap': False
            }},
            target_opset={
                '': TARGET_OPSET,
                'ai.onnx.ml': TARGET_OPSET_ML
            })

        try:
            sess = InferenceSession(model_onnx.SerializeToString())
        except InvalidArgument as e:
            raise AssertionError("Cannot load model\n%r" %
                                 str(model_onnx)) from e
        expected = model.predict(X)
        res = sess.run(None, {'X': X})
        assert_almost_equal(expected, res[1])
Example #6
0
def lightgbm_trainer(training_data, label, model_params):
    """Train LightGBM model on training data.

    Args:
        training_data (lightgbm.Dataset): Training data.
        label (str): Target column in training data.
        model_params (dict): Training parameters.

    Returns:
        lightgbm.Booster: Trained LightGBM model.
    """
    training_data = Dataset(data=training_data.drop(label, axis=1),
                            label=training_data[LABEL])
    return train(train_set=training_data, params=model_params)
Example #7
0
    def test_lightgbm_booster_classifier(self):
        from lightgbm import Dataset, train as lgb_train

        X = numpy.array([[0, 1], [1, 1], [2, 0], [1, 2]], dtype=numpy.float32)
        y = [0, 1, 0, 1]
        data = Dataset(X, label=y)
        model = lgb_train({'boosting_type': 'rf', 'objective': 'binary',
                           'n_estimators': 3, 'min_child_samples': 1,
                           'subsample_freq': 1, 'bagging_fraction': 0.5,
                           'feature_fraction': 0.5},
                          data)
        model_onnx = to_onnx(model, X, verbose=0, rewrite_ops=True,
                             target_opset=TARGET_OPSET)
        self.assertNotEmpty(model_onnx)
Example #8
0
    def test_onnxrt_python_lightgbm_categorical_iris_booster3_real(self):
        from lightgbm import LGBMClassifier, Dataset, train as lgb_train

        iris = load_iris()
        X, y = iris.data, iris.target
        X = (X * 10).astype(numpy.float32)
        X_train, X_test, y_train, _ = train_test_split(
            X, y, random_state=11)

        # Classic
        gbm = LGBMClassifier()
        gbm.fit(X_train, y_train)
        exp = gbm.predict_proba(X_test)
        onx = to_onnx(gbm.booster_, initial_types=[
            ('X', FloatTensorType([None, X_train.shape[1]]))],
            target_opset=TARGET_OPSET)
        self.assertIn('ZipMap', str(onx))
        oif = OnnxInference(onx)
        got = oif.run({'X': X_test})
        values = pandas.DataFrame(got['output_probability']).values
        self.assertEqualArray(exp, values, decimal=5)

        # categorical_feature=[0, 1]
        train_data = Dataset(
            X_train, label=y_train,
            feature_name=['c1', 'c2', 'c3', 'c4'],
            categorical_feature=['c1', 'c2'])

        params = {
            "boosting_type": "gbdt", "learning_rate": 0.05,
            "n_estimators": 2, "objective": "multiclass",
            "max_bin": 5, "min_child_samples": 100,
            'verbose': -1, 'num_class': 3}

        booster = lgb_train(params, train_data)
        exp = booster.predict(X_test)

        onx = to_onnx(booster, initial_types=[
            ('X', FloatTensorType([None, X_train.shape[1]]))],
            target_opset=TARGET_OPSET)
        self.assertIn('ZipMap', str(onx))
        oif = OnnxInference(onx)
        got = oif.run({'X': X_test})
        values = pandas.DataFrame(got['output_probability']).values
        self.assertEqualArray(exp, values, decimal=5)
Example #9
0
        def objective(params, n_folds=self.n_folds):
            self.iteration += 1

            subsample = params['boosting_type'].get('subsample', 1.0)
            params['boosting_type'] = params['boosting_type']['boosting_type']
            params['subsample'] = subsample
            params['verbose'] = -1
            for p in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
                params[p] = int(params[p])

            params['histogram_pool_size'] = 1024
            # NOTE: Above parameter is introduced to reduce memory consumption
            self.logger.debug("Parameters: {}".format(params))

            start = timer()
            train_set = Dataset(x_train, label=y_train)

            # Perform n_folds cross validation
            cv_results = cv(params,
                            train_set,
                            num_boost_round=10000,
                            nfold=n_folds,
                            early_stopping_rounds=100,
                            metrics='auc',
                            seed=self.seed)
            run_time = timer() - start

            # Loss must be minimized
            best_score = np.max(cv_results['auc-mean'])
            loss = 1 - best_score

            # Boosting rounds that returned the highest cv score
            n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)

            return {
                'loss': loss,
                'params': params,
                'iteration': self.iteration,
                'estimators': n_estimators,
                'train_time': run_time,
                'status': STATUS_OK
            }
Example #10
0
    def fit_lightgbm(self, x, y, early_stopping_rounds):

        self.model = LGBMModel(**self.optimized_params)

        if early_stopping_rounds is not None:

            x_valid, y_valid = train_test_split(x,
                                                stratify=y,
                                                shuffle=True,
                                                test_size=self.test_size,
                                                random_state=self.random_state)

            self.model.fit(x,
                           y,
                           eval_set=Dataset(x_valid, y_valid),
                           early_stopping_rounds=early_stopping_rounds,
                           verbose=self.verbose)

        else:
            self.model.fit(x, y)
Example #11
0
# seeds=[i for i in range(100)]
seed = None
datafilepath = './data/cleanData.csv'
label_flag = 'categorical'
test_size = 0.2
if __name__ == '__main__':
    x, y = loadXY(datafilepath, label_flag)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=seed,
                                                        shuffle=True)
    # x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=test_size, random_state=seed, shuffle=True)

    # '''
    train_data = Dataset(x_train, label=y_train)
    # val_data = Dataset(x_val, label=y_val)
    test_data = Dataset(x_test, label=y_test)

    param = {
        'num_leaves': 100,
        'num_trees': 300,
        'objective': 'binary',
        'metric': ['auc', 'binary_logloss']
    }
    num_round = 10
    # bstcv = lgb.cv(param, train0_data, num_round, nfold=10)
    bst = lgb.train(param,
                    train_data,
                    num_round,
                    valid_sets=[train_data],
Example #12
0
    def test_onnxrt_python_lightgbm_categorical_iris_dataframe(self):
        iris = load_iris()
        X, y = iris.data, iris.target
        X = (X * 10).astype(numpy.int32)
        X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
        other_x = numpy.random.randint(0,
                                       high=10,
                                       size=(1500, X_train.shape[1]))
        X_train = numpy.vstack([X_train, other_x]).astype(dtype=numpy.int32)
        y_train = numpy.hstack([
            y_train,
            numpy.zeros(500) + 3,
            numpy.zeros(500) + 4,
            numpy.zeros(500) + 5
        ]).astype(dtype=numpy.int32)
        self.assertEqual(y_train.shape, (X_train.shape[0], ))
        y_train = y_train % 2

        df_train = pandas.DataFrame(X_train)
        df_train.columns = ['c1', 'c2', 'c3', 'c4']
        df_train['c1'] = df_train['c1'].astype('category')
        df_train['c2'] = df_train['c2'].astype('category')
        df_train['c3'] = df_train['c3'].astype('category')
        df_train['c4'] = df_train['c4'].astype('category')

        df_test = pandas.DataFrame(X_test)
        df_test.columns = ['c1', 'c2', 'c3', 'c4']
        df_test['c1'] = df_test['c1'].astype('category')
        df_test['c2'] = df_test['c2'].astype('category')
        df_test['c3'] = df_test['c3'].astype('category')
        df_test['c4'] = df_test['c4'].astype('category')

        # categorical_feature=[0, 1]
        train_data = Dataset(df_train, label=y_train)

        params = {
            "boosting_type": "gbdt",
            "learning_rate": 0.05,
            "n_estimators": 2,
            "objective": "binary",
            "max_bin": 5,
            "min_child_samples": 100,
            'verbose': -1,
        }

        booster = lgb_train(params, train_data)
        exp = booster.predict(X_test)

        onx = to_onnx(booster, df_train)
        self.assertIn('ZipMap', str(onx))

        oif = OnnxInference(onx)
        got = oif.run(df_test)
        values = pandas.DataFrame(got['output_probability']).values
        self.assertEqualArray(exp, values[:, 1], decimal=5)

        onx.ir_version = get_ir_version_from_onnx()
        oif = OnnxInference(onx, runtime='onnxruntime1')
        got = oif.run(df_test)
        values = pandas.DataFrame(got['output_probability']).values
        self.assertEqualArray(exp, values[:, 1], decimal=5)

        onx = to_onnx(booster,
                      df_train,
                      options={booster.__class__: {
                          'cast': True
                      }})
        self.assertIn('op_type: "Cast"', str(onx))
        oif = OnnxInference(onx)
        got = oif.run(df_test)
        values = pandas.DataFrame(got['output_probability']).values
        self.assertEqualArray(exp, values[:, 1], decimal=5)
Example #13
0
import torch
import pandas as pd
from lightgbm import Dataset
import numpy as np

df = torch.load('prep/vanilla0.pt')
dtest = df[df.split == 'test']
ids = dtest.pop('id')
m = torch.load('model_full.pt')
cols = [
    c for c in dtest.columns if c not in ['fold', 'id', 'split', 'ad_periods']
]

x_te = dtest[m.feature_name()]
te_ds = Dataset(x_te, free_raw_data=False)
te_pred = m.predict(te_ds.data)

sub = pd.read_csv('data/sample_submission.csv')
assert np.all(sub.id.values == ids.values)

sub.ad_periods = te_pred
sub.to_csv('pred/nov12.csv', index=False)
# corr.to_csv("ModelCorr.csv")

# # for col in corr.columns:
# #     print(corr[col][corr[col] < 0.9].index)

# selected_columns = ["preds_01", "preds_03", "preds_07", "preds_10"]

# df = preds[selected_columns]

# print(df.head(10))
# df["TARGET"] = df.mean(axis=1)
# df[["TARGET"]].to_csv("Ensemble_LowCorr.csv")

# Convert data to DMatrix
lgb_train = Dataset(X_train, y_train)
lgb_test = Dataset(X_test)

# Define estimator parameters
params = {
    'task': 'train',
    'objective': 'binary',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': 8,
    'min_data_in_leaf': 20,
    'min_sum_hessian_in_leaf': 0.001,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'scale_pos_weight': 1,
    'metric': 'auc',
Example #15
0
test_size = 0.4
if __name__ == '__main__':
    x, y = loadXY(datafilepath, label_flag)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=seed,
                                                        shuffle=True)
    x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                      y_train,
                                                      test_size=test_size,
                                                      random_state=seed,
                                                      shuffle=True)

    # '''
    train_data = Dataset(x_train, label=y_train)
    val_data = Dataset(x_val, label=y_val)
    test_data = Dataset(x_test, label=y_test)

    param = {
        'num_leaves': 31,
        'num_trees': 100,
        'objective': 'binary',
        'metric': ['auc', 'binary_logloss']
    }
    num_round = 10
    bstcv = lgb.cv(param, train_data, num_round, nfold=10)
    bst = lgb.train(param,
                    train_data,
                    num_round,
                    valid_sets=[val_data],
Example #16
0
def train_lgb(model_output_path,
              valid_limit=500,
              thread_num=2,
              save_rounds=100,
              num_boost_round=2000,
              former_model_path=None,
              max_epochs=100,
              batch_size=100,
              nb_worker=4,
              mini_batch_size=3000,
              limit=2000,
              iteration_per_epoch=100):
    train_file_numbers = range(1, 540) + range(750, 800) + range(
        870, 920) + range(970, 1020) + range(1100, 1200)
    valid_file_numbers = range(400, 440) + range(700, 750) + range(
        845, 870) + range(945, 970) + range(1045, 1100)
    test_file_numbers = range(540, 640) + range(800, 845) + range(
        920, 945) + range(1020, 1045) + range(1200, 1214)
    DATA_ROOT = '/media/user/Data0/hjw/datas/Quant_Datas_v4.0//gzip_datas_norm'
    train_filepath_list = [
        os.path.join(DATA_ROOT, '%s_trans_norm.gz' % fn)
        for fn in train_file_numbers
    ]
    valid_filepath_list = [
        os.path.join(DATA_ROOT, '%s_trans_norm.gz' % fn)
        for fn in valid_file_numbers
    ]
    # basic_model = build_model(feature_dim=4560, output_dim=1)
    # train_filepath_list = ['/Users/jayveehe/git_project/FundDataAnalysis/pipelines/datas/gzip_datas/993_trans.gz']
    # valid_filepath_list = ['/Users/jayveehe/git_project/FundDataAnalysis/pipelines/datas/gzip_datas/993_trans.gz']
    # train_generator = gzip_sample_generator(train_filepath_list, batch_size=1000, total_limit=1000000,
    #                                         per_file_limit=10000)
    valid_generator = gzip_sample_generator(valid_filepath_list,
                                            batch_size=50000,
                                            total_limit=100000,
                                            per_file_limit=10000)
    params = {
        'objective': 'regression_l2',
        'num_leaves': 128,
        'boosting': 'gbdt',
        'feature_fraction': 0.9,
        'bagging_fraction': 0.7,
        'bagging_freq': 100,
        'verbose': 0,
        'is_unbalance': False,
        'metric': 'l1,l2,huber',
        'num_threads': thread_num
    }
    if former_model_path:
        former_model = cPickle.load(open(former_model_path, 'rb'))
    else:
        former_model = None
    tmp_model = former_model
    tmp_num = num_boost_round
    valid_x, valid_y = next(valid_generator)
    valid_set = Dataset(valid_x, valid_y, free_raw_data=False)
    gbm = None
    eval_res = {}
    for epoch in xrange(max_epochs):
        train_generator = gzip_sample_generator(train_filepath_list,
                                                batch_size=50000,
                                                total_limit=1000000,
                                                per_file_limit=100000)
        for iter_n in xrange(iteration_per_epoch):
            train_x, train_y = next(train_generator)
            tmp_dataset = Dataset(train_x, train_y, free_raw_data=False)
            # if not gbm:
            gbm = lgb.train(params,
                            tmp_dataset,
                            num_boost_round=save_rounds,
                            early_stopping_rounds=30,
                            keep_training_booster=True,
                            learning_rates=lambda iter_num: max(
                                1 * (0.98**iter_num /
                                     (iteration_per_epoch * 0.05)), 0.008),
                            valid_sets=[valid_set],
                            init_model=tmp_model,
                            evals_result=eval_res)
            # else:
            #     gbm.update(train_set=tmp_dataset)
            tmp_model = gbm
        # print 'eval result: %s' % eval_res
        print 'saving model'
        gbm.save_model(model_output_path)
Example #17
0
def _train(params: dict,
           x_train: np.ndarray,
           y_train: np.ndarray,
           loss_func,
           *args,
           x_valid: np.ndarray = None,
           y_valid: np.ndarray = None,
           loss_func_grad: Callable[[float, float], float] = None,
           loss_func_eval: Callable[[float, float], float] = None,
           use_custom_dataset: bool = False,
           func_train=None,
           **kwargs):
    """
    Params::
        loss_func: custom loss or string
            string:
                lightgbm の公式に従う. https://lightgbm.readthedocs.io/en/latest/Parameters.html#core-parameters
                binary, multiclass, regression_l1, huber, ...
            custom loss
        loss_func_grad:
            grad, hess の値が出る function があれば set する.
            None かつ loss_func が custom の場合は, scipy で grad,hessを自動計算.
            None かつ loss_func が string の場合は original の実装通り.
        loss_func_eval:
            eval function or string.
            function の場合
                func_embed に mean で set する
            string の場合
                lightgbm の公式に従う. https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric-parameters
            None の場合
                loss_func が string の場合
                    loss_func を使う
                loss_func が function の場合
                    loss_func を func_embed に埋め込んで使う
    """
    logger.info("START")
    dataset = None
    if use_custom_dataset:
        dataset = KkLgbDataset(x_train)
        dataset.set_culstom_label(y_train)
    else:
        dataset = Dataset(x_train, label=y_train)
    if not (isinstance(x_valid, list) or isinstance(x_valid, tuple)):
        x_valid = [] if x_valid is None else [x_valid]
        y_valid = [] if y_valid is None else [y_valid]
    list_dataset_valid = [dataset]
    for _x_valid, _y_valid in zip(x_valid, y_valid):
        if use_custom_dataset:
            list_dataset_valid.append(KkLgbDataset(_x_valid))
            list_dataset_valid[-1].set_culstom_label(_y_valid)
        else:
            list_dataset_valid.append(Dataset(_x_valid, label=_y_valid))
    fobj = None
    if loss_func_grad is None and (not isinstance(loss_func, str)):
        loss_func_grad = partial(calc_grad_hess, loss_func=loss_func)
    if loss_func_grad is not None:
        fobj = lambda x, y: lgb_custom_objective(
            x, y, loss_func_grad, is_lgbdataset=True)
    feval = None
    if loss_func_eval is not None and (not isinstance(loss_func_eval, str)):
        feval = lambda x, y: lgb_custom_eval(x,
                                             y,
                                             func_embed(loss_func_eval,
                                                        calc_type="mean"),
                                             "myloss",
                                             is_higher_better=False,
                                             is_lgbdataset=True)
    elif loss_func_eval is None:
        if isinstance(loss_func, str):
            loss_func_eval = loss_func
        else:
            feval = lambda x, y: lgb_custom_eval(x,
                                                 y,
                                                 func_embed(loss_func,
                                                            calc_type="mean"),
                                                 "myloss",
                                                 is_higher_better=False,
                                                 is_lgbdataset=True)
    if fobj is None and isinstance(loss_func, str):
        params["objective"] = loss_func
    if feval is None and isinstance(loss_func_eval, str):
        params["metric"] = loss_func_eval
    evals_result = {}  # metric の履歴
    logger.info(
        f"params: {params}, dataset: {dataset}, fobj: {fobj}, feval: {feval}")
    obj = func_train(
        params,
        dataset,
        valid_sets=list_dataset_valid,
        valid_names=["train"] +
        ["valid" + str(i) for i in range(len(list_dataset_valid) - 1)],
        fobj=fobj,
        feval=feval,
        evals_result=evals_result,
        **kwargs)
    logger.info("END")
    return obj
Example #18
0
def kfold_lightgbm(df,
                   num_folds,
                   stratified=False,
                   epochs=1,
                   corr_save=False,
                   importance_save=False):
    df = df.drop('Unnamed: 0', axis=1)
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]

    # Correlation csv processing
    if corr_save == True:
        target_corr = train_df.corr()['TARGET'].sort_values()
        corr_df = pd.DataFrame()
        corr_df['feature'] = target_corr.index
        corr_df['corr'] = target_corr.values
        corr_df = corr_df[corr_df['feature'] != 'feature']
        corr_df.to_csv('../output/correlation.csv')
        del target_corr, corr_df

    # Delete variables from memory
    del df

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))
    # Initialise predictions and importance dataframes and epoch weights
    sub_df = test_df[['SK_ID_CURR']].copy()
    sub_df['TARGET'] = 0
    ep_ave = 1 / epochs
    epv_preds = np.zeros(train_df.shape[0])
    epv_df = train_df[['SK_ID_CURR']].copy()
    epv_df['TARGET'] = 0
    feature_importance_df = pd.DataFrame()

    for n in range(epochs):

        print('Epoch number {} of {} starting'.format(n + 1, epochs))
        # Cross validation model
        if epochs == 1:
            if stratified:
                folds = StratifiedKFold(n_splits=num_folds,
                                        shuffle=True,
                                        random_state=1001)
            else:
                folds = KFold(n_splits=num_folds,
                              shuffle=True,
                              random_state=1001)
        else:
            if stratified:
                folds = StratifiedKFold(n_splits=num_folds, shuffle=True)
            else:
                folds = KFold(n_splits=num_folds, shuffle=True)
        # Create arrays and dataframes to store results
        oof_preds = np.zeros(train_df.shape[0])
        sub_preds = np.zeros(test_df.shape[0])
        feats = [
            f for f in train_df.columns if f not in
            ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']
        ]

        for n_fold, (train_idx, valid_idx) in enumerate(
                folds.split(train_df[feats], train_df['TARGET'])):
            dtrain = Dataset(data=train_df[feats].iloc[train_idx],
                             label=train_df['TARGET'].iloc[train_idx],
                             free_raw_data=False,
                             silent=True)
            dvalid = Dataset(data=train_df[feats].iloc[valid_idx],
                             label=train_df['TARGET'].iloc[valid_idx],
                             free_raw_data=False,
                             silent=True)

            # LightGBM parameters found by Bayesian optimization
            params = {
                'objective': 'binary',
                'boosting_type': 'gbdt',  # 'goss'
                'nthread': 4,
                'learning_rate': 0.02,  # 02,
                'num_leaves': 20,
                'colsample_bytree': 0.9497036,
                'subsample': 0.8715623,
                'subsample_freq': 1,
                'max_depth': 8,
                'reg_alpha': 0.041545473,
                'reg_lambda': 0.0735294,
                'min_split_gain': 0.0222415,
                'min_child_weight': 60,  #39.3259775
                'seed': 0,
                'verbose': -1,
                'metric': 'auc',
            }

            clf = train(params=params,
                        train_set=dtrain,
                        num_boost_round=10000,
                        valid_sets=[dtrain, dvalid],
                        early_stopping_rounds=200,
                        verbose_eval=100)

            # params = {
            #     'objective': 'binary',
            #     'boosting_type': 'gbdt', # 'goss'
            #     'nthread': 4,
            #     'learning_rate': 0.1,  # 02,
            #     'num_leaves': 35,
            #     'colsample_bytree': 0.2,
            #     'subsample': 1,
            #     'subsample_freq': 1,
            #     'max_depth': -1,
            #     'reg_alpha': 0.0,
            #     'reg_lambda': 100.0,
            #     'min_split_gain': 0.5,
            #     'min_child_weight': 60, #39.3259775
            #     'seed': 0,
            #     'verbose': -1,
            #     'metric': 'auc',
            #     'scale_pos_weight': 1,
            #     'min_child_samples': 50,
            #     'subsample_for_bin': 300
            # }

            # clf = train(
            #     params=params,
            #     train_set=dtrain,
            #     num_boost_round=5000,
            #     valid_sets=[dtrain, dvalid],
            #     early_stopping_rounds= 100,
            #     verbose_eval=100
            # )

            oof_preds[valid_idx] = clf.predict(dvalid.data)
            sub_preds += clf.predict(test_df[feats]) / folds.n_splits

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = clf.feature_importance(
                importance_type='gain')
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)
            print('Fold %2d AUC : %.6f' %
                  (n_fold + 1, roc_auc_score(dvalid.label,
                                             oof_preds[valid_idx])))
            del clf, dtrain, dvalid

        print('Full AUC score %.6f' %
              roc_auc_score(train_df['TARGET'], oof_preds))
        # Write submission file and plot feature importance
        sub_df['TARGET'] += ep_ave * sub_preds
        epv_preds += ep_ave * oof_preds
        # epv_df['TARGET'] += ep_ave*oof_preds
        print('Epoch number {} of {} ended'.format(n + 1, epochs))

    sub_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False)
    # epv_df[['SK_ID_CURR', 'TARGET']].to_csv('../data/lightgbm-fast-train.csv', index= False)
    print('Full AUC score over all epochs %.6f' %
          roc_auc_score(train_df['TARGET'], epv_preds))
    display_importances(feature_importance_df)

    # Save feature importance df as csv
    if importance_save == True:
        feature_importance_df = feature_importance_df.groupby('feature').agg(
            'mean').drop('fold', axis=1).sort_values('importance')
        feature_importance_df.to_csv('../output/importance_3.9.csv')
Example #19
0
#tr_df = tr_df[cols]
#te_df = te_df[cols]
# svr_43_int is not fit at all .. ignored
#cols = [c for c in tr_df.columns if c.startswith('lgb')]
#tr_df = tr_df[cols]
#te_df = te_df[cols]
#tr_df.drop(cols_to_drop, axis=1, inplace=True)
#te_df.drop(cols_to_drop, axis=1, inplace=True)

x_tr, x_va, y_tr, y_va = train_test_split(tr_df,
                                          y,
                                          test_size=0.1,
                                          shuffle=True,
                                          random_state=42)

tr_ds = Dataset(x_tr, label=y_tr, free_raw_data=False)
va_ds = Dataset(x_va, label=y_va, free_raw_data=False)

valid_sets = [tr_ds, va_ds]

#hpsearch_lgb(x_tr, y_tr, x_va, y_va)
params = {
    'learning_rate': 0.001,
    'max_depth': -1,
    'boosting': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'is_training_metric': True,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'lambda_l2': 0.2,
Example #20
0
              min_child_weight=0.001,
              min_split_gain=0.0,
              n_estimators=5,
              n_jobs=1,
              num_leaves=31,
              objective='mse',
              random_state=0,
              reg_alpha=0.0,
              reg_lambda=0.0,
              silent=True,
              subsample=1.,
              subsample_for_bin=200000,
              subsample_freq=0)

X, y = load_boston(return_X_y=True)
dataset = Dataset(data=X, label=y, free_raw_data=False)

train_index = np.arange(0, 100)
test_index = np.arange(100, 200)

train_set0 = Dataset(X[train_index], y[train_index],
                     free_raw_data=False).construct()
train_set1 = dataset.subset(used_indices=train_index).construct()

assert_array_equal(train_set0.data, train_set1.data)
assert_array_equal(train_set0.label, train_set1.label)

booster0 = train(params=params, train_set=train_set0, num_boost_round=5)
booster1 = train(params=params, train_set=train_set1, num_boost_round=5)

pred0 = booster0.predict(X[test_index])
Example #21
0
def lgbm_feat_selector(train_x,
                       train_y,
                       valid_x,
                       valid_y,
                       params=PARAMS,
                       drop_size=1,
                       keep_size=20):
    """
    :param train_x:
    :param train_y:
    :param valid_x:
    :param valid_y:
    :param params:
    :param drop_size:
    :param keep_size:
    :return:list[dict({"round":select_round,
                    "features":left_features_this_round,
                    "train_round":best_iteration_of_model,
                    "auc":auc-score})]
    """
    res = []
    feat_list = list(train_x.columns)
    round_ = 0
    while len(feat_list) > keep_size:
        print("-" * 25 + "selector round {}".format(round_) + "-" * 25)
        sub_col_train_x = train_x[feat_list]
        sub_col_valid_x = valid_x[feat_list]
        data_train = Dataset(sub_col_train_x.values, train_y.values)
        data_valid = Dataset(sub_col_valid_x.values, valid_y.values)

        print("-" * 25 + "training" + "-" * 25)
        cv_res = lgb.cv(params=params,
                        train_set=data_train,
                        nfold=4,
                        stratified=True,
                        shuffle=True,
                        early_stopping_rounds=10,
                        num_boost_round=1000)

        train_score = max(cv_res["auc-mean"])
        iteration = len(cv_res["auc-mean"])
        print("-" * 25 + "saving result" + "-" * 25)
        res.append({
            "round": round_,
            "features": feat_list,
            "train_round": iteration,
            "auc": train_score
        })
        # drop tail features
        model = lgb.train(params=params,
                          train_set=data_train,
                          num_boost_round=iteration,
                          valid_sets=[data_train, data_valid])
        del data_train, data_valid
        print("-" * 25 + "dropping tail {} features".format(drop_size) +
              "-" * 25)
        feature_importance = pd.Series(
            model.feature_importance(),
            index=sub_col_train_x.columns).sort_values()
        tail_features = list(feature_importance.head(drop_size).index)
        feat_list = list(set(feat_list) - set(tail_features))

        round_ += 1

    return res
sess = rt.InferenceSession(onx.SerializeToString())
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
pred_onx = sess.run([label_name],
                    {input_name: X_test.astype(numpy.float32)})[0]
print(pred_onx)

###############################################
# With Dataset
# ++++++++++++
#
# Huge datasets cannot be handled with the scikit-learn API.
# DMatrix must be used. Let's see how to convert the trained
# model.

dtrain = Dataset(X_train, label=y_train)

param = {'objective': 'multiclass', 'num_class': 3}
bst = train_lgbm(param, dtrain, 10)

initial_type = [('float_input', FloatTensorType([None, 4]))]
onx = convert_lightgbm(bst, initial_types=initial_type)

sess = rt.InferenceSession(onx.SerializeToString())
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
pred_onx = sess.run([label_name],
                    {input_name: X_test.astype(numpy.float32)})[0]
print(pred_onx)

##################################
Example #23
0
tr_dfs = []
for fn in baseroot.glob("*_tr.csv"):
    tr_dfs.append(pd.read_csv(fn))

tr_df = pd.concat(tr_dfs, axis=1)

for fn in baseroot.glob("*_te.csv"):
    te_dfs.append(pd.read_csv(fn))

te_df = pd.concat(te_dfs, axis=1)

# svr_43_int is not fit at all .. ignored
tr_df.drop('svr_43_int', axis=1, inplace=True)
te_df.drop('svr_43_int', axis=1, inplace=True)

tr_ds = Dataset(tr_df, label=y, free_raw_data=False)
te_ds = Dataset(te_df, free_raw_data=False)

#hpsearch_lgb(x_tr, y_tr, x_va, y_va)
#params = {
#    'learning_rate': 0.02,
#    'max_depth': 3,
#    'boosting': 'gbdt',
#    'objective': 'regression',
#    'metric': 'rmse',
#    'is_training_metric': True,
#    'feature_fraction': 0.9,
#    'bagging_fraction': 0.7,
#    'lambda_l2': 0.7,
#    'bagging_freq': 5,
#    'seed':42
Example #24
0
    def __call__(self, trial):
        # sample params
        model_id = self.model_parameter.model_id
        para_dict = self.model_parameter.grid_search(trial)
        self.setup_eval(data_dict=self.data_dict, eval_dict=self.eval_dict)

        k_flod_average = 0.
        for i in range(self.fold_num):  # evaluation over k-fold data
            fold_k = i + 1
            study = self.k_studies[i]

            train_data, test_data, vali_data = self.load_data(
                self.eval_dict, self.data_dict, fold_k)

            data_id = self.data_dict['data_id']

            train_presort, validation_presort, test_presort = self.data_dict['train_presort'], self.data_dict['validation_presort'],\
                                                                self.data_dict['test_presort']

            file_train, file_vali, file_test = self.determine_files(
                data_dict=self.data_dict, fold_k=fold_k)

            self.update_save_model_dir(data_dict=self.data_dict, fold_k=fold_k)

            # prepare training & testing datasets
            file_train_data, file_train_group = load_letor_data_as_libsvm_data(
                file_train,
                split_type=SPLIT_TYPE.Train,
                data_dict=self.data_dict,
                eval_dict=self.eval_dict,
                presort=train_presort)
            x_train, y_train = load_svmlight_file(file_train_data)
            group_train = np.loadtxt(file_train_group)
            train_set = Dataset(data=x_train, label=y_train, group=group_train)

            file_test_data, file_test_group = load_letor_data_as_libsvm_data(
                file_test,
                split_type=SPLIT_TYPE.Test,
                data_dict=self.data_dict,
                eval_dict=self.eval_dict,
                presort=test_presort)
            x_test, y_test = load_svmlight_file(file_test_data)
            group_test = np.loadtxt(file_test_group)

            file_vali_data, file_vali_group = load_letor_data_as_libsvm_data(
                file_vali,
                split_type=SPLIT_TYPE.Validation,
                data_dict=self.data_dict,
                eval_dict=self.eval_dict,
                presort=validation_presort)
            x_valid, y_valid = load_svmlight_file(file_vali_data)
            group_valid = np.loadtxt(file_vali_group)
            valid_set = Dataset(data=x_valid, label=y_valid, group=group_valid)

            if para_dict['custom_dict']['custom'] and para_dict['custom_dict'][
                    'use_LGBMRanker']:
                lgbm_ranker = lgbm.LGBMRanker()
            else:
                lgbm_ranker = lgbm

            study.optimize(TreeLTRObjective(model_id=model_id, data_id = data_id, x_train=x_train , y_train=y_train, group_train=group_train, train_set=train_set, x_valid=x_valid , y_valid=y_valid, group_valid=group_valid, valid_set=valid_set, \
                ranker=lgbm_ranker, fold_k=fold_k, para_dict=para_dict, data_dict=self.data_dict, eval_dict=self.eval_dict, save_model_dir=self.save_model_dir),
                           n_trials=1) # ??? the meaning of n_trials
            # store loss
            if data_id in YAHOO_LTR:
                model_file = self.save_model_dir + 'model.txt'
            else:
                model_file = self.save_model_dir + '_'.join(
                    ['fold', str(fold_k), 'model']) + '.txt'

            lgbm_ranker = lgbm.Booster(model_file=model_file)

            vali_eval_tmp = ndcg_at_k(ranker=lgbm_ranker,
                                      test_data=vali_data,
                                      k=self.vali_k,
                                      label_type=vali_data.label_type,
                                      gpu=self.gpu,
                                      device=self.device)
            vali_eval_v = vali_eval_tmp.data.numpy()
            k_flod_average += vali_eval_v

        # calculate loss todo average k-fold validation score
        k_flod_average /= self.fold_num

        return k_flod_average
Example #25
0
    def run(self,
            fold_k,
            file_train,
            file_vali,
            file_test,
            data_dict=None,
            eval_dict=None,
            save_model_dir=None):
        """
        Run lambdaMART model based on the specified datasets.
        :param fold_k:
        :param file_train:
        :param file_vali:
        :param file_test:
        :param data_dict:
        :param eval_dict:
        :return:
        """
        data_id, do_validation = data_dict['data_id'], eval_dict[
            'do_validation']

        # prepare training & testing datasets
        file_train_data, file_train_group = \
            load_letor_data_as_libsvm_data(file_train, train=True, data_dict=data_dict, eval_dict=eval_dict)
        x_train, y_train = load_svmlight_file(file_train_data)
        group_train = np.loadtxt(file_train_group)
        train_set = Dataset(data=x_train, label=y_train, group=group_train)

        file_test_data, file_test_group = \
            load_letor_data_as_libsvm_data(file_test, data_dict=data_dict, eval_dict=eval_dict)
        x_test, y_test = load_svmlight_file(file_test_data)
        group_test = np.loadtxt(file_test_group)
        # test_set = Dataset(data=x_test, label=y_test, group=group_test)

        if do_validation:  # prepare validation dataset if needed
            file_vali_data, file_vali_group = \
                load_letor_data_as_libsvm_data(file_vali, data_dict=data_dict, eval_dict=eval_dict)
            x_valid, y_valid = load_svmlight_file(file_vali_data)
            group_valid = np.loadtxt(file_vali_group)
            valid_set = Dataset(data=x_valid, label=y_valid, group=group_valid)

            if self.custom_dict['custom'] and self.custom_dict[
                    'use_LGBMRanker']:
                lgbm_ranker = lgbm.LGBMRanker()
                lgbm_ranker.set_params(**self.lightgbm_para_dict)
                '''
                objective : string, callable or None, optional (default=None)
                Specify the learning task and the corresponding learning objective or
                a custom objective function to be used (see note below).
                Default: 'regression' for LGBMRegressor, 'binary' or 'multiclass' for LGBMClassifier, 'lambdarank' for LGBMRanker.
                '''
                custom_obj_dict = dict(objective=self.get_custom_obj(
                    custom_obj_id=self.custom_dict['custom_obj_id']))
                lgbm_ranker.set_params(**custom_obj_dict)
                '''
                eval_set (list or None, optional (default=None)) – A list of (X, y) tuple pairs to use as validation sets.
                cf. https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRanker.html
                '''
                lgbm_ranker.fit(x_train,
                                y_train,
                                group=group_train,
                                eval_set=[(x_valid, y_valid)],
                                eval_group=[group_valid],
                                eval_at=[5],
                                early_stopping_rounds=eval_dict['epochs'],
                                verbose=10)

            elif self.custom_dict['custom']:
                # use the argument of fobj
                lgbm_ranker = lgbm.train(
                    params=self.lightgbm_para_dict,
                    verbose_eval=10,
                    train_set=train_set,
                    valid_sets=[valid_set],
                    early_stopping_rounds=eval_dict['epochs'],
                    fobj=self.get_custom_obj(
                        custom_obj_id=self.custom_dict['custom_obj_id'],
                        fobj=True))
            else:  # trained booster as ranker
                lgbm_ranker = lgbm.train(
                    params=self.lightgbm_para_dict,
                    verbose_eval=10,
                    train_set=train_set,
                    valid_sets=[valid_set],
                    early_stopping_rounds=eval_dict['epochs'])
        else:  # without validation
            if self.custom_dict['custom'] and self.custom_dict[
                    'use_LGBMRanker']:
                lgbm_ranker = lgbm.LGBMRanker()
                lgbm_ranker.set_params(**self.lightgbm_para_dict)

                custom_obj_dict = dict(objective=self.get_custom_obj(
                    custom_obj_id=self.custom_dict['custom_obj_id']))
                lgbm_ranker.set_params(**custom_obj_dict)

                lgbm_ranker.fit(x_train,
                                y_train,
                                group=group_train,
                                verbose=10,
                                eval_at=[5],
                                early_stopping_rounds=eval_dict['epochs'])

            elif self.custom_dict['custom']:  # use the argument of fobj
                lgbm_ranker = lgbm.train(
                    params=self.lightgbm_para_dict,
                    verbose_eval=10,
                    train_set=train_set,
                    num_boost_round=eval_dict['epochs'],
                    fobj=self.get_custom_obj(
                        custom_obj_id=self.custom_dict['custom_obj_id'],
                        fobj=True))

            else:  # trained booster as ranker
                lgbm_ranker = lgbm.train(params=self.lightgbm_para_dict,
                                         verbose_eval=10,
                                         train_set=train_set,
                                         num_boost_round=eval_dict['epochs'])

        if data_id in YAHOO_LTR:
            model_file = save_model_dir + 'model.txt'
        else:
            model_file = save_model_dir + '_'.join(
                ['fold', str(fold_k), 'model']) + '.txt'

        if self.custom_dict['custom'] and self.custom_dict['use_LGBMRanker']:
            lgbm_ranker.booster_.save_model(model_file)
        else:
            lgbm_ranker.save_model(model_file)

        y_pred = lgbm_ranker.predict(x_test)  # fold-wise prediction

        return y_test, group_test, y_pred
Example #26
0
        score = roc_auc_score(y_train[val_idx], y_pred)
        #print('\t[XGBoost ] best iteration: \033[92m%i\033[0m' % xgb_model.best_iteration)
        print('\t[XGBoost ] oof ROC-AUC is: \033[92m%.4f\033[0m' % score)

        y_train_xgb.append(xgb_model.predict(xgb_alltrain))
        y_pred_xgb.append(xgb_model.predict(xgb_alltest))
        xgb_scores.append(score)

        xgb_feature_importances.append(
            xgb_model.get_fscore())


        # Then LightGBM
        lgbm_train = Dataset(
            data=X_train[trn_idx, :],
            label=y_train[trn_idx],
            feature_name=predictors,
            categorical_feature=categorical_features)

        lgbm_val = Dataset(
            data=X_train[val_idx, :],
            label=y_train[val_idx],
            feature_name=predictors,
            categorical_feature=categorical_features)

        print('\t[LightGBM] training...')
        lgbm_model = train_lgb(
            lgbm_params,
            lgbm_train,
            num_boost_round=15000,
            early_stopping_rounds=100,
Example #27
0
def main(verbose=True, force=False, test=False):
    import datetime

    IGNORE_FEATURES = []

    os.makedirs(ANALYSIS_PATH, exist_ok=True)
    os.makedirs(TRAIN_PATH, exist_ok=True)

    raw_df_name = os.path.join(TRAIN_PATH, 'data_raw.pyt')
    scaled_df_name = os.path.join(TRAIN_PATH, 'data_scaled.pyt')

    st_time = datetime.datetime.now()
    print('Loading the data...')
    if not os.path.isfile(raw_df_name) or force:
        df = read()
        df.set_index(ID, inplace=True)
        print('\tWriting \033[92m%s\033[0m' % (raw_df_name))
        with open(raw_df_name, 'wb') as pyt:
            joblib.dump(df, pyt)
    else:
        print('\tLoading data from \033[92m%s\033[0m' % (raw_df_name))
        df = joblib.load(raw_df_name)
    print('-- Took %i seconds.' %
          (datetime.datetime.now() - st_time).total_seconds())

    # log-scale the predictors & predictand
    bins_target = np.logspace(np.log10(df[PREDICTAND].min()),
                              np.log10(df[PREDICTAND].max()), 20)
    df[PREDICTAND] = np.log1p(df[PREDICTAND])
    predictors = [c for c in df.columns if c not in ['isTrain', PREDICTAND]]

    # Counts of 0s or non-0s is very different between test and train sets !
    pstep = 5
    percs = np.arange(pstep, 100, pstep)

    calculated_cols = []
    columns_then = df.columns

    # Add the info relative to the leak as it affects the training / test processes
    leak_file = os.path.join(TRAIN_PATH, "df_leaked_%s.pyt" % N_LAGS)
    if os.path.isfile(leak_file):
        df_leaked = joblib.load(leak_file)
    else:
        df_ = df[predictors].reset_index(level=0)
        df_[PREDICTAND] = df[PREDICTAND]
        df_ = df_[['ID', PREDICTAND] + predictors]

        df_leaked = get_all_leak(df_, COLUMNS_LEAK, N_LAGS)
        leak_cols = [c for c in df_leaked if c.startswith('leak')]
        df_leaked = df_leaked[leak_cols]
        with open(leak_file, 'wb') as pyt:
            joblib.dump(df_leaked, pyt)

    df_leaked.index = df.index
    leak_cols = df_leaked.columns
    df['nb_potential_leaks'] = df_leaked.notnull().sum(axis=1)
    df['leak_mean'] = df_leaked.mean(axis=1).fillna(0)
    df['leak_median'] = df_leaked.median(axis=1).fillna(0)
    df['leak_max'] = df_leaked.max(axis=1).fillna(0)
    df['leak_min'] = df_leaked.min(axis=1).fillna(0)

    # Clustering on sorted dataframe (row by row) to detect similar entries
    df_ = df[predictors].copy()
    for row in range(len(df_)):
        arr = df_.iloc[row, :]
        df_.iloc[row, :] = np.sort(arr)

    # Hierarchical clustering seems to have a predictive power
    #distance = "euclidean"
    n_clusters = 12
    for distance in [
            "hamming", "jaccard", "sokalmichener", "sokalsneath", "euclidean"
    ]:
        st_time = datetime.datetime.now()
        print(
            'Finding \033[92m%i clusters\033[0m with \033[92m%s distance\033[0m'
            % (n_clusters, distance))
        dist_fname = os.path.join(TRAIN_PATH, "%s_dists.pyt" % distance)
        if os.path.isfile(dist_fname):
            dist = joblib.load(dist_fname)
            print('-- Pairwise distance loading took %i seconds.' %
                  (datetime.datetime.now() - st_time).total_seconds())
        else:
            if distance == "euclidean":
                dist = ss.distance.pdist(df_[predictors].values, distance)
            else:
                dist = ss.distance.pdist(df[predictors].values.astype(bool),
                                         distance)
            print('-- Pairwise distance computation took %i seconds.' %
                  (datetime.datetime.now() - st_time).total_seconds())
            with open(dist_fname, 'wb') as pyt:
                joblib.dump(dist, pyt)

        ward_linkage = hierarchy.ward(dist)
        tree = hierarchy.to_tree(ward_linkage)
        cluster_colname = 'cluster_%s' % distance
        df[cluster_colname] = hierarchy.fcluster(ward_linkage,
                                                 _get_height_at(
                                                     tree, n_clusters),
                                                 criterion="distance")
        print('-- Took %i seconds.' %
              (datetime.datetime.now() - st_time).total_seconds())
        CATEGORICAL_FEATURES.append(cluster_colname)
        sns.catplot(x=cluster_colname,
                    y=PREDICTAND,
                    data=df.groupby('isTrain').get_group(True),
                    kind="violin")
        plt.savefig(os.path.join(ANALYSIS_PATH, '%s.png' % cluster_colname))
        plt.close()

    # Keep euclidean clusters as 'cluster_colname' for K-fold grouping
    cluster_colname = "cluster_euclidean"

    print('Mojena stopping rule')

    clusters_for_plot = np.arange(1, 101)
    heights = np.array(
        [_get_height_at(tree, n_clusters) for n_clusters in clusters_for_plot])

    plt.figure()
    plt.plot(clusters_for_plot, heights, 'ko--')
    plt.grid()
    plt.xlabel('Number of clusters')
    plt.ylabel('Dendrogram height')
    plt.savefig(os.path.join(ANALYSIS_PATH, '%s_mojena.png' % cluster_colname))
    plt.close()

    print('Dendrogram for Euclidean distance')
    dn = hierarchy.dendrogram(ward_linkage,
                              no_labels=True,
                              above_threshold_color='k')
    plt.ylabel('height')
    plt.xlabel('samples')
    plt.savefig(
        os.path.join(ANALYSIS_PATH, '%s_dendrogram.png' % cluster_colname))
    plt.close()

    df[predictors] = np.log1p(df[predictors])

    st_time = datetime.datetime.now()

    def func_agg(row):
        r = row[row > 0]
        return np.append([
            (row > 0).sum(),
            r.mean(),
            (r**2).mean(),
            r.std(),
            r.max(),
            r.min(),
            r.skew(),
            r.kurtosis(),
        ], r.quantile(q=percs / 100))

    print('Computing non-zero aggregates...')
    df[[
        'count_nonzero',
        'mean_nonzero',
        'meansq_nonzero',
        'std_nonzero',
        'max_nonzero',
        'min_nonzero',
        'skew_nonzero',
        'kurt_nonzero',
    ] + ['p%i' % p for p in percs]] = df[predictors].apply(
        func_agg, axis=1, result_type="expand").fillna(0)
    print('-- Took %i seconds.' %
          (datetime.datetime.now() - st_time).total_seconds())

    st_time = datetime.datetime.now()

    def func_agg(row):
        r = row[row > 0].diff().abs()
        return np.append([
            r.mean(),
            (r**2).mean(),
            r.std(),
            r.max(),
            r.min(),
            r.skew(),
            r.kurtosis(),
        ], r.quantile(q=percs / 100))

    print('Computing diff aggregates...')
    df[[
        'diff_mean_nonzero',
        'diff_meansq_nonzero',
        'diff_std_nonzero',
        'diff_max_nonzero',
        'diff_min_nonzero',
        'diff_skew_nonzero',
        'diff_kurtosis_nonzero',
    ] + ['diff_p%i' % p for p in percs]] = df[predictors].apply(
        func_agg, axis=1, result_type="expand").fillna(0)
    print('-- Took %i seconds.' %
          (datetime.datetime.now() - st_time).total_seconds())

    # add occurrences (will it help ?)
    print('Computing distributions...')

    def func_epd(row):
        epd = np.histogram(np.exp(row[row > 0].values) - 1,
                           bins=bins_target,
                           normed=True)[0]
        return epd / np.sum(epd)

    df[['epd_%i' % b for b in bins_target[:-1]
        ]] = df[predictors].apply(func_epd, axis=1,
                                  result_type="expand").fillna(0)

    columns_now = df.columns
    calculated_cols.extend([c for c in columns_now if c not in columns_then])

    ## Scale the features
    #st_time = datetime.datetime.now()
    #print('Scaling (log) the features')
    #for col in df.columns:
    #if col not in [PREDICTAND, ID, 'isTrain']:
    #df[col] = np.log(df[col] + 1)
    #print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds())

    df.drop(predictors, axis=1, inplace=True)
    predictors = [c for c in calculated_cols if c in df.columns]

    print('-- Took %i seconds.' %
          (datetime.datetime.now() - st_time).total_seconds())

    #with open(os.path.join(TRAIN_PATH, 'predictors_%s.pyt' % datetime.datetime.now().strftime('%Y%m%d%H')),
    #'wb') as pyt:
    #joblib.dump(df, pyt)

    st_time = datetime.datetime.now()
    print('Transforming the features')
    cols_to_remove = []
    cols_to_add = []
    for col in predictors:
        if col in CATEGORICAL_FEATURES:
            print('\tFeature %s is categorical -> OneHot' % col)
            transf = OneHotEncoder()
            transf.fit(df[col].values.reshape(-1, 1))

            res = transf.transform(df[col].values.reshape(-1, 1))
            for i, ax in enumerate(res.transpose(), 1):
                onehot = '{}_{}'.format(col, i)
                df[onehot] = ax.toarray().squeeze()
                cols_to_add.append(onehot)
            cols_to_remove.append(col)

        else:
            print('\tFeature %s is numerical -> QuantileTransformer' % col)
            try:
                df[col] = QuantileTransformer().fit_transform(
                    df[col].values.reshape(-1, 1))
            except:
                print("\033[91mQuantileTransformer failed on %s\033[0m" % col)
    print('-- Took %i seconds.' %
          (datetime.datetime.now() - st_time).total_seconds())

    #df.drop(cols_to_remove, axis=1, inplace=True)
    IGNORE_FEATURES.extend(cols_to_remove)
    for col in cols_to_remove:
        predictors.remove(col)
        calculated_cols.remove(col)
    predictors.extend(cols_to_add)

    # T-SNE
    st_time = datetime.datetime.now()
    print('Running T-SNE...')
    fname = os.path.join(ANALYSIS_PATH, "tsne", "tsne.png")
    os.makedirs(os.path.dirname(fname), exist_ok=True)
    tsne_comps = tsne(
        df[predictors + [PREDICTAND, 'isTrain']],
        fname,
        nb=len(df),
        perplexity=40,
        title=None,
        visu_tsne=None,
        cmap='viridis',
        predictand=PREDICTAND,
        binary=False,
        #do_not_plot=[c for c in predictors if not c in calculated_cols + ['isTrain', PREDICTAND]],
    )

    with open(
            os.path.join(
                TRAIN_PATH, "tsne_%s.pyt" %
                (datetime.datetime.now().strftime('%Y%m%d%H%M'))),
            'wb') as pyt:
        joblib.dump(tsne_comps, pyt)

    try:
        for i, tsne_ax in enumerate(tsne_comps.transpose(), 1):
            df['tsne%i' % i] = tsne_ax
            calculated_cols.append('tsne%i' % i)
    except:
        print('\033[91mWARNING ! could not add t-sne values\033[0m')
        print_exc()
        pass
    print('-- Took %i seconds.' %
          (datetime.datetime.now() - st_time).total_seconds())

    analyze(df, calculated_cols, step='preprocessed')
    #analyze_bivariate(df, cfgs, step='preprocessed')

    df_train = select_sample(df, "train")
    df_test = select_sample(df, "test")

    fname = os.path.join(TRAIN_PATH, 'df_train.pyt')
    print('Saving df_train to \033[92m%s\033[0m' % fname)
    with open(fname, 'wb') as pyt:
        joblib.dump(df_train, pyt)

    predictors = [c for c in df_train.columns if c not in IGNORE_FEATURES]
    predictors.remove(PREDICTAND)

    X_train = df_train[predictors].values
    y_train = df_train[PREDICTAND].values
    X_test = df_test[predictors].values
    test_rows = df_test.index

    # Load the "leaked" target
    leaked_target = df_leaked.loc[test_rows, leak_cols].median(axis=1)
    leaked_count = df_leaked.loc[test_rows, leak_cols].notnull().sum(axis=1)
    leak_inds = np.where(leaked_count > 0)[0]

    #reg, _ = train_and_validate(
    #df_train,
    #predictors,
    #PREDICTAND,
    #wdir=TRAIN_PATH,
    #kind='regression',
    #MLP_options={'hidden_layer_sizes': (100, 100)},
    #GradientBoosting_options={'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 600, 'random_state': 42},
    #XGBoost_options={'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 600, 'random_state': 42},
    #LightGBM_options={'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 600, 'random_state': 42, 'verbose': -1, 'num_leaves': 124},
    #RandomForest_options={'max_depth': None, 'n_estimators': 900, 'max_features': 1, 'min_samples_leaf': 3, 'min_samples_split': 10, 'criterion': 'mse', 'random_state': 42},
    #)

    #os.makedirs(OUTPUT_PATH, exist_ok=True)
    #for name, regdict in reg.items():
    #model = regdict['model']
    #fname = os.path.join(OUTPUT_PATH, '%s.csv' % name)
    #y_pred = model.predict(X_test)
    #y_pred = np.expm1(y_pred)

    ##y_pred[leak_inds] = leaked_target.values[leak_inds]

    #df_result = pd.DataFrame({ID: df_test.index,
    #PREDICTAND: y_pred})
    #df_result.to_csv(fname, index=False)
    #print('Wrote prediction file: \033[94;1m%s\033[0m' % fname)

    def save_model(model, name, y_pred=None, replace_leak=False):
        if model is not None:
            fname = os.path.join(TRAIN_PATH, "%s.pyt" % name)
            os.makedirs(TRAIN_PATH, exist_ok=True)
            with open(fname, "wb") as pyt:
                joblib.dump({'model': model}, pyt)
            print('\tSaved model to \033[92m%s\033[0m' % fname)

        fname = os.path.join(OUTPUT_PATH, "%s.csv" % name)
        if y_pred is None:
            y_pred = model.predict(X_test)
        y_pred = np.expm1(y_pred)

        if replace_leak:
            y_pred[leak_inds] = leaked_target.values[leak_inds]
            fname = fname.replace('.csv', '_leak.csv')

        df_result = pd.DataFrame({ID: df_test.index, PREDICTAND: y_pred})
        df_result.to_csv(fname, index=False)
        print('\tSaved prediction to \033[92m%s\033[0m' % fname)

    from lightgbm import Dataset
    from lightgbm import train as train_lgb

    nfolds = 10
    #folds = KFold(n_splits=nfolds, shuffle=True, random_state=21)
    folds = GroupKFold(n_splits=nfolds)

    y_pred_xgb = np.zeros(len(X_test))
    y_train_xgb = np.zeros(len(X_train))
    y_pred_lgbm = np.zeros(len(X_test))
    y_train_lgbm = np.zeros(len(X_train))

    lgb_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'mse'},
        'num_leaves': 124,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'verbose': -1,
        'num_boost_round': 15000,
        'early_stopping_rounds': 100,
        'nthread': 26
    }

    def _rmse_func(predictions, ground_truth):
        return np.sqrt(mean_squared_error(predictions, ground_truth))

    def rmse(predictions, train_data):
        labels = train_data.get_label()
        return 'RMSE', _rmse_func(predictions, labels), False

    for ifold, (trn_idx, val_idx) in enumerate(
            folds.split(X_train, y_train, df_train[cluster_colname].values)):

        print("Fold nb. %i" % ifold)

        lgb_train = Dataset(data=X_train[trn_idx, :],
                            label=y_train[trn_idx],
                            feature_name=predictors)

        lgb_val = Dataset(data=X_train[val_idx, :],
                          label=y_train[val_idx],
                          feature_name=predictors)

        reg = XGBRegressor(n_estimators=600,
                           max_depth=5,
                           learning_rate=0.05,
                           random_state=42)
        reg.fit(df_train[predictors].iloc[trn_idx, :].values,
                df_train[[PREDICTAND]].iloc[trn_idx, :].values.squeeze())
        pred_fold = reg.predict(df_train[predictors].iloc[val_idx].values)

        print('\t[XGBoost] oof RMSE is: \033[92m%.4f\033[0m' % np.sqrt(
            mean_squared_error(
                df_train[[PREDICTAND]].iloc[val_idx].values.squeeze(),
                pred_fold)))
        y_train_xgb += reg.predict(X_train) / nfolds
        y_pred_xgb += reg.predict(X_test) / nfolds

        reg = train_lgb(lgb_params,
                        lgb_train,
                        num_boost_round=15000,
                        early_stopping_rounds=100,
                        verbose_eval=100,
                        valid_sets=[lgb_train, lgb_val],
                        feval=rmse)

        y_pred = reg.predict(X_train[val_idx, :],
                             num_iteration=reg.best_iteration)
        score = np.sqrt(mean_squared_error(y_train[val_idx], y_pred))

        print('\t[LGBM] Best iteration: \033[92m%i\033[0m' %
              reg.best_iteration)
        print('\t[LGBM] oof RMSE is: \033[92m%.4f\033[0m' % score)

        y_train_lgbm += reg.predict(X_train,
                                    num_iteration=reg.best_iteration) / nfolds
        y_pred_lgbm += reg.predict(X_test,
                                   num_iteration=reg.best_iteration) / nfolds

    save_model(None, "LightGBM_folded", y_pred_lgbm, replace_leak=True)

    save_model(None, "XGBoost_folded", y_pred_xgb)
    save_model(None, "LightGBM_folded", y_pred_lgbm)
    save_model(None, "XGB-LGBM_folded", 0.5 * (y_pred_xgb + y_pred_lgbm))

    gsDict = {}

    ## AdaBoost
    #print('\033[1mGridSearch - AdaBoostRegressor\033[0m')
    #reg_base = DecisionTreeRegressor()
    #reg = AdaBoostRegressor(reg_base, random_state=42)
    #ada_param_grid = {
    #"base_estimator__criterion": ["mse", "mae"],
    #"base_estimator__splitter": ["best", "random"],
    #"algorithm": ["SAMME", "SAMME.R"],
    #"n_estimators": [2, 10, 50],
    #"learning_rate":  [0.001, 0.01, 0.1]}

    #gsAdaBoost = GridSearchCV(reg, param_grid=ada_param_grid,
    #cv=nfolds, scoring="neg_mean_squared_error",
    #n_jobs=20, verbose=1)
    #gsAdaBoost.fit(X_train, y_train)

    #ada_best = gsAdaBoost.best_estimator_
    #print('\tBest score: \033[92m%.4f\033[0m' % gsAdaBoost.best_score_)
    #ada_best.fit(X_train, y_train)
    #save_model(ada_best, "AdaBoost")
    #gsDict["AdaBoost"] = gsAdaBoost

    ## ExtraTrees
    #print('\033[1mGridSearch - ExtraTreesRegressor\033[0m')
    #reg = ExtraTreesRegressor()

    ## Search grid for optimal parameters
    #ex_param_grid = {
    #"max_depth": [None],
    #"max_features": [1, 3, 10],
    #"min_samples_split": [2, 3, 10],
    #"min_samples_leaf": [1, 3, 10],
    #"bootstrap": [False],
    #"n_estimators": [100, 300, 900],
    #"criterion": ["mse", "mae"]}

    #gsExtraTrees = GridSearchCV(reg, param_grid=ex_param_grid,
    #cv=nfolds, scoring="neg_mean_squared_error",
    #n_jobs=20, verbose=1)
    #gsExtraTrees.fit(X_train, y_train)
    #etc_best = gsExtraTrees.best_estimator_
    #print('\tBest score: \033[92m%.4f\033[0m' % gsExtraTrees.best_score_)
    #etc_best.fit(X_train, y_train)
    #save_model(etc_best, "ExtraTrees")
    #gsDict["ExtraTrees"] = gsExtraTrees

    ## RF Parameters
    #print('\033[1mGridSearch - RandomForestRegressor\033[0m')
    #reg = RandomForestRegressor()

    ## Search grid for optimal parameters
    #rf_param_grid = {
    #"max_depth": [None, 4, 5],
    #"max_features": [1, 3, 10],
    #"min_samples_split": [2, 3, 10],
    #"min_samples_leaf": [1, 3, 10],
    #"bootstrap": [False],
    #"n_estimators": [100, 300, 900],
    #"criterion": ["mse", "mae"]}

    #gsRandomForest = GridSearchCV(
    #reg, param_grid=rf_param_grid,
    #cv=nfolds, scoring="neg_mean_squared_error",
    #n_jobs=36, verbose=1)
    #gsRandomForest.fit(X_train, y_train)
    #rfc_best = gsRandomForest.best_estimator_
    #print('\tBest score: \033[92m%.4f\033[0m' % gsRandomForest.best_score_)
    #for key in rf_param_grid.keys():
    #print('\t\t%s: \033[92m%s\033[0m' % (key, getattr(rfc_best, key, '-')))

    #rfc_best.fit(X_train, y_train)
    #save_model(rfc_best, "RandomForest")
    #gsDict["RandomForest"] = gsRandomForest

    ## Gradient boosting
    #print('\033[1mGridSearch - GradientBoostingRegressor\033[0m')
    #reg = GradientBoostingRegressor()
    #gb_param_grid = {
    #'loss' : ["ls", "lad", "huber"],
    #'n_estimators' : [600, 300, 900],
    #'learning_rate': [0.1, 0.05, 0.01],
    #'max_depth': [5, 4, 6],
    #'min_samples_leaf': [10, 50],
    #'max_features': ["sqrt", "auto"]
    #}

    #gsGradientBoosting = GridSearchCV(
    #reg, param_grid=gb_param_grid,
    #cv=nfolds, scoring="neg_mean_squared_error",
    #n_jobs=36, verbose=1)
    #gsGradientBoosting.fit(X_train, y_train)
    #gbc_best = gsGradientBoosting.best_estimator_
    #print('\tBest score: \033[92m%.4f\033[0m' % gsGradientBoosting.best_score_)
    #for key in gb_param_grid.keys():
    #print('\t\t%s: \033[92m%s\033[0m' % (key, getattr(gbc_best, key, '-')))

    #gbc_best.fit(X_train, y_train)
    #save_model(gbc_best, "GradientBoosting")
    #gsDict["GradientBoosting"] = gsGradientBoosting

    # Gradient boosting
    print('\033[1mGridSearch - XGBRegressor\033[0m')
    reg = XGBRegressor()
    xgb_param_grid = {
        'n_estimators': [600, 300, 900],
        'learning_rate': [0.1, 0.05, 0.01],
        'max_depth': [5, 4, 6],
        'missing': [None, 0.],
        'booster': ["gbtree", "gblinear", "dart"],
    }

    gsXGBoost = GridSearchCV(reg,
                             param_grid=xgb_param_grid,
                             cv=nfolds,
                             scoring="neg_mean_squared_error",
                             n_jobs=36,
                             verbose=1)
    gsXGBoost.fit(X_train, y_train)
    gbc_best = gsXGBoost.best_estimator_
    print('\tBest score: \033[92m%.4f\033[0m' % gsXGBoost.best_score_)
    for key in xgb_param_grid.keys():
        print('\t\t%s: \033[92m%s\033[0m' % (key, getattr(gbc_best, key, '-')))

    gbc_best.fit(X_train, y_train)
    save_model(gbc_best, "XGBoost")
    gsDict["XGBoost"] = gsXGBoost
Example #28
0
from Estimators import LGBM
from Utils import Profiler
import pandas as pd
from IPython.display import display
import lightgbm as lgb
import Gather_Data

profile = Profiler()
profile.Start()

# Gather Data
#train_X, test_X, train_Y = dataset.Load('AllData_v3')
train_X, test_X, train_Y = Gather_Data.AllData_v4()

# Convert data to DMatrix
lgb_train = Dataset(train_X, train_Y)
lgb_test = Dataset(test_X)

# Define estimator parameters
params = {
    'task': 'train',
    'objective': 'binary',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': 8,
    'min_data_in_leaf': 20,
    'min_sum_hessian_in_leaf': 0.001,
    'lambda_l1': 0,
    'lambda_l2': 0,
    'scale_pos_weight': 1,
    'metric': 'auc',