Example #1
0
def get_importances_from_model(X,
                               y,
                               features=None,
                               verbose=50,
                               early_stopping_rounds=200):

    lgb_params = {}
    lgb_params['boosting_type'] = 'gbdt'
    lgb_params['objective'] = 'binary'
    lgb_params['learning_rate'] = 0.03
    lgb_params['metric'] = 'auc'
    lgb_params['num_iterations'] = 10000
    lgb_params["colsample_bytree"] = 0.5
    lgb_params["subsample"] = 0.8
    lgb_params["reg_alpha"] = 0.3
    lgb_params['reg_lambda'] = 0.3
    lgb_params['max_depth'] = 8

    if features == None:
        features = X.columns.tolist()

    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=2017)

    lgb_train = Dataset(data=train_X, label=train_y, feature_name=features)
    lgb_val = Dataset(data=val_X, label=val_y, feature_name=features)

    lgb_booster = train(params=lgb_params,
                        train_set=lgb_train,
                        valid_sets=[lgb_train, lgb_val],
                        valid_names=["train", "validation"],
                        verbose_eval=verbose,
                        early_stopping_rounds=early_stopping_rounds)

    return lgb_booster
Example #2
0
    def _evaluate(self, scores: np.ndarray, clases: lgb.Dataset) -> Tuple[str, int, bool]:
        labels = clases.get_label()
        weights = clases.get_weight()
        score_corte = self.prob_corte

        nombre, valor = self._evaluar_funcion_ganancia(scores, labels, weights, score_corte)

        return nombre, valor, True
Example #3
0
def fit_lgb(x_tr, y_tr, x_va, y_va, cat_feats, args):
    from lightgbm import Dataset

    if args.clip_target != -1:
        y_tr = y_tr.clip(upper=args.clip_target)

    tr_ds = Dataset(x_tr, label=y_tr, free_raw_data=False)
    if args.mode not in ['full', 'fold']:
        va_ds = Dataset(x_va, label=y_va, free_raw_data=False)
        valid_sets = [tr_ds, va_ds]
    else:
        valid_sets = [tr_ds]

    params = {
        'learning_rate': 0.02,
        'max_depth': -1,
        'boosting': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'is_training_metric': True,
        'num_leaves': args.num_leaves,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.7,
        'lambda_l2': 0.7,
        'bagging_freq': 5,
        'seed': 42
    }

    kwargs = {
        'train_set': tr_ds,
        'categorical_feature': cat_feats,
        'verbose_eval': args.verbose_eval,
        'num_boost_round': args.num_boost_round,
    }

    if args.mode not in ['full', 'fold']:
        kwargs['early_stopping_rounds'] = 200
        kwargs['valid_sets'] = valid_sets

    if args.lr_decay:
        kwargs['callbacks'] = [
            lgb.reset_parameter(
                learning_rate=learning_rate_010_decay_power_0995)
        ]

    m = lgb.train(params, **kwargs)

    tr_pred = np.clip(m.predict(tr_ds.data), 0, 361)
    tr_score = np.sqrt(mean_squared_error(tr_pred, tr_ds.label))

    if args.mode not in ['full', 'fold']:
        va_pred = np.clip(m.predict(va_ds.data), 0, 361)
        va_score = np.sqrt(mean_squared_error(va_pred, va_ds.label))
    else:
        va_score = 0.

    return m, tr_score, va_score
Example #4
0
    def Dist_Objective(predt: np.ndarray, data: lgb.Dataset):
        """A customized objective function to train each distributional parameter using custom gradient and hessian.

        """

        target = torch.tensor(data.get_label())

        # When num_class!= 0, preds has shape (n_obs, n_dist_param).
        # Each element in a row represents a raw prediction (leaf weight, hasn't gone through response function yet).
        predt = predt.reshape(-1, Gaussian.n_dist_param(), order="F")
        preds_location = Gaussian.param_dict()["location"](predt[:, 0])
        preds_location = torch.tensor(preds_location, requires_grad=True)

        preds_scale = Gaussian.param_dict()["scale"](predt[:, 1])
        preds_scale = torch.tensor(preds_scale, requires_grad=True)

        # Weights
        if data.get_weight() == None:
            # Use 1 as weight if no weights are specified
            weights = np.ones_like(target, dtype=float)
        else:
            weights = data.get_weight()

        # Initialize Gradient and Hessian Matrices
        grad = np.zeros(shape=(len(target), Gaussian.n_dist_param()))
        hess = np.zeros(shape=(len(target), Gaussian.n_dist_param()))

        # Specify Metric for Auto Derivation
        dGaussian = Normal(preds_location, preds_scale)
        autograd_metric = -dGaussian.log_prob(target).nansum()

        # Location
        grad[:, 0] = stabilize_derivative(
            auto_grad(metric=autograd_metric, parameter=preds_location, n=1) *
            weights, Gaussian.stabilize)

        hess[:, 0] = stabilize_derivative(
            auto_grad(metric=autograd_metric, parameter=preds_location, n=2) *
            weights, Gaussian.stabilize)

        # Scale
        grad[:, 1] = stabilize_derivative(
            auto_grad(metric=autograd_metric, parameter=preds_scale, n=1) *
            weights, Gaussian.stabilize)

        hess[:, 1] = stabilize_derivative(
            auto_grad(metric=autograd_metric, parameter=preds_scale, n=2) *
            weights, Gaussian.stabilize)

        # Reshaping
        grad = grad.ravel(order="F")
        hess = hess.ravel(order="F")

        return grad, hess
Example #5
0
def _make_validation_labels_purchase_only(valid_ds: lgb.Dataset):
    valid_ds.construct()
    labels = np.array(valid_ds.get_label())
    non_purchase = (labels != _PURCHASE_LABEL)
    non_purchase_interaction = np.logical_and(non_purchase,
                                              labels != _NOTHING_LABEL)
    logging.info(
        f"Number of non-purchase interactions in valid: {non_purchase_interaction.sum()}"
    )
    logging.info(
        f"Number of total non-purchases in valid: {non_purchase.sum()}")
    labels[non_purchase] = 0.0
    valid_ds.set_label(labels)
Example #6
0
    def Dist_Objective(predt: np.ndarray, data: lgb.Dataset):
        """A customized objective function to train each distributional parameter using custom gradient and hessian.

        """

        target = data.get_label()

        # When num_class!= 0, preds has shape (n_obs, n_dist_param).
        # Each element in a row represents a raw prediction (leaf weight, hasn't gone through response function yet).
        predt = predt.reshape(-1, Gaussian.n_dist_param(), order="F")
        preds_location = Gaussian.param_dict()["location"](predt[:, 0])
        preds_scale = Gaussian.param_dict()["scale"](predt[:, 1])


        # Weights
        if data.get_weight() == None:
            # Use 1 as weight if no weights are specified
            weights = np.ones_like(target, dtype=float)
        else:
            weights = data.get_weight()


        # Initialize Gradient and Hessian Matrices
        grad = np.zeros(shape=(len(target), Gaussian.n_dist_param()))
        hess = np.zeros(shape=(len(target), Gaussian.n_dist_param()))


        # Location
        grad[:, 0] = Gaussian.gradient_location(y=target,
                                                location=preds_location,
                                                scale=preds_scale,
                                                weights=weights)

        hess[:, 0] = Gaussian.hessian_location(scale=preds_scale,
                                               weights=weights)

        # Scale
        grad[:, 1] = Gaussian.gradient_scale(y=target,
                                             location=preds_location,
                                             scale=preds_scale,
                                             weights=weights)

        hess[:, 1] = Gaussian.hessian_scale(scale=preds_scale,
                                            weights=weights)

        # Reshaping
        grad = grad.ravel(order="F")
        hess = hess.ravel(order="F")

        return grad, hess
Example #7
0
def lightgbm_trainer(training_data, label, model_params):
    """Train LightGBM model on training data.

    Args:
        training_data (lightgbm.Dataset): Training data.
        label (str): Target column in training data.
        model_params (dict): Training parameters.

    Returns:
        lightgbm.Booster: Trained LightGBM model.
    """
    training_data = Dataset(data=training_data.drop(label, axis=1),
                            label=training_data[LABEL])
    return train(train_set=training_data, params=model_params)
Example #8
0
def lgb_compatible_f1_score(y_hat: np.ndarray,
                            data: lgb.Dataset) -> Tuple[str, float, bool]:
    y_true = data.get_label().astype(int)
    y_hat = np.round(y_hat).astype(
        int)  # scikit's f1 doesn't work with probabilities

    return "f1_score", f1_score(y_true, y_hat), True
Example #9
0
def get_importances_from_model(X, y, features=None):

    lgb_params = {}
    lgb_params['boosting_type'] = 'gbdt'
    lgb_params['objective'] = 'binary'
    lgb_params['learning_rate'] = 0.02
    lgb_params['metric'] = 'auc'
    #    lgb_params['num_leaves'] = 34
    lgb_params['colsample_bytree'] = 0.75
    lgb_params['subsample'] = 0.75
    lgb_params['n_estimators'] = 1500
    #    lgb_params['max_depth'] = 8
    #    lgb_params["reg_alpha"] = 0.041545473
    #    lgb_params['reg_lambda'] = 0.0735294
    #    lgb_params['min_split_gain'] = 0.0735294
    #    lgb_params['min_child_weight'] = 0.0735294
    #    lgb_params['silent'] = False

    if features == None:
        features = X.columns.tolist()

    lgb_train = Dataset(data=X, label=y, feature_name=features)

    lgb_booster = train(params=lgb_params,
                        train_set=lgb_train,
                        verbose_eval=50,
                        num_boost_round=1500)

    return lgb_booster
Example #10
0
def lgb_custom_eval(y_pred: np.ndarray,
                    data: lgb.Dataset,
                    func_loss,
                    func_name: str,
                    is_higher_better: bool,
                    is_lgbdataset: bool = True):
    """
    lightGBMのcustomized objectiveの共通関数
    Params::
        y_pred:
            予測値. multi classの場合は、n_sample * n_class の長さになったいる
            値は、array([0データ目0ラベルの予測値, ..., Nデータ目0ラベルの予測値, 0データ目1ラベルの予測値, ..., ])
        data:
            train_set に set した値
        func_loss:
            y_pred, y_true を入力に持つ
    """
    if is_lgbdataset == False:
        y_true = y_pred.copy()
        y_pred = data
    else:
        y_true = data.label
        if is_callable(data, "ndf_label"):
            y_true = data.get_culstom_label(y_true.astype(int))
    if y_pred.shape[0] != y_true.shape[0]:
        # multi class の場合
        y_pred = y_pred.reshape(-1, y_true.shape[0]).T
    value = func_loss(y_pred, y_true)
    return func_name, value, is_higher_better
Example #11
0
def lgb_custom_objective(y_pred: np.ndarray,
                         data: lgb.Dataset,
                         func_loss,
                         is_lgbdataset: bool = True):
    """
    lightGBMのcustomized objectiveの共通関数
    Params::
        y_pred:
            予測値. multi classの場合は、n_sample * n_class の長さになったいる
            値は、array([0データ目0ラベルの予測値, ..., Nデータ目0ラベルの予測値, 0データ目1ラベルの予測値, ..., ])
        data:
            train_set に set した値
        func_loss:
            y_pred, y_true を入力に持ち、y_pred と同じ shape を持つ return をする
        is_lgbdataset:
            lgb.dataset でなかった場合は入力が逆転するので気をつける
    """
    if is_lgbdataset == False:
        y_true = y_pred.copy()
        y_pred = data
    else:
        y_true = data.label
        if is_callable(data, "ndf_label"):
            y_true = data.get_culstom_label(y_true.astype(int))
    if y_pred.shape[0] != y_true.shape[0]:
        # multi class の場合
        y_pred = y_pred.reshape(-1, y_true.shape[0]).T
    grad, hess = func_loss(y_pred, y_true)
    return grad.T.reshape(-1), hess.T.reshape(-1)
Example #12
0
    def test_onnxrt_python_lightgbm_categorical_iris(self):
        iris = load_iris()
        X, y = iris.data, iris.target
        X = (X * 10).astype(numpy.int32)
        X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
        other_x = numpy.random.randint(0,
                                       high=10,
                                       size=(1500, X_train.shape[1]))
        X_train = numpy.vstack([X_train, other_x]).astype(dtype=numpy.int32)
        y_train = numpy.hstack([
            y_train,
            numpy.zeros(500) + 3,
            numpy.zeros(500) + 4,
            numpy.zeros(500) + 5
        ]).astype(dtype=numpy.int32)
        self.assertEqual(y_train.shape, (X_train.shape[0], ))
        y_train = y_train % 2

        # Classic
        gbm = LGBMClassifier()
        gbm.fit(X_train, y_train)
        exp = gbm.predict_proba(X_test)
        onx = to_onnx(gbm,
                      initial_types=[('X',
                                      Int64TensorType([None,
                                                       X_train.shape[1]]))])
        self.assertIn('ZipMap', str(onx))
        oif = OnnxInference(onx)
        got = oif.run({'X': X_test})
        values = pandas.DataFrame(got['output_probability']).values
        self.assertEqualArray(exp, values, decimal=5)

        # categorical_feature=[0, 1]
        train_data = Dataset(X_train,
                             label=y_train,
                             feature_name=['c1', 'c2', 'c3', 'c4'],
                             categorical_feature=['c1', 'c2'])

        params = {
            "boosting_type": "gbdt",
            "learning_rate": 0.05,
            "n_estimators": 2,
            "objective": "binary",
            "max_bin": 5,
            "min_child_samples": 100,
            'verbose': -1,
        }

        booster = lgb_train(params, train_data)
        exp = booster.predict(X_test)

        onx = to_onnx(booster,
                      initial_types=[('X',
                                      Int64TensorType([None,
                                                       X_train.shape[1]]))])
        self.assertIn('ZipMap', str(onx))
        oif = OnnxInference(onx)
        got = oif.run({'X': X_test})
        values = pandas.DataFrame(got['output_probability']).values
        self.assertEqualArray(exp, values[:, 1], decimal=5)
Example #13
0
def lgb_f1_loss_multiclass(preds: np.ndarray,
                           train_data: lgb.Dataset,
                           clip: float = 1e-5):
    """Custom loss for optimizing f1.

    Args:
        preds: np.ndarray.
        train_data: lgb dataset.
        clip: clip values.

    Returns:
        lgb loss output.

    """
    y_true = train_data.get_label().astype(np.int32)
    preds = preds.reshape((y_true.shape[0], -1), order='F')
    # softmax
    preds = np.clip(softmax_ax1(preds), clip, 1 - clip)
    # make ohe
    y_ohe = np.zeros_like(preds)
    np.add.at(y_ohe, (np.arange(y_true.shape[0]), y_true), 1)
    # grad
    grad = (preds - y_ohe) * preds
    # hess
    hess = (1 - preds) * preds * np.clip((2 * preds - y_ohe), 1e-3, np.inf)
    # reshape back preds
    return grad.reshape((-1, ), order='F'), hess.reshape((-1, ), order='F')
Example #14
0
def lgb_f1_loss_multiclass(
        preds: np.ndarray,
        train_data: lgb.Dataset,
        clip: float = 1e-5) -> Tuple[np.ndarray, np.ndarray]:
    """Custom loss for optimizing f1.

    Args:
        preds: Predctions.
        train_data: Dataset in LightGBM format.
        clip: Clump constant.

    Returns:
        Gradient, hessian.

    """
    y_true = train_data.get_label().astype(np.int32)
    preds = preds.reshape((y_true.shape[0], -1), order="F")
    # softmax
    preds = np.clip(softmax_ax1(preds), clip, 1 - clip)
    # make ohe
    y_ohe = np.zeros_like(preds)
    np.add.at(y_ohe, (np.arange(y_true.shape[0]), y_true), 1)
    # grad
    grad = (preds - y_ohe) * preds
    # hess
    hess = (1 - preds) * preds * np.clip((2 * preds - y_ohe), 1e-3, np.inf)
    # reshape back preds
    return grad.reshape((-1, ), order="F"), hess.reshape((-1, ), order="F")
Example #15
0
    def test_lightgbm_booster_multi_classifier(self):
        X = [[0, 1], [1, 1], [2, 0], [1, 2], [-1, 2], [1, -2]]
        X = numpy.array(X, dtype=numpy.float32)
        y = [0, 1, 0, 1, 2, 2]
        data = Dataset(X, label=y)
        model = train(
            {
                'boosting_type': 'gbdt',
                'objective': 'multiclass',
                'n_estimators': 3,
                'min_child_samples': 1,
                'num_class': 3
            }, data)

        update_registered_converter(WrappedLightGbmBoosterClassifier,
                                    'WrappedLightGbmBoosterClassifier',
                                    calculate_lightgbm_output_shapes,
                                    convert_lightgbm,
                                    parser=lightgbm_parser,
                                    options={
                                        'zipmap': [False, True],
                                        'nocl': [False, True]
                                    })
        update_registered_converter(WrappedBooster,
                                    'WrappedBooster',
                                    calculate_lightgbm_output_shapes,
                                    convert_lightgbm,
                                    parser=lightgbm_parser,
                                    options={
                                        'zipmap': [False, True],
                                        'nocl': [False, True]
                                    })
        update_registered_converter(Booster,
                                    'LightGbmBooster',
                                    calculate_lightgbm_output_shapes,
                                    convert_lightgbm,
                                    parser=lightgbm_parser)

        model_onnx = to_onnx(
            model,
            initial_types=[('X', FloatTensorType([None, 2]))],
            options={WrappedLightGbmBoosterClassifier: {
                'zipmap': False
            }},
            target_opset={
                '': TARGET_OPSET,
                'ai.onnx.ml': TARGET_OPSET_ML
            })

        try:
            sess = InferenceSession(model_onnx.SerializeToString())
        except InvalidArgument as e:
            raise AssertionError("Cannot load model\n%r" %
                                 str(model_onnx)) from e
        expected = model.predict(X)
        res = sess.run(None, {'X': X})
        assert_almost_equal(expected, res[1])
Example #16
0
    def Dist_Objective(predt: np.ndarray, data: lgb.Dataset):
        """A customized objective function to train each distributional parameter using custom gradient and hessian.

        """

        target = data.get_label()

        # When num_class!= 0, preds has shape (n_obs, n_dist_param).
        # Each element in a row represents a raw prediction (leaf weight, hasn't gone through response function yet).
        preds_expectile = predt.reshape(-1,
                                        Expectile.n_dist_param(),
                                        order="F")

        # Weights
        if data.get_weight() == None:
            # Use 1 as weight if no weights are specified
            weights = np.ones_like(target, dtype=float)
        else:
            weights = data.get_weight()

        # Initialize Gradient and Hessian Matrices
        grad = np.zeros(shape=(len(target), len(Expectile.expectiles)))
        hess = np.zeros(shape=(len(target), len(Expectile.expectiles)))

        for i in range(len(Expectile.expectiles)):
            grad[:, i] = Expectile.gradient_expectile(
                y=target,
                expectile=preds_expectile[:, i],
                tau=Expectile.expectiles[i],
                weights=weights)

            hess[:,
                 i] = Expectile.hessian_expectile(y=target,
                                                  expectile=preds_expectile[:,
                                                                            i],
                                                  tau=Expectile.expectiles[i],
                                                  weights=weights)

        # Reshaping
        grad = grad.ravel(order="F")
        hess = hess.ravel(order="F")

        return grad, hess
Example #17
0
    def __call__(self, pred: np.ndarray,
                 dtrain: lgb.Dataset) -> Tuple[str, float, bool]:

        label = dtrain.get_label()

        weights = dtrain.get_weight()

        if label.shape[0] != pred.shape[0]:
            pred = pred.reshape((label.shape[0], -1), order='F')
            label = label.astype(np.int32)

        pred = self.bw_func(pred)

        # for weighted case
        try:
            val = self.metric_func(label, pred, sample_weight=weights)
        except TypeError:
            val = self.metric_func(label, pred)

        # TODO: what if grouped case

        return 'Opt metric', val, self.greater_is_better
Example #18
0
    def test_lightgbm_booster_classifier(self):
        from lightgbm import Dataset, train as lgb_train

        X = numpy.array([[0, 1], [1, 1], [2, 0], [1, 2]], dtype=numpy.float32)
        y = [0, 1, 0, 1]
        data = Dataset(X, label=y)
        model = lgb_train({'boosting_type': 'rf', 'objective': 'binary',
                           'n_estimators': 3, 'min_child_samples': 1,
                           'subsample_freq': 1, 'bagging_fraction': 0.5,
                           'feature_fraction': 0.5},
                          data)
        model_onnx = to_onnx(model, X, verbose=0, rewrite_ops=True,
                             target_opset=TARGET_OPSET)
        self.assertNotEmpty(model_onnx)
Example #19
0
def lgb_mape(preds: np.ndarray, lgb_train: Dataset) -> Tuple[str, float, bool]:
    """
    Mean average precision error metric for evaluation in lightgbm.

    Args:
        preds: Array of predictions
        lgb_train: LightGBM Dataset

    Returns:
        Tuple of error name (str) and error (float)
    """
    labels = lgb_train.get_label()
    mask = labels != 0
    return "mape", (np.fabs(labels - preds) / labels)[mask].mean(), False
Example #20
0
def lgb_pr_auc(preds: np.ndarray,
               lgb_train: Dataset) -> Tuple[str, float, bool]:
    """
    Precision Recall AUC (Area under Curve) of our prediction in lightgbm

    Args:
        preds: Array of predictions
        lgb_train: LightGBM Dataset

    Returns:
        Precision Recall AUC (Area under Curve)
    """
    labels = lgb_train.get_label()
    precision, recall, _ = precision_recall_curve(labels, preds)
    return "pr_auc", auc(recall, precision), True
Example #21
0
def top2_accuray_lgb(
    predt: np.ndarray,
    data: lgb.Dataset,
    threshold: float = 0.5,
) -> Tuple[str, float, bool]:
    s_0 = 31
    s_1 = int(len(predt) / s_0)

    predt = predt.reshape(s_0, s_1)
    y = data.get_label()
    p = predt.argsort(axis=0)[::-1, :]
    accuracy = ((y == p[0, :]) | (y == p[1, :])).mean()

    # # eval_name, eval_result, is_higher_better
    return 'top2_accuray', float(accuracy), True
Example #22
0
    def Dist_Metric(predt: np.ndarray, data: lgb.Dataset):
        """A customized evaluation metric that evaluates the predictions using the negative log-likelihood.

        """
        target = data.get_label()
        is_higher_better = False

        # Using a custom objective function, the custom metric receives raw predictions which need to be transformed
        # with the corresponding response function.
        predt = predt.reshape(-1, Gaussian.n_dist_param(), order="F")
        preds_location = Gaussian.param_dict()["location"](predt[:, 0])
        preds_scale = Gaussian.param_dict()["scale"](predt[:, 1])

        nll = -np.nansum(norm.logpdf(x=target, loc=preds_location, scale=preds_scale))

        return "NegLogLikelihood", nll, is_higher_better
Example #23
0
def lgb_mape_exp(preds: np.ndarray,
                 lgb_train: Dataset) -> Tuple[str, float, bool]:
    """
    Mean average precision error metric for evaluation in lightgbm.
    NOTE: This will exponentiate the predictions first, in the case where our actual is logged

    Args:
        preds: Array of predictions
        lgb_train: LightGBM Dataset

    Returns:
        Tuple of error name (str) and error (float)
    """
    labels = lgb_train.get_label()
    mask = labels != 0
    return "mape_exp", (np.fabs(labels - np.exp(preds)) /
                        labels)[mask].mean(), False
Example #24
0
def corr_sharpe_lgb(
    time_id_fold,
    y_pred: np.array,
    dtrain: lgb.Dataset,
) -> Tuple[str, float, bool]:
    """
    Pearson correlation coefficient metric
    """
    y_true = dtrain.get_label()

    pd_info = pd.DataFrame({
        'time_id': time_id_fold,
        'y_pred': y_pred,
        'y_true': y_true
    })
    sharpe_corr = calculate_corr(pd_info, sharpe=True)[0]
    return 'pearson_corr_sharpe', sharpe_corr, True
Example #25
0
    def test_onnxrt_python_lightgbm_categorical_iris_booster3_real(self):
        from lightgbm import LGBMClassifier, Dataset, train as lgb_train

        iris = load_iris()
        X, y = iris.data, iris.target
        X = (X * 10).astype(numpy.float32)
        X_train, X_test, y_train, _ = train_test_split(
            X, y, random_state=11)

        # Classic
        gbm = LGBMClassifier()
        gbm.fit(X_train, y_train)
        exp = gbm.predict_proba(X_test)
        onx = to_onnx(gbm.booster_, initial_types=[
            ('X', FloatTensorType([None, X_train.shape[1]]))],
            target_opset=TARGET_OPSET)
        self.assertIn('ZipMap', str(onx))
        oif = OnnxInference(onx)
        got = oif.run({'X': X_test})
        values = pandas.DataFrame(got['output_probability']).values
        self.assertEqualArray(exp, values, decimal=5)

        # categorical_feature=[0, 1]
        train_data = Dataset(
            X_train, label=y_train,
            feature_name=['c1', 'c2', 'c3', 'c4'],
            categorical_feature=['c1', 'c2'])

        params = {
            "boosting_type": "gbdt", "learning_rate": 0.05,
            "n_estimators": 2, "objective": "multiclass",
            "max_bin": 5, "min_child_samples": 100,
            'verbose': -1, 'num_class': 3}

        booster = lgb_train(params, train_data)
        exp = booster.predict(X_test)

        onx = to_onnx(booster, initial_types=[
            ('X', FloatTensorType([None, X_train.shape[1]]))],
            target_opset=TARGET_OPSET)
        self.assertIn('ZipMap', str(onx))
        oif = OnnxInference(onx)
        got = oif.run({'X': X_test})
        values = pandas.DataFrame(got['output_probability']).values
        self.assertEqualArray(exp, values, decimal=5)
Example #26
0
        def objective(params, n_folds=self.n_folds):
            self.iteration += 1

            subsample = params['boosting_type'].get('subsample', 1.0)
            params['boosting_type'] = params['boosting_type']['boosting_type']
            params['subsample'] = subsample
            params['verbose'] = -1
            for p in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
                params[p] = int(params[p])

            params['histogram_pool_size'] = 1024
            # NOTE: Above parameter is introduced to reduce memory consumption
            self.logger.debug("Parameters: {}".format(params))

            start = timer()
            train_set = Dataset(x_train, label=y_train)

            # Perform n_folds cross validation
            cv_results = cv(params,
                            train_set,
                            num_boost_round=10000,
                            nfold=n_folds,
                            early_stopping_rounds=100,
                            metrics='auc',
                            seed=self.seed)
            run_time = timer() - start

            # Loss must be minimized
            best_score = np.max(cv_results['auc-mean'])
            loss = 1 - best_score

            # Boosting rounds that returned the highest cv score
            n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)

            return {
                'loss': loss,
                'params': params,
                'iteration': self.iteration,
                'estimators': n_estimators,
                'train_time': run_time,
                'status': STATUS_OK
            }
Example #27
0
    def fit_lightgbm(self, x, y, early_stopping_rounds):

        self.model = LGBMModel(**self.optimized_params)

        if early_stopping_rounds is not None:

            x_valid, y_valid = train_test_split(x,
                                                stratify=y,
                                                shuffle=True,
                                                test_size=self.test_size,
                                                random_state=self.random_state)

            self.model.fit(x,
                           y,
                           eval_set=Dataset(x_valid, y_valid),
                           early_stopping_rounds=early_stopping_rounds,
                           verbose=self.verbose)

        else:
            self.model.fit(x, y)
Example #28
0
    def Dist_Metric(predt: np.ndarray, data: lgb.Dataset):
        """A customized evaluation metric that evaluates the predictions using the negative log-likelihood.

        """
        target = torch.tensor(data.get_label())
        is_higher_better = False

        # Using a custom objective function, the custom metric receives raw predictions which need to be transformed
        # with the corresponding response function.
        predt = predt.reshape(-1, Gaussian.n_dist_param(), order="F")
        preds_location = Gaussian.param_dict()["location"](predt[:, 0])
        preds_location = torch.tensor(preds_location, requires_grad=True)

        preds_scale = Gaussian.param_dict()["scale"](predt[:, 1])
        preds_scale = torch.tensor(preds_scale, requires_grad=True)

        dGaussian = Normal(preds_location, preds_scale)
        nll = -dGaussian.log_prob(target).nansum()
        nll = nll.detach().numpy()
        nll = np.round(nll, 5)

        return "NegLogLikelihood", nll, is_higher_better
Example #29
0
    def Dist_Metric(predt: np.ndarray, data: lgb.Dataset):
        """A customized evaluation metric that evaluates the predictions using the negative log-likelihood.

        """
        target = data.get_label()
        is_higher_better = False

        # Using a custom objective function, the custom metric receives raw predictions which need to be transformed
        # with the corresponding response function.
        preds_expectile = predt.reshape(-1,
                                        Expectile.n_dist_param(),
                                        order="F")

        loss_expectile = []
        for i in range(len(Expectile.expectiles)):
            loss_expectile.append(
                Expectile.expectile_loss(y=target,
                                         expectile=preds_expectile[:, i],
                                         tau=Expectile.expectiles[i]))

        nll = np.nanmean(loss_expectile)

        return "NegLogLikelihood", nll, is_higher_better
Example #30
0
def main(verbose=True, force=False, test=False):
    import datetime

    IGNORE_FEATURES = []

    os.makedirs(ANALYSIS_PATH, exist_ok=True)
    os.makedirs(TRAIN_PATH, exist_ok=True)

    raw_df_name = os.path.join(TRAIN_PATH, 'data_raw.pyt')
    scaled_df_name = os.path.join(TRAIN_PATH, 'data_scaled.pyt')

    st_time = datetime.datetime.now()
    print('Loading the data...')
    if not os.path.isfile(raw_df_name) or force:
        df = read()
        df.set_index(ID, inplace=True)
        print('\tWriting \033[92m%s\033[0m' % (raw_df_name))
        with open(raw_df_name, 'wb') as pyt:
            joblib.dump(df, pyt)
    else:
        print('\tLoading data from \033[92m%s\033[0m' % (raw_df_name))
        df = joblib.load(raw_df_name)
    print('-- Took %i seconds.' %
          (datetime.datetime.now() - st_time).total_seconds())

    # log-scale the predictors & predictand
    bins_target = np.logspace(np.log10(df[PREDICTAND].min()),
                              np.log10(df[PREDICTAND].max()), 20)
    df[PREDICTAND] = np.log1p(df[PREDICTAND])
    predictors = [c for c in df.columns if c not in ['isTrain', PREDICTAND]]

    # Counts of 0s or non-0s is very different between test and train sets !
    pstep = 5
    percs = np.arange(pstep, 100, pstep)

    calculated_cols = []
    columns_then = df.columns

    # Add the info relative to the leak as it affects the training / test processes
    leak_file = os.path.join(TRAIN_PATH, "df_leaked_%s.pyt" % N_LAGS)
    if os.path.isfile(leak_file):
        df_leaked = joblib.load(leak_file)
    else:
        df_ = df[predictors].reset_index(level=0)
        df_[PREDICTAND] = df[PREDICTAND]
        df_ = df_[['ID', PREDICTAND] + predictors]

        df_leaked = get_all_leak(df_, COLUMNS_LEAK, N_LAGS)
        leak_cols = [c for c in df_leaked if c.startswith('leak')]
        df_leaked = df_leaked[leak_cols]
        with open(leak_file, 'wb') as pyt:
            joblib.dump(df_leaked, pyt)

    df_leaked.index = df.index
    leak_cols = df_leaked.columns
    df['nb_potential_leaks'] = df_leaked.notnull().sum(axis=1)
    df['leak_mean'] = df_leaked.mean(axis=1).fillna(0)
    df['leak_median'] = df_leaked.median(axis=1).fillna(0)
    df['leak_max'] = df_leaked.max(axis=1).fillna(0)
    df['leak_min'] = df_leaked.min(axis=1).fillna(0)

    # Clustering on sorted dataframe (row by row) to detect similar entries
    df_ = df[predictors].copy()
    for row in range(len(df_)):
        arr = df_.iloc[row, :]
        df_.iloc[row, :] = np.sort(arr)

    # Hierarchical clustering seems to have a predictive power
    #distance = "euclidean"
    n_clusters = 12
    for distance in [
            "hamming", "jaccard", "sokalmichener", "sokalsneath", "euclidean"
    ]:
        st_time = datetime.datetime.now()
        print(
            'Finding \033[92m%i clusters\033[0m with \033[92m%s distance\033[0m'
            % (n_clusters, distance))
        dist_fname = os.path.join(TRAIN_PATH, "%s_dists.pyt" % distance)
        if os.path.isfile(dist_fname):
            dist = joblib.load(dist_fname)
            print('-- Pairwise distance loading took %i seconds.' %
                  (datetime.datetime.now() - st_time).total_seconds())
        else:
            if distance == "euclidean":
                dist = ss.distance.pdist(df_[predictors].values, distance)
            else:
                dist = ss.distance.pdist(df[predictors].values.astype(bool),
                                         distance)
            print('-- Pairwise distance computation took %i seconds.' %
                  (datetime.datetime.now() - st_time).total_seconds())
            with open(dist_fname, 'wb') as pyt:
                joblib.dump(dist, pyt)

        ward_linkage = hierarchy.ward(dist)
        tree = hierarchy.to_tree(ward_linkage)
        cluster_colname = 'cluster_%s' % distance
        df[cluster_colname] = hierarchy.fcluster(ward_linkage,
                                                 _get_height_at(
                                                     tree, n_clusters),
                                                 criterion="distance")
        print('-- Took %i seconds.' %
              (datetime.datetime.now() - st_time).total_seconds())
        CATEGORICAL_FEATURES.append(cluster_colname)
        sns.catplot(x=cluster_colname,
                    y=PREDICTAND,
                    data=df.groupby('isTrain').get_group(True),
                    kind="violin")
        plt.savefig(os.path.join(ANALYSIS_PATH, '%s.png' % cluster_colname))
        plt.close()

    # Keep euclidean clusters as 'cluster_colname' for K-fold grouping
    cluster_colname = "cluster_euclidean"

    print('Mojena stopping rule')

    clusters_for_plot = np.arange(1, 101)
    heights = np.array(
        [_get_height_at(tree, n_clusters) for n_clusters in clusters_for_plot])

    plt.figure()
    plt.plot(clusters_for_plot, heights, 'ko--')
    plt.grid()
    plt.xlabel('Number of clusters')
    plt.ylabel('Dendrogram height')
    plt.savefig(os.path.join(ANALYSIS_PATH, '%s_mojena.png' % cluster_colname))
    plt.close()

    print('Dendrogram for Euclidean distance')
    dn = hierarchy.dendrogram(ward_linkage,
                              no_labels=True,
                              above_threshold_color='k')
    plt.ylabel('height')
    plt.xlabel('samples')
    plt.savefig(
        os.path.join(ANALYSIS_PATH, '%s_dendrogram.png' % cluster_colname))
    plt.close()

    df[predictors] = np.log1p(df[predictors])

    st_time = datetime.datetime.now()

    def func_agg(row):
        r = row[row > 0]
        return np.append([
            (row > 0).sum(),
            r.mean(),
            (r**2).mean(),
            r.std(),
            r.max(),
            r.min(),
            r.skew(),
            r.kurtosis(),
        ], r.quantile(q=percs / 100))

    print('Computing non-zero aggregates...')
    df[[
        'count_nonzero',
        'mean_nonzero',
        'meansq_nonzero',
        'std_nonzero',
        'max_nonzero',
        'min_nonzero',
        'skew_nonzero',
        'kurt_nonzero',
    ] + ['p%i' % p for p in percs]] = df[predictors].apply(
        func_agg, axis=1, result_type="expand").fillna(0)
    print('-- Took %i seconds.' %
          (datetime.datetime.now() - st_time).total_seconds())

    st_time = datetime.datetime.now()

    def func_agg(row):
        r = row[row > 0].diff().abs()
        return np.append([
            r.mean(),
            (r**2).mean(),
            r.std(),
            r.max(),
            r.min(),
            r.skew(),
            r.kurtosis(),
        ], r.quantile(q=percs / 100))

    print('Computing diff aggregates...')
    df[[
        'diff_mean_nonzero',
        'diff_meansq_nonzero',
        'diff_std_nonzero',
        'diff_max_nonzero',
        'diff_min_nonzero',
        'diff_skew_nonzero',
        'diff_kurtosis_nonzero',
    ] + ['diff_p%i' % p for p in percs]] = df[predictors].apply(
        func_agg, axis=1, result_type="expand").fillna(0)
    print('-- Took %i seconds.' %
          (datetime.datetime.now() - st_time).total_seconds())

    # add occurrences (will it help ?)
    print('Computing distributions...')

    def func_epd(row):
        epd = np.histogram(np.exp(row[row > 0].values) - 1,
                           bins=bins_target,
                           normed=True)[0]
        return epd / np.sum(epd)

    df[['epd_%i' % b for b in bins_target[:-1]
        ]] = df[predictors].apply(func_epd, axis=1,
                                  result_type="expand").fillna(0)

    columns_now = df.columns
    calculated_cols.extend([c for c in columns_now if c not in columns_then])

    ## Scale the features
    #st_time = datetime.datetime.now()
    #print('Scaling (log) the features')
    #for col in df.columns:
    #if col not in [PREDICTAND, ID, 'isTrain']:
    #df[col] = np.log(df[col] + 1)
    #print('-- Took %i seconds.' % (datetime.datetime.now() - st_time).total_seconds())

    df.drop(predictors, axis=1, inplace=True)
    predictors = [c for c in calculated_cols if c in df.columns]

    print('-- Took %i seconds.' %
          (datetime.datetime.now() - st_time).total_seconds())

    #with open(os.path.join(TRAIN_PATH, 'predictors_%s.pyt' % datetime.datetime.now().strftime('%Y%m%d%H')),
    #'wb') as pyt:
    #joblib.dump(df, pyt)

    st_time = datetime.datetime.now()
    print('Transforming the features')
    cols_to_remove = []
    cols_to_add = []
    for col in predictors:
        if col in CATEGORICAL_FEATURES:
            print('\tFeature %s is categorical -> OneHot' % col)
            transf = OneHotEncoder()
            transf.fit(df[col].values.reshape(-1, 1))

            res = transf.transform(df[col].values.reshape(-1, 1))
            for i, ax in enumerate(res.transpose(), 1):
                onehot = '{}_{}'.format(col, i)
                df[onehot] = ax.toarray().squeeze()
                cols_to_add.append(onehot)
            cols_to_remove.append(col)

        else:
            print('\tFeature %s is numerical -> QuantileTransformer' % col)
            try:
                df[col] = QuantileTransformer().fit_transform(
                    df[col].values.reshape(-1, 1))
            except:
                print("\033[91mQuantileTransformer failed on %s\033[0m" % col)
    print('-- Took %i seconds.' %
          (datetime.datetime.now() - st_time).total_seconds())

    #df.drop(cols_to_remove, axis=1, inplace=True)
    IGNORE_FEATURES.extend(cols_to_remove)
    for col in cols_to_remove:
        predictors.remove(col)
        calculated_cols.remove(col)
    predictors.extend(cols_to_add)

    # T-SNE
    st_time = datetime.datetime.now()
    print('Running T-SNE...')
    fname = os.path.join(ANALYSIS_PATH, "tsne", "tsne.png")
    os.makedirs(os.path.dirname(fname), exist_ok=True)
    tsne_comps = tsne(
        df[predictors + [PREDICTAND, 'isTrain']],
        fname,
        nb=len(df),
        perplexity=40,
        title=None,
        visu_tsne=None,
        cmap='viridis',
        predictand=PREDICTAND,
        binary=False,
        #do_not_plot=[c for c in predictors if not c in calculated_cols + ['isTrain', PREDICTAND]],
    )

    with open(
            os.path.join(
                TRAIN_PATH, "tsne_%s.pyt" %
                (datetime.datetime.now().strftime('%Y%m%d%H%M'))),
            'wb') as pyt:
        joblib.dump(tsne_comps, pyt)

    try:
        for i, tsne_ax in enumerate(tsne_comps.transpose(), 1):
            df['tsne%i' % i] = tsne_ax
            calculated_cols.append('tsne%i' % i)
    except:
        print('\033[91mWARNING ! could not add t-sne values\033[0m')
        print_exc()
        pass
    print('-- Took %i seconds.' %
          (datetime.datetime.now() - st_time).total_seconds())

    analyze(df, calculated_cols, step='preprocessed')
    #analyze_bivariate(df, cfgs, step='preprocessed')

    df_train = select_sample(df, "train")
    df_test = select_sample(df, "test")

    fname = os.path.join(TRAIN_PATH, 'df_train.pyt')
    print('Saving df_train to \033[92m%s\033[0m' % fname)
    with open(fname, 'wb') as pyt:
        joblib.dump(df_train, pyt)

    predictors = [c for c in df_train.columns if c not in IGNORE_FEATURES]
    predictors.remove(PREDICTAND)

    X_train = df_train[predictors].values
    y_train = df_train[PREDICTAND].values
    X_test = df_test[predictors].values
    test_rows = df_test.index

    # Load the "leaked" target
    leaked_target = df_leaked.loc[test_rows, leak_cols].median(axis=1)
    leaked_count = df_leaked.loc[test_rows, leak_cols].notnull().sum(axis=1)
    leak_inds = np.where(leaked_count > 0)[0]

    #reg, _ = train_and_validate(
    #df_train,
    #predictors,
    #PREDICTAND,
    #wdir=TRAIN_PATH,
    #kind='regression',
    #MLP_options={'hidden_layer_sizes': (100, 100)},
    #GradientBoosting_options={'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 600, 'random_state': 42},
    #XGBoost_options={'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 600, 'random_state': 42},
    #LightGBM_options={'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 600, 'random_state': 42, 'verbose': -1, 'num_leaves': 124},
    #RandomForest_options={'max_depth': None, 'n_estimators': 900, 'max_features': 1, 'min_samples_leaf': 3, 'min_samples_split': 10, 'criterion': 'mse', 'random_state': 42},
    #)

    #os.makedirs(OUTPUT_PATH, exist_ok=True)
    #for name, regdict in reg.items():
    #model = regdict['model']
    #fname = os.path.join(OUTPUT_PATH, '%s.csv' % name)
    #y_pred = model.predict(X_test)
    #y_pred = np.expm1(y_pred)

    ##y_pred[leak_inds] = leaked_target.values[leak_inds]

    #df_result = pd.DataFrame({ID: df_test.index,
    #PREDICTAND: y_pred})
    #df_result.to_csv(fname, index=False)
    #print('Wrote prediction file: \033[94;1m%s\033[0m' % fname)

    def save_model(model, name, y_pred=None, replace_leak=False):
        if model is not None:
            fname = os.path.join(TRAIN_PATH, "%s.pyt" % name)
            os.makedirs(TRAIN_PATH, exist_ok=True)
            with open(fname, "wb") as pyt:
                joblib.dump({'model': model}, pyt)
            print('\tSaved model to \033[92m%s\033[0m' % fname)

        fname = os.path.join(OUTPUT_PATH, "%s.csv" % name)
        if y_pred is None:
            y_pred = model.predict(X_test)
        y_pred = np.expm1(y_pred)

        if replace_leak:
            y_pred[leak_inds] = leaked_target.values[leak_inds]
            fname = fname.replace('.csv', '_leak.csv')

        df_result = pd.DataFrame({ID: df_test.index, PREDICTAND: y_pred})
        df_result.to_csv(fname, index=False)
        print('\tSaved prediction to \033[92m%s\033[0m' % fname)

    from lightgbm import Dataset
    from lightgbm import train as train_lgb

    nfolds = 10
    #folds = KFold(n_splits=nfolds, shuffle=True, random_state=21)
    folds = GroupKFold(n_splits=nfolds)

    y_pred_xgb = np.zeros(len(X_test))
    y_train_xgb = np.zeros(len(X_train))
    y_pred_lgbm = np.zeros(len(X_test))
    y_train_lgbm = np.zeros(len(X_train))

    lgb_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'mse'},
        'num_leaves': 124,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'verbose': -1,
        'num_boost_round': 15000,
        'early_stopping_rounds': 100,
        'nthread': 26
    }

    def _rmse_func(predictions, ground_truth):
        return np.sqrt(mean_squared_error(predictions, ground_truth))

    def rmse(predictions, train_data):
        labels = train_data.get_label()
        return 'RMSE', _rmse_func(predictions, labels), False

    for ifold, (trn_idx, val_idx) in enumerate(
            folds.split(X_train, y_train, df_train[cluster_colname].values)):

        print("Fold nb. %i" % ifold)

        lgb_train = Dataset(data=X_train[trn_idx, :],
                            label=y_train[trn_idx],
                            feature_name=predictors)

        lgb_val = Dataset(data=X_train[val_idx, :],
                          label=y_train[val_idx],
                          feature_name=predictors)

        reg = XGBRegressor(n_estimators=600,
                           max_depth=5,
                           learning_rate=0.05,
                           random_state=42)
        reg.fit(df_train[predictors].iloc[trn_idx, :].values,
                df_train[[PREDICTAND]].iloc[trn_idx, :].values.squeeze())
        pred_fold = reg.predict(df_train[predictors].iloc[val_idx].values)

        print('\t[XGBoost] oof RMSE is: \033[92m%.4f\033[0m' % np.sqrt(
            mean_squared_error(
                df_train[[PREDICTAND]].iloc[val_idx].values.squeeze(),
                pred_fold)))
        y_train_xgb += reg.predict(X_train) / nfolds
        y_pred_xgb += reg.predict(X_test) / nfolds

        reg = train_lgb(lgb_params,
                        lgb_train,
                        num_boost_round=15000,
                        early_stopping_rounds=100,
                        verbose_eval=100,
                        valid_sets=[lgb_train, lgb_val],
                        feval=rmse)

        y_pred = reg.predict(X_train[val_idx, :],
                             num_iteration=reg.best_iteration)
        score = np.sqrt(mean_squared_error(y_train[val_idx], y_pred))

        print('\t[LGBM] Best iteration: \033[92m%i\033[0m' %
              reg.best_iteration)
        print('\t[LGBM] oof RMSE is: \033[92m%.4f\033[0m' % score)

        y_train_lgbm += reg.predict(X_train,
                                    num_iteration=reg.best_iteration) / nfolds
        y_pred_lgbm += reg.predict(X_test,
                                   num_iteration=reg.best_iteration) / nfolds

    save_model(None, "LightGBM_folded", y_pred_lgbm, replace_leak=True)

    save_model(None, "XGBoost_folded", y_pred_xgb)
    save_model(None, "LightGBM_folded", y_pred_lgbm)
    save_model(None, "XGB-LGBM_folded", 0.5 * (y_pred_xgb + y_pred_lgbm))

    gsDict = {}

    ## AdaBoost
    #print('\033[1mGridSearch - AdaBoostRegressor\033[0m')
    #reg_base = DecisionTreeRegressor()
    #reg = AdaBoostRegressor(reg_base, random_state=42)
    #ada_param_grid = {
    #"base_estimator__criterion": ["mse", "mae"],
    #"base_estimator__splitter": ["best", "random"],
    #"algorithm": ["SAMME", "SAMME.R"],
    #"n_estimators": [2, 10, 50],
    #"learning_rate":  [0.001, 0.01, 0.1]}

    #gsAdaBoost = GridSearchCV(reg, param_grid=ada_param_grid,
    #cv=nfolds, scoring="neg_mean_squared_error",
    #n_jobs=20, verbose=1)
    #gsAdaBoost.fit(X_train, y_train)

    #ada_best = gsAdaBoost.best_estimator_
    #print('\tBest score: \033[92m%.4f\033[0m' % gsAdaBoost.best_score_)
    #ada_best.fit(X_train, y_train)
    #save_model(ada_best, "AdaBoost")
    #gsDict["AdaBoost"] = gsAdaBoost

    ## ExtraTrees
    #print('\033[1mGridSearch - ExtraTreesRegressor\033[0m')
    #reg = ExtraTreesRegressor()

    ## Search grid for optimal parameters
    #ex_param_grid = {
    #"max_depth": [None],
    #"max_features": [1, 3, 10],
    #"min_samples_split": [2, 3, 10],
    #"min_samples_leaf": [1, 3, 10],
    #"bootstrap": [False],
    #"n_estimators": [100, 300, 900],
    #"criterion": ["mse", "mae"]}

    #gsExtraTrees = GridSearchCV(reg, param_grid=ex_param_grid,
    #cv=nfolds, scoring="neg_mean_squared_error",
    #n_jobs=20, verbose=1)
    #gsExtraTrees.fit(X_train, y_train)
    #etc_best = gsExtraTrees.best_estimator_
    #print('\tBest score: \033[92m%.4f\033[0m' % gsExtraTrees.best_score_)
    #etc_best.fit(X_train, y_train)
    #save_model(etc_best, "ExtraTrees")
    #gsDict["ExtraTrees"] = gsExtraTrees

    ## RF Parameters
    #print('\033[1mGridSearch - RandomForestRegressor\033[0m')
    #reg = RandomForestRegressor()

    ## Search grid for optimal parameters
    #rf_param_grid = {
    #"max_depth": [None, 4, 5],
    #"max_features": [1, 3, 10],
    #"min_samples_split": [2, 3, 10],
    #"min_samples_leaf": [1, 3, 10],
    #"bootstrap": [False],
    #"n_estimators": [100, 300, 900],
    #"criterion": ["mse", "mae"]}

    #gsRandomForest = GridSearchCV(
    #reg, param_grid=rf_param_grid,
    #cv=nfolds, scoring="neg_mean_squared_error",
    #n_jobs=36, verbose=1)
    #gsRandomForest.fit(X_train, y_train)
    #rfc_best = gsRandomForest.best_estimator_
    #print('\tBest score: \033[92m%.4f\033[0m' % gsRandomForest.best_score_)
    #for key in rf_param_grid.keys():
    #print('\t\t%s: \033[92m%s\033[0m' % (key, getattr(rfc_best, key, '-')))

    #rfc_best.fit(X_train, y_train)
    #save_model(rfc_best, "RandomForest")
    #gsDict["RandomForest"] = gsRandomForest

    ## Gradient boosting
    #print('\033[1mGridSearch - GradientBoostingRegressor\033[0m')
    #reg = GradientBoostingRegressor()
    #gb_param_grid = {
    #'loss' : ["ls", "lad", "huber"],
    #'n_estimators' : [600, 300, 900],
    #'learning_rate': [0.1, 0.05, 0.01],
    #'max_depth': [5, 4, 6],
    #'min_samples_leaf': [10, 50],
    #'max_features': ["sqrt", "auto"]
    #}

    #gsGradientBoosting = GridSearchCV(
    #reg, param_grid=gb_param_grid,
    #cv=nfolds, scoring="neg_mean_squared_error",
    #n_jobs=36, verbose=1)
    #gsGradientBoosting.fit(X_train, y_train)
    #gbc_best = gsGradientBoosting.best_estimator_
    #print('\tBest score: \033[92m%.4f\033[0m' % gsGradientBoosting.best_score_)
    #for key in gb_param_grid.keys():
    #print('\t\t%s: \033[92m%s\033[0m' % (key, getattr(gbc_best, key, '-')))

    #gbc_best.fit(X_train, y_train)
    #save_model(gbc_best, "GradientBoosting")
    #gsDict["GradientBoosting"] = gsGradientBoosting

    # Gradient boosting
    print('\033[1mGridSearch - XGBRegressor\033[0m')
    reg = XGBRegressor()
    xgb_param_grid = {
        'n_estimators': [600, 300, 900],
        'learning_rate': [0.1, 0.05, 0.01],
        'max_depth': [5, 4, 6],
        'missing': [None, 0.],
        'booster': ["gbtree", "gblinear", "dart"],
    }

    gsXGBoost = GridSearchCV(reg,
                             param_grid=xgb_param_grid,
                             cv=nfolds,
                             scoring="neg_mean_squared_error",
                             n_jobs=36,
                             verbose=1)
    gsXGBoost.fit(X_train, y_train)
    gbc_best = gsXGBoost.best_estimator_
    print('\tBest score: \033[92m%.4f\033[0m' % gsXGBoost.best_score_)
    for key in xgb_param_grid.keys():
        print('\t\t%s: \033[92m%s\033[0m' % (key, getattr(gbc_best, key, '-')))

    gbc_best.fit(X_train, y_train)
    save_model(gbc_best, "XGBoost")
    gsDict["XGBoost"] = gsXGBoost