Exemple #1
0
    def fit(self, X, y):
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        self._le = _LGBMLabelEncoder().fit(y)
        training_labels = self._le.transform(y)
        xgdmat = lgbm.Dataset(X, label=training_labels)
        #xgdmat.construct() 
        self.param_map.update({'objective':'binary'})
        #print('avant lgbm.cv')
        #print(self.param_map)
        # a verifier
#        if self.n_classes_ > 2:
#            self.param_map.update({'num_class':self.n_classes_})
#            self.param_map.update({'objective':'multi:softprob'})
        # Note: lgbm.cv reset the value of max_bin to 255
        self.results = lgbm.cv(self.param_map,
                               xgdmat,
                                self.num_boost_round,
                                self.folds,
                                self.nfold,
                                self.stratified,
                                self.shuffle,
                                self.metrics,
                                self.fobj,
                                self.feval,
                                self.init_model,
                                self.feature_name,
                                self.categorical_feature,
                                self.early_stopping_rounds,
                                self.fpreproc,
                                self.verbose_eval,
                                self.show_stdv,
                                self.seed,
                                self.callbacks)
Exemple #2
0
 def fit(self, X, y):
     self.classes_ = np.unique(y)
     self.n_classes_ = len(self.classes_)
     self._le = _LGBMLabelEncoder().fit(y)
     training_labels = self._le.transform(y)
     xgdmat = lgbm.Dataset(X, label=training_labels)
     #xgdmat.construct()
     self.param_map.update({'objective': 'binary'})
     #print('avant lgbm.cv')
     #print(self.param_map)
     # a verifier
     #        if self.n_classes_ > 2:
     #            self.param_map.update({'num_class':self.n_classes_})
     #            self.param_map.update({'objective':'multi:softprob'})
     # Note: lgbm.cv reset the value of max_bin to 255
     self.results = lgbm.cv(self.param_map, xgdmat, self.num_boost_round,
                            self.folds, self.nfold, self.stratified,
                            self.shuffle, self.metrics, self.fobj,
                            self.feval, self.init_model, self.feature_name,
                            self.categorical_feature,
                            self.early_stopping_rounds, self.fpreproc,
                            self.verbose_eval, self.show_stdv, self.seed,
                            self.callbacks)
Exemple #3
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            init_score=None,
            eval_set=None,
            eval_names=None,
            eval_sample_weight=None,
            eval_class_weight=None,
            eval_init_score=None,
            eval_metric=None,
            early_stopping_rounds=None,
            verbose=True,
            feature_name='auto',
            categorical_feature='auto',
            callbacks=None):
        """Docstring is inherited from the LGBMModel."""
        _LGBMAssertAllFinite(y)
        _LGBMCheckClassificationTargets(y)
        self._le = _LGBMLabelEncoder().fit(y)
        _y = self._le.transform(y)
        self._class_map = dict(
            zip_(self._le.classes_, self._le.transform(self._le.classes_)))
        if isinstance(self.class_weight, dict):
            self._class_weight = {
                self._class_map[k]: v
                for k, v in self.class_weight.items()
            }

        self._classes = self._le.classes_
        self._n_classes = len(self._classes)
        if self._n_classes > 2:
            # Switch to using a multiclass objective in the underlying LGBM instance
            ova_aliases = ("multiclassova", "multiclass_ova", "ova", "ovr")
            if self._objective not in ova_aliases and not callable(
                    self._objective):
                self._objective = "multiclass"
            if eval_metric in ('logloss', 'binary_logloss'):
                eval_metric = "multi_logloss"
            elif eval_metric in ('error', 'binary_error'):
                eval_metric = "multi_error"
        else:
            if eval_metric in ('logloss', 'multi_logloss'):
                eval_metric = 'binary_logloss'
            elif eval_metric in ('error', 'multi_error'):
                eval_metric = 'binary_error'

        if eval_set is not None:
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, (valid_x, valid_y) in enumerate(eval_set):
                if valid_x is X and valid_y is y:
                    eval_set[i] = (valid_x, _y)
                else:
                    eval_set[i] = (valid_x, self._le.transform(valid_y))

        super(LGBMClassifier,
              self).fit(X,
                        _y,
                        sample_weight=sample_weight,
                        init_score=init_score,
                        eval_set=eval_set,
                        eval_names=eval_names,
                        eval_sample_weight=eval_sample_weight,
                        eval_class_weight=eval_class_weight,
                        eval_init_score=eval_init_score,
                        eval_metric=eval_metric,
                        early_stopping_rounds=early_stopping_rounds,
                        verbose=verbose,
                        feature_name=feature_name,
                        categorical_feature=categorical_feature,
                        callbacks=callbacks)
        return self
Exemple #4
0
    def cv(
            self,
            # Dataset
            data,
            cat_features=None,
            groups=None,
            folds=KFold(n_splits=5),
            # Model
            params={},
            fit_params={},
            convert_to_sklearn=True,
            # Optuna
            tune_model=False,
            optuna_params=None,
            maximize=None,
            eval_metric=None,
            n_trials=None,
            timeout=None,
            lgbm_n_trials=[7, 20, 10, 6, 20],
            # Misc
            logger=None,
            n_jobs=-1):

        self._data_check(data)
        self.n_features = data[0].shape[1]
        self.n_classes = len(np.unique(data[1]))
        if isinstance(data[0], pd.DataFrame):
            self.feature_names = data[0].columns.tolist()
        else:
            self.feature_names = [f'f{i}' for i in range(self.n_features)]

        main_metric, maximize = self._parse_eval_metric(params, maximize)
        if eval_metric is not None and maximize is None:
            raise ValueError('metric direction must be specified.')

        if isinstance(logger, (str, Path)):
            logger = LGBMLogger(logger, stdout=True, file=True)
        elif logger is None:
            logger = LGBMLogger(logger, stdout=True, file=False)
        assert isinstance(logger, LGBMLogger)

        # logger(f'params: \n{params}')
        # logger(f'fit_params: \n{fit_params}')

        if self.model_name in MODEL_ZOO['cat']:
            raise NotImplementedError('catboost is incompatible with .cv().')

        elif self.model_name in MODEL_ZOO['lgb']:
            ''' LightGBM '''
            dtrain = lgb.Dataset(data=data[0],
                                 label=data[1],
                                 weight=data[2] if len(data) == 3 else None,
                                 categorical_feature=cat_features)

            # Optuna intergration
            if tune_model:
                _fit_params = fit_params.copy()
                _params = params.copy()
                _params.update({'metric': main_metric})
                if 'verbose_eval' in _fit_params.keys():
                    _fit_params.update({'verbose_eval': False})
                tuner = lgb_tune.LightGBMTunerCV(
                    _params,
                    dtrain,
                    folds=folds,
                    n_trials_config=lgbm_n_trials,
                    return_cvbooster=True,
                    optuna_callbacks=[logger.optuna],
                    show_progress_bar=False,
                    **_fit_params)
                tuner.run()
                self.model = tuner.get_best_booster().boosters
                self.best_iteration = tuner.get_best_booster().best_iteration
                self.best_score = tuner.best_score
                del tuner
            else:
                _params = params.copy()
                model_extractor = ModelExtractor()
                res = lgb.cv(_params,
                             train_set=dtrain,
                             folds=folds,
                             callbacks=[logger.lgbm, model_extractor],
                             **fit_params)
                self.model = model_extractor.get_model().boosters
                self.best_iteration = model_extractor.get_best_iteration()
                self.evals_result = res
                if maximize:
                    self.best_score = np.max(res[f'{main_metric}-mean'])
                else:
                    self.best_score = np.min(res[f'{main_metric}-mean'])

            for i in range(len(self.model)):
                self.model[i].best_iteration = self.best_iteration

            logger(
                f'[{self.best_iteration}]\tbest score is {self.best_score:.6f}'
            )

            if convert_to_sklearn:
                for i in range(len(self.model)):
                    self.model[i] = booster2sklearn(self.model[i],
                                                    self.model_type,
                                                    self.n_features,
                                                    self.n_classes)

                    if self.model_name == 'LGBMClassifier':
                        self.model[i]._le = _LGBMLabelEncoder().fit(data[1])

        elif self.model_name in MODEL_ZOO['xgb']:
            ''' XGBoost '''
            dtrain = xgb.DMatrix(data=data[0],
                                 label=data[1],
                                 weight=data[2] if len(data) == 3 else None,
                                 nthread=n_jobs)

            # Optuna integration
            if tune_model:
                _fit_params = fit_params.copy()
                if 'verbose_eval' in _fit_params.keys():
                    _fit_params.update({'verbose_eval': False})
                self.best_score = None
                self.best_model = None
                self.best_iteration = 0

                def xgb_objective(trial):
                    _params = params.copy()
                    if optuna_params is None:
                        _params = PARAMS_ZOO[self.model_name](trial, params)
                    else:
                        _params = optuna_params(trial, _params)

                    model_extractor = ModelExtractor()
                    res = xgb.cv(_params,
                                 dtrain=dtrain,
                                 folds=folds,
                                 maximize=maximize,
                                 callbacks=[model_extractor],
                                 **_fit_params)
                    self.model = model_extractor.get_model()

                    if eval_metric is None:
                        if maximize:
                            score = np.max(res[f'test-{main_metric}-mean'])
                            best_iteration = np.argmax(
                                res[f'test-{main_metric}-mean'])
                        else:
                            score = np.min(res[f'test-{main_metric}-mean'])
                            best_iteration = np.argmin(
                                res[f'test-{main_metric}-mean'])
                    else:
                        raise NotImplementedError(
                            'Do not use custom eval_metric for .cv() :(')

                    if self.best_score is None:
                        self.best_score = score
                        self.best_model = self.model.copy()
                        self.evals_result = res

                    if maximize == True and self.best_score < score:
                        self.best_score = score
                        self.best_model = self.model.copy()
                        self.best_iteration = best_iteration
                        self.evals_result = res
                    elif maximize == False and self.best_score > score:
                        self.best_score = score
                        self.best_model = self.model.copy()
                        self.best_iteration = best_iteration
                        self.evals_result = res

                    return score

                study = optuna.create_study(
                    direction='maximize' if maximize else 'minimize')
                study.optimize(xgb_objective,
                               n_trials=n_trials,
                               timeout=timeout,
                               callbacks=[logger.optuna],
                               n_jobs=1)
                self.model = self.best_model.copy()
                del self.best_model
            else:
                _params = params.copy()

                model_extractor = ModelExtractor()
                res = xgb.cv(_params,
                             dtrain=dtrain,
                             folds=folds,
                             maximize=maximize,
                             callbacks=[logger.lgbm, model_extractor],
                             **fit_params)
                self.model = model_extractor.get_model()

                if maximize:
                    self.best_score = np.max(res[f'test-{main_metric}-mean'])
                    self.best_iteration = np.argmax(
                        res[f'test-{main_metric}-mean'])
                else:
                    self.best_score = np.min(res[f'test-{main_metric}-mean'])
                    self.best_iteration = np.argmin(
                        res[f'test-{main_metric}-mean'])

                for i in range(len(self.model)):
                    self.model[i].best_ntree_limit = self.best_iteration

            logger(
                f'[{self.best_iteration}]\tbest score is {self.best_score:.6f}'
            )

            if convert_to_sklearn:
                for i in range(len(self.model)):
                    self.model[i] = booster2sklearn(self.model[i],
                                                    self.model_type,
                                                    self.n_features,
                                                    self.n_classes)
                    if self.model_name == 'XGBClassifier':
                        self.model[i]._le = XGBoostLabelEncoder().fit(data[1])

        else:
            raise NotImplementedError(
                f'{self.model_name} is incompatible with .cv().')

        self.is_trained = True
Exemple #5
0
    def train(
            self,
            # Dataset
            train_data,
            valid_data=(),
            cat_features=None,
            # Model
            params={},
            fit_params={},
            convert_to_sklearn=True,
            # Probability calibration
            calibration=False,
            calibration_method='isotonic',
            calibration_cv=5,
            # Optuna
            tune_model=False,
            optuna_params=None,
            maximize=None,
            eval_metric=None,
            n_trials=None,
            timeout=None,
            lgbm_n_trials=[7, 20, 10, 6, 20],
            # Misc
            logger=None,
            n_jobs=-1):

        self._data_check(train_data)
        self._data_check(valid_data)
        self.n_features = train_data[0].shape[1]
        self.n_classes = len(np.unique(train_data[1]))
        if isinstance(train_data[0], pd.DataFrame):
            self.feature_names = train_data[0].columns.tolist()
        else:
            self.feature_names = [f'f{i}' for i in range(self.n_features)]

        main_metric, maximize = self._parse_eval_metric(params, maximize)
        if eval_metric is not None and maximize is None:
            raise ValueError('metric direction must be specified.')

        if isinstance(logger, (str, Path)):
            logger = LGBMLogger(logger, stdout=True, file=True)
        elif logger is None:
            logger = LGBMLogger(logger, stdout=True, file=False)
        assert isinstance(logger, LGBMLogger)

        # logger(f'params: \n{params}')
        # logger(f'fit_params: \n{fit_params}')

        if self.model_name in MODEL_ZOO['cat']:
            ''' Catboost '''
            dtrain = CatPool(
                data=train_data[0],
                label=train_data[1],
                weight=train_data[2] if len(train_data) == 3 else None,
                cat_features=cat_features,
                thread_count=n_jobs)
            if len(valid_data) > 0:
                dvalid = CatPool(
                    data=valid_data[0],
                    label=valid_data[1],
                    weight=valid_data[2] if len(train_data) == 3 else None,
                    cat_features=cat_features,
                    thread_count=n_jobs)
            else:
                dvalid = dtrain

            # Optuna integration
            if tune_model:
                _fit_params = fit_params.copy()
                if 'verbose_eval' in _fit_params.keys():
                    _fit_params.update({'verbose_eval': False})
                self.best_score = None
                self.best_model = None

                def cat_objective(trial):
                    _params = params.copy()
                    if optuna_params is None:
                        _params = PARAMS_ZOO[self.model_name](trial, _params)
                    else:
                        _params = optuna_params(trial, _params)

                    self.model = self.model_type(**_params)
                    self.model.fit(X=dtrain, eval_set=dvalid, **_fit_params)

                    if eval_metric is None:
                        score = self.model.get_best_score(
                        )['validation'][main_metric]
                    else:
                        score = eval_metric(self.model, valid_data)

                    if self.best_score is None:
                        self.best_score = score
                        self.best_model = self.model.copy()
                        self.best_iteration = self.model.get_best_iteration()
                        self.evals_result = self.model.get_evals_result()

                    if maximize == True and self.best_score < score:
                        self.best_model = self.model.copy()
                        self.best_score = score
                        self.best_iteration = self.model.get_best_iteration()
                        self.evals_result = self.model.get_evals_result()
                    elif maximize == False and self.best_score > score:
                        self.best_model = self.model.copy()
                        self.best_score = score
                        self.best_iteration = self.model.get_best_iteration()
                        self.evals_result = self.model.get_evals_result()

                    return score

                study = optuna.create_study(
                    direction='maximize' if maximize else 'minimize')
                study.optimize(cat_objective,
                               n_trials=n_trials,
                               timeout=timeout,
                               callbacks=[logger.optuna],
                               n_jobs=1)
                self.model = self.best_model.copy()
                del self.best_model

            else:
                _params = params.copy()
                self.model = self.model_type(**_params)
                self.model.fit(X=dtrain, eval_set=dvalid, **fit_params)
                self.best_score = self.model.get_best_score(
                )['validation'][main_metric]
                self.evals_result = self.model.get_evals_result()
                self.best_iteration = self.model.get_best_iteration()

        elif self.model_name in MODEL_ZOO['lgb']:
            ''' LightGBM '''
            dtrain = lgb.Dataset(
                data=train_data[0],
                label=train_data[1],
                weight=train_data[2] if len(train_data) == 3 else None,
                categorical_feature=cat_features)
            if len(valid_data) > 0:
                dvalid = lgb.Dataset(
                    data=valid_data[0],
                    label=valid_data[1],
                    weight=valid_data[2] if len(train_data) == 3 else None,
                    categorical_feature=cat_features)
            else:
                dvalid = dtrain

            # Optuna intergration
            if tune_model:
                _fit_params = fit_params.copy()
                _params = params.copy()
                if 'verbose_eval' in _fit_params.keys():
                    _fit_params.update({'verbose_eval': False})
                _params.update({'metric': main_metric})
                tuner = lgb_tune.LightGBMTuner(
                    _params,
                    train_set=dtrain,
                    valid_sets=[dtrain, dvalid],
                    n_trials_config=lgbm_n_trials,
                    show_progress_bar=False,
                    optuna_callbacks=[logger.optuna],
                    **_fit_params)
                tuner.run()
                self.model = tuner.get_best_booster()
                self.best_score = tuner.best_score
                del tuner
            else:
                _params = params.copy()

                res = {}
                self.model = lgb.train(_params,
                                       train_set=dtrain,
                                       valid_sets=[dtrain, dvalid],
                                       callbacks=[logger.lgbm],
                                       evals_result=res,
                                       **fit_params)
                if maximize:
                    self.best_score = np.max(res['valid_1'][main_metric])
                else:
                    self.best_score = np.min(res['valid_1'][main_metric])
                self.evals_result = res

            self.best_iteration = self.model.best_iteration

            if convert_to_sklearn:
                self.model = booster2sklearn(self.model, self.model_type,
                                             self.n_features, self.n_classes)
                if self.model_name == 'LGBMClassifier':
                    self.model._le = _LGBMLabelEncoder().fit(
                        train_data[1])  # internal label encoder

        elif self.model_name in MODEL_ZOO['xgb']:
            ''' XGBoost '''
            dtrain = xgb.DMatrix(
                data=train_data[0],
                label=train_data[1],
                weight=train_data[2] if len(train_data) == 3 else None,
                nthread=n_jobs)
            if len(valid_data) > 0:
                dvalid = xgb.DMatrix(
                    data=valid_data[0],
                    label=valid_data[1],
                    weight=valid_data[2] if len(train_data) == 3 else None,
                    nthread=n_jobs)
            else:
                dvalid = dtrain

            # Optuna integration
            if tune_model:
                _fit_params = fit_params.copy()
                if 'verbose_eval' in _fit_params.keys():
                    _fit_params.update({'verbose_eval': False})
                self.best_score = None
                self.best_model = None

                def xgb_objective(trial):
                    _params = params.copy()
                    if optuna_params is None:
                        _params = PARAMS_ZOO[self.model_name](trial, _params)
                    else:
                        _params = optuna_params(trial, _params)

                    res = {}
                    pruning_callback = optuna.integration.XGBoostPruningCallback(
                        trial, f'valid-{main_metric}')
                    self.model = xgb.train(_params,
                                           dtrain=dtrain,
                                           evals=[(dtrain, 'train'),
                                                  (dvalid, 'valid')],
                                           evals_result=res,
                                           callbacks=[pruning_callback],
                                           **_fit_params)

                    if eval_metric is None:
                        if maximize:
                            score = np.max(res['valid'][main_metric])
                        else:
                            score = np.min(res['valid'][main_metric])
                    else:
                        score = eval_metric(self.model, dvalid)

                    if self.best_score is None:
                        self.best_score = score
                        self.best_model = self.model.copy()
                        self.evals_result = res

                    if maximize == True and self.best_score < score:
                        self.best_score = score
                        self.best_model = self.model.copy()
                        self.evals_result = res
                    elif maximize == False and self.best_score > score:
                        self.best_score = score
                        self.best_model = self.model.copy()
                        self.evals_result = res

                    return score

                study = optuna.create_study(
                    direction='maximize' if maximize else 'minimize')
                study.optimize(xgb_objective,
                               n_trials=n_trials,
                               timeout=timeout,
                               callbacks=[logger.optuna],
                               n_jobs=1)
                self.model = self.best_model.copy()
                del self.best_model
            else:
                _params = params.copy()

                res = {}
                self.model = xgb.train(_params,
                                       dtrain=dtrain,
                                       evals=[(dtrain, 'train'),
                                              (dvalid, 'valid')],
                                       callbacks=[logger.lgbm],
                                       evals_result=res,
                                       **fit_params)
                if maximize:
                    self.best_score = np.max(res['valid'][main_metric])
                else:
                    self.best_score = np.min(res['valid'][main_metric])
                self.evals_result = res

            self.best_iteration = self.model.best_ntree_limit

            if convert_to_sklearn:
                self.model = booster2sklearn(self.model, self.model_type,
                                             self.n_features, self.n_classes)
                if self.model_name == 'XGBClassifier':
                    self.model._le = XGBoostLabelEncoder().fit(train_data[1])

        else:
            ''' Other skelarn models '''
            if eval_metric is None:
                if self.n_classes == 2:
                    eval_metric = auc_metric
                    maximize = True
                else:
                    eval_metric = mse_metric
                    maximize = False
                print('eval_metric automatically selected.')

            # Optuna integration
            if tune_model:
                self.best_score = None
                self.best_model = None

                def sklearn_objective(trial):
                    _params = params.copy()
                    if optuna_params is None:
                        _params = PARAMS_ZOO[self.model_name](trial, params)
                    else:
                        _params = optuna_params(trial, _params)

                    self.model = self.model_type(**_params)
                    self.model.fit(train_data[0], train_data[1], **fit_params)
                    score = eval_metric(self.model, valid_data)

                    if self.best_score is None:
                        self.best_score = score
                        self.best_model = deepcopy(self.model)

                    if maximize == True and self.best_score < score:
                        self.best_model = deepcopy(self.model)
                        self.best_score = score
                    elif maximize == False and self.best_score > score:
                        self.best_model = deepcopy(self.model)
                        self.best_score = score

                    return score

                study = optuna.create_study(
                    direction='maximize' if maximize else 'minimize')
                study.optimize(sklearn_objective,
                               n_trials=n_trials,
                               timeout=timeout,
                               callbacks=[logger.optuna],
                               n_jobs=1)
                self.model = deepcopy(self.best_model)
                del self.best_model
            else:
                self.model = self.model_type(**params)
                self.model.fit(train_data[0], train_data[1], **fit_params)
                self.best_score = eval_metric(self.model, valid_data)
                logger(f'[None]\tbest score is {self.best_score:.6f}')

        self.is_trained = True

        if calibration:
            pass