def fit(self, X, y): self.classes_ = np.unique(y) self.n_classes_ = len(self.classes_) self._le = _LGBMLabelEncoder().fit(y) training_labels = self._le.transform(y) xgdmat = lgbm.Dataset(X, label=training_labels) #xgdmat.construct() self.param_map.update({'objective':'binary'}) #print('avant lgbm.cv') #print(self.param_map) # a verifier # if self.n_classes_ > 2: # self.param_map.update({'num_class':self.n_classes_}) # self.param_map.update({'objective':'multi:softprob'}) # Note: lgbm.cv reset the value of max_bin to 255 self.results = lgbm.cv(self.param_map, xgdmat, self.num_boost_round, self.folds, self.nfold, self.stratified, self.shuffle, self.metrics, self.fobj, self.feval, self.init_model, self.feature_name, self.categorical_feature, self.early_stopping_rounds, self.fpreproc, self.verbose_eval, self.show_stdv, self.seed, self.callbacks)
def fit(self, X, y): self.classes_ = np.unique(y) self.n_classes_ = len(self.classes_) self._le = _LGBMLabelEncoder().fit(y) training_labels = self._le.transform(y) xgdmat = lgbm.Dataset(X, label=training_labels) #xgdmat.construct() self.param_map.update({'objective': 'binary'}) #print('avant lgbm.cv') #print(self.param_map) # a verifier # if self.n_classes_ > 2: # self.param_map.update({'num_class':self.n_classes_}) # self.param_map.update({'objective':'multi:softprob'}) # Note: lgbm.cv reset the value of max_bin to 255 self.results = lgbm.cv(self.param_map, xgdmat, self.num_boost_round, self.folds, self.nfold, self.stratified, self.shuffle, self.metrics, self.fobj, self.feval, self.init_model, self.feature_name, self.categorical_feature, self.early_stopping_rounds, self.fpreproc, self.verbose_eval, self.show_stdv, self.seed, self.callbacks)
def fit(self, X, y, sample_weight=None, init_score=None, eval_set=None, eval_names=None, eval_sample_weight=None, eval_class_weight=None, eval_init_score=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name='auto', categorical_feature='auto', callbacks=None): """Docstring is inherited from the LGBMModel.""" _LGBMAssertAllFinite(y) _LGBMCheckClassificationTargets(y) self._le = _LGBMLabelEncoder().fit(y) _y = self._le.transform(y) self._class_map = dict( zip_(self._le.classes_, self._le.transform(self._le.classes_))) if isinstance(self.class_weight, dict): self._class_weight = { self._class_map[k]: v for k, v in self.class_weight.items() } self._classes = self._le.classes_ self._n_classes = len(self._classes) if self._n_classes > 2: # Switch to using a multiclass objective in the underlying LGBM instance ova_aliases = ("multiclassova", "multiclass_ova", "ova", "ovr") if self._objective not in ova_aliases and not callable( self._objective): self._objective = "multiclass" if eval_metric in ('logloss', 'binary_logloss'): eval_metric = "multi_logloss" elif eval_metric in ('error', 'binary_error'): eval_metric = "multi_error" else: if eval_metric in ('logloss', 'multi_logloss'): eval_metric = 'binary_logloss' elif eval_metric in ('error', 'multi_error'): eval_metric = 'binary_error' if eval_set is not None: if isinstance(eval_set, tuple): eval_set = [eval_set] for i, (valid_x, valid_y) in enumerate(eval_set): if valid_x is X and valid_y is y: eval_set[i] = (valid_x, _y) else: eval_set[i] = (valid_x, self._le.transform(valid_y)) super(LGBMClassifier, self).fit(X, _y, sample_weight=sample_weight, init_score=init_score, eval_set=eval_set, eval_names=eval_names, eval_sample_weight=eval_sample_weight, eval_class_weight=eval_class_weight, eval_init_score=eval_init_score, eval_metric=eval_metric, early_stopping_rounds=early_stopping_rounds, verbose=verbose, feature_name=feature_name, categorical_feature=categorical_feature, callbacks=callbacks) return self
def cv( self, # Dataset data, cat_features=None, groups=None, folds=KFold(n_splits=5), # Model params={}, fit_params={}, convert_to_sklearn=True, # Optuna tune_model=False, optuna_params=None, maximize=None, eval_metric=None, n_trials=None, timeout=None, lgbm_n_trials=[7, 20, 10, 6, 20], # Misc logger=None, n_jobs=-1): self._data_check(data) self.n_features = data[0].shape[1] self.n_classes = len(np.unique(data[1])) if isinstance(data[0], pd.DataFrame): self.feature_names = data[0].columns.tolist() else: self.feature_names = [f'f{i}' for i in range(self.n_features)] main_metric, maximize = self._parse_eval_metric(params, maximize) if eval_metric is not None and maximize is None: raise ValueError('metric direction must be specified.') if isinstance(logger, (str, Path)): logger = LGBMLogger(logger, stdout=True, file=True) elif logger is None: logger = LGBMLogger(logger, stdout=True, file=False) assert isinstance(logger, LGBMLogger) # logger(f'params: \n{params}') # logger(f'fit_params: \n{fit_params}') if self.model_name in MODEL_ZOO['cat']: raise NotImplementedError('catboost is incompatible with .cv().') elif self.model_name in MODEL_ZOO['lgb']: ''' LightGBM ''' dtrain = lgb.Dataset(data=data[0], label=data[1], weight=data[2] if len(data) == 3 else None, categorical_feature=cat_features) # Optuna intergration if tune_model: _fit_params = fit_params.copy() _params = params.copy() _params.update({'metric': main_metric}) if 'verbose_eval' in _fit_params.keys(): _fit_params.update({'verbose_eval': False}) tuner = lgb_tune.LightGBMTunerCV( _params, dtrain, folds=folds, n_trials_config=lgbm_n_trials, return_cvbooster=True, optuna_callbacks=[logger.optuna], show_progress_bar=False, **_fit_params) tuner.run() self.model = tuner.get_best_booster().boosters self.best_iteration = tuner.get_best_booster().best_iteration self.best_score = tuner.best_score del tuner else: _params = params.copy() model_extractor = ModelExtractor() res = lgb.cv(_params, train_set=dtrain, folds=folds, callbacks=[logger.lgbm, model_extractor], **fit_params) self.model = model_extractor.get_model().boosters self.best_iteration = model_extractor.get_best_iteration() self.evals_result = res if maximize: self.best_score = np.max(res[f'{main_metric}-mean']) else: self.best_score = np.min(res[f'{main_metric}-mean']) for i in range(len(self.model)): self.model[i].best_iteration = self.best_iteration logger( f'[{self.best_iteration}]\tbest score is {self.best_score:.6f}' ) if convert_to_sklearn: for i in range(len(self.model)): self.model[i] = booster2sklearn(self.model[i], self.model_type, self.n_features, self.n_classes) if self.model_name == 'LGBMClassifier': self.model[i]._le = _LGBMLabelEncoder().fit(data[1]) elif self.model_name in MODEL_ZOO['xgb']: ''' XGBoost ''' dtrain = xgb.DMatrix(data=data[0], label=data[1], weight=data[2] if len(data) == 3 else None, nthread=n_jobs) # Optuna integration if tune_model: _fit_params = fit_params.copy() if 'verbose_eval' in _fit_params.keys(): _fit_params.update({'verbose_eval': False}) self.best_score = None self.best_model = None self.best_iteration = 0 def xgb_objective(trial): _params = params.copy() if optuna_params is None: _params = PARAMS_ZOO[self.model_name](trial, params) else: _params = optuna_params(trial, _params) model_extractor = ModelExtractor() res = xgb.cv(_params, dtrain=dtrain, folds=folds, maximize=maximize, callbacks=[model_extractor], **_fit_params) self.model = model_extractor.get_model() if eval_metric is None: if maximize: score = np.max(res[f'test-{main_metric}-mean']) best_iteration = np.argmax( res[f'test-{main_metric}-mean']) else: score = np.min(res[f'test-{main_metric}-mean']) best_iteration = np.argmin( res[f'test-{main_metric}-mean']) else: raise NotImplementedError( 'Do not use custom eval_metric for .cv() :(') if self.best_score is None: self.best_score = score self.best_model = self.model.copy() self.evals_result = res if maximize == True and self.best_score < score: self.best_score = score self.best_model = self.model.copy() self.best_iteration = best_iteration self.evals_result = res elif maximize == False and self.best_score > score: self.best_score = score self.best_model = self.model.copy() self.best_iteration = best_iteration self.evals_result = res return score study = optuna.create_study( direction='maximize' if maximize else 'minimize') study.optimize(xgb_objective, n_trials=n_trials, timeout=timeout, callbacks=[logger.optuna], n_jobs=1) self.model = self.best_model.copy() del self.best_model else: _params = params.copy() model_extractor = ModelExtractor() res = xgb.cv(_params, dtrain=dtrain, folds=folds, maximize=maximize, callbacks=[logger.lgbm, model_extractor], **fit_params) self.model = model_extractor.get_model() if maximize: self.best_score = np.max(res[f'test-{main_metric}-mean']) self.best_iteration = np.argmax( res[f'test-{main_metric}-mean']) else: self.best_score = np.min(res[f'test-{main_metric}-mean']) self.best_iteration = np.argmin( res[f'test-{main_metric}-mean']) for i in range(len(self.model)): self.model[i].best_ntree_limit = self.best_iteration logger( f'[{self.best_iteration}]\tbest score is {self.best_score:.6f}' ) if convert_to_sklearn: for i in range(len(self.model)): self.model[i] = booster2sklearn(self.model[i], self.model_type, self.n_features, self.n_classes) if self.model_name == 'XGBClassifier': self.model[i]._le = XGBoostLabelEncoder().fit(data[1]) else: raise NotImplementedError( f'{self.model_name} is incompatible with .cv().') self.is_trained = True
def train( self, # Dataset train_data, valid_data=(), cat_features=None, # Model params={}, fit_params={}, convert_to_sklearn=True, # Probability calibration calibration=False, calibration_method='isotonic', calibration_cv=5, # Optuna tune_model=False, optuna_params=None, maximize=None, eval_metric=None, n_trials=None, timeout=None, lgbm_n_trials=[7, 20, 10, 6, 20], # Misc logger=None, n_jobs=-1): self._data_check(train_data) self._data_check(valid_data) self.n_features = train_data[0].shape[1] self.n_classes = len(np.unique(train_data[1])) if isinstance(train_data[0], pd.DataFrame): self.feature_names = train_data[0].columns.tolist() else: self.feature_names = [f'f{i}' for i in range(self.n_features)] main_metric, maximize = self._parse_eval_metric(params, maximize) if eval_metric is not None and maximize is None: raise ValueError('metric direction must be specified.') if isinstance(logger, (str, Path)): logger = LGBMLogger(logger, stdout=True, file=True) elif logger is None: logger = LGBMLogger(logger, stdout=True, file=False) assert isinstance(logger, LGBMLogger) # logger(f'params: \n{params}') # logger(f'fit_params: \n{fit_params}') if self.model_name in MODEL_ZOO['cat']: ''' Catboost ''' dtrain = CatPool( data=train_data[0], label=train_data[1], weight=train_data[2] if len(train_data) == 3 else None, cat_features=cat_features, thread_count=n_jobs) if len(valid_data) > 0: dvalid = CatPool( data=valid_data[0], label=valid_data[1], weight=valid_data[2] if len(train_data) == 3 else None, cat_features=cat_features, thread_count=n_jobs) else: dvalid = dtrain # Optuna integration if tune_model: _fit_params = fit_params.copy() if 'verbose_eval' in _fit_params.keys(): _fit_params.update({'verbose_eval': False}) self.best_score = None self.best_model = None def cat_objective(trial): _params = params.copy() if optuna_params is None: _params = PARAMS_ZOO[self.model_name](trial, _params) else: _params = optuna_params(trial, _params) self.model = self.model_type(**_params) self.model.fit(X=dtrain, eval_set=dvalid, **_fit_params) if eval_metric is None: score = self.model.get_best_score( )['validation'][main_metric] else: score = eval_metric(self.model, valid_data) if self.best_score is None: self.best_score = score self.best_model = self.model.copy() self.best_iteration = self.model.get_best_iteration() self.evals_result = self.model.get_evals_result() if maximize == True and self.best_score < score: self.best_model = self.model.copy() self.best_score = score self.best_iteration = self.model.get_best_iteration() self.evals_result = self.model.get_evals_result() elif maximize == False and self.best_score > score: self.best_model = self.model.copy() self.best_score = score self.best_iteration = self.model.get_best_iteration() self.evals_result = self.model.get_evals_result() return score study = optuna.create_study( direction='maximize' if maximize else 'minimize') study.optimize(cat_objective, n_trials=n_trials, timeout=timeout, callbacks=[logger.optuna], n_jobs=1) self.model = self.best_model.copy() del self.best_model else: _params = params.copy() self.model = self.model_type(**_params) self.model.fit(X=dtrain, eval_set=dvalid, **fit_params) self.best_score = self.model.get_best_score( )['validation'][main_metric] self.evals_result = self.model.get_evals_result() self.best_iteration = self.model.get_best_iteration() elif self.model_name in MODEL_ZOO['lgb']: ''' LightGBM ''' dtrain = lgb.Dataset( data=train_data[0], label=train_data[1], weight=train_data[2] if len(train_data) == 3 else None, categorical_feature=cat_features) if len(valid_data) > 0: dvalid = lgb.Dataset( data=valid_data[0], label=valid_data[1], weight=valid_data[2] if len(train_data) == 3 else None, categorical_feature=cat_features) else: dvalid = dtrain # Optuna intergration if tune_model: _fit_params = fit_params.copy() _params = params.copy() if 'verbose_eval' in _fit_params.keys(): _fit_params.update({'verbose_eval': False}) _params.update({'metric': main_metric}) tuner = lgb_tune.LightGBMTuner( _params, train_set=dtrain, valid_sets=[dtrain, dvalid], n_trials_config=lgbm_n_trials, show_progress_bar=False, optuna_callbacks=[logger.optuna], **_fit_params) tuner.run() self.model = tuner.get_best_booster() self.best_score = tuner.best_score del tuner else: _params = params.copy() res = {} self.model = lgb.train(_params, train_set=dtrain, valid_sets=[dtrain, dvalid], callbacks=[logger.lgbm], evals_result=res, **fit_params) if maximize: self.best_score = np.max(res['valid_1'][main_metric]) else: self.best_score = np.min(res['valid_1'][main_metric]) self.evals_result = res self.best_iteration = self.model.best_iteration if convert_to_sklearn: self.model = booster2sklearn(self.model, self.model_type, self.n_features, self.n_classes) if self.model_name == 'LGBMClassifier': self.model._le = _LGBMLabelEncoder().fit( train_data[1]) # internal label encoder elif self.model_name in MODEL_ZOO['xgb']: ''' XGBoost ''' dtrain = xgb.DMatrix( data=train_data[0], label=train_data[1], weight=train_data[2] if len(train_data) == 3 else None, nthread=n_jobs) if len(valid_data) > 0: dvalid = xgb.DMatrix( data=valid_data[0], label=valid_data[1], weight=valid_data[2] if len(train_data) == 3 else None, nthread=n_jobs) else: dvalid = dtrain # Optuna integration if tune_model: _fit_params = fit_params.copy() if 'verbose_eval' in _fit_params.keys(): _fit_params.update({'verbose_eval': False}) self.best_score = None self.best_model = None def xgb_objective(trial): _params = params.copy() if optuna_params is None: _params = PARAMS_ZOO[self.model_name](trial, _params) else: _params = optuna_params(trial, _params) res = {} pruning_callback = optuna.integration.XGBoostPruningCallback( trial, f'valid-{main_metric}') self.model = xgb.train(_params, dtrain=dtrain, evals=[(dtrain, 'train'), (dvalid, 'valid')], evals_result=res, callbacks=[pruning_callback], **_fit_params) if eval_metric is None: if maximize: score = np.max(res['valid'][main_metric]) else: score = np.min(res['valid'][main_metric]) else: score = eval_metric(self.model, dvalid) if self.best_score is None: self.best_score = score self.best_model = self.model.copy() self.evals_result = res if maximize == True and self.best_score < score: self.best_score = score self.best_model = self.model.copy() self.evals_result = res elif maximize == False and self.best_score > score: self.best_score = score self.best_model = self.model.copy() self.evals_result = res return score study = optuna.create_study( direction='maximize' if maximize else 'minimize') study.optimize(xgb_objective, n_trials=n_trials, timeout=timeout, callbacks=[logger.optuna], n_jobs=1) self.model = self.best_model.copy() del self.best_model else: _params = params.copy() res = {} self.model = xgb.train(_params, dtrain=dtrain, evals=[(dtrain, 'train'), (dvalid, 'valid')], callbacks=[logger.lgbm], evals_result=res, **fit_params) if maximize: self.best_score = np.max(res['valid'][main_metric]) else: self.best_score = np.min(res['valid'][main_metric]) self.evals_result = res self.best_iteration = self.model.best_ntree_limit if convert_to_sklearn: self.model = booster2sklearn(self.model, self.model_type, self.n_features, self.n_classes) if self.model_name == 'XGBClassifier': self.model._le = XGBoostLabelEncoder().fit(train_data[1]) else: ''' Other skelarn models ''' if eval_metric is None: if self.n_classes == 2: eval_metric = auc_metric maximize = True else: eval_metric = mse_metric maximize = False print('eval_metric automatically selected.') # Optuna integration if tune_model: self.best_score = None self.best_model = None def sklearn_objective(trial): _params = params.copy() if optuna_params is None: _params = PARAMS_ZOO[self.model_name](trial, params) else: _params = optuna_params(trial, _params) self.model = self.model_type(**_params) self.model.fit(train_data[0], train_data[1], **fit_params) score = eval_metric(self.model, valid_data) if self.best_score is None: self.best_score = score self.best_model = deepcopy(self.model) if maximize == True and self.best_score < score: self.best_model = deepcopy(self.model) self.best_score = score elif maximize == False and self.best_score > score: self.best_model = deepcopy(self.model) self.best_score = score return score study = optuna.create_study( direction='maximize' if maximize else 'minimize') study.optimize(sklearn_objective, n_trials=n_trials, timeout=timeout, callbacks=[logger.optuna], n_jobs=1) self.model = deepcopy(self.best_model) del self.best_model else: self.model = self.model_type(**params) self.model.fit(train_data[0], train_data[1], **fit_params) self.best_score = eval_metric(self.model, valid_data) logger(f'[None]\tbest score is {self.best_score:.6f}') self.is_trained = True if calibration: pass