def run_experiment(model_params: Dict[str, Any], X_train: pd.DataFrame, y: pd.Series, X_test: Optional[pd.DataFrame] = None, logging_directory: str = 'output/{time}', if_exists: str = 'error', eval_func: Optional[Callable] = None, algorithm_type: Union[str, Type[BaseEstimator]] = 'lgbm', fit_params: Optional[Union[Dict[str, Any], Callable]] = None, cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, groups: Optional[pd.Series] = None, categorical_feature: Optional[List[str]] = None, sample_submission: Optional[pd.DataFrame] = None, submission_filename: Optional[str] = None, type_of_target: str = 'auto', feature_list: Optional[List[Union[int, str]]] = None, feature_directory: Optional[str] = None, inherit_experiment: Optional[Experiment] = None, with_auto_hpo: bool = False, with_auto_prep: bool = False, with_mlflow: bool = False): """ Evaluate metrics by cross-validation and stores result (log, oof prediction, test prediction, feature importance plot and submission file) under the directory specified. One of the following estimators are used (automatically dispatched by ``type_of_target(y)`` and ``gbdt_type``). * LGBMClassifier * LGBMRegressor * CatBoostClassifier * CatBoostRegressor The output files are laid out as follows: .. code-block:: none <logging_directory>/ log.txt <== Logging file importance.png <== Feature importance plot generated by nyaggle.util.plot_importance oof_prediction.npy <== Out of fold prediction in numpy array format test_prediction.npy <== Test prediction in numpy array format submission.csv <== Submission csv file metrics.json <== Metrics params.json <== Parameters models/ fold1 <== The trained model in fold 1 ... Args: model_params: Parameters passed to the constructor of the classifier/regressor object (i.e. LGBMRegressor). X_train: Training data. Categorical feature should be casted to pandas categorical type or encoded to integer. y: Target X_test: Test data (Optional). If specified, prediction on the test data is performed using ensemble of models. logging_directory: Path to directory where output of experiment is stored. if_exists: How to behave if the logging directory already exists. - error: Raise a ValueError. - replace: Delete logging directory before logging. - append: Append to exisitng experiment. - rename: Rename current directory by adding "_1", "_2"... prefix fit_params: Parameters passed to the fit method of the estimator. If dict is passed, the same parameter except eval_set passed for each fold. If callable is passed, returning value of ``fit_params(fold_id, train_index, test_index)`` will be used for each fold. eval_func: Function used for logging and calculation of returning scores. This parameter isn't passed to GBDT, so you should set objective and eval_metric separately if needed. If ``eval_func`` is None, ``roc_auc_score`` or ``mean_squared_error`` is used by default. gbdt_type: Type of gradient boosting library used. "lgbm" (lightgbm) or "cat" (catboost) cv: int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. - None, to use the default ``KFold(5, random_state=0, shuffle=True)``, - integer, to specify the number of folds in a ``(Stratified)KFold``, - CV splitter (the instance of ``BaseCrossValidator``), - An iterable yielding (train, test) splits as arrays of indices. groups: Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). sample_submission: A sample dataframe alined with test data (Usually in Kaggle, it is available as sample_submission.csv). The submission file will be created with the same schema as this dataframe. submission_filename: The name of submission file will be created under logging directory. If ``None``, the basename of the logging directory will be used as a filename. categorical_feature: List of categorical column names. If ``None``, categorical columns are automatically determined by dtype. type_of_target: The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``. Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported. feature_list: The list of feature ids saved through nyaggle.feature_store module. feature_directory: The location of features stored. Only used if feature_list is not empty. inherit_experiment: An experiment object which is used to log results. if not ``None``, all logs in this function are treated as a part of this experiment. with_auto_prep: If True, the input datasets will be copied and automatic preprocessing will be performed on them. For example, if ``gbdt_type = 'cat'``, all missing values in categorical features will be filled. with_auto_hpo: If True, model parameters will be automatically updated using optuna (only available in lightgbm). with_mlflow: If True, `mlflow tracking <https://www.mlflow.org/docs/latest/tracking.html>`_ is used. One instance of ``nyaggle.experiment.Experiment`` corresponds to one run in mlflow. Note that all output mlflow's directory (``mlruns`` by default). :return: Namedtuple with following members * oof_prediction: numpy array, shape (len(X_train),) Predicted value on Out-of-Fold validation data. * test_prediction: numpy array, shape (len(X_test),) Predicted value on test data. ``None`` if X_test is ``None`` * metrics: list of float, shape(nfolds+1) ``scores[i]`` denotes validation score in i-th fold. ``scores[-1]`` is overall score. * models: list of objects, shape(nfolds) Trained models for each folds. * importance: list of pd.DataFrame, feature importance for each fold (type="gain"). * time: Training time in seconds. * submit_df: The dataframe saved as submission.csv """ start_time = time.time() cv = check_cv(cv, y) if feature_list: X = pd.concat([X_train, X_test]) if X_test is not None else X_train X.reset_index(drop=True, inplace=True) X = load_features(X, feature_list, directory=feature_directory) ntrain = len(X_train) X_train, X_test = X.iloc[:ntrain, :], X.iloc[ntrain:, :].reset_index( drop=True) _check_input(X_train, y, X_test) if categorical_feature is None: categorical_feature = [ c for c in X_train.columns if X_train[c].dtype.name in ['object', 'category'] ] if type_of_target == 'auto': type_of_target = multiclass.type_of_target(y) model_type, eval_func, cat_param_name = _dispatch_models( algorithm_type, type_of_target, eval_func) if with_auto_prep: assert algorithm_type in ( 'cat', 'xgb', 'lgbm'), "with_auto_prep is only supported for gbdt" X_train, X_test = autoprep_gbdt(algorithm_type, X_train, X_test, categorical_feature) logging_directory = logging_directory.format( time=datetime.now().strftime('%Y%m%d_%H%M%S')) if inherit_experiment is not None: experiment = ExpeimentProxy(inherit_experiment) else: experiment = Experiment(logging_directory, if_exists=if_exists, with_mlflow=with_mlflow) with experiment as exp: exp.log('Algorithm: {}'.format(algorithm_type)) exp.log('Experiment: {}'.format(exp.logging_directory)) exp.log('Params: {}'.format(model_params)) exp.log('Features: {}'.format(list(X_train.columns))) exp.log_param('algorithm_type', algorithm_type) exp.log_param('num_features', X_train.shape[1]) if callable(fit_params): exp.log_param('fit_params', str(fit_params)) else: exp.log_dict('fit_params', fit_params) exp.log_dict('model_params', model_params) if feature_list is not None: exp.log_param('features', feature_list) if with_auto_hpo: assert algorithm_type == 'lgbm', 'auto-tuning is only supported for LightGBM' model_params = find_best_lgbm_parameter( model_params, X_train, y, cv=cv, groups=groups, type_of_target=type_of_target) exp.log_param('model_params_tuned', model_params) exp.log('Categorical: {}'.format(categorical_feature)) models = [model_type(**model_params) for _ in range(cv.get_n_splits())] if fit_params is None: fit_params = {} if cat_param_name is not None and not callable( fit_params) and cat_param_name not in fit_params: fit_params[cat_param_name] = categorical_feature if isinstance(fit_params, Dict): exp.log_params(fit_params) result = cross_validate(models, X_train=X_train, y=y, X_test=X_test, cv=cv, groups=groups, logger=exp.get_logger(), eval_func=eval_func, fit_params=fit_params, type_of_target=type_of_target) # save oof exp.log_numpy('oof_prediction', result.oof_prediction) exp.log_numpy('test_prediction', result.test_prediction) for i in range(cv.get_n_splits()): exp.log_metric('Fold {}'.format(i + 1), result.scores[i]) exp.log_metric('Overall', result.scores[-1]) # save importance plot if result.importance: importance = pd.concat(result.importance) plot_file_path = os.path.join(exp.logging_directory, 'importance.png') plot_importance(importance, plot_file_path) exp.log_artifact(plot_file_path) # save trained model for i, model in enumerate(models): _save_model(model, exp.logging_directory, i + 1, exp) # save submission.csv submit_df = None if X_test is not None: submit_df = make_submission_df(result.test_prediction, sample_submission, y) exp.log_dataframe( submission_filename or os.path.basename(exp.logging_directory), submit_df, 'csv') elapsed_time = time.time() - start_time return ExperimentResult(result.oof_prediction, result.test_prediction, result.scores, models, result.importance, elapsed_time, submit_df)
def find_best_lgbm_parameter(base_param: Dict, X: pd.DataFrame, y: pd.Series, cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, groups: Optional[pd.Series] = None, time_budget: Optional[int] = None, type_of_target: str = 'auto') -> Dict: """ Search hyperparameter for lightgbm using optuna. Args: base_param: Base parameters passed to lgb.train. X: Training data. y: Target cv: int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. groups: Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). time_budget: Time budget for tuning (in seconds). type_of_target: The type of target variable. If ``auto``, type is inferred by ``sklearn.utils.multiclass.type_of_target``. Otherwise, ``binary``, ``continuous``, or ``multiclass`` are supported. Returns: The best parameters found """ cv = check_cv(cv, y) if type_of_target == 'auto': type_of_target = multiclass.type_of_target(y) train_index, test_index = next(cv.split(X, y, groups)) dtrain = optuna_lgb.Dataset(X.iloc[train_index], y.iloc[train_index]) dvalid = optuna_lgb.Dataset(X.iloc[test_index], y.iloc[test_index]) params = copy.deepcopy(base_param) if 'early_stopping_rounds' not in params: params['early_stopping_rounds'] = 100 if not any([p in params for p in ('num_iterations', 'num_iteration', 'num_trees', 'num_tree', 'num_rounds', 'num_round')]): params['num_iterations'] = params.get('n_estimators', 10000) if 'objective' not in params: tot_to_objective = { 'binary': 'binary', 'continuous': 'regression', 'multiclass': 'multiclass' } params['objective'] = tot_to_objective[type_of_target] if 'metric' not in params and 'objective' in params: if params['objective'] in ['regression', 'regression_l2', 'l2', 'mean_squared_error', 'mse', 'l2_root', 'root_mean_squared_error', 'rmse']: params['metric'] = 'l2' if params['objective'] in ['regression_l1', 'l1', 'mean_absolute_error', 'mae']: params['metric'] = 'l1' if params['objective'] in ['binary']: params['metric'] = 'binary_logloss' if params['objective'] in ['multiclass']: params['metric'] = 'multi_logloss' if not any([p in params for p in ('verbose', 'verbosity')]): params['verbosity'] = -1 best_params, tuning_history = dict(), list() optuna_lgb.train(params, dtrain, valid_sets=[dvalid], verbose_eval=0, best_params=best_params, tuning_history=tuning_history, time_budget=time_budget) result_param = copy.deepcopy(base_param) result_param.update(best_params) return result_param
def cross_validate(estimator: Union[BaseEstimator, List[BaseEstimator]], X_train: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray], X_test: Union[pd.DataFrame, np.ndarray] = None, cv: Optional[Union[int, Iterable, BaseCrossValidator]] = None, groups: Optional[pd.Series] = None, predict_proba: bool = False, eval_func: Optional[Callable] = None, logger: Optional[Logger] = None, on_each_fold: Optional[Callable[[int, BaseEstimator, pd.DataFrame, pd.Series], None]] = None, fit_params: Optional[Union[Dict[str, Any], Callable]] = None, importance_type: str = 'gain', early_stopping: bool = True, type_of_target: str = 'auto') -> CVResult: """ Evaluate metrics by cross-validation. It also records out-of-fold prediction and test prediction. Args: estimator: The object to be used in cross-validation. For list inputs, ``estimator[i]`` is trained on i-th fold. X_train: Training data y: Target X_test: Test data (Optional). If specified, prediction on the test data is performed using ensemble of models. cv: int, cross-validation generator or an iterable which determines the cross-validation splitting strategy. - None, to use the default ``KFold(5, random_state=0, shuffle=True)``, - integer, to specify the number of folds in a ``(Stratified)KFold``, - CV splitter (the instance of ``BaseCrossValidator``), - An iterable yielding (train, test) splits as arrays of indices. groups: Group labels for the samples. Only used in conjunction with a “Group” cv instance (e.g., ``GroupKFold``). predict_proba: If true, call ``predict_proba`` instead of ``predict`` for calculating prediction for test data. eval_func: Function used for logging and returning scores logger: logger on_each_fold: called for each fold with (idx_fold, model, X_fold, y_fold) fit_params: Parameters passed to the fit method of the estimator importance_type: The type of feature importance to be used to calculate result. Used only in ``LGBMClassifier`` and ``LGBMRegressor``. early_stopping: If ``True``, ``eval_set`` will be added to ``fit_params`` for each fold. ``early_stopping_rounds = 100`` will also be appended to fit_params if it does not already have one. Returns: Namedtuple with following members * oof_prediction (numpy array, shape (len(X_train),)): The predicted value on put-of-Fold validation data. * test_prediction (numpy array, hape (len(X_test),)): The predicted value on test data. ``None`` if X_test is ``None``. * scores (list of float, shape (nfolds+1,)): ``scores[i]`` denotes validation score in i-th fold. ``scores[-1]`` is the overall score. `None` if eval is not specified. * importance (list of pandas DataFrame, shape (nfolds,)): ``importance[i]`` denotes feature importance in i-th fold model. If the estimator is not GBDT, empty array is returned. Example: >>> from sklearn.datasets import make_regression >>> from sklearn.linear_model import Ridge >>> from sklearn.metrics import mean_squared_error >>> from nyaggle.validation import cross_validate >>> X, y = make_regression(n_samples=8) >>> model = Ridge(alpha=1.0) >>> pred_oof, pred_test, scores, _ = \ >>> cross_validate(model, >>> X_train=X[:3, :], >>> y=y[:3], >>> X_test=X[3:, :], >>> cv=3, >>> eval_func=mean_squared_error) >>> print(pred_oof) [-101.1123267 , 26.79300693, 17.72635528] >>> print(pred_test) [-10.65095894 -12.18909059 -23.09906427 -17.68360714 -20.08218267] >>> print(scores) [71912.80290003832, 15236.680239881942, 15472.822033121925, 34207.43505768073] """ cv = check_cv(cv, y) n_output_cols = 1 if type_of_target == 'auto': type_of_target = multiclass.type_of_target(y) if type_of_target == 'multiclass': n_output_cols = y.nunique(dropna=True) if isinstance(estimator, list): assert len(estimator) == cv.get_n_splits(), "Number of estimators should be same to nfolds." X_train = convert_input(X_train) y = convert_input_vector(y, X_train.index) if X_test is not None: X_test = convert_input(X_test) if not isinstance(estimator, list): estimator = [estimator] * cv.get_n_splits() assert len(estimator) == cv.get_n_splits() if logger is None: logger = getLogger(__name__) def _predict(model: BaseEstimator, x: pd.DataFrame, _predict_proba: bool): if _predict_proba: proba = model.predict_proba(x) return proba[:, 1] if proba.shape[1] == 2 else proba else: return model.predict(x) oof = np.zeros((len(X_train), n_output_cols)) if n_output_cols > 1 else np.zeros(len(X_train)) evaluated = np.full(len(X_train), False) test = None if X_test is not None: test = np.zeros((len(X_test), n_output_cols)) if n_output_cols > 1 else np.zeros(len(X_test)) scores = [] eta_all = [] importance = [] for n, (train_idx, valid_idx) in enumerate(cv.split(X_train, y, groups)): start_time = time.time() train_x, train_y = X_train.iloc[train_idx], y.iloc[train_idx] valid_x, valid_y = X_train.iloc[valid_idx], y.iloc[valid_idx] if fit_params is None: fit_params_fold = {} elif callable(fit_params): fit_params_fold = fit_params(n, train_idx, valid_idx) else: fit_params_fold = copy.copy(fit_params) if isinstance(estimator[n], (LGBMModel, CatBoost)): if early_stopping: if 'eval_set' not in fit_params_fold: fit_params_fold['eval_set'] = [(valid_x, valid_y)] if 'early_stopping_rounds' not in fit_params_fold: fit_params_fold['early_stopping_rounds'] = 100 estimator[n].fit(train_x, train_y, **fit_params_fold) else: estimator[n].fit(train_x, train_y, **fit_params_fold) oof[valid_idx] = _predict(estimator[n], valid_x, predict_proba) evaluated[valid_idx] = True if X_test is not None: test += _predict(estimator[n], X_test, predict_proba) if on_each_fold is not None: on_each_fold(n, estimator[n], train_x, train_y) if isinstance(estimator[n], (LGBMModel, CatBoost)): importance.append(_get_gbdt_importance(estimator[n], list(X_train.columns), importance_type)) if eval_func is not None: score = eval_func(valid_y, oof[valid_idx]) scores.append(score) logger.info('Fold {} score: {}'.format(n, score)) elapsed = time.time() - start_time eta_all.append(elapsed) logger.debug('{:.3f} sec / fold'.format(elapsed)) if eval_func is not None: score = eval_func(y.loc[evaluated], oof[evaluated]) scores.append(score) logger.info('Overall score: {}'.format(score)) if X_test is not None: predicted = test / cv.get_n_splits(X_train, y, groups) else: predicted = None return CVResult(oof, predicted, scores, importance)
def _pre_train(self, y): self.cv = check_cv(self.cv, y) self.n_splits = self.cv.get_n_splits() self.transformers = [clone(self.base_transformer) for _ in range(self.n_splits + 1)]