Exemple #1
0
    def __init__(self, clf, scoring="roc_auc", verbose=0):
        """
        Initializes the class.

        Args:
            clf (binary classifier):
                Model fitted on X_train.

            scoring (string or probatus.utils.Scorer, optional):
                Metric for which the model performance is calculated. It can be either a metric name  aligned with
                predefined classification scorers names in sklearn
                ([link](https://scikit-learn.org/stable/modules/model_evaluation.html)).
                Another option is using probatus.utils.Scorer to define a custom metric.

            verbose (int, optional):
                Controls verbosity of the output:

                - 0 - nether prints nor warnings are shown
                - 1 - 50 - only most important warnings
                - 51 - 100 - shows other warnings and prints
                - above 100 - presents all prints and all warnings (including SHAP warnings).
        """
        self.clf = clf
        self.scorer = get_single_scorer(scoring)
        self.verbose = verbose
Exemple #2
0
    def __init__(
        self,
        clf,
        scoring="roc_auc",
        test_prc=0.25,
        n_jobs=1,
        verbose=0,
        random_state=None,
    ):
        """
        Initializes the class.

        Args:
            clf (model object):
                Binary classification model or pipeline.

            scoring (string or probatus.utils.Scorer, optional):
                Metric for which the model performance is calculated. It can be either a metric name aligned with
                predefined
                [classification scorers names in sklearn](https://scikit-learn.org/stable/modules/model_evaluation.html).
                Another option is using probatus.utils.Scorer to define a custom metric. The recommended option for this
                class is 'roc_auc'.

            test_prc (float, optional):
                Percentage of data used to test the model. By default 0.25 is set.

            n_jobs (int, optional):
                Number of parallel executions. If -1 use all available cores. By default 1.

            verbose (int, optional):
                Controls verbosity of the output:

                - 0 - neither prints nor warnings are shown
                - 1 - 50 - only most important warnings
                - 51 - 100 - shows other warnings and prints
                - above 100 - presents all prints and all warnings (including SHAP warnings).

            random_state (int, optional):
                Random state set at each round of feature elimination. If it is None, the results will not be
                reproducible and in random search at each iteration a different hyperparameters might be tested. For
                reproducible results set it to an integer.
        """  # noqa
        self.clf = clf
        self.test_prc = test_prc
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose
        self.scorer = get_single_scorer(scoring)
Exemple #3
0
    def __init__(
        self,
        clf,
        step=1,
        min_features_to_select=1,
        cv=None,
        scoring="roc_auc",
        n_jobs=-1,
        verbose=0,
        random_state=None,
    ):
        """
        This method initializes the class.

        Args:
            clf (binary classifier, sklearn compatible search CV e.g. GridSearchCV, RandomizedSearchCV or BayesSearchCV):
                A model that will be optimized and trained at each round of feature elimination. The recommended model
                is [LGBMClassifier](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html),
                because it by default handles the missing values and categorical variables. This parameter also supports
                any hyperparameter search schema that is consistent with the sklearn API e.g.
                [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html),
                [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)
                or [BayesSearchCV](https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV).

            step (int or float, optional):
                Number of lowest importance features removed each round. If it is an int, then each round such a number of
                features are discarded. If float, such a percentage of remaining features (rounded down) is removed each
                iteration. It is recommended to use float, since it is faster for a large number of features, and slows
                down and becomes more precise with fewer features. Note: the last round may remove fewer features in
                order to reach min_features_to_select.
                If columns_to_keep parameter is specified in the fit method, step is the number of features to remove after
                keeping those columns.

            min_features_to_select (int, optional):
                Minimum number of features to be kept. This is a stopping criterion of the feature elimination. By
                default the process stops when one feature is left. If columns_to_keep is specified in the fit method,
                it may overide this parameter to the maximum between length of columns_to_keep the two.

            cv (int, cross-validation generator or an iterable, optional):
                Determines the cross-validation splitting strategy. Compatible with sklearn
                [cv parameter](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html).
                If None, then cv of 5 is used.

            scoring (string or probatus.utils.Scorer, optional):
                Metric for which the model performance is calculated. It can be either a metric name aligned with predefined
                [classification scorers names in sklearn](https://scikit-learn.org/stable/modules/model_evaluation.html).
                Another option is using probatus.utils.Scorer to define a custom metric.

            n_jobs (int, optional):
                Number of cores to run in parallel while fitting across folds. None means 1 unless in a
                `joblib.parallel_backend` context. -1 means using all processors.

            verbose (int, optional):
                Controls verbosity of the output:

                - 0 - neither prints nor warnings are shown
                - 1 - 50 - only most important warnings
                - 51 - 100 - shows other warnings and prints
                - above 100 - presents all prints and all warnings (including SHAP warnings).

            random_state (int, optional):
                Random state set at each round of feature elimination. If it is None, the results will not be
                reproducible and in random search at each iteration a different hyperparameters might be tested. For
                reproducible results set it to an integer.
        """  # noqa
        self.clf = clf
        if isinstance(self.clf, BaseSearchCV):
            self.search_clf = True
        else:
            self.search_clf = False

        if (isinstance(step, int) or isinstance(step, float)) and step > 0:
            self.step = step
        else:
            raise (ValueError(
                f"The current value of step = {step} is not allowed. "
                f"It needs to be a positive integer or positive float."))

        if isinstance(min_features_to_select,
                      int) and min_features_to_select > 0:
            self.min_features_to_select = min_features_to_select
        else:
            raise (ValueError(
                f"The current value of min_features_to_select = {min_features_to_select} is not allowed. "
                f"It needs to be a greater than or equal to 0."))

        self.cv = cv
        self.scorer = get_single_scorer(scoring)
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.report_df = pd.DataFrame([])
        self.verbose = verbose
Exemple #4
0
    def __init__(
        self,
        clf,
        strategies,
        scoring="roc_auc",
        cv=5,
        model_na_support=False,
        n_jobs=-1,
        verbose=0,
        random_state=None,
    ):
        """
        Initialise the class.

        Args :
            clf (binary classifier,sklearn.Pipeline):
                A binary classification model, that will used to evaluate various imputation strategies.

            strategies (dictionary of sklearn.impute objects or any other scikit learn compatible imputer.):
                Dictionary containing the sklearn.impute objects.
                e.g.
                strategies = {'KNN' : KNNImputer(n_neighbors=3),
                'Simple Median Imputer' : SimpleImputer(strategy='median',add_indicator=True),
                'Iterative Imputer'  : IterativeImputer(add_indicator=True,n_nearest_features=5,
                sample_posterior=True)}
                This allows you to have fine grained control over the imputation method.

            scoring (string, list of strings, probatus.utils.Scorer or list of probatus.utils.Scorers, optional):
                Metrics for which the score is calculated. It can be either a name or list of names metric names and
                needs to be aligned with predefined [classification scorers names in sklearn](https://scikit-learn.org/stable/modules/model_evaluation.html).
                Another option is using probatus.utils.Scorer to define a custom metric.

            model_na_support(boolean): default False
                If the classifier supports missing values by default e.g. LightGBM,XGBoost etc.
                If True an default comparison `No Imputation`  result will be added indicating the model performance without any explict imputation.
                If False only the provided strategies will be used.

            n_jobs (int, optional):
                Number of cores to run in parallel while fitting across folds. None means 1 unless in a
                `joblib.parallel_backend` context. -1 means using all processors.

            verbose (int, optional):
                Controls verbosity of the output:

                - 0 - nether prints nor warnings are shown
                - 1 - 50 - only most important warnings regarding data properties are shown (excluding SHAP warnings)
                - 51 - 100 - shows most important warnings, prints of the feature removal process
                - above 100 - presents all prints and all warnings (including SHAP warnings).

            random_state (int, optional):
                Random state set at each round of feature elimination. If it is None, the results will not be
                reproducible and in random search at each iteration a different hyperparameters might be tested. For
                reproducible results set it to integer.
        """  # noqa
        self.clf = clf
        self.model_na_support = model_na_support
        self.cv = cv
        self.scorer = get_single_scorer(scoring)
        self.strategies = strategies
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.fitted = False
        self.report_df = pd.DataFrame([])