Python preprocess_labels Exemples, probatus.utils.preprocess_labels Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : volatility.py Projet : orchardbirds/probatus

    def fit(self, X, y, column_names=None):
        """
        Fit.

        Bootstraps a number of random seeds, then splits the data based on the sampled seeds and estimates performance
            of the model based on the split data.

        Args:
            X (pandas.DataFrame or numpy.ndarray):
                Array with samples and features.

            y (pandas.Series or numpy.ndarray):
                Array with targets.

            column_names (list of str, optional):
                List of feature names of the provided samples. If provided it will be used to overwrite the existing
                feature names. If not provided the existing feature names are used or default feature names are
                generated.

        Returns:
            (TrainTestVolatility):
                Fitted object.
        """
        super().fit()

        self.X, self.column_names = preprocess_data(X, X_name="X", column_names=column_names, verbose=self.verbose)
        self.y = preprocess_labels(y, y_name="y", index=self.X.index, verbose=self.verbose)

        if self.sample_train_test_split_seed:
            random_seeds = np.random.random_integers(0, 999999, self.iterations)
        else:
            random_seeds = (np.ones(self.iterations)).astype(int)
            if self.random_state:
                random_seeds = random_seeds * self.random_state

        if self.verbose > 0:
            random_seeds = tqdm(random_seeds)

        results_per_iteration = Parallel(n_jobs=self.n_jobs)(
            delayed(get_metric)(
                X=self.X,
                y=self.y,
                clf=self.clf,
                test_size=self.test_prc,
                split_seed=split_seed,
                scorers=self.scorers,
                train_sampling_type=self.train_sampling_type,
                test_sampling_type=self.test_sampling_type,
                train_sampling_fraction=self.train_sampling_fraction,
                test_sampling_fraction=self.test_sampling_fraction,
            )
            for split_seed in random_seeds
        )

        self.iterations_results = pd.concat(results_per_iteration, ignore_index=True)

        self._create_report()
        return self

Exemple #2

0

Afficher le fichier

def test_preprocess_labels():
    """
    Test.
    """
    y1 = pd.Series([1, 0, 1, 0, 1])
    index_1 = np.array([5, 4, 3, 2, 1])

    with pytest.warns(None) as record:
        y1_output = preprocess_labels(y1,
                                      y_name="y1",
                                      index=index_1,
                                      verbose=150)

    pd.testing.assert_series_equal(y1_output,
                                   pd.Series([1, 0, 1, 0, 1], index=index_1))
    # Ensure that number of warnings is correct
    assert len(record) == 0

    y2 = [False, False, False, False, False]

    with pytest.warns(None) as record:
        y2_output = preprocess_labels(y2, y_name="y2", verbose=150)

    pd.testing.assert_series_equal(y2_output, pd.Series(y2))
    # Ensure that number of warnings is correct
    assert len(record) == 1

    y3 = np.array([0, 1, 2, 3, 4])
    with pytest.warns(None) as record:
        y3_output = preprocess_labels(y3, y_name="y3", verbose=150)

    pd.testing.assert_series_equal(y3_output, pd.Series(y3))
    # Ensure that number of warnings is correct
    assert len(record) == 1

    y4 = pd.Series(["2", "1", "3", "2", "1"])
    index4 = pd.Index([0, 2, 1, 3, 4])
    with pytest.warns(None) as record:
        y4_output = preprocess_labels(y4, y_name="y4", index=index4, verbose=0)
    print(y4_output)
    print(pd.Series(["2", "3", "1", "2", "1"], index=index4))
    pd.testing.assert_series_equal(
        y4_output, pd.Series(["2", "3", "1", "2", "1"], index=index4))
    # Ensure that number of warnings is correct
    assert len(record) == 0

Exemple #3

0

Afficher le fichier

    def fit(self,
            X,
            y,
            column_names=None,
            class_names=None,
            precalc_shap=None,
            **shap_kwargs):
        """
        Fits the plotter to the model and data by computing the shap values.

        If the shap_values are passed, they do not need to be computed.

        Args:
            X (pd.DataFrame): input variables.

            y (pd.Series): target variable.

            column_names (None, or list of str, optional):
                List of feature names for the dataset. If None, then column names from the X_train dataframe are used.

            class_names (None, or list of str, optional):
                List of class names e.g. ['neg', 'pos']. If none, the default ['Negative Class', 'Positive Class'] are
                used.

            precalc_shap (Optional, None or np.array):
                Precalculated shap values, If provided they don't need to be computed.

            **shap_kwargs:
                keyword arguments passed to
                [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
                It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values.
                The `approximate=True` causes less accurate, but faster SHAP values calculation, while
                `check_additivity=False` disables the additivity check inside SHAP.
        """
        self.X, self.column_names = preprocess_data(X,
                                                    X_name="X",
                                                    column_names=column_names,
                                                    verbose=self.verbose)
        self.y = preprocess_labels(y,
                                   y_name="y",
                                   index=self.X.index,
                                   verbose=self.verbose)

        # Set class names
        self.class_names = class_names
        if self.class_names is None:
            self.class_names = ["Negative Class", "Positive Class"]

        self.shap_vals_df = shap_to_df(self.clf,
                                       self.X,
                                       precalc_shap=precalc_shap,
                                       verbose=self.verbose,
                                       **shap_kwargs)

        self.fitted = True
        return self

Exemple #4

0

Afficher le fichier

    def fit(self,
            X,
            y,
            column_names=None,
            class_names=None,
            precalc_shap=None):
        """
        Fits the plotter to the model and data by computing the shap values. If the shap_values are passed, they do not
            need to be computed
        
        Args:
            X (pd.DataFrame):
                input variables.

            y (pd.Series):
                target variable.

            column_names (None, or list of str, optional):
                List of feature names for the dataset. If None, then column names from the X_train dataframe are used.

            class_names (None, or list of str, optional):
                List of class names e.g. ['neg', 'pos']. If none, the default ['Negative Class', 'Positive Class'] are
                used.

            precalc_shap (Optional, None or np.array):
                Precalculated shap values, If provided they don't need to be computed.
        """

        self.X, self.column_names = preprocess_data(X,
                                                    X_name='X',
                                                    column_names=column_names,
                                                    verbose=self.verbose)
        self.y = preprocess_labels(y,
                                   y_name='y',
                                   index=self.X.index,
                                   verbose=self.verbose)

        # Set class names
        self.class_names = class_names
        if self.class_names is None:
            self.class_names = ['Negative Class', 'Positive Class']

        self.shap_vals_df = shap_to_df(self.clf,
                                       self.X,
                                       precalc_shap=precalc_shap,
                                       verbose=self.verbose)

        self.fitted = True
        return self

Exemple #5

0

Afficher le fichier

def test_get_feature_shap_values_per_fold_early_stopping(complex_data):
    """
    Test with ShapRFECV with features per fold.
    """
    from lightgbm import LGBMClassifier

    clf = LGBMClassifier(n_estimators=200, max_depth=3)
    X, y = complex_data
    y = preprocess_labels(y, y_name="y", index=X.index)

    shap_elimination = EarlyStoppingShapRFECV(clf, early_stopping_rounds=5)
    shap_values, train_score, test_score = shap_elimination._get_feature_shap_values_per_fold(
        X,
        y,
        clf,
        train_index=list(range(5, 50)),
        val_index=[0, 1, 2, 3, 4],
        scorer=get_scorer("roc_auc"),
    )
    assert test_score > 0.6
    assert train_score > 0.6
    assert shap_values.shape == (5, 5)

Exemple #6

0

Afficher le fichier

Fichier : model_interpret.py Projet : python-polymatrix-games/probatus

    def fit(
        self,
        X_train,
        X_test,
        y_train,
        y_test,
        column_names=None,
        class_names=None,
        approximate=False,
        **shap_kwargs,
    ):
        """
        Fits the object and calculates the shap values for the provided datasets.

        Args:
            X_train (pd.DataFrame):
                Dataframe containing training data.

            X_test (pd.DataFrame):
                Dataframe containing test data.

            y_train (pd.Series):
                Series of binary labels for train data.

            y_test (pd.Series):
                Series of binary labels for test data.

            column_names (None, or list of str, optional):
                List of feature names for the dataset. If None, then column names from the X_train dataframe are used.

            class_names (None, or list of str, optional):
                List of class names e.g. ['neg', 'pos']. If none, the default ['Negative Class', 'Positive Class'] are
                used.

            approximate (boolean, optional):
                if True uses shap approximations - less accurate, but very fast.

            **shap_kwargs:
                keyword arguments passed to [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
        """

        self.X_train, self.column_names = preprocess_data(
            X_train, X_name="X_train", column_names=column_names, verbose=self.verbose
        )
        self.X_test, _ = preprocess_data(
            X_test, X_name="X_test", column_names=column_names, verbose=self.verbose
        )
        self.y_train = preprocess_labels(
            y_train, y_name="y_train", index=self.X_train.index, verbose=self.verbose
        )
        self.y_test = preprocess_labels(
            y_test, y_name="y_test", index=self.X_test.index, verbose=self.verbose
        )

        # Set class names
        self.class_names = class_names
        if self.class_names is None:
            self.class_names = ["Negative Class", "Positive Class"]

        # Calculate Metrics
        self.train_score = self.scorer.score(self.clf, self.X_train, self.y_train)
        self.test_score = self.scorer.score(self.clf, self.X_test, self.y_test)

        self.results_text = (
            f"Train {self.scorer.metric_name}: {np.round(self.train_score, 3)},\n"
            f"Test {self.scorer.metric_name}: {np.round(self.test_score, 3)}."
        )

        (
            self.shap_values_train,
            self.expected_value_train,
            self.tdp_train,
        ) = self._prep_shap_related_variables(
            clf=self.clf,
            X=self.X_train,
            y=self.y_train,
            approximate=approximate,
            column_names=self.column_names,
            class_names=self.class_names,
            verbose=self.verbose,
            **shap_kwargs,
        )

        (
            self.shap_values_test,
            self.expected_value_test,
            self.tdp_test,
        ) = self._prep_shap_related_variables(
            clf=self.clf,
            X=self.X_test,
            y=self.y_test,
            approximate=approximate,
            column_names=self.column_names,
            class_names=self.class_names,
            verbose=self.verbose,
            **shap_kwargs,
        )

        self.fitted = True

Exemple #7

0

Afficher le fichier

    def fit(self, X1, X2, column_names=None, class_names=None):
        """
        Base fit functionality that should be executed before each fit.

        Args:
            X1 (np.ndarray or pd.DataFrame):
                First sample to be compared. It needs to have the same number of columns as X2.

            X2 (np.ndarray or pd.DataFrame):
                Second sample to be compared. It needs to have the same number of columns as X1.

            column_names (list of str, optional):
                List of feature names of the provided samples. If provided it will be used to overwrite the existing
                feature names. If not provided the existing feature names are used or default feature names are
                generated.

            class_names (None, or list of str, optional):
                List of class names assigned, in this case provided samples e.g. ['sample1', 'sample2']. If none, the
                default ['First Sample', 'Second Sample'] are used.

        Returns:
            (BaseResemblanceModel):
                Fitted object
        """
        # Set seed for results reproducibility
        if self.random_state is not None:
            np.random.seed(self.random_state)

        # Set class names
        self.class_names = class_names
        if self.class_names is None:
            self.class_names = ['First Sample', 'Second Sample']

        # Ensure inputs are correct
        self.X1, self.column_names = preprocess_data(X1,
                                                     X_name='X1',
                                                     column_names=column_names,
                                                     verbose=self.verbose)
        self.X2, _ = preprocess_data(X2,
                                     X_name='X2',
                                     column_names=column_names,
                                     verbose=self.verbose)

        # Prepare dataset for modelling
        self.X = pd.DataFrame(pd.concat([self.X1, self.X2], axis=0),
                              columns=self.column_names).reset_index(drop=True)

        self.y = pd.Series(
            np.concatenate([
                np.zeros(self.X1.shape[0]),
                np.ones(self.X2.shape[0]),
            ])).reset_index(drop=True)

        # Assure the type and number of classes for the variable
        self.X, _ = preprocess_data(self.X,
                                    X_name='X',
                                    column_names=self.column_names,
                                    verbose=self.verbose)

        self.y = preprocess_labels(self.y,
                                   y_name='y',
                                   index=self.X.index,
                                   verbose=self.verbose)

        # Reinitialize variables in case of multiple times being fit
        self._init_output_variables()

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X,
            self.y,
            test_size=self.test_prc,
            random_state=self.random_state,
            shuffle=True,
            stratify=self.y)
        self.clf.fit(self.X_train, self.y_train)

        self.train_score = np.round(
            self.scorer.score(self.clf, self.X_train, self.y_train), 3)
        self.test_score = np.round(
            self.scorer.score(self.clf, self.X_test, self.y_test), 3)


        self.results_text = f'Train {self.scorer.metric_name}: {np.round(self.train_score, 3)},\n' \
                            f'Test {self.scorer.metric_name}: {np.round(self.test_score, 3)}.'
        if self.verbose > 50:
            print(f'Finished model training: \n{self.results_text}')

        if self.verbose > 0:
            if self.auc_train > self.auc_test:
                warnings.warn(
                    f'Train {self.scorer.metric_name} > Test {self.scorer.metric_name}, which might indicate '
                    f'an overfit. \n Strong overfit might lead to misleading conclusions when analysing '
                    f'feature importance. Consider retraining with more regularization applied to the model.'
                )
        self.fitted = True
        return self

Exemple #8

0

Afficher le fichier

    def fit(self,
            X,
            y,
            sample_weight=None,
            columns_to_keep=None,
            column_names=None,
            **shap_kwargs):
        """
        Fits the object with the provided data.

        The algorithm starts with the entire dataset, and then sequentially
             eliminates features. If sklearn compatible search CV is passed as clf e.g.
             [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html),
             [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)
             or [BayesSearchCV](https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html),
             the hyperparameter optimization is applied at each step of the elimination.
             Then, the SHAP feature importance is calculated using Cross-Validation,
             and `step` lowest importance features are removed.

        Args:
            X (pd.DataFrame):
                Provided dataset.

            y (pd.Series):
                Binary labels for X.

            sample_weight (pd.Series, np.ndarray, list, optional):
                array-like of shape (n_samples,) - only use if the model you're using supports
                sample weighting (check the corresponding scikit-learn documentation).
                Array of weights that are assigned to individual samples.
                Note that they're only used for fitting of  the model, not during evaluation of metrics.
                If not provided, then each sample is given unit weight.

            columns_to_keep (list of str, optional):
                List of column names to keep. If given,
                these columns will not be eliminated by the feature elimination process.
                However, these feature will used for the calculation of the SHAP values.

            column_names (list of str, optional):
                List of feature names of the provided samples. If provided it will be used to overwrite the existing
                feature names. If not provided the existing feature names are used or default feature names are
                generated.

            **shap_kwargs:
                keyword arguments passed to
                [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
                It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values.
                The `approximate=True` causes less accurate, but faster SHAP values calculation, while
                `check_additivity=False` disables the additivity check inside SHAP.

        Returns:
            (ShapRFECV): Fitted object.
        """
        # Set seed for results reproducibility
        if self.random_state is not None:
            np.random.seed(self.random_state)

        # If to columns_to_keep is not provided, then initialise it by an empty string.
        # If provided check if all the elements in columns_to_keep are of type string.
        if columns_to_keep is None:
            len_columns_to_keep = 0
        else:
            if all(isinstance(x, str) for x in columns_to_keep):
                len_columns_to_keep = len(columns_to_keep)
            else:
                raise (ValueError(
                    "The current values of columns_to_keep are not allowed.All the elements should be strings."
                ))

        # If the columns_to_keep parameter is provided, check if they match the column names in the X.
        if column_names is not None:
            if all(x in column_names for x in list(X.columns)):
                pass
            else:
                raise (ValueError(
                    "The column names in parameter columns_to_keep and column_names are not macthing."
                ))

        # Check that the total number of columns to select is less than total number of columns in the data.
        # only when both parameters are provided.
        if column_names is not None and columns_to_keep is not None:
            if (self.min_features_to_select + len_columns_to_keep) > len(
                    self.column_names):
                raise ValueError(
                    "Minimum features to select is greater than number of features."
                    "Lower the value for min_features_to_select or number of columns in columns_to_keep"
                )

        self.X, self.column_names = preprocess_data(X,
                                                    X_name="X",
                                                    column_names=column_names,
                                                    verbose=self.verbose)
        self.y = preprocess_labels(y,
                                   y_name="y",
                                   index=self.X.index,
                                   verbose=self.verbose)
        if sample_weight is not None:
            if self.verbose > 0:
                warnings.warn(
                    "sample_weight is passed only to the fit method of the model, not the evaluation metrics."
                )
            sample_weight = assure_pandas_series(sample_weight,
                                                 index=self.X.index)
        self.cv = check_cv(self.cv, self.y, classifier=is_classifier(self.clf))

        remaining_features = current_features_set = self.column_names
        round_number = 0

        # Stop when stopping criteria is met.
        stopping_criteria = np.max(
            [self.min_features_to_select, len_columns_to_keep])

        # Setting up the min_features_to_select parameter.
        if columns_to_keep is None:
            pass
        else:
            self.min_features_to_select = 0
            # This ensures that, if columns_to_keep is provided ,
            # the last features remaining are only the columns_to_keep.
            if self.verbose > 50:
                warnings.warn(
                    f"Minimum features to select : {stopping_criteria}")

        while len(current_features_set) > stopping_criteria:
            round_number += 1

            # Get current dataset info
            current_features_set = remaining_features
            if columns_to_keep is None:
                remaining_removeable_features = list(set(current_features_set))
            else:
                remaining_removeable_features = list(
                    set(current_features_set) | set(columns_to_keep))
            current_X = self.X[remaining_removeable_features]

            # Set seed for results reproducibility
            if self.random_state is not None:
                np.random.seed(self.random_state)

            # Optimize parameters
            if self.search_clf:
                current_search_clf = clone(self.clf).fit(current_X, self.y)
                current_clf = current_search_clf.estimator.set_params(
                    **current_search_clf.best_params_)
            else:
                current_clf = clone(self.clf)

            # Perform CV to estimate feature importance with SHAP
            results_per_fold = Parallel(n_jobs=self.n_jobs)(
                delayed(self._get_feature_shap_values_per_fold)(
                    X=current_X,
                    y=self.y,
                    clf=current_clf,
                    train_index=train_index,
                    val_index=val_index,
                    sample_weight=sample_weight,
                    **shap_kwargs,
                )
                for train_index, val_index in self.cv.split(current_X, self.y))

            shap_values = np.vstack(
                [current_result[0] for current_result in results_per_fold])
            scores_train = [
                current_result[1] for current_result in results_per_fold
            ]
            scores_val = [
                current_result[2] for current_result in results_per_fold
            ]

            # Calculate the shap features with remaining features and features to keep.

            shap_importance_df = calculate_shap_importance(
                shap_values, remaining_removeable_features)

            # Get features to remove
            features_to_remove = self._get_current_features_to_remove(
                shap_importance_df, columns_to_keep=columns_to_keep)
            remaining_features = list(
                set(current_features_set) - set(features_to_remove))

            # Report results
            self._report_current_results(
                round_number=round_number,
                current_features_set=current_features_set,
                features_to_remove=features_to_remove,
                train_metric_mean=np.round(np.mean(scores_train), 3),
                train_metric_std=np.round(np.std(scores_train), 3),
                val_metric_mean=np.round(np.mean(scores_val), 3),
                val_metric_std=np.round(np.std(scores_val), 3),
            )
            if self.verbose > 50:
                print(
                    f"Round: {round_number}, Current number of features: {len(current_features_set)}, "
                    f'Current performance: Train {self.report_df.loc[round_number]["train_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["train_metric_std"]}, CV Validation '
                    f'{self.report_df.loc[round_number]["val_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["val_metric_std"]}. \n'
                    f"Features left: {remaining_features}. "
                    f"Removed features at the end of the round: {features_to_remove}"
                )
        self.fitted = True
        return self

Exemple #9

0

Afficher le fichier

Fichier : imputation.py Projet : orchardbirds/probatus

    def fit(self, X, y, column_names=None):
        """
        Calculates the cross validated results for various imputation strategies.

        Args:
            X (pd.DataFrame):
                input variables.

            y (pd.Series):
                target variable.

            column_names (None, or list of str, optional):
                List of feature names for the dataset.
                If None, then column names from the X dataframe are used.
        """
        if self.random_state is not None:
            np.random.seed(self.random_state)

        # Place holder for results.
        results = []

        self.X, self.column_names = preprocess_data(X, column_names=column_names, verbose=self.verbose)
        self.y = preprocess_labels(y, index=self.X.index, verbose=self.verbose)

        # Identify categorical features.
        categorical_columns = X.select_dtypes(include=["category", "object"]).columns
        # Identify the numeric columns.Numeric columns are all columns expect the categorical columns
        numeric_columns = X.select_dtypes("number").columns

        for strategy in self.strategies:

            numeric_transformer = Pipeline(steps=[("imputer", self.strategies[strategy])])

            categorical_transformer = Pipeline(
                steps=[
                    (
                        "imp_cat",
                        SimpleImputer(
                            strategy="constant",
                            fill_value="missing",
                            add_indicator=True,
                        ),
                    ),
                    ("ohe_cat", OneHotEncoder(handle_unknown="ignore")),
                ]
            )

            preprocessor = ColumnTransformer(
                transformers=[
                    ("num", numeric_transformer, numeric_columns),
                    ("cat", categorical_transformer, categorical_columns),
                ],
                remainder="passthrough",
            )

            model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", self.clf)])

            temp_results = self._calculate_results(X, y, clf=model_pipeline, strategy=strategy)

            results.append(temp_results)

        # If model supports missing values by default, then calculate the scores
        # on raw data without any imputation.
        if self.model_na_support:

            categorical_transformer = Pipeline(
                steps=[
                    ("ohe_cat", OneHotEncoder(handle_unknown="ignore")),
                ]
            )

            preprocessor = ColumnTransformer(
                transformers=[("cat", categorical_transformer, categorical_columns)],
                remainder="passthrough",
            )

            model_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", self.clf)])

            temp_results = self._calculate_results(X, y, clf=model_pipeline, strategy="No Imputation")
            results.append(temp_results)

        self.report_df = pd.DataFrame(results)
        # Set the index of the dataframe to the imputation methods.
        self.report_df = self.report_df.set_index(self.report_df.strategy, "strategy")
        self.report_df.drop(columns=["strategy"], inplace=True)
        self.report_df.sort_values(by="mean_test_score", inplace=True)
        self.fitted = True
        return self

Exemple #10

0

Afficher le fichier

    def fit(self, X, y, column_names=None):
        """
        Fits the object with the provided data. The algorithm starts with the entire dataset, and then sequentially
             eliminates features. If [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
             or [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)
             object assigned as clf, the hyperparameter optimization is applied first. Then, the SHAP feature importance
             is calculated using Cross-Validation, and `step` lowest importance features are removed.

        Args:
            X (pd.DataFrame):
                Provided dataset.

            y (pd.Series):
                Binary labels for X.

            column_names (list of str, optional):
                List of feature names of the provided samples. If provided it will be used to overwrite the existing
                feature names. If not provided the existing feature names are used or default feature names are
                generated.
        Returns:
            (ShapRFECV): Fitted object.
        """
        # Set seed for results reproducibility
        if self.random_state is not None:
            np.random.seed(self.random_state)

        self.X, self.column_names = preprocess_data(X,
                                                    X_name='X',
                                                    column_names=column_names,
                                                    verbose=self.verbose)
        self.y = preprocess_labels(y,
                                   y_name='y',
                                   index=self.X.index,
                                   verbose=self.verbose)
        self.cv = check_cv(self.cv, self.y, classifier=is_classifier(self.clf))

        remaining_features = current_features_set = self.column_names
        round_number = 0

        while len(current_features_set) > self.min_features_to_select:
            round_number += 1

            # Get current dataset info
            current_features_set = remaining_features
            current_X = self.X[current_features_set]

            # Set seed for results reproducibility
            if self.random_state is not None:
                np.random.seed(self.random_state)

            # Optimize parameters
            if self.search_clf:
                current_search_clf = clone(self.clf).fit(current_X, self.y)
                current_clf = current_search_clf.estimator.set_params(
                    **current_search_clf.best_params_)
            else:
                current_clf = clone(self.clf)

            # Perform CV to estimate feature importance with SHAP
            results_per_fold = Parallel(n_jobs=self.n_jobs)(
                delayed(self._get_feature_shap_values_per_fold)(
                    X=current_X,
                    y=self.y,
                    clf=current_clf,
                    train_index=train_index,
                    val_index=val_index,
                    scorer=self.scorer.scorer,
                    verbose=self.verbose)
                for train_index, val_index in self.cv.split(current_X, self.y))

            shap_values = np.vstack(
                [current_result[0] for current_result in results_per_fold])
            scores_train = [
                current_result[1] for current_result in results_per_fold
            ]
            scores_val = [
                current_result[2] for current_result in results_per_fold
            ]

            shap_importance_df = calculate_shap_importance(
                shap_values, remaining_features)

            # Get features to remove
            features_to_remove = self._get_current_features_to_remove(
                shap_importance_df)
            remaining_features = list(
                set(current_features_set) - set(features_to_remove))

            # Report results
            self._report_current_results(
                round_number=round_number,
                current_features_set=current_features_set,
                features_to_remove=features_to_remove,
                train_metric_mean=np.round(np.mean(scores_train), 3),
                train_metric_std=np.round(np.std(scores_train), 3),
                val_metric_mean=np.round(np.mean(scores_val), 3),
                val_metric_std=np.round(np.std(scores_val), 3))
            if self.verbose > 50:
                print(
                    f'Round: {round_number}, Current number of features: {len(current_features_set)}, '
                    f'Current performance: Train {self.report_df.loc[round_number]["train_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["train_metric_std"]}, CV Validation '
                    f'{self.report_df.loc[round_number]["val_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["val_metric_std"]}. \n'
                    f'Num of features left: {len(remaining_features)}. '
                    f'Removed features at the end of the round: {features_to_remove}'
                )
        self.fitted = True
        return self

Exemple #11

0

Afficher le fichier

Fichier : feature_elimination.py Projet : Contrib-to-probatus/probatus

    def fit(self, X, y, columns_to_keep=None, column_names=None):
        """
        Fits the object with the provided data. The algorithm starts with the entire dataset, and then sequentially
             eliminates features. If [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
             or [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)
             object assigned as clf, the hyperparameter optimization is applied first. Then, the SHAP feature importance
             is calculated using Cross-Validation, and `step` lowest importance features are removed.

        Args:
            X (pd.DataFrame):
                Provided dataset.

            y (pd.Series):
                Binary labels for X.

            columns_to_keep(list of str,optional):
                List of column names to keep. If given, these columns will not be eliminated by the feature elimination process.
                However, these feature will used for the calculation of the SHAP values.

            column_names (list of str, optional):
                List of feature names of the provided samples. If provided it will be used to overwrite the existing
                feature names. If not provided the existing feature names are used or default feature names are
                generated.
        Returns:
            (ShapRFECV): Fitted object.
        """
        # Set seed for results reproducibility
        if self.random_state is not None:
            np.random.seed(self.random_state)

        # If to columns_to_keep is not provided, then initialise it by an empty string.
        # If provided check if all the elements in columns_to_keep are of type string.
        if columns_to_keep is None:
            len_columns_to_keep = 0
        else:
            if all(isinstance(x, str) for x in columns_to_keep):
                len_columns_to_keep = len(columns_to_keep)
            else:
                raise (ValueError(
                    'The current values of columns_to_keep are not allowed.All the elements should be strings.'
                ))

        # If the columns_to_keep parameter is provided, check if they match the column names in the X.
        if column_names is not None:
            if all(x in column_names for x in list(X.columns)):
                pass
            else:
                raise (ValueError(
                    'The column names in parameter columns_to_keep and column_names are not macthing.'
                ))

        #Check that the total number of columns to select is less than total number of columns in the data.
        #only when both parameters are provided.
        if column_names is not None and columns_to_keep is not None:
            if (self.min_features_to_select + len_columns_to_keep) > len(
                    self.column_names):
                raise ValueError(
                    'Minimum features to select is greater than number of features.'
                    'Lower the value for min_features_to_select or number of columns in columns_to_keep'
                )

        self.X, self.column_names = preprocess_data(X,
                                                    X_name='X',
                                                    column_names=column_names,
                                                    verbose=self.verbose)
        self.y = preprocess_labels(y,
                                   y_name='y',
                                   index=self.X.index,
                                   verbose=self.verbose)
        self.cv = check_cv(self.cv, self.y, classifier=is_classifier(self.clf))

        remaining_features = current_features_set = self.column_names
        round_number = 0

        #Stop when stopping criteria is met.
        stopping_criteria = np.max(
            [self.min_features_to_select, len_columns_to_keep])

        #Setting up the min_features_to_select parameter.
        if columns_to_keep is None:
            pass
        else:
            self.min_features_to_select = 0
            #This ensures that, if columns_to_keep is provided ,the last features remaining are only the columns_to_keep.
            if self.verbose > 50:
                warnings.warn(
                    f'Minimum features to select : {stopping_criteria}')

        while len(current_features_set) > stopping_criteria:
            round_number += 1

            # Get current dataset info
            current_features_set = remaining_features
            if columns_to_keep is None:
                remaining_removeable_features = list(set(current_features_set))
            else:
                remaining_removeable_features = list(
                    set(current_features_set) | set(columns_to_keep))
            current_X = self.X[remaining_removeable_features]

            # Set seed for results reproducibility
            if self.random_state is not None:
                np.random.seed(self.random_state)

            # Optimize parameters
            if self.search_clf:
                current_search_clf = clone(self.clf).fit(current_X, self.y)
                current_clf = current_search_clf.estimator.set_params(
                    **current_search_clf.best_params_)
            else:
                current_clf = clone(self.clf)

            # Perform CV to estimate feature importance with SHAP
            results_per_fold = Parallel(n_jobs=self.n_jobs)(
                delayed(self._get_feature_shap_values_per_fold)(
                    X=current_X,
                    y=self.y,
                    clf=current_clf,
                    train_index=train_index,
                    val_index=val_index,
                    scorer=self.scorer.scorer,
                    verbose=self.verbose)
                for train_index, val_index in self.cv.split(current_X, self.y))

            shap_values = np.vstack(
                [current_result[0] for current_result in results_per_fold])
            scores_train = [
                current_result[1] for current_result in results_per_fold
            ]
            scores_val = [
                current_result[2] for current_result in results_per_fold
            ]

            #Calculate the shap features with remaining features and features to keep.

            shap_importance_df = calculate_shap_importance(
                shap_values, remaining_removeable_features)

            # Get features to remove
            features_to_remove = self._get_current_features_to_remove(
                shap_importance_df, columns_to_keep=columns_to_keep)
            remaining_features = list(
                set(current_features_set) - set(features_to_remove))

            # Report results
            self._report_current_results(
                round_number=round_number,
                current_features_set=current_features_set,
                features_to_remove=features_to_remove,
                train_metric_mean=np.round(np.mean(scores_train), 3),
                train_metric_std=np.round(np.std(scores_train), 3),
                val_metric_mean=np.round(np.mean(scores_val), 3),
                val_metric_std=np.round(np.std(scores_val), 3))
            if self.verbose > 50:
                print(
                    f'Round: {round_number}, Current number of features: {len(current_features_set)}, '
                    f'Current performance: Train {self.report_df.loc[round_number]["train_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["train_metric_std"]}, CV Validation '
                    f'{self.report_df.loc[round_number]["val_metric_mean"]} '
                    f'+/- {self.report_df.loc[round_number]["val_metric_std"]}. \n'
                    f'Features left: {remaining_features}. '
                    f'Removed features at the end of the round: {features_to_remove}'
                )
        self.fitted = True
        return self