Esempio n. 1
0
def test_error_if_input_df_contains_na_in_transform(df_vartypes, df_na):
    # test case 4: when dataset contains na, transform method
    with pytest.raises(ValueError):
        transformer = EqualWidthDiscretiser()
        transformer.fit(df_vartypes)
        transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]])
def test_non_fitted_error(df_vartypes):
    with pytest.raises(NotFittedError):
        transformer = EqualWidthDiscretiser()
        transformer.transform(df_vartypes)
Esempio n. 3
0
    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        """
        Find features with high PSI values.

        Parameters
        ----------
        X : pandas dataframe of shape = [n_samples, n_features]
            The training dataset.

        y : pandas series. Default = None
            y is not needed in this transformer. You can pass y or None.
        """
        # check input dataframe
        X = check_X(X)

        # If required exclude variables that are not in the input dataframe
        self._confirm_variables(X)

        # find numerical variables or check those entered are present in the dataframe
        self.variables_ = _find_or_check_numerical_variables(
            X, self.variables_)

        # Remove the split_col from the variables list. It might be added if the
        # variables are not defined at initialization.
        if self.split_col in self.variables_:
            self.variables_.remove(self.split_col)

        if self.missing_values == "raise":
            # check if dataset contains na or inf
            _check_contains_na(X, self.variables_)
            _check_contains_inf(X, self.variables_)

        # Split the dataframe into basis and test.
        basis_df, test_df = self._split_dataframe(X)

        # Check the shape of the returned dataframes for PSI calculations.
        # The number of observations must be at least equal to the
        # number of bins.
        if min(basis_df.shape[0], test_df.shape[0]) < self.bins:
            raise ValueError(
                "The number of rows in the basis and test datasets that will be used "
                f"in the PSI calculations must be at least larger than {self.bins}. "
                "After slitting the original dataset based on the given cut_off or"
                f"split_frac we have {basis_df.shape[0]} samples in the basis set, "
                f"and {test_df.shape[0]} samples in the test set. "
                "Please adjust the value of the cut_off or split_frac.")

        # Switch basis and test dataframes if required.
        if self.switch:
            test_df, basis_df = basis_df, test_df

        # set up the discretizer
        if self.strategy == "equal_width":
            bucketer = EqualWidthDiscretiser(bins=self.bins)
        else:
            bucketer = EqualFrequencyDiscretiser(q=self.bins)

        # Compute the PSI by looping over the features
        self.psi_values_ = {}
        self.features_to_drop_ = []

        for feature in self.variables_:
            # Discretize the features.

            basis_discrete = bucketer.fit_transform(basis_df[[feature
                                                              ]].dropna())
            test_discrete = bucketer.transform(test_df[[feature]].dropna())

            # Determine percentage of observations per bin
            basis_distrib, test_distrib = self._observation_frequency_per_bin(
                basis_discrete, test_discrete)

            # Calculate the PSI value
            self.psi_values_[feature] = np.sum(
                (test_distrib - basis_distrib) *
                np.log(test_distrib / basis_distrib))
            # Assess if feature should be dropped
            if self.psi_values_[feature] > self.threshold:
                self.features_to_drop_.append(feature)

        # save input features
        self._get_feature_names_in(X)

        return self