Ejemplo n.º 1
0
def test_find_numerical_variables(dataframe_vartypes):
    vars_num = ['Age', 'Marks']
    vars_mix = ['Age', 'Marks', 'Name']
    vars_none = None
    assert _find_numerical_variables(dataframe_vartypes, vars_num) == vars_num
    assert _find_numerical_variables(dataframe_vartypes, vars_none) == vars_num
    with pytest.raises(TypeError):
        assert _find_numerical_variables(dataframe_vartypes, vars_mix)
    with pytest.raises(ValueError):
        assert _find_numerical_variables(dataframe_vartypes[['Name', 'City']],
                                         None)
Ejemplo n.º 2
0
    def fit(self, X, y=None):
        """
        The `fit` method allows Scikit-learn transformers to learn the required parameters
        from the training data set.

        If transformer is OneHotEncoder, OrdinalEncoder or SimpleImputer, all variables indicated
        in the variables parameter will be transformed. When the variables parameter is None, the
        SklearnWrapper will automatically select and transform all features in the dataset,
        numerical or otherwise.

        For all other Scikit-learn transformers only numerical variables will be transformed.
        The SklearnWrapper will check that the variables indicated in the variables parameter
        are numerical, or alternatively, if variables is None, it will automatically select
        the numerical variables in the data set.
        """

        # check input dataframe
        X = _is_dataframe(X)

        if isinstance(self.transformer, (OneHotEncoder, OrdinalEncoder, SimpleImputer)):
            self.variables = _find_all_variables(X, self.variables)
        else:
            self.variables = _find_numerical_variables(X, self.variables)

        self.transformer.fit(X[self.variables])

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 3
0
    def fit(self, X, y=None):
        """
        Learns the values at the end of the variable distribution.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            The user can pass the entire dataframe, not just the variables that need imputation.

        y : None
            y is not needed in this imputation. You can pass None or y.

        Attributes
        ----------

        imputer_dict_: dictionary
            The dictionary containing the values at the end of the distribution
            per variable. These values will be used by the imputer to replace missing
            data.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        # estimate imputation values
        if self.distribution == 'max':
            self.imputer_dict_ = (X[self.variables].max() *
                                  self.fold).to_dict()

        elif self.distribution == 'gaussian':
            if self.tail == 'right':
                self.imputer_dict_ = (
                    X[self.variables].mean() +
                    self.fold * X[self.variables].std()).to_dict()
            elif self.tail == 'left':
                self.imputer_dict_ = (
                    X[self.variables].mean() -
                    self.fold * X[self.variables].std()).to_dict()

        elif self.distribution == 'skewed':
            IQR = X[self.variables].quantile(0.75) - X[
                self.variables].quantile(0.25)
            if self.tail == 'right':
                self.imputer_dict_ = (X[self.variables].quantile(0.75) +
                                      (IQR * self.fold)).to_dict()
            elif self.tail == 'left':
                self.imputer_dict_ = (X[self.variables].quantile(0.25) -
                                      (IQR * self.fold)).to_dict()

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 4
0
    def fit(self, X, y=None):
        """
        Checks that the variables are numerical.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            User can pass the entire dataframe, not just the variables to impute.

        y : None
            y is not needed in this imputation. You can pass None or y.


        Attributes
        ----------

        imputer_dict_: dictionary
            The dictionary containing the values that will be used to replace each variable.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        if self.imputer_dict:
            self.variables = _find_numerical_variables(
                X, self.imputer_dict.keys())
        else:
            self.variables = _find_numerical_variables(X, self.variables)

        # create the imputer dictionary
        if self.imputer_dict:
            self.imputer_dict_ = self.imputer_dict
        else:
            self.imputer_dict_ = {
                var: self.arbitrary_number
                for var in self.variables
            }

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 5
0
    def fit(self, X, y=None):
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        # check if dataset contains na
        _check_contains_na(X, self.variables)

        return X
Ejemplo n.º 6
0
    def fit(self, X, y=None):
        """
        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y : None
            y is not needed in this transformer. You can pass y or None.

        Attributes
        ----------

        right_tail_caps_: dictionary
            The dictionary containing the maximum values at which variables
            will be capped.

        left_tail_caps_ : dictionary
            The dictionary containing the minimum values at which variables
            will be capped.
        """
        X = _is_dataframe(X)

        if self.missing_values == 'raise':
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        if self.max_capping_dict is not None:
            self.right_tail_caps_ = self.max_capping_dict
        else:
            self.right_tail_caps_ = {}

        if self.min_capping_dict is not None:
            self.left_tail_caps_ = self.min_capping_dict
        else:
            self.left_tail_caps_ = {}

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 7
0
    def fit(self, X, y=None):
        """
        Learns the mean or median values.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            User can pass the entire dataframe, not just the variables that need imputation.

        y : pandas series or None, default=None
            y is not needed in this imputation. You can pass None or y.

        Attributes
        ----------

        imputer_dict_ : dictionary
            The dictionary containing the mean / median values per variable. These
            values will be used by the imputer to replace missing data.
        """
        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        # find imputation parameters: mean or median
        if self.imputation_method == 'mean':
            self.imputer_dict_ = X[self.variables].mean().to_dict()

        elif self.imputation_method == 'median':
            self.imputer_dict_ = X[self.variables].median().to_dict()

        self.input_shape_ = X.shape

        return self
Ejemplo n.º 8
0
    def fit(self, X, y=None):
        """ 
        Learns the values that should be used to replace outliers.
        
        Parameters
        ----------
        
        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.

        y : None
            y is not needed in this transformer. You can pass y or None.

        Attributes
        ----------

        right_tail_caps_: dictionary
            The dictionary containing the maximum values at which variables
            will be capped.

        left_tail_caps_ : dictionary
            The dictionary containing the minimum values at which variables
            will be capped.
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find or check for numerical variables
        self.variables = _find_numerical_variables(X, self.variables)

        if self.missing_values == 'raise':
            # check if dataset contains na
            _check_contains_na(X, self.variables)

        self.right_tail_caps_ = {}
        self.left_tail_caps_ = {}

        # estimate the end values
        if self.tail in ['right', 'both']:
            if self.distribution == 'gaussian':
                self.right_tail_caps_ = (
                    X[self.variables].mean() +
                    self.fold * X[self.variables].std()).to_dict()

            elif self.distribution == 'skewed':
                IQR = X[self.variables].quantile(0.75) - X[
                    self.variables].quantile(0.25)
                self.right_tail_caps_ = (X[self.variables].quantile(0.75) +
                                         (IQR * self.fold)).to_dict()

            elif self.distribution == 'quantiles':
                self.right_tail_caps_ = X[self.variables].quantile(
                    1 - self.fold).to_dict()

        if self.tail in ['left', 'both']:
            if self.distribution == 'gaussian':
                self.left_tail_caps_ = (
                    X[self.variables].mean() -
                    self.fold * X[self.variables].std()).to_dict()

            elif self.distribution == 'skewed':
                IQR = X[self.variables].quantile(0.75) - X[
                    self.variables].quantile(0.25)
                self.left_tail_caps_ = (X[self.variables].quantile(0.25) -
                                        (IQR * self.fold)).to_dict()

            elif self.distribution == 'quantiles':
                self.left_tail_caps_ = X[self.variables].quantile(
                    self.fold).to_dict()

        self.input_shape_ = X.shape

        return self