コード例 #1
0
def test_check_input_parameter_variables():
    vars_ls = ["var1", "var2", "var1"]
    vars_int_ls = [0, 1, 2, 3]
    vars_none = None
    vars_str = "var1"
    vars_int = 0
    vars_tuple = ("var1", "var2")
    vars_set = {"var1", "var2"}
    vars_dict = {"var1": 1, "var2": 2}

    assert _check_input_parameter_variables(vars_ls) == [
        "var1", "var2", "var1"
    ]
    assert _check_input_parameter_variables(vars_int_ls) == [0, 1, 2, 3]
    assert _check_input_parameter_variables(vars_none) is None
    assert _check_input_parameter_variables(vars_str) == "var1"
    assert _check_input_parameter_variables(vars_int) == 0

    with pytest.raises(ValueError):
        assert _check_input_parameter_variables(vars_tuple)

    with pytest.raises(ValueError):
        assert _check_input_parameter_variables(vars_set)

    with pytest.raises(ValueError):
        assert _check_input_parameter_variables(vars_dict)
コード例 #2
0
    def __init__(
        self,
        estimator,
        scoring: str = "roc_auc",
        cv=3,
        threshold: Union[int, float] = None,
        variables: Variables = None,
        confirm_variables: bool = False,
    ):

        if threshold:
            if not isinstance(threshold, (int, float)):
                raise ValueError(
                    "threshold can only be integer, float or None")

            if scoring == "roc_auc" and (threshold < 0.5 or threshold > 1):
                raise ValueError(
                    "roc-auc score should vary between 0.5 and 1. Pick a "
                    "threshold within this interval.")

            if scoring == "r2" and (threshold < 0 or threshold > 1):
                raise ValueError(
                    "r2 takes values between -1 and 1. To select features the "
                    "transformer considers the absolute value. Pick a threshold within "
                    "0 and 1.")

        super().__init__(confirm_variables)
        self.variables = _check_input_parameter_variables(variables)
        self.estimator = estimator
        self.scoring = scoring
        self.threshold = threshold
        self.cv = cv
コード例 #3
0
    def __init__(
        self,
        estimator=RandomForestClassifier(),
        scoring: str = "roc_auc",
        cv: int = 3,
        threshold: Union[int, float] = None,
        variables: Variables = None,
    ):

        if not isinstance(cv, int) or cv < 1:
            raise ValueError(
                "cv can only take positive integers bigger than 1")

        if threshold:
            if not isinstance(threshold, (int, float)):
                raise ValueError(
                    "threshold can only be integer, float or None")

            if scoring == "roc_auc" and (threshold < 0.5 or threshold > 1):
                raise ValueError(
                    "roc-auc score should vary between 0.5 and 1. Pick a "
                    "threshold within this interval.")

            if scoring == "r2" and (threshold < 0 or threshold > 1):
                raise ValueError(
                    "r2 takes values between -1 and 1. To select features the "
                    "transformer considers the absolute value. Pick a threshold within "
                    "0 and 1.")

        self.variables = _check_input_parameter_variables(variables)
        self.estimator = estimator
        self.scoring = scoring
        self.threshold = threshold
        self.cv = cv
コード例 #4
0
    def __init__(
        self,
        tol: float = 0.05,
        n_categories: int = 10,
        max_n_categories: Optional[int] = None,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        replace_with: str = "Rare",
    ) -> None:

        if tol < 0 or tol > 1:
            raise ValueError("tol takes values between 0 and 1")

        if n_categories < 0 or not isinstance(n_categories, int):
            raise ValueError("n_categories takes only positive integer numbers")

        if max_n_categories is not None:
            if max_n_categories < 0 or not isinstance(max_n_categories, int):
                raise ValueError("max_n_categories takes only positive integer numbers")

        if not isinstance(replace_with, str):
            raise ValueError("replace_with takes only strings as values.")

        self.tol = tol
        self.n_categories = n_categories
        self.max_n_categories = max_n_categories
        self.variables = _check_input_parameter_variables(variables)
        self.replace_with = replace_with
コード例 #5
0
    def __init__(
        self,
        estimator=RandomForestClassifier(),
        scoring: str = "roc_auc",
        cv: int = 3,
        threshold: Union[int, float] = 0.5,
        variables: Variables = None,
    ):

        if not isinstance(cv, int) or cv < 1:
            raise ValueError(
                "cv can only take positive integers bigger than 1")

        if not isinstance(threshold, (int, float)):
            raise ValueError("threshold can only be integer or float")

        if scoring == "roc_auc" and (threshold < 0.5 or threshold > 1):
            raise ValueError(
                "roc-auc score should vary between 0.5 and 1. Pick a "
                "threshold within this interval.")

        self.variables = _check_input_parameter_variables(variables)
        self.estimator = estimator
        self.scoring = scoring
        self.threshold = threshold
        self.cv = cv
コード例 #6
0
    def __init__(
        self,
        capping_method: str = "gaussian",
        tail: str = "right",
        fold: Union[int, float] = 3,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        missing_values: str = "raise",
    ) -> None:

        if capping_method not in ["gaussian", "iqr", "quantiles"]:
            raise ValueError(
                "capping_method takes only values 'gaussian', 'iqr' or 'quantiles'"
            )

        if tail not in ["right", "left", "both"]:
            raise ValueError(
                "tail takes only values 'right', 'left' or 'both'")

        if fold <= 0:
            raise ValueError("fold takes only positive numbers")

        if capping_method == "quantiles" and fold > 0.2:
            raise ValueError(
                "with capping_method ='quantiles', fold takes values between 0 and "
                "0.20 only.")

        if missing_values not in ["raise", "ignore"]:
            raise ValueError(
                "missing_values takes only values 'raise' or 'ignore'")

        self.capping_method = capping_method
        self.tail = tail
        self.fold = fold
        self.variables = _check_input_parameter_variables(variables)
        self.missing_values = missing_values
コード例 #7
0
    def __init__(
        self,
        tol: float = 0.05,
        n_categories: int = 10,
        max_n_categories: Optional[int] = None,
        replace_with: Union[str, int, float] = "Rare",
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        ignore_format: bool = False,
    ) -> None:

        if tol < 0 or tol > 1:
            raise ValueError("tol takes values between 0 and 1")

        if n_categories < 0 or not isinstance(n_categories, int):
            raise ValueError(
                "n_categories takes only positive integer numbers")

        if max_n_categories is not None:
            if max_n_categories < 0 or not isinstance(max_n_categories, int):
                raise ValueError(
                    "max_n_categories takes only positive integer numbers")

        if not isinstance(ignore_format, bool):
            raise ValueError(
                "ignore_format takes only booleans True and False")

        self.tol = tol
        self.n_categories = n_categories
        self.max_n_categories = max_n_categories
        self.replace_with = replace_with
        self.variables = _check_input_parameter_variables(variables)
        self.ignore_format = ignore_format
コード例 #8
0
    def __init__(
        self,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        random_state: Union[None, int, str, List[Union[str, int]]] = None,
        seed: str = "general",
        seeding_method: str = "add",
    ) -> None:

        if seed not in ["general", "observation"]:
            raise ValueError(
                "seed takes only values 'general' or 'observation'")

        if seeding_method not in ["add", "multiply"]:
            raise ValueError(
                "seeding_method takes only values 'add' or 'multiply'")

        if seed == "general" and random_state:
            if not isinstance(random_state, int):
                raise ValueError(
                    "if seed == 'general' then random_state must take an integer"
                )

        if seed == "observation" and not random_state:
            raise ValueError(
                "if seed == 'observation' the random state must take the name of one "
                "or more variables which will be used to seed the imputer")

        self.variables = _check_input_parameter_variables(variables)
        self.random_state = random_state
        self.seed = seed
        self.seeding_method = seeding_method
コード例 #9
0
ファイル: one_hot.py プロジェクト: thibaultbl/feature_engine
    def __init__(
        self,
        top_categories: Optional[int] = None,
        drop_last: bool = False,
        drop_last_binary: bool = False,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        ignore_format: bool = False,
    ) -> None:

        if top_categories and not isinstance(top_categories, int):
            raise ValueError(
                "top_categories takes only integer numbers, 1, 2, 3, etc.")

        if not isinstance(drop_last, bool):
            raise ValueError("drop_last takes only True or False")

        if not isinstance(drop_last_binary, bool):
            raise ValueError("drop_last_binary takes only True or False")

        if not isinstance(ignore_format, bool):
            raise ValueError(
                "ignore_format takes only booleans True and False")

        self.top_categories = top_categories
        self.drop_last = drop_last
        self.drop_last_binary = drop_last_binary
        self.variables = _check_input_parameter_variables(variables)
        self.ignore_format = ignore_format
コード例 #10
0
    def __init__(
        self,
        cv: int = 3,
        scoring: str = "neg_mean_squared_error",
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        param_grid: Optional[Dict[str, Union[str, int, float,
                                             List[int]]]] = None,
        regression: bool = True,
        random_state: Optional[int] = None,
    ) -> None:

        if param_grid is None:
            param_grid = {"max_depth": [1, 2, 3, 4]}

        if not isinstance(cv, int) or cv < 0:
            raise ValueError("cv can only take only positive integers")

        if not isinstance(regression, bool):
            raise ValueError("regression can only take True or False")

        self.cv = cv
        self.scoring = scoring
        self.regression = regression
        self.variables = _check_input_parameter_variables(variables)
        self.param_grid = param_grid
        self.random_state = random_state
コード例 #11
0
 def __init__(self,
              variables: Union[None, int, str, List[Union[str,
                                                          int]]] = None,
              percent_threshold=0.02,
              other_val='_OTHER_'):
     self.variables = _check_input_parameter_variables(variables)
     self.percent_threshold = percent_threshold
     self.other_val = other_val
コード例 #12
0
 def __init__(self,
              variables: Union[None, int, str, List[Union[str,
                                                          int]]] = None,
              max_levels=20,
              other_val='_OTHER_'):
     self.variables = _check_input_parameter_variables(variables)
     self.max_levels = max_levels
     self.other_val = other_val
コード例 #13
0
 def __init__(self,
              variables: Union[None, int, str, List[Union[str,
                                                          int]]] = None,
              cum_percent=0.95,
              other_val='_OTHER_'):
     self.variables = _check_input_parameter_variables(variables)
     self.cum_percent = cum_percent
     self.other_val = other_val
コード例 #14
0
    def __init__(
        self,
        variables: Variables = None,
        method: str = "pearson",
        threshold: float = 0.8,
        missing_values: str = "ignore",
        selection_method: str = "missing_values",
        estimator=None,
        scoring: str = "roc_auc",
        cv: int = 3,
    ):

        if method not in ["pearson", "spearman", "kendall"]:
            raise ValueError(
                "correlation method takes only values 'pearson', 'spearman', 'kendall'"
            )

        if not isinstance(threshold, float) or threshold < 0 or threshold > 1:
            raise ValueError("threshold must be a float between 0 and 1")

        if missing_values not in ["raise", "ignore"]:
            raise ValueError(
                "missing_values takes only values 'raise' or 'ignore'.")

        if selection_method not in [
                "missing_values",
                "cardinality",
                "variance",
                "model_performance",
        ]:
            raise ValueError(
                "selection_method takes only values 'missing_values', 'cardinality', "
                "'variance' or 'model_performance'.")

        if not isinstance(cv, int) or cv < 1:
            raise ValueError(
                "cv can only take positive integers bigger than 1")

        if selection_method == "model_performance" and estimator is None:
            raise ValueError("Please provide an estimator, e.g., "
                             "RandomForestClassifier or select another "
                             "selection_method")

        if selection_method == "missing_values" and missing_values == "raise":
            raise ValueError(
                "To select the variables with least missing values, we "
                "need to allow this transformer to contemplate variables "
                "with NaN by setting missing_values to 'ignore.")

        self.variables = _check_input_parameter_variables(variables)
        self.method = method
        self.threshold = threshold
        self.missing_values = missing_values
        self.selection_method = selection_method
        self.estimator = estimator
        self.scoring = scoring
        self.cv = cv
コード例 #15
0
    def __init__(
        self,
        variables: Variables = None,
        scoring: str = "roc_auc_score",
        threshold: float = 0.5,
        bins: int = 5,
        strategy: str = "equal_width",
        cv: int = 3,
        random_state: int = None,
    ):

        if scoring not in ["roc_auc_score", "r2_score"]:
            raise ValueError(
                "At the moment, the selector can evaluate only the "
                "roc_auc and r2 scores. Please enter either "
                "'roc_auc_score' or 'r2_score' for the parameter "
                "'scoring'"
            )

        if not isinstance(threshold, (int, float)):
            raise ValueError("threshold can only take integer or float")

        if scoring == "roc_auc_score" and (threshold < 0.5 or threshold > 1):
            raise ValueError(
                "roc-auc score should vary between 0.5 and 1. Pick a "
                "threshold within this interval."
            )

        if scoring == "r2_score" and (threshold < 0 or threshold > 1):
            raise ValueError(
                "r2 score should vary between 0 and 1. Pick a "
                "threshold within this interval."
            )

        if not isinstance(bins, int):
            raise TypeError("'bins' takes only integers")

        if strategy not in ["equal_width", "equal_frequency"]:
            raise ValueError(
                "'strategy' takes boolean values 'equal_width' and "
                "'equal_frequency'."
            )

        if not isinstance(cv, int) or cv <= 1:
            raise ValueError("cv takes integers bigger than 1")

        if random_state and not isinstance(random_state, int):
            raise TypeError("'random_state' takes only integers")

        self.variables = _check_input_parameter_variables(variables)
        self.scoring = scoring
        self.threshold = threshold
        self.bins = bins
        self.strategy = strategy
        self.cv = cv
        self.random_state = random_state
コード例 #16
0
    def __init__(self,
                 variables: Variables = None,
                 missing_values: str = "ignore"):

        if missing_values not in ["raise", "ignore"]:
            raise ValueError(
                "missing_values takes only values 'raise' or 'ignore'.")

        self.variables = _check_input_parameter_variables(variables)
        self.missing_values = missing_values
コード例 #17
0
    def __init__(
        self,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        exp: Union[float, int] = 0.5,
    ):

        if not isinstance(exp, (float, int)):
            raise ValueError("exp must be a float or an int")

        self.exp = exp
        self.variables = _check_input_parameter_variables(variables)
コード例 #18
0
    def __init__(
        self,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        ignore_format: bool = False,
    ) -> None:

        if not isinstance(ignore_format, bool):
            raise ValueError("ignore_format takes only booleans True and False")

        self.variables = _check_input_parameter_variables(variables)
        self.ignore_format = ignore_format
コード例 #19
0
    def __init__(
        self,
        imputation_method: str = "median",
        variables: Union[None, int, str, List[Union[str, int]]] = None,
    ) -> None:

        if imputation_method not in ["median", "mean"]:
            raise ValueError("imputation_method takes only values 'median' or 'mean'")

        self.imputation_method = imputation_method
        self.variables = _check_input_parameter_variables(variables)
コード例 #20
0
    def __init__(
        self,
        missing_only: bool = True,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
    ) -> None:

        if not isinstance(missing_only, bool):
            raise ValueError("missing_only takes values True or False")

        self.variables = _check_input_parameter_variables(variables)
        self.missing_only = missing_only
コード例 #21
0
ファイル: log.py プロジェクト: solegalli/feature_engine
    def __init__(
        self,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        base: str = "e",
    ) -> None:

        if base not in ["e", "10"]:
            raise ValueError("base can take only '10' or 'e' as values")

        self.variables = _check_input_parameter_variables(variables)
        self.base = base
コード例 #22
0
    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
        """
        Makes a copy of the train set. Only stores a copy of the variables to impute.
        This copy is then used to randomly extract the values to fill the missing data
        during transform.

        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training dataset. Only a copy of the indicated variables will be stored
            in the transformer.

        y : None
            y is not needed in this imputation. You can pass None or y.

        Raises
        ------
        TypeError
            If the input is not a Pandas DataFrame

        Returns
        -------
        self
        """

        # check input dataframe
        X = _is_dataframe(X)

        # find variables to impute
        if not self.variables:
            self.variables = [var for var in X.columns]
        else:
            self.variables = self.variables

        # take a copy of the selected variables
        self.X_ = X[self.variables].copy()

        # check the variables assigned to the random state
        if self.seed == "observation":
            self.random_state = _check_input_parameter_variables(self.random_state)
            if isinstance(self.random_state, (int, str)):
                self.random_state = [self.random_state]
            if self.random_state and any(
                var for var in self.random_state if var not in X.columns
            ):
                raise ValueError(
                    "There are variables assigned as random state which are not part "
                    "of the training dataframe."
                )
        self.input_shape_ = X.shape

        return self
コード例 #23
0
    def __init__(
        self,
        encoding_method: str = "ordered",
        variables: Union[None, int, str, List[Union[str, int]]] = None,
    ) -> None:

        if encoding_method not in ["ordered", "arbitrary"]:
            raise ValueError(
                "encoding_method takes only values 'ordered' and 'arbitrary'")

        self.encoding_method = encoding_method
        self.variables = _check_input_parameter_variables(variables)
コード例 #24
0
    def __init__(
        self,
        transformer,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
    ) -> None:

        if not issubclass(transformer.__class__, BaseEstimator):
            raise TypeError("transformer expected a Scikit-learn transformer, "
                            f"got {transformer} instead.")

        self.transformer = transformer
        self.variables = _check_input_parameter_variables(variables)
コード例 #25
0
    def __init__(
        self,
        encoding_method: str = "count",
        variables: Union[None, int, str, List[Union[str, int]]] = None,
    ) -> None:

        if encoding_method not in ["count", "frequency"]:
            raise ValueError(
                "encoding_method takes only values 'count' and 'frequency'")

        self.encoding_method = encoding_method
        self.variables = _check_input_parameter_variables(variables)
コード例 #26
0
    def __init__(
        self,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        transformer=None,
    ) -> None:
        self.variables = _check_input_parameter_variables(variables)
        self.transformer = transformer

        if isinstance(self.transformer,
                      OneHotEncoder) and self.transformer.sparse:
            raise AttributeError(
                "The SklearnTransformerWrapper can only wrap the OneHotEncoder if you "
                "set its sparse attribute to False")
コード例 #27
0
    def __init__(
        self,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        features_to_extract: Union[None, str, List[str]] = None,
        drop_original: bool = True,
        missing_values: str = "raise",
        dayfirst: bool = False,
        yearfirst: bool = False,
        utc: Union[None, bool] = None,
    ) -> None:

        if features_to_extract:
            if not (
                isinstance(features_to_extract, list) or features_to_extract == "all"
            ):
                raise ValueError(
                    "features_to_extract must be a list of strings or 'all'. "
                    f"Got {features_to_extract} instead."
                )
            elif isinstance(features_to_extract, list) and any(
                feat not in FEATURES_SUPPORTED for feat in features_to_extract
            ):
                raise ValueError(
                    "Some of the requested features are not supported. "
                    "Supported features are {}.".format(", ".join(FEATURES_SUPPORTED))
                )

        if not isinstance(drop_original, bool):
            raise ValueError(
                "drop_original takes only booleans True or False. "
                f"Got {drop_original} instead."
            )

        if missing_values not in ["raise", "ignore"]:
            raise ValueError(
                "missing_values takes only values 'raise' or 'ignore'. "
                f"Got {missing_values} instead."
            )

        if utc is not None and not isinstance(utc, bool):
            raise ValueError("utc takes only booleans or None. " f"Got {utc} instead.")

        self.variables = _check_input_parameter_variables(variables)
        self.drop_original = drop_original
        self.missing_values = missing_values
        self.dayfirst = dayfirst
        self.yearfirst = yearfirst
        self.utc = utc
        self.features_to_extract = features_to_extract
コード例 #28
0
    def __init__(
        self,
        arbitrary_number: Union[int, float] = 999,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        imputer_dict: Optional[dict] = None,
    ) -> None:

        if isinstance(arbitrary_number, int) or isinstance(arbitrary_number, float):
            self.arbitrary_number = arbitrary_number
        else:
            raise ValueError("arbitrary_number must be numeric of type int or float")

        self.variables = _check_input_parameter_variables(variables)

        self.imputer_dict = _define_numerical_dict(imputer_dict)
コード例 #29
0
    def __init__(
        self,
        variables: Union[None, int, str, List[Union[str, int]]] = None,
        bins: int = 10,
        return_object: bool = False,
        return_boundaries: bool = False,
    ) -> None:

        if not isinstance(bins, int):
            raise ValueError(f"bins must be an integer. Got {bins} instead.")

        super().__init__(return_object, return_boundaries)

        self.bins = bins
        self.variables = _check_input_parameter_variables(variables)
コード例 #30
0
    def __init__(
        self,
        variables: Variables = None,
        bins: int = 5,
        strategy: str = "equal_width",
        scoring: str = "roc_auc",
        cv=3,
        threshold: Union[int, float] = None,
        regression: bool = False,
        confirm_variables: bool = False,
    ):

        if not isinstance(bins, int):
            raise ValueError(f"bins must be an integer. Got {bins} instead.")

        if strategy not in ["equal_width", "equal_frequency"]:
            raise ValueError(
                "strategy takes only values 'equal_width' or 'equal_frequency'. "
                f"Got {strategy} instead."
            )

        if threshold is not None and not isinstance(threshold, (int, float)):
            raise ValueError(
                "threshold can only take integer or float. " f"Got {threshold} instead."
            )

        if regression is True and scoring not in _REGRESSION_METRICS:
            raise ValueError(
                f"The metric {scoring} is not suitable for regression. Set the "
                "parameter regression to False or choose a different performance "
                "metric."
            )

        if regression is False and scoring not in _CLASSIFICATION_METRICS:
            raise ValueError(
                f"The metric {scoring} is not suitable for classification. Set the"
                "parameter regression to True or choose a different performance "
                "metric."
            )

        super().__init__(confirm_variables)
        self.variables = _check_input_parameter_variables(variables)
        self.bins = bins
        self.strategy = strategy
        self.scoring = scoring
        self.cv = cv
        self.threshold = threshold
        self.regression = regression