Example #1
0
 def test_preprocessing_invalid(self):
     """Test exception handling of precomputation."""
     with pytest.warns(UserWarning):
         _internal.process_precomp_groups(precomp_groups=tuple(),
                                          groups=tuple(),
                                          custom_class_=MFETestClass,
                                          raise_exception=True)
Example #2
0
 def test_mem_err_precompute(self):
     with pytest.warns(UserWarning):
         _internal.process_precomp_groups(
             precomp_groups=tuple(),
             groups=tuple(),
             custom_class_=MFETestClass,
             raise_mem_err=True,
         )
Example #3
0
    def test_preprocessing_valid(self):
        """Test valid precomputation and its automatic detection."""
        precomp_args = _internal.process_precomp_groups(
            precomp_groups=tuple(), groups=tuple(), custom_class_=MFETestClass
        )

        assert len(precomp_args) > 0
Example #4
0
    def fit(self,
            X: t.Sequence,
            y: t.Sequence,
            transform_num: bool = True,
            transform_cat: bool = True,
            rescale: t.Optional[str] = None,
            rescale_args: t.Optional[t.Dict[str, t.Any]] = None,
            cat_cols: t.Optional[t.Union[str, t.Iterable[int]]] = "auto",
            check_bool: bool = False,
            precomp_groups: t.Optional[str] = "all",
            wildcard: str = "all",
            suppress_warnings: bool = False,
            ) -> "MFE":
        """Fits dataset into an MFE model.

        Parameters
        ----------
        X : :obj:`Sequence`
            Predictive attributes of the dataset.

        y : :obj:`Sequence`
            Target attributes of the dataset, assuming that it is a supervised
            task.

        transform_num : :obj:`bool`, optional
            If True, numeric attributes are discretized using equal-frequency
            histogram technique to use alongside categorical data when
            extracting categoric-only metafeatures. Note that numeric-only
            features still uses the original numeric values, not the
            discretized ones. If False, then numeric attributes are ignored for
            categorical-only meta-features.

        transform_cat : :obj:`bool`, optional
            If True, categorical attributes are binarized using a model matrix
            to use when alongside numerical data while extracting numeric-only
            metafeatures. Note that categoric-only features still uses the
            original categoric values, not the binarized ones. If False, then
            categorical attributes are ignored for numeric-only metafeatures.

            The formula used for this transformation is just the union (+) of
            all categoric attributes using formula language from ``patsy``
            package API, removing the intercept terms:
            ``~ 0 + A_1 + ... + A_n``, where ``n`` is the number of attributes
            and A_i is the ith categoric attribute, 1 <= i <= n.

        rescale : :obj:`str`, optional
            If :obj:`NoneType`, the model keeps all numeric data with its
            original values. Otherwise, this argument can assume one of the
            string options below to rescale all numeric values:

                1. ``standard``: set numeric data to zero mean, unit variance.
                   Also known as ``z-score`` normalization. Check the
                   documentation of ``sklearn.preprocessing.StandardScaler``
                   for in-depth information.

                2. `'min-max``: set numeric data to interval [a, b], a < b. It
                   is possible to define values to ``a`` and ``b`` using
                   argument ``rescale_args``. The default values are a = 0.0
                   and b = 1.0. Check ``sklearn.preprocessing.MinMaxScaler``
                   documentation for more information.

                3. ``robust``: rescale data using statistics robust to the
                   presence of outliers. For in-depth information, check
                   documentation of ``sklearn.preprocessing.RobustScaler``.

        rescale_args : :obj:`dict`, optional
            Dictionary containing parameters for rescaling data. Used only if
            ``rescale`` argument is not :obj:`NoneType`. These dictionary keys
            are the parameter names as strings and the values, the
            corresponding parameter value.

        cat_cols :obj:`Sequence` of :obj:`int` or :obj:`str`, optional
            Categorical columns of dataset. If given :obj:`NoneType` or an
            empty sequence, assume all columns as numeric. If given value
            ``auto``, then an attempt of automatic detection is performed while
            fitting the dataset.

        check_bool : :obj:`bool`, optional
            If `cat_cols` is ``auto``, and this flag is True, assume that all
            columns with precisely two different values is also a categorical
            (boolean) column, independently of its data type. Otherwise, these
            columns may be considered numeric depending on their data type.

        missing_data : :obj:`str`, optional
            Defines the strategy to handle missing values in data. Still not
            implemented.

        precomp_groups : :obj:`str`, optional
            Defines which metafeature groups common values should be cached to
            share among various meta-feature extraction related methods (e.g.
            ``classes``, or ``covariance``). This argument may speed up
            meta-feature extraction but also consumes more memory, so it may
            not be suitable for huge datasets.

        wildcard : :obj:`str`, optional
            Value used as ``select all`` for ``precomp_groups``.

        suppress_warnings : :obj:`bool`, optional
            If True, ignore all warnings invoked while fitting dataset.

        Returns
        -------
        self

        Raises
        ------
        ValueError
            If the number of rows of X and y length does not match.
        TypeError
            If X or y (or both) is neither a :obj:`list` or a :obj:`np.ndarray`
            object.

        """
        self.X, self.y = _internal.check_data(X, y)

        rescale = _internal.process_generic_option(
            value=rescale, group_name="rescale", allow_none=True)

        self._fill_col_ind_by_type(cat_cols=cat_cols, check_bool=check_bool)

        data_cat = self._set_data_categoric(transform_num=transform_num)
        data_num = self._set_data_numeric(
            transform_cat=transform_cat,
            rescale=rescale,
            rescale_args=rescale_args)

        # Custom arguments for metafeature extraction methods
        self._custom_args_ft = {
            "X": self.X,
            "N": data_num,
            "C": data_cat,
            "y": self.y,
            "folds": self.folds,
            "sample_size": self.sample_size,
            "score": self.score,
            "random_state": self.random_state,
            "cat_cols": self._attr_indexes_cat,
        }

        # Custom arguments from preprocessing methods
        self._precomp_args_ft = _internal.process_precomp_groups(
            precomp_groups=precomp_groups,
            groups=self.groups,
            wildcard=wildcard,
            suppress_warnings=suppress_warnings,
            **self._custom_args_ft)

        # Custom arguments for postprocessing methods
        self._postprocess_args_ft = {
            "inserted_group_dep": self.inserted_group_dep,
        }

        # Custom arguments for summarization methods
        self._custom_args_sum = {
            "ddof": 1,
        }

        return self