Example #1
0
    def extract(
            self,
            remove_nan: bool = True,
            verbose: bool = False,
            enable_parallel: bool = False,
            suppress_warnings: bool = False,
            **kwargs) -> t.Tuple[t.Sequence, ...]:
        """Extracts metafeatures from the previously fitted dataset.

        Parameters
        ----------
        remove_nan : :obj:`bool`, optional
            If True, remove any non-numeric values features before summarizing
            values from all feature extraction methods. Note that the summary
            methods may still remove non-numeric values by itself. In this
            case, the user must modify this behavior using built-in summary
            method arguments via kwargs, if possible.

        verbose : :obj:`bool`, optional
            If True, print messages related to the metafeature extraction
            process. Note that warning messages are not affected by this option
            (see ``suppress_warnings`` argument below).

        enable_parallel : :obj:`bool`, optional
            If True, then the meta-feature extraction is done with
            multi-processes. Currently, this argument has no effect by now
            (to be implemented).

        by_class : :obj:`bool`, optional
            Not implemented yet.

        suppress_warnings : :obj:`bool`, optional
            If True, do not show warnings about unknown user custom parameters
            for feature extraction and summary methods passed via kwargs. Note
            that both feature extraction and summary methods may still raise
            warnings by itself. In this case, just like the ``remove_nan``
            situation, the user must suppress them by built-in args from these
            methods via kwargs, if possible.

        kwargs:
            Used to pass custom arguments for both feature-extraction and
            summary methods. The expected format is the following:

            {``mtd_name``: {``arg_name``: arg_value, ...}, ...}

            In words, the key values of ``**kwargs`` should be the target
            methods which receives the custom arguments, and each method has
            another dictionary containing customs method argument names as keys
            and their correspondent values, as values. See ``Examples``
            subsection for a clearer explanation.

            For more information see Examples.

        Returns
        -------
        :obj:`tuple`(:obj:`list`, :obj:`list`)
            A tuple containing two lists.

            The first field is the identifiers of each summarized value in the
            form ``feature_name.summary_mtd_name`` (i.e., the feature
            extraction name concatenated by the summary method name, separated
            by a dot).

            The second field is the summarized values.

            Both lists have a 1-1 correspondence by the index of each element
            (i.e., the value at index ``i`` in the second list has its
            identifier at the same index in the first list and vice-versa).

            Example:
                ([``attr_ent.mean``, ``attr_ent.sd``], [``0.983``, ``0.344``])
                is the return value for the feature ``attr_end`` summarized by
                both ``mean`` and ``sd`` (standard deviation), giving the valu-
                es ``0.983`` and ``0.344``, respectively.

        Raises
        ------
        TypeError
            If calling ``extract`` method before ``fit`` method.

        Examples
        --------
        Using kwargs. Option 1 to pass ft. extraction custom arguments:

        >>> args = {
        >>> 'sd': {'ddof': 2},
        >>> '1NN': {'metric': 'minkowski', 'p': 2},
        >>> 'leaves': {'max_depth': 4},
        >>> }

        >>> model = MFE().fit(X=data, y=labels)
        >>> result = model.extract(**args)

        Option 2 (note: metafeatures with name starting with numbers are not
        allowed!):

        >>> model = MFE().fit(X=data, y=labels)
        >>> res = extract(sd={'ddof': 2}, leaves={'max_depth': 4})

        """
        if self.X is None or self.y is None:
            raise TypeError("Fitted data not found. Call "
                            '"fit" method before "extract".')

        if (not isinstance(self.X, np.ndarray)
                or not isinstance(self.y, np.ndarray)):
            self.X, self.y = _internal.check_data(self.X, self.y)

        if verbose:
            print("Started the metafeature extraction process.")

        results = self._call_feature_methods(
            remove_nan=remove_nan,
            verbose=verbose,
            enable_parallel=enable_parallel,
            suppress_warnings=suppress_warnings,
            **kwargs)

        _internal.post_processing(
            results=results,
            groups=self.groups,
            suppress_warnings=suppress_warnings,
            **self._postprocess_args_ft,
            **kwargs)

        if results and results[0]:
            # Sort results by metafeature name
            results = tuple(
                map(list, zip(*sorted(zip(*results),
                                      key=lambda item: item[0]))))

        res_names, res_vals, res_times = results

        if verbose:
            if self._timeopt_type_is_avg():
                time_type = "average"
            else:
                time_type = "total"

            print(
                "Metafeature extraction process done.",
                "Total of {0} values obtained. Time elapsed "
                "({1}) = {2:.8f} seconds.".format(
                    len(res_vals), time_type, sum(res_times)),
                sep="\n")

        if self.timeopt:
            return res_names, res_vals, res_times

        return res_names, res_vals
 def test_error_check_data(self):
     X, y = load_xy(0)
     with pytest.raises(TypeError):
         _internal.check_data(X, y='')
Example #3
0
    def fit(self,
            X: t.Sequence,
            y: t.Sequence,
            transform_num: bool = True,
            transform_cat: bool = True,
            rescale: t.Optional[str] = None,
            rescale_args: t.Optional[t.Dict[str, t.Any]] = None,
            cat_cols: t.Optional[t.Union[str, t.Iterable[int]]] = "auto",
            check_bool: bool = False,
            precomp_groups: t.Optional[str] = "all",
            wildcard: str = "all",
            suppress_warnings: bool = False,
            ) -> "MFE":
        """Fits dataset into an MFE model.

        Parameters
        ----------
        X : :obj:`Sequence`
            Predictive attributes of the dataset.

        y : :obj:`Sequence`
            Target attributes of the dataset, assuming that it is a supervised
            task.

        transform_num : :obj:`bool`, optional
            If True, numeric attributes are discretized using equal-frequency
            histogram technique to use alongside categorical data when
            extracting categoric-only metafeatures. Note that numeric-only
            features still uses the original numeric values, not the
            discretized ones. If False, then numeric attributes are ignored for
            categorical-only meta-features.

        transform_cat : :obj:`bool`, optional
            If True, categorical attributes are binarized using a model matrix
            to use when alongside numerical data while extracting numeric-only
            metafeatures. Note that categoric-only features still uses the
            original categoric values, not the binarized ones. If False, then
            categorical attributes are ignored for numeric-only metafeatures.

            The formula used for this transformation is just the union (+) of
            all categoric attributes using formula language from ``patsy``
            package API, removing the intercept terms:
            ``~ 0 + A_1 + ... + A_n``, where ``n`` is the number of attributes
            and A_i is the ith categoric attribute, 1 <= i <= n.

        rescale : :obj:`str`, optional
            If :obj:`NoneType`, the model keeps all numeric data with its
            original values. Otherwise, this argument can assume one of the
            string options below to rescale all numeric values:

                1. ``standard``: set numeric data to zero mean, unit variance.
                   Also known as ``z-score`` normalization. Check the
                   documentation of ``sklearn.preprocessing.StandardScaler``
                   for in-depth information.

                2. `'min-max``: set numeric data to interval [a, b], a < b. It
                   is possible to define values to ``a`` and ``b`` using
                   argument ``rescale_args``. The default values are a = 0.0
                   and b = 1.0. Check ``sklearn.preprocessing.MinMaxScaler``
                   documentation for more information.

                3. ``robust``: rescale data using statistics robust to the
                   presence of outliers. For in-depth information, check
                   documentation of ``sklearn.preprocessing.RobustScaler``.

        rescale_args : :obj:`dict`, optional
            Dictionary containing parameters for rescaling data. Used only if
            ``rescale`` argument is not :obj:`NoneType`. These dictionary keys
            are the parameter names as strings and the values, the
            corresponding parameter value.

        cat_cols :obj:`Sequence` of :obj:`int` or :obj:`str`, optional
            Categorical columns of dataset. If given :obj:`NoneType` or an
            empty sequence, assume all columns as numeric. If given value
            ``auto``, then an attempt of automatic detection is performed while
            fitting the dataset.

        check_bool : :obj:`bool`, optional
            If `cat_cols` is ``auto``, and this flag is True, assume that all
            columns with precisely two different values is also a categorical
            (boolean) column, independently of its data type. Otherwise, these
            columns may be considered numeric depending on their data type.

        missing_data : :obj:`str`, optional
            Defines the strategy to handle missing values in data. Still not
            implemented.

        precomp_groups : :obj:`str`, optional
            Defines which metafeature groups common values should be cached to
            share among various meta-feature extraction related methods (e.g.
            ``classes``, or ``covariance``). This argument may speed up
            meta-feature extraction but also consumes more memory, so it may
            not be suitable for huge datasets.

        wildcard : :obj:`str`, optional
            Value used as ``select all`` for ``precomp_groups``.

        suppress_warnings : :obj:`bool`, optional
            If True, ignore all warnings invoked while fitting dataset.

        Returns
        -------
        self

        Raises
        ------
        ValueError
            If the number of rows of X and y length does not match.
        TypeError
            If X or y (or both) is neither a :obj:`list` or a :obj:`np.ndarray`
            object.

        """
        self.X, self.y = _internal.check_data(X, y)

        rescale = _internal.process_generic_option(
            value=rescale, group_name="rescale", allow_none=True)

        self._fill_col_ind_by_type(cat_cols=cat_cols, check_bool=check_bool)

        data_cat = self._set_data_categoric(transform_num=transform_num)
        data_num = self._set_data_numeric(
            transform_cat=transform_cat,
            rescale=rescale,
            rescale_args=rescale_args)

        # Custom arguments for metafeature extraction methods
        self._custom_args_ft = {
            "X": self.X,
            "N": data_num,
            "C": data_cat,
            "y": self.y,
            "folds": self.folds,
            "sample_size": self.sample_size,
            "score": self.score,
            "random_state": self.random_state,
            "cat_cols": self._attr_indexes_cat,
        }

        # Custom arguments from preprocessing methods
        self._precomp_args_ft = _internal.process_precomp_groups(
            precomp_groups=precomp_groups,
            groups=self.groups,
            wildcard=wildcard,
            suppress_warnings=suppress_warnings,
            **self._custom_args_ft)

        # Custom arguments for postprocessing methods
        self._postprocess_args_ft = {
            "inserted_group_dep": self.inserted_group_dep,
        }

        # Custom arguments for summarization methods
        self._custom_args_sum = {
            "ddof": 1,
        }

        return self