Example #1
0
    def test_feature_detection(self):
        """Test automatic dectection of metafeature extraction method."""
        name, mtd, groups = _internal.process_features(
            features="all",
            groups=tuple(),
            suppress_warnings=True,
            custom_class_=MFETestClass)

        assert len(name) == 3 and len(mtd) == 3 and len(groups) == 1
Example #2
0
    def test_feature_warning2(self):
        """Test memory error handling of feature extraction."""
        name, mtd, groups = map(
            np.asarray,
            _internal.process_features(
                features="memory_error",
                groups=tuple(),
                suppress_warnings=True,
                custom_class_=MFETestClass,
            ),
        )

        with pytest.warns(RuntimeWarning):
            _internal.get_feat_value(
                mtd_name=name[0],
                mtd_args={
                    "X": np.array([]),
                    "y": np.ndarray([]),
                    "raise_mem_err": True,
                },
                mtd_callable=mtd[0][1],
                suppress_warnings=False,
            )
Example #3
0
    def __init__(self,
                 groups: t.Union[str, t.Iterable[str]] = "all",
                 features: t.Union[str, t.Iterable[str]] = "all",
                 summary: t.Union[str, t.Iterable[str]] = ("mean", "sd"),
                 measure_time: t.Optional[str] = None,
                 wildcard: str = "all",
                 score="accuracy",
                 folds=10,
                 sample_size=1.0,
                 suppress_warnings: bool = False,
                 random_state: t.Optional[int] = None) -> None:
        """This class provides easy access for metafeature extraction from
        datasets.

        It expected that user first calls ``fit`` method after instantiation
        and then ``extract`` for effectively extract the selected metafeatures.
        Check reference [1]_ for more information.

        Parameters
        ----------
        groups : :obj:`Iterable` of :obj:`str` or :obj:`str`
            A collection or a single metafeature group name representing the
            desired group of metafeatures for extraction. The supported groups
            are:

                1. ``general``: general/simples metafeatures.
                2. ``statistical``: statistical metafeatures.
                3. ``info-theory``: information-theoretic type of metafeature.
                4. ``model-based``: metafeatures based on machine learning
                   model characteristics.
                5. ``landmarking``: metafeatures representing performance
                   metrics from simple machine learning models or machine
                   learning models induced with sampled data.

            The value provided by the argument ``wildcard`` can be used to
            select all metafeature groups rapidly.

        features : :obj:`Iterable` of :obj:`str` or :obj:`str`, optional
            A collection or a single metafeature name desired for extraction.
            Keep in mind that the extraction only gathers features also in the
            selected ``groups``. Check this class ``feature`` attribute to get
            a list of available metafeatures from selected groups.

            The value provided by the argument ``wildcard`` can be used to
            select all features from all selected groups rapidly.

        summary : :obj:`Iterable` of :obj:`str` or :obj:`str`, optional
            A collection or a single summary function to summarize a group of
            metafeature measures into a fixed-length group of value, typically
            a single value. The values must be one of the following:

                1. ``mean``: Average of the values.
                2. ``sd``: Standard deviation of the values.
                3. ``count``: Computes the cardinality of the measure. Suitable
                   for variable cardinality.
                4. ``histogram``: Describes the distribution of the measured
                   values. Suitable for high cardinality.
                5. ``iq_range``: Computes the interquartile range of the
                   measured values.
                6. ``kurtosis``: Describes the shape of the measures values
                   distribution.
                7. ``max``: Results in the maximum value of the measure.
                8. ``median``: Results in the central value of the measure.
                9. ``min``: Results in the minimum value of the measure.
                10. ``quantiles``: Results in the minimum, first quartile,
                    median, third quartile and maximum of the measured values.
                11. ``range``: Computes the range of the measured values.
                12. ``skewness``: Describes the shape of the measure values
                    distribution in terms of symmetry.

            If more than one summary function is selected, then all multivalued
            extracted metafeatures are summarized with each summary function.

            The particular value provided by the argument ``wildcard`` can be
            used to select all summary functions rapidly.

        measure_time : :obj:`str`, optional
            Options for measuring the time elapsed during metafeature
            extraction. If this argument value is :obj:`NoneType`, no time
            elapsed is measured. Otherwise, this argument must be a :obj:`str`
            valued as one of the options below:

                1. ``avg``: average time for each metafeature (total time
                   divided by the feature cardinality, i.e., number of features
                   extracted by a single feature-extraction related method),
                   without summarization time.
                2. ``avg_summ``: average time for each metafeature (total time
                   of extraction divided by feature cardinality) including
                   required time for summarization.
                3. ``total``: total time for each metafeature, without
                   summarization time.
                4. ``total_summ``: total time for each metafeature including
                   the required time for summarization.

            The ``cardinality`` of the feature is the number of values
            extracted by a single calculation method.

            For example, ``mean`` feature has cardinality equal to the number
            of numeric features in the dataset, where ``cor`` (from
            ``correlation``) has cardinality equals to (N - 1)/2, where N is
            the number of numeric features in the dataset.

            The cardinality is used to divide the total execution time of that
            method if an option starting with ``avg`` is selected.

            If a summary method has cardinality higher than one (more than one
            value returned after summarization and, thus, creating more than
            one entry in the result lists) like, for example, ``histogram``
            summary method, then the corresponding time of this summary will be
            inserted only in the first correspondent element of the time list.
            The remaining entries are all filled with 0 value, to keep
            consistency between the size of all lists returned and index
            correspondence between they.

        wildcard : :obj:`str`, optional
            Value used as ``select all`` for ``groups``, ``features`` and
            ``summary`` arguments.

         score : :obj:`str`, optional
            Score metric used to extract ``landmarking`` metafeatures.

         folds : :obj:`int`, optional
            Number of folds to create a Stratified K-Fold cross validation
            to produce the ``landmarking`` metafeatures.

         sample_size : :obj:`float`, optional
            Sample proportion used to produce the ``landmarking`` metafeatures.
            This argument must be in 0.5 and 1.0 (both inclusive) interval.

        suppress_warnings : :obj:`bool`, optional
            If True, then ignore all warnings invoked at the instantiation
            time.

        Notes
        -----
            .. [1] Rivolli et al. "Towards Reproducible Empirical
               Research in Meta-Learning,".
               Rivolli et al. URL: https://arxiv.org/abs/1808.10406

        Examples
        --------

        Load a dataset

        >>> from sklearn.datasets import load_iris
        >>> from pymfe.mfe import MFE

        >>> data = load_iris()
        >>> y = data.target
        >>> X = data.data

        Extract all measures

        >>> mfe = MFE()
        >>> mfe.fit(X, y)
        >>> ft = mfe.extract()
        >>> print(ft)

        Extract general, statistical and information-theoretic measures

        >>> mfe = MFE(groups=["general", "statistical", "info-theory"])
        >>> mfe.fit(X, y)
        >>> ft = mfe.extract()
        >>> print(ft)

        """
        self.groups = _internal.process_generic_set(
            values=groups, group_name="groups")  # type: t.Tuple[str, ...]

        self.groups, self.inserted_group_dep = (
            _internal.solve_group_dependencies(
                groups=self.groups))

        proc_feat = _internal.process_features(
            features=features,
            groups=self.groups,
            suppress_warnings=suppress_warnings,
            wildcard=wildcard,
        )  # type: t.Tuple[t.Tuple[str, ...], _TypeSeqExt, t.Tuple[str, ...]]

        self.features, self._metadata_mtd_ft, self.groups = proc_feat
        del proc_feat

        self.summary, self._metadata_mtd_sm = _internal.process_summary(
            summary)  # type: t.Tuple[t.Tuple[str, ...], _TypeSeqExt]

        self.timeopt = _internal.process_generic_option(
            value=measure_time, group_name="timeopt",
            allow_none=True)  # type: t.Optional[str]

        self.X = None  # type: t.Optional[np.ndarray]
        self.y = None  # type: t.Optional[np.ndarray]

        self._custom_args_ft = None  # type: t.Optional[t.Dict[str, t.Any]]
        """User-independent arguments for ft. methods (e.g. ``X`` and ``y``)"""

        self._custom_args_sum = None  # type: t.Optional[t.Dict[str, t.Any]]
        """User-independent arguments for summary functions methods."""

        self._attr_indexes_num = None  # type: t.Optional[t.Tuple[int, ...]]
        """Numeric column indexes from ``X`` (independent attributes)."""

        self._attr_indexes_cat = None  # type: t.Optional[t.Tuple[int, ...]]
        """Categoric column indexes from ``X`` (independent attributes)."""

        self._precomp_args_ft = None  # type: t.Optional[t.Dict[str, t.Any]]
        """Precomputed common feature-extraction method arguments."""

        self._postprocess_args_ft = {}  # type: t.Dict[str, t.Any]
        """User-independent arguments for post-processing methods."""

        if random_state is None or isinstance(random_state, int):
            self.random_state = random_state
            np.random.seed(random_state)

        else:
            raise ValueError(
                'Invalid "random_state" argument ({0}). '
                'Expecting None or an integer.'.format(random_state))

        if isinstance(folds, int):
            self.folds = folds
        else:
            raise ValueError('Invalid "folds" argument ({0}). '
                             'Expecting an integer.'.format(random_state))

        if isinstance(sample_size, int):
            sample_size = float(sample_size)

        if isinstance(sample_size, float)\
           and 0.5 <= sample_size <= 1.0:
            self.sample_size = sample_size

        else:
            raise ValueError('Invalid "sample_size" argument ({0}). '
                             'Expecting an float [0.5, 1].'
                             .format(random_state))

        self.score = _internal.check_score(score, self.groups)