def test_error_process_generic_set_1(self, values, group_name, allow_none, allow_empty): with pytest.raises(ValueError): _internal.process_generic_set(values=values, group_name=group_name, allow_none=allow_none, allow_empty=allow_empty)
def test_error_process_generic_set_2(self): with pytest.raises(TypeError): _internal.process_generic_set(values=[1, 2, 3], group_name=None)
def __init__(self, groups: t.Union[str, t.Iterable[str]] = "all", features: t.Union[str, t.Iterable[str]] = "all", summary: t.Union[str, t.Iterable[str]] = ("mean", "sd"), measure_time: t.Optional[str] = None, wildcard: str = "all", score="accuracy", folds=10, sample_size=1.0, suppress_warnings: bool = False, random_state: t.Optional[int] = None) -> None: """This class provides easy access for metafeature extraction from datasets. It expected that user first calls ``fit`` method after instantiation and then ``extract`` for effectively extract the selected metafeatures. Check reference [1]_ for more information. Parameters ---------- groups : :obj:`Iterable` of :obj:`str` or :obj:`str` A collection or a single metafeature group name representing the desired group of metafeatures for extraction. The supported groups are: 1. ``general``: general/simples metafeatures. 2. ``statistical``: statistical metafeatures. 3. ``info-theory``: information-theoretic type of metafeature. 4. ``model-based``: metafeatures based on machine learning model characteristics. 5. ``landmarking``: metafeatures representing performance metrics from simple machine learning models or machine learning models induced with sampled data. The value provided by the argument ``wildcard`` can be used to select all metafeature groups rapidly. features : :obj:`Iterable` of :obj:`str` or :obj:`str`, optional A collection or a single metafeature name desired for extraction. Keep in mind that the extraction only gathers features also in the selected ``groups``. Check this class ``feature`` attribute to get a list of available metafeatures from selected groups. The value provided by the argument ``wildcard`` can be used to select all features from all selected groups rapidly. summary : :obj:`Iterable` of :obj:`str` or :obj:`str`, optional A collection or a single summary function to summarize a group of metafeature measures into a fixed-length group of value, typically a single value. The values must be one of the following: 1. ``mean``: Average of the values. 2. ``sd``: Standard deviation of the values. 3. ``count``: Computes the cardinality of the measure. Suitable for variable cardinality. 4. ``histogram``: Describes the distribution of the measured values. Suitable for high cardinality. 5. ``iq_range``: Computes the interquartile range of the measured values. 6. ``kurtosis``: Describes the shape of the measures values distribution. 7. ``max``: Results in the maximum value of the measure. 8. ``median``: Results in the central value of the measure. 9. ``min``: Results in the minimum value of the measure. 10. ``quantiles``: Results in the minimum, first quartile, median, third quartile and maximum of the measured values. 11. ``range``: Computes the range of the measured values. 12. ``skewness``: Describes the shape of the measure values distribution in terms of symmetry. If more than one summary function is selected, then all multivalued extracted metafeatures are summarized with each summary function. The particular value provided by the argument ``wildcard`` can be used to select all summary functions rapidly. measure_time : :obj:`str`, optional Options for measuring the time elapsed during metafeature extraction. If this argument value is :obj:`NoneType`, no time elapsed is measured. Otherwise, this argument must be a :obj:`str` valued as one of the options below: 1. ``avg``: average time for each metafeature (total time divided by the feature cardinality, i.e., number of features extracted by a single feature-extraction related method), without summarization time. 2. ``avg_summ``: average time for each metafeature (total time of extraction divided by feature cardinality) including required time for summarization. 3. ``total``: total time for each metafeature, without summarization time. 4. ``total_summ``: total time for each metafeature including the required time for summarization. The ``cardinality`` of the feature is the number of values extracted by a single calculation method. For example, ``mean`` feature has cardinality equal to the number of numeric features in the dataset, where ``cor`` (from ``correlation``) has cardinality equals to (N - 1)/2, where N is the number of numeric features in the dataset. The cardinality is used to divide the total execution time of that method if an option starting with ``avg`` is selected. If a summary method has cardinality higher than one (more than one value returned after summarization and, thus, creating more than one entry in the result lists) like, for example, ``histogram`` summary method, then the corresponding time of this summary will be inserted only in the first correspondent element of the time list. The remaining entries are all filled with 0 value, to keep consistency between the size of all lists returned and index correspondence between they. wildcard : :obj:`str`, optional Value used as ``select all`` for ``groups``, ``features`` and ``summary`` arguments. score : :obj:`str`, optional Score metric used to extract ``landmarking`` metafeatures. folds : :obj:`int`, optional Number of folds to create a Stratified K-Fold cross validation to produce the ``landmarking`` metafeatures. sample_size : :obj:`float`, optional Sample proportion used to produce the ``landmarking`` metafeatures. This argument must be in 0.5 and 1.0 (both inclusive) interval. suppress_warnings : :obj:`bool`, optional If True, then ignore all warnings invoked at the instantiation time. Notes ----- .. [1] Rivolli et al. "Towards Reproducible Empirical Research in Meta-Learning,". Rivolli et al. URL: https://arxiv.org/abs/1808.10406 Examples -------- Load a dataset >>> from sklearn.datasets import load_iris >>> from pymfe.mfe import MFE >>> data = load_iris() >>> y = data.target >>> X = data.data Extract all measures >>> mfe = MFE() >>> mfe.fit(X, y) >>> ft = mfe.extract() >>> print(ft) Extract general, statistical and information-theoretic measures >>> mfe = MFE(groups=["general", "statistical", "info-theory"]) >>> mfe.fit(X, y) >>> ft = mfe.extract() >>> print(ft) """ self.groups = _internal.process_generic_set( values=groups, group_name="groups") # type: t.Tuple[str, ...] self.groups, self.inserted_group_dep = ( _internal.solve_group_dependencies( groups=self.groups)) proc_feat = _internal.process_features( features=features, groups=self.groups, suppress_warnings=suppress_warnings, wildcard=wildcard, ) # type: t.Tuple[t.Tuple[str, ...], _TypeSeqExt, t.Tuple[str, ...]] self.features, self._metadata_mtd_ft, self.groups = proc_feat del proc_feat self.summary, self._metadata_mtd_sm = _internal.process_summary( summary) # type: t.Tuple[t.Tuple[str, ...], _TypeSeqExt] self.timeopt = _internal.process_generic_option( value=measure_time, group_name="timeopt", allow_none=True) # type: t.Optional[str] self.X = None # type: t.Optional[np.ndarray] self.y = None # type: t.Optional[np.ndarray] self._custom_args_ft = None # type: t.Optional[t.Dict[str, t.Any]] """User-independent arguments for ft. methods (e.g. ``X`` and ``y``)""" self._custom_args_sum = None # type: t.Optional[t.Dict[str, t.Any]] """User-independent arguments for summary functions methods.""" self._attr_indexes_num = None # type: t.Optional[t.Tuple[int, ...]] """Numeric column indexes from ``X`` (independent attributes).""" self._attr_indexes_cat = None # type: t.Optional[t.Tuple[int, ...]] """Categoric column indexes from ``X`` (independent attributes).""" self._precomp_args_ft = None # type: t.Optional[t.Dict[str, t.Any]] """Precomputed common feature-extraction method arguments.""" self._postprocess_args_ft = {} # type: t.Dict[str, t.Any] """User-independent arguments for post-processing methods.""" if random_state is None or isinstance(random_state, int): self.random_state = random_state np.random.seed(random_state) else: raise ValueError( 'Invalid "random_state" argument ({0}). ' 'Expecting None or an integer.'.format(random_state)) if isinstance(folds, int): self.folds = folds else: raise ValueError('Invalid "folds" argument ({0}). ' 'Expecting an integer.'.format(random_state)) if isinstance(sample_size, int): sample_size = float(sample_size) if isinstance(sample_size, float)\ and 0.5 <= sample_size <= 1.0: self.sample_size = sample_size else: raise ValueError('Invalid "sample_size" argument ({0}). ' 'Expecting an float [0.5, 1].' .format(random_state)) self.score = _internal.check_score(score, self.groups)