def test_error_process_generic_option_1(self, value, group_name, allow_none, allow_empty): with pytest.raises(ValueError): _internal.process_generic_option(value=value, group_name=group_name, allow_none=allow_none, allow_empty=allow_empty)
def test_error_process_generic_option_3(self): with pytest.raises(TypeError): _internal.process_generic_option(values=[1, 2, 3], group_name="timeopt")
def fit(self, X: t.Sequence, y: t.Sequence, transform_num: bool = True, transform_cat: bool = True, rescale: t.Optional[str] = None, rescale_args: t.Optional[t.Dict[str, t.Any]] = None, cat_cols: t.Optional[t.Union[str, t.Iterable[int]]] = "auto", check_bool: bool = False, precomp_groups: t.Optional[str] = "all", wildcard: str = "all", suppress_warnings: bool = False, ) -> "MFE": """Fits dataset into an MFE model. Parameters ---------- X : :obj:`Sequence` Predictive attributes of the dataset. y : :obj:`Sequence` Target attributes of the dataset, assuming that it is a supervised task. transform_num : :obj:`bool`, optional If True, numeric attributes are discretized using equal-frequency histogram technique to use alongside categorical data when extracting categoric-only metafeatures. Note that numeric-only features still uses the original numeric values, not the discretized ones. If False, then numeric attributes are ignored for categorical-only meta-features. transform_cat : :obj:`bool`, optional If True, categorical attributes are binarized using a model matrix to use when alongside numerical data while extracting numeric-only metafeatures. Note that categoric-only features still uses the original categoric values, not the binarized ones. If False, then categorical attributes are ignored for numeric-only metafeatures. The formula used for this transformation is just the union (+) of all categoric attributes using formula language from ``patsy`` package API, removing the intercept terms: ``~ 0 + A_1 + ... + A_n``, where ``n`` is the number of attributes and A_i is the ith categoric attribute, 1 <= i <= n. rescale : :obj:`str`, optional If :obj:`NoneType`, the model keeps all numeric data with its original values. Otherwise, this argument can assume one of the string options below to rescale all numeric values: 1. ``standard``: set numeric data to zero mean, unit variance. Also known as ``z-score`` normalization. Check the documentation of ``sklearn.preprocessing.StandardScaler`` for in-depth information. 2. `'min-max``: set numeric data to interval [a, b], a < b. It is possible to define values to ``a`` and ``b`` using argument ``rescale_args``. The default values are a = 0.0 and b = 1.0. Check ``sklearn.preprocessing.MinMaxScaler`` documentation for more information. 3. ``robust``: rescale data using statistics robust to the presence of outliers. For in-depth information, check documentation of ``sklearn.preprocessing.RobustScaler``. rescale_args : :obj:`dict`, optional Dictionary containing parameters for rescaling data. Used only if ``rescale`` argument is not :obj:`NoneType`. These dictionary keys are the parameter names as strings and the values, the corresponding parameter value. cat_cols :obj:`Sequence` of :obj:`int` or :obj:`str`, optional Categorical columns of dataset. If given :obj:`NoneType` or an empty sequence, assume all columns as numeric. If given value ``auto``, then an attempt of automatic detection is performed while fitting the dataset. check_bool : :obj:`bool`, optional If `cat_cols` is ``auto``, and this flag is True, assume that all columns with precisely two different values is also a categorical (boolean) column, independently of its data type. Otherwise, these columns may be considered numeric depending on their data type. missing_data : :obj:`str`, optional Defines the strategy to handle missing values in data. Still not implemented. precomp_groups : :obj:`str`, optional Defines which metafeature groups common values should be cached to share among various meta-feature extraction related methods (e.g. ``classes``, or ``covariance``). This argument may speed up meta-feature extraction but also consumes more memory, so it may not be suitable for huge datasets. wildcard : :obj:`str`, optional Value used as ``select all`` for ``precomp_groups``. suppress_warnings : :obj:`bool`, optional If True, ignore all warnings invoked while fitting dataset. Returns ------- self Raises ------ ValueError If the number of rows of X and y length does not match. TypeError If X or y (or both) is neither a :obj:`list` or a :obj:`np.ndarray` object. """ self.X, self.y = _internal.check_data(X, y) rescale = _internal.process_generic_option( value=rescale, group_name="rescale", allow_none=True) self._fill_col_ind_by_type(cat_cols=cat_cols, check_bool=check_bool) data_cat = self._set_data_categoric(transform_num=transform_num) data_num = self._set_data_numeric( transform_cat=transform_cat, rescale=rescale, rescale_args=rescale_args) # Custom arguments for metafeature extraction methods self._custom_args_ft = { "X": self.X, "N": data_num, "C": data_cat, "y": self.y, "folds": self.folds, "sample_size": self.sample_size, "score": self.score, "random_state": self.random_state, "cat_cols": self._attr_indexes_cat, } # Custom arguments from preprocessing methods self._precomp_args_ft = _internal.process_precomp_groups( precomp_groups=precomp_groups, groups=self.groups, wildcard=wildcard, suppress_warnings=suppress_warnings, **self._custom_args_ft) # Custom arguments for postprocessing methods self._postprocess_args_ft = { "inserted_group_dep": self.inserted_group_dep, } # Custom arguments for summarization methods self._custom_args_sum = { "ddof": 1, } return self
def __init__(self, groups: t.Union[str, t.Iterable[str]] = "all", features: t.Union[str, t.Iterable[str]] = "all", summary: t.Union[str, t.Iterable[str]] = ("mean", "sd"), measure_time: t.Optional[str] = None, wildcard: str = "all", score="accuracy", folds=10, sample_size=1.0, suppress_warnings: bool = False, random_state: t.Optional[int] = None) -> None: """This class provides easy access for metafeature extraction from datasets. It expected that user first calls ``fit`` method after instantiation and then ``extract`` for effectively extract the selected metafeatures. Check reference [1]_ for more information. Parameters ---------- groups : :obj:`Iterable` of :obj:`str` or :obj:`str` A collection or a single metafeature group name representing the desired group of metafeatures for extraction. The supported groups are: 1. ``general``: general/simples metafeatures. 2. ``statistical``: statistical metafeatures. 3. ``info-theory``: information-theoretic type of metafeature. 4. ``model-based``: metafeatures based on machine learning model characteristics. 5. ``landmarking``: metafeatures representing performance metrics from simple machine learning models or machine learning models induced with sampled data. The value provided by the argument ``wildcard`` can be used to select all metafeature groups rapidly. features : :obj:`Iterable` of :obj:`str` or :obj:`str`, optional A collection or a single metafeature name desired for extraction. Keep in mind that the extraction only gathers features also in the selected ``groups``. Check this class ``feature`` attribute to get a list of available metafeatures from selected groups. The value provided by the argument ``wildcard`` can be used to select all features from all selected groups rapidly. summary : :obj:`Iterable` of :obj:`str` or :obj:`str`, optional A collection or a single summary function to summarize a group of metafeature measures into a fixed-length group of value, typically a single value. The values must be one of the following: 1. ``mean``: Average of the values. 2. ``sd``: Standard deviation of the values. 3. ``count``: Computes the cardinality of the measure. Suitable for variable cardinality. 4. ``histogram``: Describes the distribution of the measured values. Suitable for high cardinality. 5. ``iq_range``: Computes the interquartile range of the measured values. 6. ``kurtosis``: Describes the shape of the measures values distribution. 7. ``max``: Results in the maximum value of the measure. 8. ``median``: Results in the central value of the measure. 9. ``min``: Results in the minimum value of the measure. 10. ``quantiles``: Results in the minimum, first quartile, median, third quartile and maximum of the measured values. 11. ``range``: Computes the range of the measured values. 12. ``skewness``: Describes the shape of the measure values distribution in terms of symmetry. If more than one summary function is selected, then all multivalued extracted metafeatures are summarized with each summary function. The particular value provided by the argument ``wildcard`` can be used to select all summary functions rapidly. measure_time : :obj:`str`, optional Options for measuring the time elapsed during metafeature extraction. If this argument value is :obj:`NoneType`, no time elapsed is measured. Otherwise, this argument must be a :obj:`str` valued as one of the options below: 1. ``avg``: average time for each metafeature (total time divided by the feature cardinality, i.e., number of features extracted by a single feature-extraction related method), without summarization time. 2. ``avg_summ``: average time for each metafeature (total time of extraction divided by feature cardinality) including required time for summarization. 3. ``total``: total time for each metafeature, without summarization time. 4. ``total_summ``: total time for each metafeature including the required time for summarization. The ``cardinality`` of the feature is the number of values extracted by a single calculation method. For example, ``mean`` feature has cardinality equal to the number of numeric features in the dataset, where ``cor`` (from ``correlation``) has cardinality equals to (N - 1)/2, where N is the number of numeric features in the dataset. The cardinality is used to divide the total execution time of that method if an option starting with ``avg`` is selected. If a summary method has cardinality higher than one (more than one value returned after summarization and, thus, creating more than one entry in the result lists) like, for example, ``histogram`` summary method, then the corresponding time of this summary will be inserted only in the first correspondent element of the time list. The remaining entries are all filled with 0 value, to keep consistency between the size of all lists returned and index correspondence between they. wildcard : :obj:`str`, optional Value used as ``select all`` for ``groups``, ``features`` and ``summary`` arguments. score : :obj:`str`, optional Score metric used to extract ``landmarking`` metafeatures. folds : :obj:`int`, optional Number of folds to create a Stratified K-Fold cross validation to produce the ``landmarking`` metafeatures. sample_size : :obj:`float`, optional Sample proportion used to produce the ``landmarking`` metafeatures. This argument must be in 0.5 and 1.0 (both inclusive) interval. suppress_warnings : :obj:`bool`, optional If True, then ignore all warnings invoked at the instantiation time. Notes ----- .. [1] Rivolli et al. "Towards Reproducible Empirical Research in Meta-Learning,". Rivolli et al. URL: https://arxiv.org/abs/1808.10406 Examples -------- Load a dataset >>> from sklearn.datasets import load_iris >>> from pymfe.mfe import MFE >>> data = load_iris() >>> y = data.target >>> X = data.data Extract all measures >>> mfe = MFE() >>> mfe.fit(X, y) >>> ft = mfe.extract() >>> print(ft) Extract general, statistical and information-theoretic measures >>> mfe = MFE(groups=["general", "statistical", "info-theory"]) >>> mfe.fit(X, y) >>> ft = mfe.extract() >>> print(ft) """ self.groups = _internal.process_generic_set( values=groups, group_name="groups") # type: t.Tuple[str, ...] self.groups, self.inserted_group_dep = ( _internal.solve_group_dependencies( groups=self.groups)) proc_feat = _internal.process_features( features=features, groups=self.groups, suppress_warnings=suppress_warnings, wildcard=wildcard, ) # type: t.Tuple[t.Tuple[str, ...], _TypeSeqExt, t.Tuple[str, ...]] self.features, self._metadata_mtd_ft, self.groups = proc_feat del proc_feat self.summary, self._metadata_mtd_sm = _internal.process_summary( summary) # type: t.Tuple[t.Tuple[str, ...], _TypeSeqExt] self.timeopt = _internal.process_generic_option( value=measure_time, group_name="timeopt", allow_none=True) # type: t.Optional[str] self.X = None # type: t.Optional[np.ndarray] self.y = None # type: t.Optional[np.ndarray] self._custom_args_ft = None # type: t.Optional[t.Dict[str, t.Any]] """User-independent arguments for ft. methods (e.g. ``X`` and ``y``)""" self._custom_args_sum = None # type: t.Optional[t.Dict[str, t.Any]] """User-independent arguments for summary functions methods.""" self._attr_indexes_num = None # type: t.Optional[t.Tuple[int, ...]] """Numeric column indexes from ``X`` (independent attributes).""" self._attr_indexes_cat = None # type: t.Optional[t.Tuple[int, ...]] """Categoric column indexes from ``X`` (independent attributes).""" self._precomp_args_ft = None # type: t.Optional[t.Dict[str, t.Any]] """Precomputed common feature-extraction method arguments.""" self._postprocess_args_ft = {} # type: t.Dict[str, t.Any] """User-independent arguments for post-processing methods.""" if random_state is None or isinstance(random_state, int): self.random_state = random_state np.random.seed(random_state) else: raise ValueError( 'Invalid "random_state" argument ({0}). ' 'Expecting None or an integer.'.format(random_state)) if isinstance(folds, int): self.folds = folds else: raise ValueError('Invalid "folds" argument ({0}). ' 'Expecting an integer.'.format(random_state)) if isinstance(sample_size, int): sample_size = float(sample_size) if isinstance(sample_size, float)\ and 0.5 <= sample_size <= 1.0: self.sample_size = sample_size else: raise ValueError('Invalid "sample_size" argument ({0}). ' 'Expecting an float [0.5, 1].' .format(random_state)) self.score = _internal.check_score(score, self.groups)