def extract( self, remove_nan: bool = True, verbose: bool = False, enable_parallel: bool = False, suppress_warnings: bool = False, **kwargs) -> t.Tuple[t.Sequence, ...]: """Extracts metafeatures from the previously fitted dataset. Parameters ---------- remove_nan : :obj:`bool`, optional If True, remove any non-numeric values features before summarizing values from all feature extraction methods. Note that the summary methods may still remove non-numeric values by itself. In this case, the user must modify this behavior using built-in summary method arguments via kwargs, if possible. verbose : :obj:`bool`, optional If True, print messages related to the metafeature extraction process. Note that warning messages are not affected by this option (see ``suppress_warnings`` argument below). enable_parallel : :obj:`bool`, optional If True, then the meta-feature extraction is done with multi-processes. Currently, this argument has no effect by now (to be implemented). by_class : :obj:`bool`, optional Not implemented yet. suppress_warnings : :obj:`bool`, optional If True, do not show warnings about unknown user custom parameters for feature extraction and summary methods passed via kwargs. Note that both feature extraction and summary methods may still raise warnings by itself. In this case, just like the ``remove_nan`` situation, the user must suppress them by built-in args from these methods via kwargs, if possible. kwargs: Used to pass custom arguments for both feature-extraction and summary methods. The expected format is the following: {``mtd_name``: {``arg_name``: arg_value, ...}, ...} In words, the key values of ``**kwargs`` should be the target methods which receives the custom arguments, and each method has another dictionary containing customs method argument names as keys and their correspondent values, as values. See ``Examples`` subsection for a clearer explanation. For more information see Examples. Returns ------- :obj:`tuple`(:obj:`list`, :obj:`list`) A tuple containing two lists. The first field is the identifiers of each summarized value in the form ``feature_name.summary_mtd_name`` (i.e., the feature extraction name concatenated by the summary method name, separated by a dot). The second field is the summarized values. Both lists have a 1-1 correspondence by the index of each element (i.e., the value at index ``i`` in the second list has its identifier at the same index in the first list and vice-versa). Example: ([``attr_ent.mean``, ``attr_ent.sd``], [``0.983``, ``0.344``]) is the return value for the feature ``attr_end`` summarized by both ``mean`` and ``sd`` (standard deviation), giving the valu- es ``0.983`` and ``0.344``, respectively. Raises ------ TypeError If calling ``extract`` method before ``fit`` method. Examples -------- Using kwargs. Option 1 to pass ft. extraction custom arguments: >>> args = { >>> 'sd': {'ddof': 2}, >>> '1NN': {'metric': 'minkowski', 'p': 2}, >>> 'leaves': {'max_depth': 4}, >>> } >>> model = MFE().fit(X=data, y=labels) >>> result = model.extract(**args) Option 2 (note: metafeatures with name starting with numbers are not allowed!): >>> model = MFE().fit(X=data, y=labels) >>> res = extract(sd={'ddof': 2}, leaves={'max_depth': 4}) """ if self.X is None or self.y is None: raise TypeError("Fitted data not found. Call " '"fit" method before "extract".') if (not isinstance(self.X, np.ndarray) or not isinstance(self.y, np.ndarray)): self.X, self.y = _internal.check_data(self.X, self.y) if verbose: print("Started the metafeature extraction process.") results = self._call_feature_methods( remove_nan=remove_nan, verbose=verbose, enable_parallel=enable_parallel, suppress_warnings=suppress_warnings, **kwargs) _internal.post_processing( results=results, groups=self.groups, suppress_warnings=suppress_warnings, **self._postprocess_args_ft, **kwargs) if results and results[0]: # Sort results by metafeature name results = tuple( map(list, zip(*sorted(zip(*results), key=lambda item: item[0])))) res_names, res_vals, res_times = results if verbose: if self._timeopt_type_is_avg(): time_type = "average" else: time_type = "total" print( "Metafeature extraction process done.", "Total of {0} values obtained. Time elapsed " "({1}) = {2:.8f} seconds.".format( len(res_vals), time_type, sum(res_times)), sep="\n") if self.timeopt: return res_names, res_vals, res_times return res_names, res_vals
def test_error_check_data(self): X, y = load_xy(0) with pytest.raises(TypeError): _internal.check_data(X, y='')
def fit(self, X: t.Sequence, y: t.Sequence, transform_num: bool = True, transform_cat: bool = True, rescale: t.Optional[str] = None, rescale_args: t.Optional[t.Dict[str, t.Any]] = None, cat_cols: t.Optional[t.Union[str, t.Iterable[int]]] = "auto", check_bool: bool = False, precomp_groups: t.Optional[str] = "all", wildcard: str = "all", suppress_warnings: bool = False, ) -> "MFE": """Fits dataset into an MFE model. Parameters ---------- X : :obj:`Sequence` Predictive attributes of the dataset. y : :obj:`Sequence` Target attributes of the dataset, assuming that it is a supervised task. transform_num : :obj:`bool`, optional If True, numeric attributes are discretized using equal-frequency histogram technique to use alongside categorical data when extracting categoric-only metafeatures. Note that numeric-only features still uses the original numeric values, not the discretized ones. If False, then numeric attributes are ignored for categorical-only meta-features. transform_cat : :obj:`bool`, optional If True, categorical attributes are binarized using a model matrix to use when alongside numerical data while extracting numeric-only metafeatures. Note that categoric-only features still uses the original categoric values, not the binarized ones. If False, then categorical attributes are ignored for numeric-only metafeatures. The formula used for this transformation is just the union (+) of all categoric attributes using formula language from ``patsy`` package API, removing the intercept terms: ``~ 0 + A_1 + ... + A_n``, where ``n`` is the number of attributes and A_i is the ith categoric attribute, 1 <= i <= n. rescale : :obj:`str`, optional If :obj:`NoneType`, the model keeps all numeric data with its original values. Otherwise, this argument can assume one of the string options below to rescale all numeric values: 1. ``standard``: set numeric data to zero mean, unit variance. Also known as ``z-score`` normalization. Check the documentation of ``sklearn.preprocessing.StandardScaler`` for in-depth information. 2. `'min-max``: set numeric data to interval [a, b], a < b. It is possible to define values to ``a`` and ``b`` using argument ``rescale_args``. The default values are a = 0.0 and b = 1.0. Check ``sklearn.preprocessing.MinMaxScaler`` documentation for more information. 3. ``robust``: rescale data using statistics robust to the presence of outliers. For in-depth information, check documentation of ``sklearn.preprocessing.RobustScaler``. rescale_args : :obj:`dict`, optional Dictionary containing parameters for rescaling data. Used only if ``rescale`` argument is not :obj:`NoneType`. These dictionary keys are the parameter names as strings and the values, the corresponding parameter value. cat_cols :obj:`Sequence` of :obj:`int` or :obj:`str`, optional Categorical columns of dataset. If given :obj:`NoneType` or an empty sequence, assume all columns as numeric. If given value ``auto``, then an attempt of automatic detection is performed while fitting the dataset. check_bool : :obj:`bool`, optional If `cat_cols` is ``auto``, and this flag is True, assume that all columns with precisely two different values is also a categorical (boolean) column, independently of its data type. Otherwise, these columns may be considered numeric depending on their data type. missing_data : :obj:`str`, optional Defines the strategy to handle missing values in data. Still not implemented. precomp_groups : :obj:`str`, optional Defines which metafeature groups common values should be cached to share among various meta-feature extraction related methods (e.g. ``classes``, or ``covariance``). This argument may speed up meta-feature extraction but also consumes more memory, so it may not be suitable for huge datasets. wildcard : :obj:`str`, optional Value used as ``select all`` for ``precomp_groups``. suppress_warnings : :obj:`bool`, optional If True, ignore all warnings invoked while fitting dataset. Returns ------- self Raises ------ ValueError If the number of rows of X and y length does not match. TypeError If X or y (or both) is neither a :obj:`list` or a :obj:`np.ndarray` object. """ self.X, self.y = _internal.check_data(X, y) rescale = _internal.process_generic_option( value=rescale, group_name="rescale", allow_none=True) self._fill_col_ind_by_type(cat_cols=cat_cols, check_bool=check_bool) data_cat = self._set_data_categoric(transform_num=transform_num) data_num = self._set_data_numeric( transform_cat=transform_cat, rescale=rescale, rescale_args=rescale_args) # Custom arguments for metafeature extraction methods self._custom_args_ft = { "X": self.X, "N": data_num, "C": data_cat, "y": self.y, "folds": self.folds, "sample_size": self.sample_size, "score": self.score, "random_state": self.random_state, "cat_cols": self._attr_indexes_cat, } # Custom arguments from preprocessing methods self._precomp_args_ft = _internal.process_precomp_groups( precomp_groups=precomp_groups, groups=self.groups, wildcard=wildcard, suppress_warnings=suppress_warnings, **self._custom_args_ft) # Custom arguments for postprocessing methods self._postprocess_args_ft = { "inserted_group_dep": self.inserted_group_dep, } # Custom arguments for summarization methods self._custom_args_sum = { "ddof": 1, } return self