def test_preprocessing_invalid(self): """Test exception handling of precomputation.""" with pytest.warns(UserWarning): _internal.process_precomp_groups(precomp_groups=tuple(), groups=tuple(), custom_class_=MFETestClass, raise_exception=True)
def test_mem_err_precompute(self): with pytest.warns(UserWarning): _internal.process_precomp_groups( precomp_groups=tuple(), groups=tuple(), custom_class_=MFETestClass, raise_mem_err=True, )
def test_preprocessing_valid(self): """Test valid precomputation and its automatic detection.""" precomp_args = _internal.process_precomp_groups( precomp_groups=tuple(), groups=tuple(), custom_class_=MFETestClass ) assert len(precomp_args) > 0
def fit(self, X: t.Sequence, y: t.Sequence, transform_num: bool = True, transform_cat: bool = True, rescale: t.Optional[str] = None, rescale_args: t.Optional[t.Dict[str, t.Any]] = None, cat_cols: t.Optional[t.Union[str, t.Iterable[int]]] = "auto", check_bool: bool = False, precomp_groups: t.Optional[str] = "all", wildcard: str = "all", suppress_warnings: bool = False, ) -> "MFE": """Fits dataset into an MFE model. Parameters ---------- X : :obj:`Sequence` Predictive attributes of the dataset. y : :obj:`Sequence` Target attributes of the dataset, assuming that it is a supervised task. transform_num : :obj:`bool`, optional If True, numeric attributes are discretized using equal-frequency histogram technique to use alongside categorical data when extracting categoric-only metafeatures. Note that numeric-only features still uses the original numeric values, not the discretized ones. If False, then numeric attributes are ignored for categorical-only meta-features. transform_cat : :obj:`bool`, optional If True, categorical attributes are binarized using a model matrix to use when alongside numerical data while extracting numeric-only metafeatures. Note that categoric-only features still uses the original categoric values, not the binarized ones. If False, then categorical attributes are ignored for numeric-only metafeatures. The formula used for this transformation is just the union (+) of all categoric attributes using formula language from ``patsy`` package API, removing the intercept terms: ``~ 0 + A_1 + ... + A_n``, where ``n`` is the number of attributes and A_i is the ith categoric attribute, 1 <= i <= n. rescale : :obj:`str`, optional If :obj:`NoneType`, the model keeps all numeric data with its original values. Otherwise, this argument can assume one of the string options below to rescale all numeric values: 1. ``standard``: set numeric data to zero mean, unit variance. Also known as ``z-score`` normalization. Check the documentation of ``sklearn.preprocessing.StandardScaler`` for in-depth information. 2. `'min-max``: set numeric data to interval [a, b], a < b. It is possible to define values to ``a`` and ``b`` using argument ``rescale_args``. The default values are a = 0.0 and b = 1.0. Check ``sklearn.preprocessing.MinMaxScaler`` documentation for more information. 3. ``robust``: rescale data using statistics robust to the presence of outliers. For in-depth information, check documentation of ``sklearn.preprocessing.RobustScaler``. rescale_args : :obj:`dict`, optional Dictionary containing parameters for rescaling data. Used only if ``rescale`` argument is not :obj:`NoneType`. These dictionary keys are the parameter names as strings and the values, the corresponding parameter value. cat_cols :obj:`Sequence` of :obj:`int` or :obj:`str`, optional Categorical columns of dataset. If given :obj:`NoneType` or an empty sequence, assume all columns as numeric. If given value ``auto``, then an attempt of automatic detection is performed while fitting the dataset. check_bool : :obj:`bool`, optional If `cat_cols` is ``auto``, and this flag is True, assume that all columns with precisely two different values is also a categorical (boolean) column, independently of its data type. Otherwise, these columns may be considered numeric depending on their data type. missing_data : :obj:`str`, optional Defines the strategy to handle missing values in data. Still not implemented. precomp_groups : :obj:`str`, optional Defines which metafeature groups common values should be cached to share among various meta-feature extraction related methods (e.g. ``classes``, or ``covariance``). This argument may speed up meta-feature extraction but also consumes more memory, so it may not be suitable for huge datasets. wildcard : :obj:`str`, optional Value used as ``select all`` for ``precomp_groups``. suppress_warnings : :obj:`bool`, optional If True, ignore all warnings invoked while fitting dataset. Returns ------- self Raises ------ ValueError If the number of rows of X and y length does not match. TypeError If X or y (or both) is neither a :obj:`list` or a :obj:`np.ndarray` object. """ self.X, self.y = _internal.check_data(X, y) rescale = _internal.process_generic_option( value=rescale, group_name="rescale", allow_none=True) self._fill_col_ind_by_type(cat_cols=cat_cols, check_bool=check_bool) data_cat = self._set_data_categoric(transform_num=transform_num) data_num = self._set_data_numeric( transform_cat=transform_cat, rescale=rescale, rescale_args=rescale_args) # Custom arguments for metafeature extraction methods self._custom_args_ft = { "X": self.X, "N": data_num, "C": data_cat, "y": self.y, "folds": self.folds, "sample_size": self.sample_size, "score": self.score, "random_state": self.random_state, "cat_cols": self._attr_indexes_cat, } # Custom arguments from preprocessing methods self._precomp_args_ft = _internal.process_precomp_groups( precomp_groups=precomp_groups, groups=self.groups, wildcard=wildcard, suppress_warnings=suppress_warnings, **self._custom_args_ft) # Custom arguments for postprocessing methods self._postprocess_args_ft = { "inserted_group_dep": self.inserted_group_dep, } # Custom arguments for summarization methods self._custom_args_sum = { "ddof": 1, } return self