class MissingIndicatorImpl: def __init__( self, missing_values="nan", features="missing-only", sparse="auto", error_on_new=True, ): self._hyperparams = { "missing_values": missing_values, "features": features, "sparse": sparse, "error_on_new": error_on_new, } def fit(self, X, y=None): self._wrapped_model = SKLModel(**self._hyperparams) if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
class MIAImputer(BaseEstimator, TransformerMixin): """ MIA imputation strategy duplicate each columns by remplacing each np.nan by once +inf and once -inf """ def __init__(self, add_indicator=False, fill_value=10**5): self.add_indicator = add_indicator self.simple_imputer_max = SimpleImputer(strategy='constant', fill_value=10**5) self.simple_imputer_min = SimpleImputer(strategy='constant', fill_value=-10**5) def fit(self, X, y=None): self.simple_imputer_max.fit(X, y) self.simple_imputer_min.fit(X, y) if self.add_indicator: self.indicator_ = MissingIndicator(missing_values=np.nan, error_on_new=False) self.indicator_.fit(X) return self def transform(self, X): if self.add_indicator: X_trans_indicator = self.indicator_.transform(X) X_max = self.simple_imputer_max.transform(X) X_min = self.simple_imputer_min.transform(X) X = np.hstack((X_max, X_min)) if self.add_indicator: X = np.hstack((X, X_trans_indicator)) return X
def test_missing_indicator_float_inputs_isnan_false_tvm(self): for features in ["all", "missing-only"]: model = MissingIndicator(features=features, missing_values=0) data = np.array([[1, 2], [0, 3], [7, 6]], dtype=np.float32) model.fit(data) self._test_sklearn_missing_indic(model, data, "tvm")
def test_missing_indicator_float_inputs(self): for features in ["all", "missing-only"]: model = MissingIndicator(features=features) data = np.array([[1, 2], [np.nan, 3], [7, 6]], dtype=np.float32) model.fit(data) for backend in ["torch", "torch.jit"]: self._test_sklearn_missing_indic(model, data, backend)
def indicate_missing(train_df, test_df): for missing_feature in cont_missing_features + cat_missing_features: imp = MissingIndicator(missing_values=np.nan) imp.fit(pd.concat([train_df, test_df])[[missing_feature]]) train_df["is_missing_" + missing_feature] = imp.transform( train_df[[missing_feature]]) test_df["is_missing_" + missing_feature] = imp.transform( test_df[[missing_feature]]) return train_df, test_df
class _MissingIndicatorImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
class MissingIndicatorComponent(AutoSklearnPreprocessingAlgorithm): def __init__(self, missing_values=np.nan, features: str = "missing-only", random_state=None): super().__init__() self.features = features self.missing_values = missing_values self.random_state = random_state def fit(self, X, Y=None): from sklearn.impute import MissingIndicator self.preprocessor = MissingIndicator( missing_values=self.missing_values, features=self.features) self.preprocessor.fit(X, Y) return self def transform(self, X): if self.preprocessor is None: raise NotImplementedError() return self.preprocessor.transform(X) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'MissingIndicator', 'name': 'Missing Indicator', 'handles_regression': True, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'handles_multioutput': True, 'is_deterministic': True, 'input': (DENSE, UNSIGNED_DATA), 'output': (INPUT, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): features = CategoricalHyperparameter("features", ["missing-only", "all"], default_value="missing-only") cs = ConfigurationSpace() cs.add_hyperparameters([features]) return cs
def test_missing_indicator(): X, y = load_iris(return_X_y=True) for missing_values in [np.nan, X[0][0], X[-1][1]]: X, y = load_iris(return_X_y=True) if np.isnan(missing_values): X.ravel()[np.random.choice(X.size, 20, replace=False)] = np.nan X_ = X.tolist() for features in ["missing-only", "all"]: imp = MissingIndicator( features=features, missing_values=missing_values, error_on_new=False ) imp.fit(X) imp_ = convert_estimator(imp) X_t = getattr(imp, "transform")(X) X_t_ = getattr(imp_, "transform")(X_) assert np.allclose(X_t.shape, shape(X_t_)) assert np.allclose(X_t, X_t_)
class MissingIndicatorImpl(): def __init__(self, missing_values='nan', features='missing-only', sparse='auto', error_on_new=True): self._hyperparams = { 'missing_values': missing_values, 'features': features, 'sparse': sparse, 'error_on_new': error_on_new} def fit(self, X, y=None): self._wrapped_model = SKLModel(**self._hyperparams) if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def test_missing_indicator_error(X_fit, X_trans, params, msg_err): indicator = MissingIndicator(missing_values=-1) indicator.set_params(**params) with pytest.raises(ValueError, match=msg_err): indicator.fit(X_fit).transform(X_trans)
def test_missing_indicator_error(X_fit, X_trans, params, msg_err): indicator = MissingIndicator(missing_values=-1) indicator.set_params(**params) with pytest.raises(ValueError, match=msg_err): indicator.fit(X_fit).transform(X_trans)
indicator = MissingIndicator(missing_values=np.NaN) indicator = indicator.fit_transform(X) #print(indicator) indicator = pd.DataFrame( indicator, columns=['m1', 'm3']) # The only two columns in which missing values are print(indicator) # MissingIndicator - more in depth import numpy as np from sklearn.impute import MissingIndicator X1 = np.array([[np.nan, 1, 3], [4, 0, np.nan], [8, 1, 0]]) X2 = np.array([[5, 1, np.nan], [np.nan, 2, 3], [2, 4, 0]]) indicator = MissingIndicator() indicator.fit(X1) # Creates the possible indicator columns (i.e., not all) X2_tr = indicator.transform(X2) X1_tr = indicator.transform(X1) print('X2_tr') print(X2_tr) print('X1_tr') print(X1_tr) ##### # Inputation ##### from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values=np.nan, strategy='mean')
def indicator(self): indicator = MissingIndicator(features="all") indicator.fit(self.train_data[self.undefined_features]) return indicator
import numpy as np from sklearn.impute import SimpleImputer imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') imp_data = [[7, 2, 3], [4, np.nan, 6], [10, 5, 9]] imp_mean.fit(imp_data) imp_mean.statistics_ X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]] imp_mean.get_params() imp_mean.transform(X) from sklearn.impute import MissingIndicator X1 = np.array([[np.nan, 1, 3], [4, 0, np.nan], [8, 1, 0]]) X2 = np.array([[5, 1, np.nan], [np.nan, 2, 3], [2, 4, 0]]) indicator = MissingIndicator() indicator.fit(X1) indicator.features_ X1 indicator.transform(X1) X2 indicator.transform(X2) indicator_all = MissingIndicator(features='all') indicator_all.fit_transform(X1) indicator_all.fit_transform(X2) indicator_all.features_ from sklearn.preprocessing import Binarizer X = [[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]] transformer = Binarizer() type(transformer)
class SimulationDataScaler(object): def __init__(self, l=0.0, u=1.0, fill_value=0.0, winsorize=False, return_df=True): self.l = l self.u = u self.columns = [] self.is_fit = False self.return_df = return_df if not winsorize: self.l = 0.0 self.u = 1.0 self.winsorize = winsorize self.winsorizor = Winsorizer(l=l, u=u) self.scaler = StandardScaler() self.imputer = SimpleImputer(strategy='constant', fill_value=fill_value, add_indicator=False) self.indicator_imputer = MissingIndicator(features="all") def fit(self, X): #X = X.copy() X[~np.isfinite(X)] = np.nan self.is_fit = True if type(X) is pd.DataFrame: self.columns = list(X.columns) X_w = self.winsorizor.fit_transform(X) #print( (X[~np.isfinite(X)]).sum()) self.indicator_imputer.fit(X) self.scaler.fit(X_w) self.imputer.fit(X_w) return self def transform(self, X): if not self.is_fit: raise "Please fit the before running" X[~np.isfinite(X)] = np.nan X_w = self.winsorizor.transform(X) X_imp_ind = self.indicator_imputer.transform(X) X_w_s = self.scaler.transform(X_w) X_w_s_i = self.imputer.transform(X_w_s) if self.return_df: return pd.DataFrame(X_w_s_i, columns=self.columns), pd.DataFrame( X_imp_ind, columns=self.columns) else: return X_w_s_i, X_imp_ind def fit_transform(self, X): self.fit(X) return self.transform(X) def inverse_transform(self, X_w_s_i): if not self.is_fit: raise "Please fit the before running" X_w_i = self.scaler.inverse_transform(X_w_s_i) if self.return_df: return pd.DataFrame(X_w_i, columns=self.columns) else: return X_w_i
class RobustMissingIndicator(BaseEstimator, TransformerMixin): """Binary indicators for missing values. Note that this component typically should not be used in a vanilla :class:`sklearn.pipeline.Pipeline` consisting of transformers and a classifier, but rather could be added using a :class:`sklearn.pipeline.FeatureUnion` or :class:`sklearn.compose.ColumnTransformer`. Similar to sklearn.impute.MissingIndicator with added functionality - RobustMissingIndicator uses a custom mask_function to determine the boolean mask. The default mask_function is sagemaker_sklearn_extension.impute.is_finite_numeric which checks whether or not a value can be converted into a float. Parameters ---------- features : str, optional (default="all") Whether the imputer mask should represent all or a subset of features. - If "missing-only", the imputer mask will only represent features containing missing values during fit time. - If "all" (default), the imputer mask will represent all features. error_on_new : boolean, optional (default=True) If True (default), transform will raise an error when there are features with missing values in transform that have no missing values in fit. This is applicable only when ``features="missing-only"``. mask_function : callable -> np.array, dtype('bool') (default=None) A vectorized python function, accepts np.array, returns np.array with dtype('bool') For each value, if mask_function(val) == False, that value will be imputed. mask_function is used to create a boolean mask that determines which values in the input to impute. Use np.vectorize to vectorize singular python functions. By default, mask_function will be sagemaker_sklearn_extension.impute.is_finite_numeric Notes ----- only accepts 2D, non-sparse inputs """ def __init__(self, features="all", error_on_new=True, mask_function=None): self.features = features self.error_on_new = error_on_new self.mask_function = mask_function def _validate_input(self, X): if hasattr(X, "dtype") and X.dtype is not None and hasattr( X.dtype, "kind") and X.dtype.kind == "c": raise ValueError("Complex data not supported\n{}\n".format(X)) return check_array(X, dtype=np.dtype("O"), copy=True, force_all_finite=False, ensure_2d=True) def fit(self, X, y=None): """Fit the transformer on X. Parameters ---------- X : {array-like}, shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. Returns ------- self : RobustMissingIndicator """ X = self._validate_input(X) self.vectorized_mask_function_ = self.mask_function or is_finite_numeric X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_)) self.missing_indicator_ = MissingIndicator( features=self.features, error_on_new=self.error_on_new) self.missing_indicator_.fit(X) return self def transform(self, X): """Generate missing values indicator for X. Parameters ---------- X : {array-like}, shape (n_samples, n_features) The input data to complete. Returns ------- Xt : {ndarray}, shape (n_samples, n_features) The missing indicator for input data. The data type of ``Xt`` will be boolean. """ check_is_fitted(self, ["missing_indicator_", "vectorized_mask_function_"]) X = self._validate_input(X) X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_)) return self.missing_indicator_.transform(X) def _more_tags(self): return {"allow_nan": True}