Beispiel #1
0
class MissingIndicatorImpl:
    def __init__(
        self,
        missing_values="nan",
        features="missing-only",
        sparse="auto",
        error_on_new=True,
    ):
        self._hyperparams = {
            "missing_values": missing_values,
            "features": features,
            "sparse": sparse,
            "error_on_new": error_on_new,
        }

    def fit(self, X, y=None):
        self._wrapped_model = SKLModel(**self._hyperparams)
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Beispiel #2
0
class MIAImputer(BaseEstimator, TransformerMixin):
    """ MIA imputation strategy
    
    duplicate each columns by remplacing each np.nan by once +inf and once -inf
    """
    def __init__(self, add_indicator=False, fill_value=10**5):
        self.add_indicator = add_indicator
        self.simple_imputer_max = SimpleImputer(strategy='constant',
                                                fill_value=10**5)
        self.simple_imputer_min = SimpleImputer(strategy='constant',
                                                fill_value=-10**5)

    def fit(self, X, y=None):
        self.simple_imputer_max.fit(X, y)
        self.simple_imputer_min.fit(X, y)
        if self.add_indicator:
            self.indicator_ = MissingIndicator(missing_values=np.nan,
                                               error_on_new=False)
            self.indicator_.fit(X)
        return self

    def transform(self, X):

        if self.add_indicator:
            X_trans_indicator = self.indicator_.transform(X)

        X_max = self.simple_imputer_max.transform(X)
        X_min = self.simple_imputer_min.transform(X)
        X = np.hstack((X_max, X_min))

        if self.add_indicator:
            X = np.hstack((X, X_trans_indicator))

        return X
    def test_missing_indicator_float_inputs_isnan_false_tvm(self):
        for features in ["all", "missing-only"]:
            model = MissingIndicator(features=features, missing_values=0)
            data = np.array([[1, 2], [0, 3], [7, 6]], dtype=np.float32)
            model.fit(data)

            self._test_sklearn_missing_indic(model, data, "tvm")
    def test_missing_indicator_float_inputs(self):
        for features in ["all", "missing-only"]:
            model = MissingIndicator(features=features)
            data = np.array([[1, 2], [np.nan, 3], [7, 6]], dtype=np.float32)
            model.fit(data)

            for backend in ["torch", "torch.jit"]:
                self._test_sklearn_missing_indic(model, data, backend)
Beispiel #5
0
def indicate_missing(train_df, test_df):
    for missing_feature in cont_missing_features + cat_missing_features:
        imp = MissingIndicator(missing_values=np.nan)
        imp.fit(pd.concat([train_df, test_df])[[missing_feature]])
        train_df["is_missing_" + missing_feature] = imp.transform(
            train_df[[missing_feature]])
        test_df["is_missing_" + missing_feature] = imp.transform(
            test_df[[missing_feature]])
    return train_df, test_df
Beispiel #6
0
class _MissingIndicatorImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Beispiel #7
0
class MissingIndicatorComponent(AutoSklearnPreprocessingAlgorithm):
    def __init__(self,
                 missing_values=np.nan,
                 features: str = "missing-only",
                 random_state=None):
        super().__init__()
        self.features = features
        self.missing_values = missing_values
        self.random_state = random_state

    def fit(self, X, Y=None):
        from sklearn.impute import MissingIndicator
        self.preprocessor = MissingIndicator(
            missing_values=self.missing_values, features=self.features)
        self.preprocessor.fit(X, Y)
        return self

    def transform(self, X):
        if self.preprocessor is None:
            raise NotImplementedError()
        return self.preprocessor.transform(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'MissingIndicator',
            'name': 'Missing Indicator',
            'handles_regression': True,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': True,
            'handles_multioutput': True,
            'is_deterministic': True,
            'input': (DENSE, UNSIGNED_DATA),
            'output': (INPUT, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        features = CategoricalHyperparameter("features",
                                             ["missing-only", "all"],
                                             default_value="missing-only")

        cs = ConfigurationSpace()
        cs.add_hyperparameters([features])
        return cs
Beispiel #8
0
def test_missing_indicator():
    X, y = load_iris(return_X_y=True)
    for missing_values in [np.nan, X[0][0], X[-1][1]]:
        X, y = load_iris(return_X_y=True)
        if np.isnan(missing_values):
            X.ravel()[np.random.choice(X.size, 20, replace=False)] = np.nan
        X_ = X.tolist()
        for features in ["missing-only", "all"]:
            imp = MissingIndicator(
                features=features, missing_values=missing_values, error_on_new=False
            )
            imp.fit(X)
            imp_ = convert_estimator(imp)

            X_t = getattr(imp, "transform")(X)
            X_t_ = getattr(imp_, "transform")(X_)
            assert np.allclose(X_t.shape, shape(X_t_))
            assert np.allclose(X_t, X_t_)
Beispiel #9
0
class MissingIndicatorImpl():

    def __init__(self, missing_values='nan', features='missing-only', sparse='auto', error_on_new=True):
        self._hyperparams = {
            'missing_values': missing_values,
            'features': features,
            'sparse': sparse,
            'error_on_new': error_on_new}

    def fit(self, X, y=None):
        self._wrapped_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Beispiel #10
0
def test_missing_indicator_error(X_fit, X_trans, params, msg_err):
    indicator = MissingIndicator(missing_values=-1)
    indicator.set_params(**params)
    with pytest.raises(ValueError, match=msg_err):
        indicator.fit(X_fit).transform(X_trans)
Beispiel #11
0
def test_missing_indicator_error(X_fit, X_trans, params, msg_err):
    indicator = MissingIndicator(missing_values=-1)
    indicator.set_params(**params)
    with pytest.raises(ValueError, match=msg_err):
        indicator.fit(X_fit).transform(X_trans)
Beispiel #12
0
indicator = MissingIndicator(missing_values=np.NaN)
indicator = indicator.fit_transform(X)
#print(indicator)
indicator = pd.DataFrame(
    indicator,
    columns=['m1', 'm3'])  # The only two columns in which missing values are
print(indicator)

# MissingIndicator - more in depth
import numpy as np
from sklearn.impute import MissingIndicator

X1 = np.array([[np.nan, 1, 3], [4, 0, np.nan], [8, 1, 0]])
X2 = np.array([[5, 1, np.nan], [np.nan, 2, 3], [2, 4, 0]])
indicator = MissingIndicator()
indicator.fit(X1)  # Creates the possible indicator columns (i.e., not all)

X2_tr = indicator.transform(X2)
X1_tr = indicator.transform(X1)

print('X2_tr')
print(X2_tr)
print('X1_tr')
print(X1_tr)

#####
# Inputation
#####
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
Beispiel #13
0
 def indicator(self):
     indicator = MissingIndicator(features="all")
     indicator.fit(self.train_data[self.undefined_features])
     return indicator
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_data = [[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]
imp_mean.fit(imp_data)
imp_mean.statistics_
X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
imp_mean.get_params()
imp_mean.transform(X)

from sklearn.impute import MissingIndicator
X1 = np.array([[np.nan, 1, 3], [4, 0, np.nan], [8, 1, 0]])
X2 = np.array([[5, 1, np.nan], [np.nan, 2, 3], [2, 4, 0]])
indicator = MissingIndicator()
indicator.fit(X1)
indicator.features_
X1
indicator.transform(X1)
X2
indicator.transform(X2)

indicator_all = MissingIndicator(features='all')
indicator_all.fit_transform(X1)
indicator_all.fit_transform(X2)
indicator_all.features_

from sklearn.preprocessing import Binarizer
X = [[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]]
transformer = Binarizer()
type(transformer)
Beispiel #15
0
class SimulationDataScaler(object):
    def __init__(self,
                 l=0.0,
                 u=1.0,
                 fill_value=0.0,
                 winsorize=False,
                 return_df=True):
        self.l = l
        self.u = u
        self.columns = []
        self.is_fit = False
        self.return_df = return_df

        if not winsorize:
            self.l = 0.0
            self.u = 1.0

        self.winsorize = winsorize
        self.winsorizor = Winsorizer(l=l, u=u)
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='constant',
                                     fill_value=fill_value,
                                     add_indicator=False)
        self.indicator_imputer = MissingIndicator(features="all")

    def fit(self, X):
        #X = X.copy()
        X[~np.isfinite(X)] = np.nan
        self.is_fit = True
        if type(X) is pd.DataFrame:
            self.columns = list(X.columns)

        X_w = self.winsorizor.fit_transform(X)
        #print( (X[~np.isfinite(X)]).sum())
        self.indicator_imputer.fit(X)
        self.scaler.fit(X_w)
        self.imputer.fit(X_w)

        return self

    def transform(self, X):
        if not self.is_fit:
            raise "Please fit the before running"
        X[~np.isfinite(X)] = np.nan
        X_w = self.winsorizor.transform(X)
        X_imp_ind = self.indicator_imputer.transform(X)
        X_w_s = self.scaler.transform(X_w)
        X_w_s_i = self.imputer.transform(X_w_s)

        if self.return_df:
            return pd.DataFrame(X_w_s_i, columns=self.columns), pd.DataFrame(
                X_imp_ind, columns=self.columns)
        else:
            return X_w_s_i, X_imp_ind

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

    def inverse_transform(self, X_w_s_i):
        if not self.is_fit:
            raise "Please fit the before running"
        X_w_i = self.scaler.inverse_transform(X_w_s_i)
        if self.return_df:
            return pd.DataFrame(X_w_i, columns=self.columns)
        else:
            return X_w_i
class RobustMissingIndicator(BaseEstimator, TransformerMixin):
    """Binary indicators for missing values.

    Note that this component typically should not be used in a vanilla
    :class:`sklearn.pipeline.Pipeline` consisting of transformers and a classifier,
    but rather could be added using a :class:`sklearn.pipeline.FeatureUnion` or
    :class:`sklearn.compose.ColumnTransformer`.

    Similar to sklearn.impute.MissingIndicator with added functionality
    - RobustMissingIndicator uses a custom mask_function to determine the boolean mask.
      The default mask_function is sagemaker_sklearn_extension.impute.is_finite_numeric
      which checks whether or not a value can be converted into a float.

    Parameters
    ----------
    features : str, optional (default="all")
        Whether the imputer mask should represent all or a subset of
        features.

        - If "missing-only", the imputer mask will only represent
          features containing missing values during fit time.
        - If "all" (default), the imputer mask will represent all features.

    error_on_new : boolean, optional (default=True)
        If True (default), transform will raise an error when there are
        features with missing values in transform that have no missing values
        in fit. This is applicable only when ``features="missing-only"``.

    mask_function : callable -> np.array, dtype('bool') (default=None)
        A vectorized python function, accepts np.array, returns np.array
        with dtype('bool')

        For each value, if mask_function(val) == False, that value will
        be imputed. mask_function is used to create a boolean mask that determines
        which values in the input to impute.

        Use np.vectorize to vectorize singular python functions.

        By default, mask_function will be
        sagemaker_sklearn_extension.impute.is_finite_numeric

    Notes
    -----
    only accepts 2D, non-sparse inputs
    """
    def __init__(self, features="all", error_on_new=True, mask_function=None):
        self.features = features
        self.error_on_new = error_on_new
        self.mask_function = mask_function

    def _validate_input(self, X):
        if hasattr(X, "dtype") and X.dtype is not None and hasattr(
                X.dtype, "kind") and X.dtype.kind == "c":
            raise ValueError("Complex data not supported\n{}\n".format(X))

        return check_array(X,
                           dtype=np.dtype("O"),
                           copy=True,
                           force_all_finite=False,
                           ensure_2d=True)

    def fit(self, X, y=None):
        """Fit the transformer on X.

        Parameters
        ----------
        X : {array-like}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        Returns
        -------
        self : RobustMissingIndicator
        """
        X = self._validate_input(X)

        self.vectorized_mask_function_ = self.mask_function or is_finite_numeric
        X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_))

        self.missing_indicator_ = MissingIndicator(
            features=self.features, error_on_new=self.error_on_new)
        self.missing_indicator_.fit(X)

        return self

    def transform(self, X):
        """Generate missing values indicator for X.

        Parameters
        ----------
        X : {array-like}, shape (n_samples, n_features)
            The input data to complete.

        Returns
        -------
        Xt : {ndarray}, shape (n_samples, n_features)
            The missing indicator for input data. The data type of ``Xt``
            will be boolean.
        """
        check_is_fitted(self,
                        ["missing_indicator_", "vectorized_mask_function_"])
        X = self._validate_input(X)

        X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_))

        return self.missing_indicator_.transform(X)

    def _more_tags(self):
        return {"allow_nan": True}