Beispiel #1
0
def indicate_missing(train_df, test_df):
    for missing_feature in cont_missing_features + cat_missing_features:
        imp = MissingIndicator(missing_values=np.nan)
        imp.fit(pd.concat([train_df, test_df])[[missing_feature]])
        train_df["is_missing_" + missing_feature] = imp.transform(
            train_df[[missing_feature]])
        test_df["is_missing_" + missing_feature] = imp.transform(
            test_df[[missing_feature]])
    return train_df, test_df
Beispiel #2
0
def data_missing_indicator(data_train,var_type_dict,data_test=None):
    '''
    进行特缺失值标记变量衍生
    data_train: 需要进行转换的训练集
    var_type_dict: 变量信息记录dict
    data_test: 需要进行转换的测试集 可以不给 不给就不会进行相应的转换
    
    return:
    data_train_completed 衍生完成的训练集
    var_type_dict 更新完的变量信息记录dict
    data_test_completed 衍生完成的测试集
    '''
    numeric_feature = var_type_dict.get('numeric_var',[])
    category_feature = var_type_dict.get('category_var',[])
    print('开始进行特缺失值标记变量衍生'.center(50, '='))
    ##从dict里面把特征list拿出来
    is_miss_feature = ['is_'+i+'_missing' for i in numeric_feature+category_feature]
    print('原始数据维度:',data_train.shape)
    print('新增数据维度:',len(is_miss_feature))
    check_unique(numeric_feature+is_miss_feature)
    ##数值列和类别列用指定的方法填充
    
    miss_indicator = MissingIndicator(features='all')
    data_train_completed = miss_indicator.fit_transform(data_train[numeric_feature+category_feature])
    data_train_completed = pd.concat([data_train,pd.DataFrame(data_train_completed,columns=is_miss_feature)],axis=1)
    print('变量衍生完成:',data_train_completed.shape)
    ##更新var_type_dict文件 全部加入到numeric_var当中
    var_type_dict['numeric_var'] = numeric_feature+is_miss_feature
    ##如果测试数据不为空 那么对测试数据进行transform 并返回
    if data_test is not None:
        data_test_completed = miss_indicator.transform(data_test[numeric_feature+category_feature])
        data_test_completed = pd.concat([data_test,pd.DataFrame(data_test_completed,columns=is_miss_feature)],axis=1)
        return data_train_completed,var_type_dict,data_test_completed
    return data_train_completed,var_type_dict
Beispiel #3
0
def test_missing_indicator_sparse_param(arr_type, missing_values,
                                        param_sparse):
    # check the format of the output with different sparse parameter
    X_fit = np.array([[missing_values, missing_values, 1],
                      [4, missing_values, 2]])
    X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]])
    X_fit = arr_type(X_fit).astype(np.float64)
    X_trans = arr_type(X_trans).astype(np.float64)

    indicator = MissingIndicator(missing_values=missing_values,
                                 sparse=param_sparse)
    X_fit_mask = indicator.fit_transform(X_fit)
    X_trans_mask = indicator.transform(X_trans)

    if param_sparse is True:
        assert X_fit_mask.format == 'csc'
        assert X_trans_mask.format == 'csc'
    elif param_sparse == 'auto' and missing_values == 0:
        assert isinstance(X_fit_mask, np.ndarray)
        assert isinstance(X_trans_mask, np.ndarray)
    elif param_sparse is False:
        assert isinstance(X_fit_mask, np.ndarray)
        assert isinstance(X_trans_mask, np.ndarray)
    else:
        if sparse.issparse(X_fit):
            assert X_fit_mask.format == 'csc'
            assert X_trans_mask.format == 'csc'
        else:
            assert isinstance(X_fit_mask, np.ndarray)
            assert isinstance(X_trans_mask, np.ndarray)
Beispiel #4
0
class MissingIndicatorImpl:
    def __init__(
        self,
        missing_values="nan",
        features="missing-only",
        sparse="auto",
        error_on_new=True,
    ):
        self._hyperparams = {
            "missing_values": missing_values,
            "features": features,
            "sparse": sparse,
            "error_on_new": error_on_new,
        }

    def fit(self, X, y=None):
        self._wrapped_model = SKLModel(**self._hyperparams)
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Beispiel #5
0
def test_missing_indicator_sparse_param(arr_type, missing_values,
                                        param_sparse):
    # check the format of the output with different sparse parameter
    X_fit = np.array([[missing_values, missing_values, 1],
                      [4, missing_values, 2]])
    X_trans = np.array([[missing_values, missing_values, 1],
                        [4, 12, 10]])
    X_fit = arr_type(X_fit).astype(np.float64)
    X_trans = arr_type(X_trans).astype(np.float64)

    indicator = MissingIndicator(missing_values=missing_values,
                                 sparse=param_sparse)
    X_fit_mask = indicator.fit_transform(X_fit)
    X_trans_mask = indicator.transform(X_trans)

    if param_sparse is True:
        assert X_fit_mask.format == 'csc'
        assert X_trans_mask.format == 'csc'
    elif param_sparse == 'auto' and missing_values == 0:
        assert isinstance(X_fit_mask, np.ndarray)
        assert isinstance(X_trans_mask, np.ndarray)
    elif param_sparse is False:
        assert isinstance(X_fit_mask, np.ndarray)
        assert isinstance(X_trans_mask, np.ndarray)
    else:
        if sparse.issparse(X_fit):
            assert X_fit_mask.format == 'csc'
            assert X_trans_mask.format == 'csc'
        else:
            assert isinstance(X_fit_mask, np.ndarray)
            assert isinstance(X_trans_mask, np.ndarray)
Beispiel #6
0
class MIAImputer(BaseEstimator, TransformerMixin):
    """ MIA imputation strategy
    
    duplicate each columns by remplacing each np.nan by once +inf and once -inf
    """
    def __init__(self, add_indicator=False, fill_value=10**5):
        self.add_indicator = add_indicator
        self.simple_imputer_max = SimpleImputer(strategy='constant',
                                                fill_value=10**5)
        self.simple_imputer_min = SimpleImputer(strategy='constant',
                                                fill_value=-10**5)

    def fit(self, X, y=None):
        self.simple_imputer_max.fit(X, y)
        self.simple_imputer_min.fit(X, y)
        if self.add_indicator:
            self.indicator_ = MissingIndicator(missing_values=np.nan,
                                               error_on_new=False)
            self.indicator_.fit(X)
        return self

    def transform(self, X):

        if self.add_indicator:
            X_trans_indicator = self.indicator_.transform(X)

        X_max = self.simple_imputer_max.transform(X)
        X_min = self.simple_imputer_min.transform(X)
        X = np.hstack((X_max, X_min))

        if self.add_indicator:
            X = np.hstack((X, X_trans_indicator))

        return X
Beispiel #7
0
def test_missing_indicator_raise_on_sparse_with_missing_0(arr_type):
    # test for sparse input and missing_value == 0

    missing_values = 0
    X_fit = np.array([[missing_values, missing_values, 1], [4, missing_values, 2]])
    X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]])

    # convert the input to the right array format
    X_fit_sparse = arr_type(X_fit)
    X_trans_sparse = arr_type(X_trans)

    indicator = MissingIndicator(missing_values=missing_values)

    with pytest.raises(ValueError, match="Sparse input with missing_values=0"):
        indicator.fit_transform(X_fit_sparse)

    indicator.fit_transform(X_fit)
    with pytest.raises(ValueError, match="Sparse input with missing_values=0"):
        indicator.transform(X_trans_sparse)
Beispiel #8
0
def test_missing_indicator_new(missing_values, arr_type, dtype, param_features,
                               n_features, features_indices):
    X_fit = np.array([[missing_values, missing_values, 1],
                      [4, missing_values, 2]])
    X_trans = np.array([[missing_values, missing_values, 1],
                        [4, 12, 10]])
    X_fit_expected = np.array([[1, 1, 0], [0, 1, 0]])
    X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]])

    # convert the input to the right array format and right dtype
    X_fit = arr_type(X_fit).astype(dtype)
    X_trans = arr_type(X_trans).astype(dtype)
    X_fit_expected = X_fit_expected.astype(dtype)
    X_trans_expected = X_trans_expected.astype(dtype)

    indicator = MissingIndicator(missing_values=missing_values,
                                 features=param_features,
                                 sparse=False)
    X_fit_mask = indicator.fit_transform(X_fit)
    X_trans_mask = indicator.transform(X_trans)

    assert X_fit_mask.shape[1] == n_features
    assert X_trans_mask.shape[1] == n_features

    assert_array_equal(indicator.features_, features_indices)
    assert_allclose(X_fit_mask, X_fit_expected[:, features_indices])
    assert_allclose(X_trans_mask, X_trans_expected[:, features_indices])

    assert X_fit_mask.dtype == bool
    assert X_trans_mask.dtype == bool
    assert isinstance(X_fit_mask, np.ndarray)
    assert isinstance(X_trans_mask, np.ndarray)

    indicator.set_params(sparse=True)
    X_fit_mask_sparse = indicator.fit_transform(X_fit)
    X_trans_mask_sparse = indicator.transform(X_trans)

    assert X_fit_mask_sparse.dtype == bool
    assert X_trans_mask_sparse.dtype == bool
    assert X_fit_mask_sparse.format == 'csc'
    assert X_trans_mask_sparse.format == 'csc'
    assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask)
    assert_allclose(X_trans_mask_sparse.toarray(), X_trans_mask)
Beispiel #9
0
def test_missing_indicator_new(missing_values, arr_type, dtype, param_features,
                               n_features, features_indices):
    X_fit = np.array([[missing_values, missing_values, 1],
                      [4, 2, missing_values]])
    X_trans = np.array([[missing_values, missing_values, 1],
                        [4, 12, 10]])
    X_fit_expected = np.array([[1, 1, 0], [0, 0, 1]])
    X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]])

    # convert the input to the right array format and right dtype
    X_fit = arr_type(X_fit).astype(dtype)
    X_trans = arr_type(X_trans).astype(dtype)
    X_fit_expected = X_fit_expected.astype(dtype)
    X_trans_expected = X_trans_expected.astype(dtype)

    indicator = MissingIndicator(missing_values=missing_values,
                                 features=param_features,
                                 sparse=False)
    X_fit_mask = indicator.fit_transform(X_fit)
    X_trans_mask = indicator.transform(X_trans)

    assert X_fit_mask.shape[1] == n_features
    assert X_trans_mask.shape[1] == n_features

    assert_array_equal(indicator.features_, features_indices)
    assert_allclose(X_fit_mask, X_fit_expected[:, features_indices])
    assert_allclose(X_trans_mask, X_trans_expected[:, features_indices])

    assert X_fit_mask.dtype == bool
    assert X_trans_mask.dtype == bool
    assert isinstance(X_fit_mask, np.ndarray)
    assert isinstance(X_trans_mask, np.ndarray)

    indicator.set_params(sparse=True)
    X_fit_mask_sparse = indicator.fit_transform(X_fit)
    X_trans_mask_sparse = indicator.transform(X_trans)

    assert X_fit_mask_sparse.dtype == bool
    assert X_trans_mask_sparse.dtype == bool
    assert X_fit_mask_sparse.format == 'csc'
    assert X_trans_mask_sparse.format == 'csc'
    assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask)
    assert_allclose(X_trans_mask_sparse.toarray(), X_trans_mask)
Beispiel #10
0
def test_missing_indicator_raise_on_sparse_with_missing_0(arr_type):
    # test for sparse input and missing_value == 0

    missing_values = 0
    X_fit = np.array([[missing_values, missing_values, 1],
                      [4, missing_values, 2]])
    X_trans = np.array([[missing_values, missing_values, 1],
                        [4, 12, 10]])

    # convert the input to the right array format
    X_fit_sparse = arr_type(X_fit)
    X_trans_sparse = arr_type(X_trans)

    indicator = MissingIndicator(missing_values=missing_values)

    with pytest.raises(ValueError, match="Sparse input with missing_values=0"):
        indicator.fit_transform(X_fit_sparse)

    indicator.fit_transform(X_fit)
    with pytest.raises(ValueError, match="Sparse input with missing_values=0"):
        indicator.transform(X_trans_sparse)
Beispiel #11
0
class _MissingIndicatorImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Beispiel #12
0
class MissingIndicatorComponent(AutoSklearnPreprocessingAlgorithm):
    def __init__(self,
                 missing_values=np.nan,
                 features: str = "missing-only",
                 random_state=None):
        super().__init__()
        self.features = features
        self.missing_values = missing_values
        self.random_state = random_state

    def fit(self, X, Y=None):
        from sklearn.impute import MissingIndicator
        self.preprocessor = MissingIndicator(
            missing_values=self.missing_values, features=self.features)
        self.preprocessor.fit(X, Y)
        return self

    def transform(self, X):
        if self.preprocessor is None:
            raise NotImplementedError()
        return self.preprocessor.transform(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'MissingIndicator',
            'name': 'Missing Indicator',
            'handles_regression': True,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': True,
            'handles_multioutput': True,
            'is_deterministic': True,
            'input': (DENSE, UNSIGNED_DATA),
            'output': (INPUT, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        features = CategoricalHyperparameter("features",
                                             ["missing-only", "all"],
                                             default_value="missing-only")

        cs = ConfigurationSpace()
        cs.add_hyperparameters([features])
        return cs
Beispiel #13
0
class MissingIndicatorImpl():

    def __init__(self, missing_values='nan', features='missing-only', sparse='auto', error_on_new=True):
        self._hyperparams = {
            'missing_values': missing_values,
            'features': features,
            'sparse': sparse,
            'error_on_new': error_on_new}

    def fit(self, X, y=None):
        self._wrapped_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Beispiel #14
0
class SimulationDataScaler(object):
    def __init__(self,
                 l=0.0,
                 u=1.0,
                 fill_value=0.0,
                 winsorize=False,
                 return_df=True):
        self.l = l
        self.u = u
        self.columns = []
        self.is_fit = False
        self.return_df = return_df

        if not winsorize:
            self.l = 0.0
            self.u = 1.0

        self.winsorize = winsorize
        self.winsorizor = Winsorizer(l=l, u=u)
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='constant',
                                     fill_value=fill_value,
                                     add_indicator=False)
        self.indicator_imputer = MissingIndicator(features="all")

    def fit(self, X):
        #X = X.copy()
        X[~np.isfinite(X)] = np.nan
        self.is_fit = True
        if type(X) is pd.DataFrame:
            self.columns = list(X.columns)

        X_w = self.winsorizor.fit_transform(X)
        #print( (X[~np.isfinite(X)]).sum())
        self.indicator_imputer.fit(X)
        self.scaler.fit(X_w)
        self.imputer.fit(X_w)

        return self

    def transform(self, X):
        if not self.is_fit:
            raise "Please fit the before running"
        X[~np.isfinite(X)] = np.nan
        X_w = self.winsorizor.transform(X)
        X_imp_ind = self.indicator_imputer.transform(X)
        X_w_s = self.scaler.transform(X_w)
        X_w_s_i = self.imputer.transform(X_w_s)

        if self.return_df:
            return pd.DataFrame(X_w_s_i, columns=self.columns), pd.DataFrame(
                X_imp_ind, columns=self.columns)
        else:
            return X_w_s_i, X_imp_ind

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

    def inverse_transform(self, X_w_s_i):
        if not self.is_fit:
            raise "Please fit the before running"
        X_w_i = self.scaler.inverse_transform(X_w_s_i)
        if self.return_df:
            return pd.DataFrame(X_w_i, columns=self.columns)
        else:
            return X_w_i
Beispiel #15
0
data_info(train)
data_info(test)
 
###checking event rate
from collections import Counter
Counter(train.target)
train.target.value_counts(normalize=True)


##creating NA indicator for all the columns containing NAs
mindicator = MissingIndicator(missing_values=np.nan,error_on_new=False)
z = mindicator.fit_transform(train.drop('target',axis = 1))
cols_na_ind = [x+'_na_ind' for x in train.columns[mindicator.features_]]
train = pd.concat([train,pd.DataFrame(1*z,columns = cols_na_ind)],axis = 1)
train.head(1)
z = mindicator.transform(test)
cols_na_ind = [x+'_na_ind' for x in test.columns[mindicator.features_]]
test = pd.concat([test,pd.DataFrame(1*z,columns = cols_na_ind)],axis = 1)
test.head(1)

## Treating Null values
var = 'gender'
#f'count of NULLs in {var} : {train[[var]].isna().sum()[0]}'
train[var].value_counts(dropna = False,normalize = True)
pd.crosstab(index = train[var].fillna('Nan'), columns = train.target,margins = True,normalize='index',)

var = 'enrolled_university'
train[var].value_counts(dropna = False,normalize=True)
pd.crosstab(index = train[var].fillna('Nan'), columns = train.target,margins = True,normalize='index',)

var = 'education_level'
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_data = [[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]
imp_mean.fit(imp_data)
imp_mean.statistics_
X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
imp_mean.get_params()
imp_mean.transform(X)

from sklearn.impute import MissingIndicator
X1 = np.array([[np.nan, 1, 3], [4, 0, np.nan], [8, 1, 0]])
X2 = np.array([[5, 1, np.nan], [np.nan, 2, 3], [2, 4, 0]])
indicator = MissingIndicator()
indicator.fit(X1)
indicator.features_
X1
indicator.transform(X1)
X2
indicator.transform(X2)

indicator_all = MissingIndicator(features='all')
indicator_all.fit_transform(X1)
indicator_all.fit_transform(X2)
indicator_all.features_

from sklearn.preprocessing import Binarizer
X = [[1., -1., 2.], [2., 0., 0.], [0., 1., -1.]]
transformer = Binarizer()
type(transformer)
transformer.fit(X)

transformer.transform(X)
class RobustMissingIndicator(BaseEstimator, TransformerMixin):
    """Binary indicators for missing values.

    Note that this component typically should not be used in a vanilla
    :class:`sklearn.pipeline.Pipeline` consisting of transformers and a classifier,
    but rather could be added using a :class:`sklearn.pipeline.FeatureUnion` or
    :class:`sklearn.compose.ColumnTransformer`.

    Similar to sklearn.impute.MissingIndicator with added functionality
    - RobustMissingIndicator uses a custom mask_function to determine the boolean mask.
      The default mask_function is sagemaker_sklearn_extension.impute.is_finite_numeric
      which checks whether or not a value can be converted into a float.

    Parameters
    ----------
    features : str, optional (default="all")
        Whether the imputer mask should represent all or a subset of
        features.

        - If "missing-only", the imputer mask will only represent
          features containing missing values during fit time.
        - If "all" (default), the imputer mask will represent all features.

    error_on_new : boolean, optional (default=True)
        If True (default), transform will raise an error when there are
        features with missing values in transform that have no missing values
        in fit. This is applicable only when ``features="missing-only"``.

    mask_function : callable -> np.array, dtype('bool') (default=None)
        A vectorized python function, accepts np.array, returns np.array
        with dtype('bool')

        For each value, if mask_function(val) == False, that value will
        be imputed. mask_function is used to create a boolean mask that determines
        which values in the input to impute.

        Use np.vectorize to vectorize singular python functions.

        By default, mask_function will be
        sagemaker_sklearn_extension.impute.is_finite_numeric

    Notes
    -----
    only accepts 2D, non-sparse inputs
    """
    def __init__(self, features="all", error_on_new=True, mask_function=None):
        self.features = features
        self.error_on_new = error_on_new
        self.mask_function = mask_function

    def _validate_input(self, X):
        if hasattr(X, "dtype") and X.dtype is not None and hasattr(
                X.dtype, "kind") and X.dtype.kind == "c":
            raise ValueError("Complex data not supported\n{}\n".format(X))

        return check_array(X,
                           dtype=np.dtype("O"),
                           copy=True,
                           force_all_finite=False,
                           ensure_2d=True)

    def fit(self, X, y=None):
        """Fit the transformer on X.

        Parameters
        ----------
        X : {array-like}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        Returns
        -------
        self : RobustMissingIndicator
        """
        X = self._validate_input(X)

        self.vectorized_mask_function_ = self.mask_function or is_finite_numeric
        X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_))

        self.missing_indicator_ = MissingIndicator(
            features=self.features, error_on_new=self.error_on_new)
        self.missing_indicator_.fit(X)

        return self

    def transform(self, X):
        """Generate missing values indicator for X.

        Parameters
        ----------
        X : {array-like}, shape (n_samples, n_features)
            The input data to complete.

        Returns
        -------
        Xt : {ndarray}, shape (n_samples, n_features)
            The missing indicator for input data. The data type of ``Xt``
            will be boolean.
        """
        check_is_fitted(self,
                        ["missing_indicator_", "vectorized_mask_function_"])
        X = self._validate_input(X)

        X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_))

        return self.missing_indicator_.transform(X)

    def _more_tags(self):
        return {"allow_nan": True}
Beispiel #18
0
#print(indicator)
indicator = pd.DataFrame(
    indicator,
    columns=['m1', 'm3'])  # The only two columns in which missing values are
print(indicator)

# MissingIndicator - more in depth
import numpy as np
from sklearn.impute import MissingIndicator

X1 = np.array([[np.nan, 1, 3], [4, 0, np.nan], [8, 1, 0]])
X2 = np.array([[5, 1, np.nan], [np.nan, 2, 3], [2, 4, 0]])
indicator = MissingIndicator()
indicator.fit(X1)  # Creates the possible indicator columns (i.e., not all)

X2_tr = indicator.transform(X2)
X1_tr = indicator.transform(X1)

print('X2_tr')
print(X2_tr)
print('X1_tr')
print(X1_tr)

#####
# Inputation
#####
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
Y = imp.fit_transform(X)
print(Y)