Beispiel #1
0
    def transform(self, X):
        """Perform imputation using interpolation.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_timestamps)
            Data with missing values.

        Returns
        -------
        X_new : array-like, shape = (n_samples, n_timestamps)
            Data without missing values.

        """
        missing_values, force_all_finite = self._check_params()
        X = check_array(X, dtype='float64', force_all_finite=force_all_finite)
        n_samples, n_timestamps = X.shape

        indicator = MissingIndicator(
            missing_values=missing_values,
            features='all',
            sparse=False,
        )
        non_missing_idx = ~(indicator.fit_transform(X))
        x_new = np.arange(n_timestamps)
        X_imputed = np.asarray([
            self._impute_one_sample(X[i], non_missing_idx[i], x_new)
            for i in range(n_samples)
        ])
        return X_imputed
    def test_missing_indicator_float_inputs_isnan_false_tvm(self):
        for features in ["all", "missing-only"]:
            model = MissingIndicator(features=features, missing_values=0)
            data = np.array([[1, 2], [0, 3], [7, 6]], dtype=np.float32)
            model.fit(data)

            self._test_sklearn_missing_indic(model, data, "tvm")
Beispiel #3
0
    def impute_dynamic_dataframe(self, df_dynamic):

        # interpolate if gap is less than 10 timestep
        mask = df_dynamic[self.feat_name].copy()
        df_dynamic_imp = df_dynamic.copy()
        for column in self.feat_name:
            df = pd.DataFrame(df_dynamic_imp[column])
            df['new'] = ((df.notnull() != df.shift().notnull()).cumsum())
            df['ones'] = 1
            mask[column] = (
                df.groupby('new')['ones'].transform('count') <
                self.missing_gap_thresh) | df_dynamic_imp[column].notnull()
        df_dynamic_imp[self.feat_name] = df_dynamic_imp[
            self.feat_name].interpolate().bfill()[mask]

        # add dummy variables
        indicator = MissingIndicator(missing_values=np.nan, features='all')
        X = df_dynamic_imp[self.feat_name].values
        if_missing = indicator.fit_transform(X)
        if_measured = 1 - if_missing.astype(int)
        dummy_names = []
        for ind, feat in enumerate(self.feat_name):
            dummy_name = 'if_' + feat
            df_dynamic_imp[dummy_name] = if_measured[:, ind]
            dummy_names.append(dummy_name)

        # impute missing invasive variables with 0 and add column "index"
        df_dynamic_imp = df_dynamic_imp.fillna(value=0)
        df_dynamic_imp = df_dynamic_imp.reindex(['index', 'pid', 'ts'] +
                                                self.feat_name + dummy_names,
                                                axis=1)

        return df_dynamic_imp
Beispiel #4
0
class MIAImputer(BaseEstimator, TransformerMixin):
    """ MIA imputation strategy
    
    duplicate each columns by remplacing each np.nan by once +inf and once -inf
    """
    def __init__(self, add_indicator=False, fill_value=10**5):
        self.add_indicator = add_indicator
        self.simple_imputer_max = SimpleImputer(strategy='constant',
                                                fill_value=10**5)
        self.simple_imputer_min = SimpleImputer(strategy='constant',
                                                fill_value=-10**5)

    def fit(self, X, y=None):
        self.simple_imputer_max.fit(X, y)
        self.simple_imputer_min.fit(X, y)
        if self.add_indicator:
            self.indicator_ = MissingIndicator(missing_values=np.nan,
                                               error_on_new=False)
            self.indicator_.fit(X)
        return self

    def transform(self, X):

        if self.add_indicator:
            X_trans_indicator = self.indicator_.transform(X)

        X_max = self.simple_imputer_max.transform(X)
        X_min = self.simple_imputer_min.transform(X)
        X = np.hstack((X_max, X_min))

        if self.add_indicator:
            X = np.hstack((X, X_trans_indicator))

        return X
Beispiel #5
0
def data_missing_indicator(data_train,var_type_dict,data_test=None):
    '''
    进行特缺失值标记变量衍生
    data_train: 需要进行转换的训练集
    var_type_dict: 变量信息记录dict
    data_test: 需要进行转换的测试集 可以不给 不给就不会进行相应的转换
    
    return:
    data_train_completed 衍生完成的训练集
    var_type_dict 更新完的变量信息记录dict
    data_test_completed 衍生完成的测试集
    '''
    numeric_feature = var_type_dict.get('numeric_var',[])
    category_feature = var_type_dict.get('category_var',[])
    print('开始进行特缺失值标记变量衍生'.center(50, '='))
    ##从dict里面把特征list拿出来
    is_miss_feature = ['is_'+i+'_missing' for i in numeric_feature+category_feature]
    print('原始数据维度:',data_train.shape)
    print('新增数据维度:',len(is_miss_feature))
    check_unique(numeric_feature+is_miss_feature)
    ##数值列和类别列用指定的方法填充
    
    miss_indicator = MissingIndicator(features='all')
    data_train_completed = miss_indicator.fit_transform(data_train[numeric_feature+category_feature])
    data_train_completed = pd.concat([data_train,pd.DataFrame(data_train_completed,columns=is_miss_feature)],axis=1)
    print('变量衍生完成:',data_train_completed.shape)
    ##更新var_type_dict文件 全部加入到numeric_var当中
    var_type_dict['numeric_var'] = numeric_feature+is_miss_feature
    ##如果测试数据不为空 那么对测试数据进行transform 并返回
    if data_test is not None:
        data_test_completed = miss_indicator.transform(data_test[numeric_feature+category_feature])
        data_test_completed = pd.concat([data_test,pd.DataFrame(data_test_completed,columns=is_miss_feature)],axis=1)
        return data_train_completed,var_type_dict,data_test_completed
    return data_train_completed,var_type_dict
Beispiel #6
0
def test_missing_indicator_sparse_param(arr_type, missing_values,
                                        param_sparse):
    # check the format of the output with different sparse parameter
    X_fit = np.array([[missing_values, missing_values, 1],
                      [4, missing_values, 2]])
    X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]])
    X_fit = arr_type(X_fit).astype(np.float64)
    X_trans = arr_type(X_trans).astype(np.float64)

    indicator = MissingIndicator(missing_values=missing_values,
                                 sparse=param_sparse)
    X_fit_mask = indicator.fit_transform(X_fit)
    X_trans_mask = indicator.transform(X_trans)

    if param_sparse is True:
        assert X_fit_mask.format == 'csc'
        assert X_trans_mask.format == 'csc'
    elif param_sparse == 'auto' and missing_values == 0:
        assert isinstance(X_fit_mask, np.ndarray)
        assert isinstance(X_trans_mask, np.ndarray)
    elif param_sparse is False:
        assert isinstance(X_fit_mask, np.ndarray)
        assert isinstance(X_trans_mask, np.ndarray)
    else:
        if sparse.issparse(X_fit):
            assert X_fit_mask.format == 'csc'
            assert X_trans_mask.format == 'csc'
        else:
            assert isinstance(X_fit_mask, np.ndarray)
            assert isinstance(X_trans_mask, np.ndarray)
Beispiel #7
0
class MissingIndicatorImpl:
    def __init__(
        self,
        missing_values="nan",
        features="missing-only",
        sparse="auto",
        error_on_new=True,
    ):
        self._hyperparams = {
            "missing_values": missing_values,
            "features": features,
            "sparse": sparse,
            "error_on_new": error_on_new,
        }

    def fit(self, X, y=None):
        self._wrapped_model = SKLModel(**self._hyperparams)
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Beispiel #8
0
def test_missing_indicator_sparse_param(arr_type, missing_values,
                                        param_sparse):
    # check the format of the output with different sparse parameter
    X_fit = np.array([[missing_values, missing_values, 1],
                      [4, missing_values, 2]])
    X_trans = np.array([[missing_values, missing_values, 1],
                        [4, 12, 10]])
    X_fit = arr_type(X_fit).astype(np.float64)
    X_trans = arr_type(X_trans).astype(np.float64)

    indicator = MissingIndicator(missing_values=missing_values,
                                 sparse=param_sparse)
    X_fit_mask = indicator.fit_transform(X_fit)
    X_trans_mask = indicator.transform(X_trans)

    if param_sparse is True:
        assert X_fit_mask.format == 'csc'
        assert X_trans_mask.format == 'csc'
    elif param_sparse == 'auto' and missing_values == 0:
        assert isinstance(X_fit_mask, np.ndarray)
        assert isinstance(X_trans_mask, np.ndarray)
    elif param_sparse is False:
        assert isinstance(X_fit_mask, np.ndarray)
        assert isinstance(X_trans_mask, np.ndarray)
    else:
        if sparse.issparse(X_fit):
            assert X_fit_mask.format == 'csc'
            assert X_trans_mask.format == 'csc'
        else:
            assert isinstance(X_fit_mask, np.ndarray)
            assert isinstance(X_trans_mask, np.ndarray)
def impute_data(X, feature_name_in):
    """Impute numeric data"""
    to_replace_dict = {'na': np.nan}
    for i in feature_name_in:
        na_cnt = 0
        if pd.api.types.is_string_dtype(X[i]):
            na_cnt = X[i].str.contains('na').sum()
        if na_cnt > 0:
            X[i] = X.replace(to_replace=to_replace_dict, value=None)

    indicator = MissingIndicator(error_on_new=True,
                                 features='all',
                                 missing_values=np.nan,
                                 sparse=False)
    X_binary_miss = indicator.fit_transform(X).astype(int)
    X_binary_miss_sum = np.sum(X_binary_miss, axis=0)
    feature_name_out = feature_name_in.copy()
    to_del = []
    for i in range(0, len(X_binary_miss_sum)):
        if X_binary_miss_sum[i] > 0:
            feature_name_out.append(feature_name_in[i] + "_miss")
        else:
            to_del.append(i)
    X_binary_miss = np.delete(X_binary_miss, to_del, axis=1)
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(X)
    X_tr = imp.transform(X)
    X_out = np.concatenate((X_tr, X_binary_miss), axis=1)
    #print(feature_name_out)
    #print(X_out)
    return X_out, feature_name_out
Beispiel #10
0
 def fit(self, X, y=None):
     self.simple_imputer_max.fit(X, y)
     self.simple_imputer_min.fit(X, y)
     if self.add_indicator:
         self.indicator_ = MissingIndicator(missing_values=np.nan,
                                            error_on_new=False)
         self.indicator_.fit(X)
     return self
    def test_missing_indicator_float_inputs(self):
        for features in ["all", "missing-only"]:
            model = MissingIndicator(features=features)
            data = np.array([[1, 2], [np.nan, 3], [7, 6]], dtype=np.float32)
            model.fit(data)

            for backend in ["torch", "torch.jit"]:
                self._test_sklearn_missing_indic(model, data, backend)
Beispiel #12
0
def test_missing_indicator_no_missing():
    # check that all features are dropped if there are no missing values when
    # features='missing-only' (#13491)
    X = np.array([[1, 1], [1, 1]])

    mi = MissingIndicator(features='missing-only', missing_values=-1)
    Xt = mi.fit_transform(X)

    assert Xt.shape[1] == 0
Beispiel #13
0
def test_missing_indicator_sparse_no_explicit_zeros():
    # Check that non missing values don't become explicit zeros in the mask
    # generated by missing indicator when X is sparse. (#13491)
    X = sparse.csr_matrix([[0, 1, 2], [1, 2, 0], [2, 0, 1]])

    mi = MissingIndicator(features='all', missing_values=1)
    Xt = mi.fit_transform(X)

    assert Xt.getnnz() == Xt.sum()
def test_missing_indicator_no_missing():
    # check that all features are dropped if there are no missing values when
    # features='missing-only' (#13491)
    X = np.array([[1, 1],
                  [1, 1]])

    mi = MissingIndicator(features='missing-only', missing_values=-1)
    Xt = mi.fit_transform(X)

    assert Xt.shape[1] == 0
Beispiel #15
0
def get_indicators(data):

    indicator = MissingIndicator(missing_values=np.nan, features='all')
    mask_data = pd.DataFrame(indicator.fit_transform(data.iloc[:, :-1]))

    # Rename some columns:
    mask_data.columns = mask_data.columns + 1
    mask_data = mask_data.add_prefix('ind_')

    return (mask_data)
def test_missing_indicator_sparse_no_explicit_zeros():
    # Check that non missing values don't become explicit zeros in the mask
    # generated by missing indicator when X is sparse. (#13491)
    X = sparse.csr_matrix([[0, 1, 2],
                           [1, 2, 0],
                           [2, 0, 1]])

    mi = MissingIndicator(features='all', missing_values=1)
    Xt = mi.fit_transform(X)

    assert Xt.getnnz() == Xt.sum()
def get_results(dataset):
    X_full, y_full = dataset.data, dataset.target
    n_samples = X_full.shape[0]
    n_features = X_full.shape[1]

    # Estimate the score on the entire dataset, with no missing values
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
    full_scores = cross_val_score(estimator, X_full, y_full,
                                  scoring='neg_mean_squared_error')

    # Add missing values in 75% of the lines
    missing_rate = 0.75
    n_missing_samples = int(np.floor(n_samples * missing_rate))
    missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                          dtype=np.bool),
                                 np.ones(n_missing_samples,
                                         dtype=np.bool)))
    rng.shuffle(missing_samples)
    missing_features = rng.randint(0, n_features, n_missing_samples)

    # Estimate the score after replacing missing values by 0
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
    zero_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                         scoring='neg_mean_squared_error')

    # Estimate the score after imputation (mean strategy) of the missing values
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()
    estimator = make_pipeline(
        make_union(SimpleImputer(missing_values=0, strategy="mean"),
                   MissingIndicator(missing_values=0)),
        RandomForestRegressor(random_state=0, n_estimators=100))
    mean_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                         scoring='neg_mean_squared_error')

    # Estimate the score after chained imputation of the missing values
    estimator = make_pipeline(
        make_union(ChainedImputer(missing_values=0, random_state=0),
                   MissingIndicator(missing_values=0)),
        RandomForestRegressor(random_state=0, n_estimators=100))
    chained_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                            scoring='neg_mean_squared_error')

    return ((full_scores.mean(), full_scores.std()),
            (zero_impute_scores.mean(), zero_impute_scores.std()),
            (mean_impute_scores.mean(), mean_impute_scores.std()),
            (chained_impute_scores.mean(), chained_impute_scores.std()))
Beispiel #18
0
class _MissingIndicatorImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Beispiel #19
0
def encode_with_labels_and_impute(df, strategy='mean') -> pd.DataFrame:
    """ Encode with simple labels and impute mean (of labels) in NaNs"""

    indicator = MissingIndicator(features='all')
    missing_indicator = indicator.fit_transform(df)

    df = impute(df, strategy='constant')  # impute dummy str NaNs
    df = encode_with_labels(df)

    # impute real np.nan back
    for i in range(0, df.shape[1]):
        missing_indicator_col = missing_indicator[:, i]
        df.iloc[missing_indicator_col, i] = np.nan

    return impute(df, strategy=strategy)
def numeric_feature_pipeline(X
                             , numeric_features
                             , binarize=True
                             , binarize_cutoff=0.5):
    """
    Define a numeric feature processing pipeline.

    Parameters
    ------------
    X: {array-like} pd.DataFrame or numpy.ndarray. Shape {observations} x {features}
    numeric_features: {list of str} column names in X representing continuously-valued features
    binarize: {bool} should high-missingness features be binarized with MissingIndicator?
    binarize_cutoff: {float in [0, 1]} threshold above which we create a binary variable
                    for each feature with a missingness rate > binarize_cutoff

    Returns
    ------------
    sklearn.pipeline object
    """
    # -- base of numeric feature processor: imputer + mean, standard deviation scaler
    num_pipeline = make_pipeline(ColumnTransformer([('impute_num', SimpleImputer(strategy='median'), numeric_features)])
                                 , StandardScaler())

    # -- append missingness binarizer if specified
    if binarize:
        num_missing_dat = rank_missingness(X[numeric_features])
        num_binarize_features = num_missing_dat[num_missing_dat > binarize_cutoff].index.tolist()
        num_binarizer = ColumnTransformer([('missing_num_binarizer', MissingIndicator(), num_binarize_features)])

        # -- column-concatenate separate missingness-binarized variables.
        num_pipeline = FeatureUnion([('num_pipeline', num_pipeline)
                                    , ('missing_num_binarizer', num_binarizer)])

    return num_pipeline
Beispiel #21
0
def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp):
    trans = make_union(
        SimpleImputer(missing_values=missing_values, strategy='most_frequent'),
        MissingIndicator(missing_values=missing_values)
    )
    X_trans = trans.fit_transform(X)
    assert_array_equal(X_trans, X_trans_exp)
Beispiel #22
0
def example():
    import numpy as np
    from sklearn.impute import SimpleImputer
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit([[1, 2], [np.nan, 3], [7, 6]])

    X = [[np.nan, 2], [6, np.nan], [7, 6]]
    print(imp.transform(X))

    ######################################
    from sklearn.datasets import load_iris
    from sklearn.impute import SimpleImputer, MissingIndicator
    from sklearn.model_selection import train_test_split
    from sklearn.pipeline import FeatureUnion, make_pipeline
    from sklearn.tree import DecisionTreeClassifier
    X, y = load_iris(return_X_y=True)
    mask = np.random.randint(0, 2, size=X.shape).astype(np.bool)
    X[mask] = np.nan
    X_train, X_test, y_train, _ = train_test_split(X,
                                                   y,
                                                   test_size=100,
                                                   random_state=0)

    transformer = FeatureUnion(
        transformer_list=[('features', SimpleImputer(
            strategy='mean')), ('indicators', MissingIndicator())])
    transformer = transformer.fit(X_train, y_train)
    results = transformer.transform(X_test)
    print(results.shape)

    clf = make_pipeline(transformer, DecisionTreeClassifier())
    clf = clf.fit(X_train, y_train)
    results = clf.predict(X_test)
    print(results.shape)
Beispiel #23
0
def fit_transform_missing_indicator(input_data: pd.DataFrame, db_name: str,
                                    sql: None) -> pd.DataFrame:
    indicator = MissingIndicator()
    x = indicator.fit_transform(input_data)
    missing_features = [
        f"missing_{input_data.columns[ii]}" for ii in list(indicator.features_)
    ]
    missing_indicator_df = pd.DataFrame(x, columns=missing_features)
    missing_indicator_df[missing_features].replace({True: 1, False: 0})

    with sqlite3.connect(db_name) as conn:
        query = "INSERT INTO features VALUES (?,?)"
        conn.execute(query, ("missing", cloudpickle.dumps(missing_features)))
    return input_data.merge(missing_indicator_df,
                            left_index=True,
                            right_index=True)
def prep_dat(df,ylabels=None):
    df = df.drop(columns = ['UNIQUE_ID'])
    
    i_one = df.loc[pd.isnull(df['GENDER'])].index
    i_two = df.loc[pd.isnull(df['ETHNICITY'])].index
    labels_update = list(np.append(i_one,i_two))
    print(labels_update)

    df = df.drop(labels=labels_update)
    
    df.reset_index(inplace=True,drop=True)
    #including only the values without missing gender and ethnicity
    df = df[pd.notnull(df['GENDER'])]
    df = df[pd.notnull(df['ETHNICITY'])]
    col_names = df.columns 
    df['HS_GPA'] = df['HS_GPA'].replace(0,np.nan)
    
    imputer_transformer = FeatureUnion(
    transformer_list = [
        ('features',SimpleImputer(strategy = 'constant',fill_value = 0)),
        ('indicators',MissingIndicator())])
    out_df = imputer_transformer.fit_transform(df)
    #imputed dfcreation
    col_names = np.append(col_names,['Ind_Mothers','Ind_Fathers','Ind_HS_GPA','Ind_SATRead','Ind_SATMath'])
    imputed_df = pd.DataFrame(out_df,columns = col_names)
    imputed_df[['Ind_Mothers','Ind_Fathers','Ind_HS_GPA','Ind_SATRead','Ind_SATMath']] = imputed_df[['Ind_Mothers','Ind_Fathers','Ind_HS_GPA','Ind_SATRead','Ind_SATMath']].astype(int)
    if np.any(ylabels):
        ylabels = ylabels.drop(labels=labels_update)
        ylabels.reset_index(inplace=True,drop=True)
        return imputed_df,ylabels
    else:
        return imputed_df
def build_null_mapper(df, cols_with_na):
    imputer_steps = [([col], SimpleImputer()) for col in cols_with_na]
    is_missing_steps = [([col], MissingIndicator(), {
        'alias': f'{col}_is_na'
    }) for col in cols_with_na]
    combined_steps = imputer_steps + is_missing_steps
    return DataFrameMapper(combined_steps, df_out=True)
Beispiel #26
0
    def transform(self, X, y=None):
        null_rating_mask = X['h_user_rating'] == 0
        null_indices = X[null_rating_mask].index
        X.loc[null_indices, 'h_user_rating'] = np.nan

        # Ensure number of MissingIndicator features stays constant by specifying
        # features='all'. Variability in the number of columns causes a mismatch in
        # feature names and actual features.
        imputer_mask = MissingIndicator(features='all',
                                        sparse=False).fit_transform(X)

        # Impute the values manually because each value is handled differently
        mean_horse_weight = X['c_horse_weight'].mean()
        X['c_horse_weight'] = X['c_horse_weight'].fillna(mean_horse_weight)

        mean_horse_weight_diff = X['c_horse_weight_diff'].mean()
        X['c_horse_weight_diff'] = X['c_horse_weight_diff'].fillna(
            mean_horse_weight_diff)

        X['c_previous_order_of_finish'] = X[
            'c_previous_order_of_finish'].fillna(0)
        X['h_user_rating'] = X['h_user_rating'].fillna(0)

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        return np.c_[X_scaled, imputer_mask]
Beispiel #27
0
 def fit(self, X, y=None):
     self._wrapped_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._wrapped_model.fit(X, y)
     else:
         self._wrapped_model.fit(X)
     return self
    def missing_indicator(self):
        """
        Returns the output of sklearn.impute.MissingIndicator as a pandas DataFrame
        """

        return pd.DataFrame(MissingIndicator(features='all').fit_transform(self.input_data),
                            index=self.input_data.index, columns=[x + '_flag' for x in self.input_data.columns])
Beispiel #29
0
class MissingIndicatorComponent(AutoSklearnPreprocessingAlgorithm):
    def __init__(self,
                 missing_values=np.nan,
                 features: str = "missing-only",
                 random_state=None):
        super().__init__()
        self.features = features
        self.missing_values = missing_values
        self.random_state = random_state

    def fit(self, X, Y=None):
        from sklearn.impute import MissingIndicator
        self.preprocessor = MissingIndicator(
            missing_values=self.missing_values, features=self.features)
        self.preprocessor.fit(X, Y)
        return self

    def transform(self, X):
        if self.preprocessor is None:
            raise NotImplementedError()
        return self.preprocessor.transform(X)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'MissingIndicator',
            'name': 'Missing Indicator',
            'handles_regression': True,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': True,
            'handles_multioutput': True,
            'is_deterministic': True,
            'input': (DENSE, UNSIGNED_DATA),
            'output': (INPUT, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        features = CategoricalHyperparameter("features",
                                             ["missing-only", "all"],
                                             default_value="missing-only")

        cs = ConfigurationSpace()
        cs.add_hyperparameters([features])
        return cs
Beispiel #30
0
def test_missing_indicator_feature_names_out():
    """Check that missing indicator return the feature names with a prefix."""
    pd = pytest.importorskip("pandas")

    missing_values = np.nan
    X = pd.DataFrame(
        [
            [missing_values, missing_values, 1, missing_values],
            [4, missing_values, 2, 10],
        ],
        columns=["a", "b", "c", "d"],
    )

    indicator = MissingIndicator(missing_values=missing_values).fit(X)
    feature_names = indicator.get_feature_names_out()
    expected_names = ["missingindicator_a", "missingindicator_b", "missingindicator_d"]
    assert_array_equal(expected_names, feature_names)
def get_scores_for_imputer(imputer, X_missing, y_missing):
    estimator = make_pipeline(
        make_union(imputer, MissingIndicator(missing_values=0)), REGRESSOR)
    impute_scores = cross_val_score(estimator,
                                    X_missing,
                                    y_missing,
                                    scoring='neg_mean_squared_error',
                                    cv=N_SPLITS)
    return impute_scores
Beispiel #32
0
def test_missing_indicator():
    X, y = load_iris(return_X_y=True)
    for missing_values in [np.nan, X[0][0], X[-1][1]]:
        X, y = load_iris(return_X_y=True)
        if np.isnan(missing_values):
            X.ravel()[np.random.choice(X.size, 20, replace=False)] = np.nan
        X_ = X.tolist()
        for features in ["missing-only", "all"]:
            imp = MissingIndicator(
                features=features, missing_values=missing_values, error_on_new=False
            )
            imp.fit(X)
            imp_ = convert_estimator(imp)

            X_t = getattr(imp, "transform")(X)
            X_t_ = getattr(imp_, "transform")(X_)
            assert np.allclose(X_t.shape, shape(X_t_))
            assert np.allclose(X_t, X_t_)
Beispiel #33
0
def run(datai, missing_rate):
    train_data, train_target, test_data, test_target = datai
    transformer = FeatureUnion([('1', IterativeImputer(ExtraTreesRegressor())),
                                ('2', MissingIndicator(features='all'))])
    model = make_pipeline(StandardScaler(), transformer, SVC())
    model.fit(random_replace_with_nan(train_data, missing_rate), train_target)
    return accuracy_score(
        test_target,
        model.predict(random_replace_with_nan(test_data, missing_rate)))
Beispiel #34
0
def test_missing_indicator_new(missing_values, arr_type, dtype, param_features,
                               n_features, features_indices):
    X_fit = np.array([[missing_values, missing_values, 1],
                      [4, missing_values, 2]])
    X_trans = np.array([[missing_values, missing_values, 1],
                        [4, 12, 10]])
    X_fit_expected = np.array([[1, 1, 0], [0, 1, 0]])
    X_trans_expected = np.array([[1, 1, 0], [0, 0, 0]])

    # convert the input to the right array format and right dtype
    X_fit = arr_type(X_fit).astype(dtype)
    X_trans = arr_type(X_trans).astype(dtype)
    X_fit_expected = X_fit_expected.astype(dtype)
    X_trans_expected = X_trans_expected.astype(dtype)

    indicator = MissingIndicator(missing_values=missing_values,
                                 features=param_features,
                                 sparse=False)
    X_fit_mask = indicator.fit_transform(X_fit)
    X_trans_mask = indicator.transform(X_trans)

    assert X_fit_mask.shape[1] == n_features
    assert X_trans_mask.shape[1] == n_features

    assert_array_equal(indicator.features_, features_indices)
    assert_allclose(X_fit_mask, X_fit_expected[:, features_indices])
    assert_allclose(X_trans_mask, X_trans_expected[:, features_indices])

    assert X_fit_mask.dtype == bool
    assert X_trans_mask.dtype == bool
    assert isinstance(X_fit_mask, np.ndarray)
    assert isinstance(X_trans_mask, np.ndarray)

    indicator.set_params(sparse=True)
    X_fit_mask_sparse = indicator.fit_transform(X_fit)
    X_trans_mask_sparse = indicator.transform(X_trans)

    assert X_fit_mask_sparse.dtype == bool
    assert X_trans_mask_sparse.dtype == bool
    assert X_fit_mask_sparse.format == 'csc'
    assert X_trans_mask_sparse.format == 'csc'
    assert_allclose(X_fit_mask_sparse.toarray(), X_fit_mask)
    assert_allclose(X_trans_mask_sparse.toarray(), X_trans_mask)
Beispiel #35
0
def test_missing_indicator_raise_on_sparse_with_missing_0(arr_type):
    # test for sparse input and missing_value == 0

    missing_values = 0
    X_fit = np.array([[missing_values, missing_values, 1],
                      [4, missing_values, 2]])
    X_trans = np.array([[missing_values, missing_values, 1],
                        [4, 12, 10]])

    # convert the input to the right array format
    X_fit_sparse = arr_type(X_fit)
    X_trans_sparse = arr_type(X_trans)

    indicator = MissingIndicator(missing_values=missing_values)

    with pytest.raises(ValueError, match="Sparse input with missing_values=0"):
        indicator.fit_transform(X_fit_sparse)

    indicator.fit_transform(X_fit)
    with pytest.raises(ValueError, match="Sparse input with missing_values=0"):
        indicator.transform(X_trans_sparse)
Beispiel #36
0
def test_missing_indicator_error(X_fit, X_trans, params, msg_err):
    indicator = MissingIndicator(missing_values=-1)
    indicator.set_params(**params)
    with pytest.raises(ValueError, match=msg_err):
        indicator.fit(X_fit).transform(X_trans)
Beispiel #37
0
def test_missing_indicator_string():
    X = np.array([['a', 'b', 'c'], ['b', 'c', 'a']], dtype=object)
    indicator = MissingIndicator(missing_values='a', features='all')
    X_trans = indicator.fit_transform(X)
    assert_array_equal(X_trans, np.array([[True, False, False],
                                          [False, False, True]]))