def get_results(dataset):
    X_full, y_full = dataset.data, dataset.target
    n_samples = X_full.shape[0]
    n_features = X_full.shape[1]

    # Estimate the score on the entire dataset, with no missing values
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
    full_scores = cross_val_score(estimator, X_full, y_full,
                                  scoring='neg_mean_squared_error')

    # Add missing values in 75% of the lines
    missing_rate = 0.75
    n_missing_samples = int(np.floor(n_samples * missing_rate))
    missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                          dtype=np.bool),
                                 np.ones(n_missing_samples,
                                         dtype=np.bool)))
    rng.shuffle(missing_samples)
    missing_features = rng.randint(0, n_features, n_missing_samples)

    # Estimate the score after replacing missing values by 0
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()
    estimator = RandomForestRegressor(random_state=0, n_estimators=100)
    zero_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                         scoring='neg_mean_squared_error')

    # Estimate the score after imputation (mean strategy) of the missing values
    X_missing = X_full.copy()
    X_missing[np.where(missing_samples)[0], missing_features] = 0
    y_missing = y_full.copy()
    estimator = make_pipeline(
        make_union(SimpleImputer(missing_values=0, strategy="mean"),
                   MissingIndicator(missing_values=0)),
        RandomForestRegressor(random_state=0, n_estimators=100))
    mean_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                         scoring='neg_mean_squared_error')

    # Estimate the score after chained imputation of the missing values
    estimator = make_pipeline(
        make_union(ChainedImputer(missing_values=0, random_state=0),
                   MissingIndicator(missing_values=0)),
        RandomForestRegressor(random_state=0, n_estimators=100))
    chained_impute_scores = cross_val_score(estimator, X_missing, y_missing,
                                            scoring='neg_mean_squared_error')

    return ((full_scores.mean(), full_scores.std()),
            (zero_impute_scores.mean(), zero_impute_scores.std()),
            (mean_impute_scores.mean(), mean_impute_scores.std()),
            (chained_impute_scores.mean(), chained_impute_scores.std()))
    def missing_indicator(self):
        """
        Returns the output of sklearn.impute.MissingIndicator as a pandas DataFrame
        """

        return pd.DataFrame(MissingIndicator(features='all').fit_transform(self.input_data),
                            index=self.input_data.index, columns=[x + '_flag' for x in self.input_data.columns])
Beispiel #3
0
    def transform(self, X, y=None):
        null_rating_mask = X['h_user_rating'] == 0
        null_indices = X[null_rating_mask].index
        X.loc[null_indices, 'h_user_rating'] = np.nan

        # Ensure number of MissingIndicator features stays constant by specifying
        # features='all'. Variability in the number of columns causes a mismatch in
        # feature names and actual features.
        imputer_mask = MissingIndicator(features='all',
                                        sparse=False).fit_transform(X)

        # Impute the values manually because each value is handled differently
        mean_horse_weight = X['c_horse_weight'].mean()
        X['c_horse_weight'] = X['c_horse_weight'].fillna(mean_horse_weight)

        mean_horse_weight_diff = X['c_horse_weight_diff'].mean()
        X['c_horse_weight_diff'] = X['c_horse_weight_diff'].fillna(
            mean_horse_weight_diff)

        X['c_previous_order_of_finish'] = X[
            'c_previous_order_of_finish'].fillna(0)
        X['h_user_rating'] = X['h_user_rating'].fillna(0)

        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        return np.c_[X_scaled, imputer_mask]
Beispiel #4
0
def test_missing_indicator_sparse_param(arr_type, missing_values,
                                        param_sparse):
    # check the format of the output with different sparse parameter
    X_fit = np.array([[missing_values, missing_values, 1],
                      [4, missing_values, 2]])
    X_trans = np.array([[missing_values, missing_values, 1], [4, 12, 10]])
    X_fit = arr_type(X_fit).astype(np.float64)
    X_trans = arr_type(X_trans).astype(np.float64)

    indicator = MissingIndicator(missing_values=missing_values,
                                 sparse=param_sparse)
    X_fit_mask = indicator.fit_transform(X_fit)
    X_trans_mask = indicator.transform(X_trans)

    if param_sparse is True:
        assert X_fit_mask.format == 'csc'
        assert X_trans_mask.format == 'csc'
    elif param_sparse == 'auto' and missing_values == 0:
        assert isinstance(X_fit_mask, np.ndarray)
        assert isinstance(X_trans_mask, np.ndarray)
    elif param_sparse is False:
        assert isinstance(X_fit_mask, np.ndarray)
        assert isinstance(X_trans_mask, np.ndarray)
    else:
        if sparse.issparse(X_fit):
            assert X_fit_mask.format == 'csc'
            assert X_trans_mask.format == 'csc'
        else:
            assert isinstance(X_fit_mask, np.ndarray)
            assert isinstance(X_trans_mask, np.ndarray)
Beispiel #5
0
    def impute_dynamic_dataframe(self, df_dynamic):

        # interpolate if gap is less than 10 timestep
        mask = df_dynamic[self.feat_name].copy()
        df_dynamic_imp = df_dynamic.copy()
        for column in self.feat_name:
            df = pd.DataFrame(df_dynamic_imp[column])
            df['new'] = ((df.notnull() != df.shift().notnull()).cumsum())
            df['ones'] = 1
            mask[column] = (
                df.groupby('new')['ones'].transform('count') <
                self.missing_gap_thresh) | df_dynamic_imp[column].notnull()
        df_dynamic_imp[self.feat_name] = df_dynamic_imp[
            self.feat_name].interpolate().bfill()[mask]

        # add dummy variables
        indicator = MissingIndicator(missing_values=np.nan, features='all')
        X = df_dynamic_imp[self.feat_name].values
        if_missing = indicator.fit_transform(X)
        if_measured = 1 - if_missing.astype(int)
        dummy_names = []
        for ind, feat in enumerate(self.feat_name):
            dummy_name = 'if_' + feat
            df_dynamic_imp[dummy_name] = if_measured[:, ind]
            dummy_names.append(dummy_name)

        # impute missing invasive variables with 0 and add column "index"
        df_dynamic_imp = df_dynamic_imp.fillna(value=0)
        df_dynamic_imp = df_dynamic_imp.reindex(['index', 'pid', 'ts'] +
                                                self.feat_name + dummy_names,
                                                axis=1)

        return df_dynamic_imp
Beispiel #6
0
def data_missing_indicator(data_train,var_type_dict,data_test=None):
    '''
    进行特缺失值标记变量衍生
    data_train: 需要进行转换的训练集
    var_type_dict: 变量信息记录dict
    data_test: 需要进行转换的测试集 可以不给 不给就不会进行相应的转换
    
    return:
    data_train_completed 衍生完成的训练集
    var_type_dict 更新完的变量信息记录dict
    data_test_completed 衍生完成的测试集
    '''
    numeric_feature = var_type_dict.get('numeric_var',[])
    category_feature = var_type_dict.get('category_var',[])
    print('开始进行特缺失值标记变量衍生'.center(50, '='))
    ##从dict里面把特征list拿出来
    is_miss_feature = ['is_'+i+'_missing' for i in numeric_feature+category_feature]
    print('原始数据维度:',data_train.shape)
    print('新增数据维度:',len(is_miss_feature))
    check_unique(numeric_feature+is_miss_feature)
    ##数值列和类别列用指定的方法填充
    
    miss_indicator = MissingIndicator(features='all')
    data_train_completed = miss_indicator.fit_transform(data_train[numeric_feature+category_feature])
    data_train_completed = pd.concat([data_train,pd.DataFrame(data_train_completed,columns=is_miss_feature)],axis=1)
    print('变量衍生完成:',data_train_completed.shape)
    ##更新var_type_dict文件 全部加入到numeric_var当中
    var_type_dict['numeric_var'] = numeric_feature+is_miss_feature
    ##如果测试数据不为空 那么对测试数据进行transform 并返回
    if data_test is not None:
        data_test_completed = miss_indicator.transform(data_test[numeric_feature+category_feature])
        data_test_completed = pd.concat([data_test,pd.DataFrame(data_test_completed,columns=is_miss_feature)],axis=1)
        return data_train_completed,var_type_dict,data_test_completed
    return data_train_completed,var_type_dict
def numeric_feature_pipeline(X
                             , numeric_features
                             , binarize=True
                             , binarize_cutoff=0.5):
    """
    Define a numeric feature processing pipeline.

    Parameters
    ------------
    X: {array-like} pd.DataFrame or numpy.ndarray. Shape {observations} x {features}
    numeric_features: {list of str} column names in X representing continuously-valued features
    binarize: {bool} should high-missingness features be binarized with MissingIndicator?
    binarize_cutoff: {float in [0, 1]} threshold above which we create a binary variable
                    for each feature with a missingness rate > binarize_cutoff

    Returns
    ------------
    sklearn.pipeline object
    """
    # -- base of numeric feature processor: imputer + mean, standard deviation scaler
    num_pipeline = make_pipeline(ColumnTransformer([('impute_num', SimpleImputer(strategy='median'), numeric_features)])
                                 , StandardScaler())

    # -- append missingness binarizer if specified
    if binarize:
        num_missing_dat = rank_missingness(X[numeric_features])
        num_binarize_features = num_missing_dat[num_missing_dat > binarize_cutoff].index.tolist()
        num_binarizer = ColumnTransformer([('missing_num_binarizer', MissingIndicator(), num_binarize_features)])

        # -- column-concatenate separate missingness-binarized variables.
        num_pipeline = FeatureUnion([('num_pipeline', num_pipeline)
                                    , ('missing_num_binarizer', num_binarizer)])

    return num_pipeline
def prep_dat(df,ylabels=None):
    df = df.drop(columns = ['UNIQUE_ID'])
    
    i_one = df.loc[pd.isnull(df['GENDER'])].index
    i_two = df.loc[pd.isnull(df['ETHNICITY'])].index
    labels_update = list(np.append(i_one,i_two))
    print(labels_update)

    df = df.drop(labels=labels_update)
    
    df.reset_index(inplace=True,drop=True)
    #including only the values without missing gender and ethnicity
    df = df[pd.notnull(df['GENDER'])]
    df = df[pd.notnull(df['ETHNICITY'])]
    col_names = df.columns 
    df['HS_GPA'] = df['HS_GPA'].replace(0,np.nan)
    
    imputer_transformer = FeatureUnion(
    transformer_list = [
        ('features',SimpleImputer(strategy = 'constant',fill_value = 0)),
        ('indicators',MissingIndicator())])
    out_df = imputer_transformer.fit_transform(df)
    #imputed dfcreation
    col_names = np.append(col_names,['Ind_Mothers','Ind_Fathers','Ind_HS_GPA','Ind_SATRead','Ind_SATMath'])
    imputed_df = pd.DataFrame(out_df,columns = col_names)
    imputed_df[['Ind_Mothers','Ind_Fathers','Ind_HS_GPA','Ind_SATRead','Ind_SATMath']] = imputed_df[['Ind_Mothers','Ind_Fathers','Ind_HS_GPA','Ind_SATRead','Ind_SATMath']].astype(int)
    if np.any(ylabels):
        ylabels = ylabels.drop(labels=labels_update)
        ylabels.reset_index(inplace=True,drop=True)
        return imputed_df,ylabels
    else:
        return imputed_df
Beispiel #9
0
def example():
    import numpy as np
    from sklearn.impute import SimpleImputer
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit([[1, 2], [np.nan, 3], [7, 6]])

    X = [[np.nan, 2], [6, np.nan], [7, 6]]
    print(imp.transform(X))

    ######################################
    from sklearn.datasets import load_iris
    from sklearn.impute import SimpleImputer, MissingIndicator
    from sklearn.model_selection import train_test_split
    from sklearn.pipeline import FeatureUnion, make_pipeline
    from sklearn.tree import DecisionTreeClassifier
    X, y = load_iris(return_X_y=True)
    mask = np.random.randint(0, 2, size=X.shape).astype(np.bool)
    X[mask] = np.nan
    X_train, X_test, y_train, _ = train_test_split(X,
                                                   y,
                                                   test_size=100,
                                                   random_state=0)

    transformer = FeatureUnion(
        transformer_list=[('features', SimpleImputer(
            strategy='mean')), ('indicators', MissingIndicator())])
    transformer = transformer.fit(X_train, y_train)
    results = transformer.transform(X_test)
    print(results.shape)

    clf = make_pipeline(transformer, DecisionTreeClassifier())
    clf = clf.fit(X_train, y_train)
    results = clf.predict(X_test)
    print(results.shape)
    def test_missing_indicator_float_inputs_isnan_false_tvm(self):
        for features in ["all", "missing-only"]:
            model = MissingIndicator(features=features, missing_values=0)
            data = np.array([[1, 2], [0, 3], [7, 6]], dtype=np.float32)
            model.fit(data)

            self._test_sklearn_missing_indic(model, data, "tvm")
def impute_data(X, feature_name_in):
    """Impute numeric data"""
    to_replace_dict = {'na': np.nan}
    for i in feature_name_in:
        na_cnt = 0
        if pd.api.types.is_string_dtype(X[i]):
            na_cnt = X[i].str.contains('na').sum()
        if na_cnt > 0:
            X[i] = X.replace(to_replace=to_replace_dict, value=None)

    indicator = MissingIndicator(error_on_new=True,
                                 features='all',
                                 missing_values=np.nan,
                                 sparse=False)
    X_binary_miss = indicator.fit_transform(X).astype(int)
    X_binary_miss_sum = np.sum(X_binary_miss, axis=0)
    feature_name_out = feature_name_in.copy()
    to_del = []
    for i in range(0, len(X_binary_miss_sum)):
        if X_binary_miss_sum[i] > 0:
            feature_name_out.append(feature_name_in[i] + "_miss")
        else:
            to_del.append(i)
    X_binary_miss = np.delete(X_binary_miss, to_del, axis=1)
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(X)
    X_tr = imp.transform(X)
    X_out = np.concatenate((X_tr, X_binary_miss), axis=1)
    #print(feature_name_out)
    #print(X_out)
    return X_out, feature_name_out
def build_null_mapper(df, cols_with_na):
    imputer_steps = [([col], SimpleImputer()) for col in cols_with_na]
    is_missing_steps = [([col], MissingIndicator(), {
        'alias': f'{col}_is_na'
    }) for col in cols_with_na]
    combined_steps = imputer_steps + is_missing_steps
    return DataFrameMapper(combined_steps, df_out=True)
Beispiel #13
0
    def transform(self, X):
        """Perform imputation using interpolation.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_timestamps)
            Data with missing values.

        Returns
        -------
        X_new : array-like, shape = (n_samples, n_timestamps)
            Data without missing values.

        """
        missing_values, force_all_finite = self._check_params()
        X = check_array(X, dtype='float64', force_all_finite=force_all_finite)
        n_samples, n_timestamps = X.shape

        indicator = MissingIndicator(
            missing_values=missing_values,
            features='all',
            sparse=False,
        )
        non_missing_idx = ~(indicator.fit_transform(X))
        x_new = np.arange(n_timestamps)
        X_imputed = np.asarray([
            self._impute_one_sample(X[i], non_missing_idx[i], x_new)
            for i in range(n_samples)
        ])
        return X_imputed
Beispiel #14
0
def test_missing_indicator_with_imputer(X, missing_values, X_trans_exp):
    trans = make_union(
        SimpleImputer(missing_values=missing_values, strategy='most_frequent'),
        MissingIndicator(missing_values=missing_values)
    )
    X_trans = trans.fit_transform(X)
    assert_array_equal(X_trans, X_trans_exp)
Beispiel #15
0
 def fit(self, X, y=None):
     self.simple_imputer_max.fit(X, y)
     self.simple_imputer_min.fit(X, y)
     if self.add_indicator:
         self.indicator_ = MissingIndicator(missing_values=np.nan,
                                            error_on_new=False)
         self.indicator_.fit(X)
     return self
    def test_missing_indicator_float_inputs(self):
        for features in ["all", "missing-only"]:
            model = MissingIndicator(features=features)
            data = np.array([[1, 2], [np.nan, 3], [7, 6]], dtype=np.float32)
            model.fit(data)

            for backend in ["torch", "torch.jit"]:
                self._test_sklearn_missing_indic(model, data, backend)
Beispiel #17
0
def test_missing_indicator_no_missing():
    # check that all features are dropped if there are no missing values when
    # features='missing-only' (#13491)
    X = np.array([[1, 1], [1, 1]])

    mi = MissingIndicator(features='missing-only', missing_values=-1)
    Xt = mi.fit_transform(X)

    assert Xt.shape[1] == 0
Beispiel #18
0
def run(datai, missing_rate):
    train_data, train_target, test_data, test_target = datai
    transformer = FeatureUnion([('1', IterativeImputer(ExtraTreesRegressor())),
                                ('2', MissingIndicator(features='all'))])
    model = make_pipeline(StandardScaler(), transformer, SVC())
    model.fit(random_replace_with_nan(train_data, missing_rate), train_target)
    return accuracy_score(
        test_target,
        model.predict(random_replace_with_nan(test_data, missing_rate)))
Beispiel #19
0
def indicate_missing(train_df, test_df):
    for missing_feature in cont_missing_features + cat_missing_features:
        imp = MissingIndicator(missing_values=np.nan)
        imp.fit(pd.concat([train_df, test_df])[[missing_feature]])
        train_df["is_missing_" + missing_feature] = imp.transform(
            train_df[[missing_feature]])
        test_df["is_missing_" + missing_feature] = imp.transform(
            test_df[[missing_feature]])
    return train_df, test_df
Beispiel #20
0
def test_missing_indicator_sparse_no_explicit_zeros():
    # Check that non missing values don't become explicit zeros in the mask
    # generated by missing indicator when X is sparse. (#13491)
    X = sparse.csr_matrix([[0, 1, 2], [1, 2, 0], [2, 0, 1]])

    mi = MissingIndicator(features='all', missing_values=1)
    Xt = mi.fit_transform(X)

    assert Xt.getnnz() == Xt.sum()
def get_scores_for_imputer(imputer, X_missing, y_missing):
    estimator = make_pipeline(
        make_union(imputer, MissingIndicator(missing_values=0)), REGRESSOR)
    impute_scores = cross_val_score(estimator,
                                    X_missing,
                                    y_missing,
                                    scoring='neg_mean_squared_error',
                                    cv=N_SPLITS)
    return impute_scores
Beispiel #22
0
    def __init__(self, imputer_strategy='median'):
        """
        :impute_strategy:  strategy to for missing value imputation
        :return: None
        """
        pipe = Pipeline([('log1p', Log1PTransformer()),
                         ('imp_nan', SimpleImputer(strategy=imputer_strategy))])

        self.union = FeatureUnion([('log1p', pipe),
                                   ('nan_ind',MissingIndicator(features='all'))])
Beispiel #23
0
def get_indicators(data):

    indicator = MissingIndicator(missing_values=np.nan, features='all')
    mask_data = pd.DataFrame(indicator.fit_transform(data.iloc[:, :-1]))

    # Rename some columns:
    mask_data.columns = mask_data.columns + 1
    mask_data = mask_data.add_prefix('ind_')

    return (mask_data)
Beispiel #24
0
def teach_tree(data, labels, max_depth, max_features, min_samples_leaf):
    transformer = FeatureUnion(
        transformer_list=[
            ('features', SimpleImputer(strategy='mean')),
            ('indicators', MissingIndicator())])
    clf = make_pipeline(transformer, tree.DecisionTreeClassifier(
        max_depth=max_depth,
        max_features=max_features,
        min_samples_leaf=min_samples_leaf))
    clf.fit(data, labels)
    return clf
Beispiel #25
0
 def __init__(self, random_state=0):
     super(ImputerIndicatorPrim, self).__init__(name='imputerIndicator')
     self.id = 3
     self.hyperparams = []
     self.type = 'data preprocess'
     self.description = "All features will be imputed using SimpleImputer, in order to enable classifiers to work with this data. Additionally, it adds the the indicator variables from MissingIndicator."
     self.hyperparams_run = {'default': True}
     self.random_state = random_state
     self.imp = FeatureUnion(transformer_list=[(
         'features', SimpleImputer()), ('indicators', MissingIndicator())])
     self.num_cols = None
     self.imp_cols = None
     self.accept_type = 'b'
Beispiel #26
0
def encode_with_labels_and_impute(df, strategy='mean') -> pd.DataFrame:
    """ Encode with simple labels and impute mean (of labels) in NaNs"""

    indicator = MissingIndicator(features='all')
    missing_indicator = indicator.fit_transform(df)

    df = impute(df, strategy='constant')  # impute dummy str NaNs
    df = encode_with_labels(df)

    # impute real np.nan back
    for i in range(0, df.shape[1]):
        missing_indicator_col = missing_indicator[:, i]
        df.iloc[missing_indicator_col, i] = np.nan

    return impute(df, strategy=strategy)
Beispiel #27
0
def pipe_num(df, cols):
    pipe = DataFrameMapper([
        *[(col, [
            FunctionTransformer(reshape, validate=False),
            MissingIndicator(),
        ], {
            'alias': f'{col}_na'
        }) for col in cols if df[col].isnull().sum() > 0], *[(col, [
            FunctionTransformer(reshape, validate=False),
            SimpleImputer(strategy='median'),
            StandardScaler()
        ]) for col in cols]
    ],
                           df_out=True)
    return pipe
Beispiel #28
0
def fit_transform_missing_indicator(input_data: pd.DataFrame, db_name: str,
                                    sql: None) -> pd.DataFrame:
    indicator = MissingIndicator()
    x = indicator.fit_transform(input_data)
    missing_features = [
        f"missing_{input_data.columns[ii]}" for ii in list(indicator.features_)
    ]
    missing_indicator_df = pd.DataFrame(x, columns=missing_features)
    missing_indicator_df[missing_features].replace({True: 1, False: 0})

    with sqlite3.connect(db_name) as conn:
        query = "INSERT INTO features VALUES (?,?)"
        conn.execute(query, ("missing", cloudpickle.dumps(missing_features)))
    return input_data.merge(missing_indicator_df,
                            left_index=True,
                            right_index=True)
Beispiel #29
0
def test_missing_indicator_feature_names_out():
    """Check that missing indicator return the feature names with a prefix."""
    pd = pytest.importorskip("pandas")

    missing_values = np.nan
    X = pd.DataFrame(
        [
            [missing_values, missing_values, 1, missing_values],
            [4, missing_values, 2, 10],
        ],
        columns=["a", "b", "c", "d"],
    )

    indicator = MissingIndicator(missing_values=missing_values).fit(X)
    feature_names = indicator.get_feature_names_out()
    expected_names = ["missingindicator_a", "missingindicator_b", "missingindicator_d"]
    assert_array_equal(expected_names, feature_names)
Beispiel #30
0
def test_missing_indicator():
    X, y = load_iris(return_X_y=True)
    for missing_values in [np.nan, X[0][0], X[-1][1]]:
        X, y = load_iris(return_X_y=True)
        if np.isnan(missing_values):
            X.ravel()[np.random.choice(X.size, 20, replace=False)] = np.nan
        X_ = X.tolist()
        for features in ["missing-only", "all"]:
            imp = MissingIndicator(
                features=features, missing_values=missing_values, error_on_new=False
            )
            imp.fit(X)
            imp_ = convert_estimator(imp)

            X_t = getattr(imp, "transform")(X)
            X_t_ = getattr(imp_, "transform")(X_)
            assert np.allclose(X_t.shape, shape(X_t_))
            assert np.allclose(X_t, X_t_)