Esempio n. 1
0
def test_default_sparse(input_data_imputation):
    X, mask = input_data_imputation
    X = sparse.csc_matrix(X)
    Y = CategoricalImputation().fit_transform(X)
    Y = Y.todense()
    assert (np.argwhere(Y == 0) == np.argwhere(mask)).all()
    assert (np.argwhere(Y != 0) == np.argwhere(np.logical_not(mask))).all()
 def test_default_sparse(self):
     X, mask = self._get_dataset()
     X = sparse.csc_matrix(X)
     Y = CategoricalImputation().fit_transform(X)
     Y = Y.todense()
     self.assertTrue((np.argwhere(Y == 2) == np.argwhere(mask)).all())
     self.assertTrue(
         (np.argwhere(Y != 2) == np.argwhere(np.logical_not(mask))).all())
Esempio n. 3
0
def test_nonzero_numerical_imputation(format_type):

    # First try with an array with 0 as only valid category. The imputation should
    # happen with -1
    X = np.full(fill_value=np.nan, shape=(10, 10))
    X[0, :] = 0
    if 'pandas' in format_type:
        X = pd.DataFrame(X)
    elif 'numpy' in format_type:
        pass
    else:
        pytest.fail(format_type)
    Y = CategoricalImputation().fit_transform(X.copy())
    np.testing.assert_equal(np.nan_to_num(X, nan=-1, copy=True), Y)

    # Then if there is also a -1 in the category, we expect -2 as imputation
    X = np.full(fill_value=np.nan, shape=(10, 10))
    X[0, :] = 0
    X[1, :] = -1
    if 'pandas' in format_type:
        X = pd.DataFrame(X)
    Y = CategoricalImputation().fit_transform(X.copy())
    np.testing.assert_equal(np.nan_to_num(X, nan=-2, copy=True), Y)
    def _get_pipeline_steps(self):
        steps = []

        steps.extend([
            ["category_shift", CategoryShift()],
            ["imputation", CategoricalImputation()],
            [
                "category_coalescence",
                CoalescenseChoice(self.dataset_properties)
            ],
            ["categorical_encoding",
             OHEChoice(self.dataset_properties)],
        ])

        return steps
Esempio n. 5
0
def test_default_imputation(input_data_imputation, categorical):
    """
    Makes sure that imputation works for both numerical and categorical data.
    This also has to be guaranteed for numpy and pandas like objects.
    """
    X, mask = input_data_imputation
    if categorical:
        imputation_value = 'missing_value'
        X = X.astype('str').astype('object')
        X[mask] = np.nan
    else:
        imputation_value = 0
    Y = CategoricalImputation().fit_transform(X.copy())
    assert ((np.argwhere(Y == imputation_value) == np.argwhere(mask)).all())
    assert ((np.argwhere(Y != imputation_value) == np.argwhere(np.logical_not(mask))).all())
Esempio n. 6
0
    def _get_pipeline_steps(self,
                            dataset_properties: Optional[Dict[str, str]] = None,
                            ) -> List[Tuple[str, BaseEstimator]]:
        steps = []

        default_dataset_properties = {}
        if dataset_properties is not None and isinstance(dataset_properties, dict):
            default_dataset_properties.update(dataset_properties)

        steps.extend([
            ("imputation", CategoricalImputation()),
            ("encoding", OrdinalEncoding()),
            ("category_shift", CategoryShift()),
            ("category_coalescence", CoalescenseChoice(default_dataset_properties)),
            ("categorical_encoding", OHEChoice(default_dataset_properties)),
            ])

        return steps
    def _get_pipeline_steps(self, dataset_properties=None):
        steps = []

        default_dataset_properties = {}
        if dataset_properties is not None and isinstance(
                dataset_properties, dict):
            default_dataset_properties.update(dataset_properties)

        steps.extend([
            ["category_shift", CategoryShift()],
            ["imputation", CategoricalImputation()],
            [
                "category_coalescence",
                CoalescenseChoice(default_dataset_properties)
            ],
            ["categorical_encoding",
             OHEChoice(default_dataset_properties)],
        ])

        return steps
 def test_default(self):
     X, mask = self._get_dataset()
     Y = CategoricalImputation().fit_transform(X)
     self.assertTrue((np.argwhere(Y == 2) == np.argwhere(mask)).all())
     self.assertTrue(
         (np.argwhere(Y != 2) == np.argwhere(np.logical_not(mask))).all())