Esempio n. 1
0
    def test_imputation_shape(self):
        """Verify the shapes of the imputed matrix for different strategies."""
        X = np.random.randn(10, 2)
        X[::2] = np.nan

        for strategy in ['mean', 'median', 'most_frequent']:
            imputer = Imputer(strategy=strategy)
            X_imputed = imputer.fit_transform(X)
            assert_equal(X_imputed.shape, (10, 2))
            X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
            assert_equal(X_imputed.shape, (10, 2))
Esempio n. 2
0
    def test_imputation_shape(self):
        """Verify the shapes of the imputed matrix for different strategies."""
        X = np.random.randn(10, 2)
        X[::2] = np.nan

        for strategy in ['mean', 'median', 'most_frequent']:
            imputer = Imputer(strategy=strategy)
            X_imputed = imputer.fit_transform(X)
            assert_equal(X_imputed.shape, (10, 2))
            X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
            assert_equal(X_imputed.shape, (10, 2))
Esempio n. 3
0
    def test_imputation_pickle(self):
        """Test for pickling imputers."""
        import pickle

        l = 100
        X = sparse_random_matrix(l, l, density=0.10)

        for strategy in ["mean", "median", "most_frequent"]:
            imputer = Imputer(missing_values=0, strategy=strategy)
            imputer.fit(X)

            imputer_pickled = pickle.loads(pickle.dumps(imputer))

            assert_array_equal(imputer.transform(X.copy()),
                               imputer_pickled.transform(X.copy()),
                               "Fail to transform the data after pickling "
                               "(strategy = %s)" % (strategy))
Esempio n. 4
0
    def test_imputation_pipeline_grid_search(self):
        """Test imputation within a pipeline + gridsearch."""
        pipeline = Pipeline([('imputer', Imputer(missing_values=0)),
                             ('tree', tree.DecisionTreeRegressor(random_state=0))])

        parameters = {
            'imputer__strategy': ["mean", "median", "most_frequent"],
            'imputer__axis': [0, 1]
        }

        l = 100
        X = sparse_random_matrix(l, l, density=0.10)
        Y = sparse_random_matrix(l, 1, density=0.10).toarray()
        gs = grid_search.GridSearchCV(pipeline, parameters)
        gs.fit(X, Y)
Esempio n. 5
0
def calculate_all_metafeatures(X,
                               y,
                               categorical,
                               dataset_name,
                               calculate=None,
                               dont_calculate=None,
                               densify_threshold=1000):
    logger = get_logger(__name__)
    """Calculate all metafeatures."""
    helper_functions.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)
                ohe = OneHotEncoder(categorical_features=categorical,
                                    sparse=True)
                X_transformed = ohe.fit_transform(X)
                imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype)
                X_transformed = imputer.fit_transform(X_transformed)
                standard_scaler = StandardScaler(copy=False)
                X_transformed = standard_scaler.fit_transform(X_transformed)

                # Transform the array which indicates the categorical metafeatures
                number_numerical = np.sum(~np.array(categorical))
                categorical_transformed = [True] * (X_transformed.shape[1] -
                                                    number_numerical) + \
                                          [False] * number_numerical

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[
                        0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            categorical_ = categorical_transformed
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_helper_function and not helper_functions.is_calculated(
                    dependency):
                logger.info("%s: Going to calculate: %s", dataset_name,
                            dependency)
                value = helper_functions[dependency](X_, y_, categorical_)
                helper_functions.set_value(dependency, value)
                mf_[dependency] = value

        logger.info("%s: Going to calculate: %s", dataset_name, name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_)
    return mf_
Esempio n. 6
0
def calculate_all_metafeatures(X, y, categorical, dataset_name,
        calculate=None, dont_calculate=None, densify_threshold=1000):
    logger = get_logger(__name__)

    """Calculate all metafeatures."""
    helper_functions.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)
                ohe = OneHotEncoder(categorical_features=categorical, sparse=True)
                X_transformed = ohe.fit_transform(X)
                imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype)
                X_transformed = imputer.fit_transform(X_transformed)
                standard_scaler = StandardScaler(copy=False)
                X_transformed = standard_scaler.fit_transform(X_transformed)

                # Transform the array which indicates the categorical metafeatures
                number_numerical = np.sum(~np.array(categorical))
                categorical_transformed = [True] * (X_transformed.shape[1] -
                                                    number_numerical) + \
                                          [False] * number_numerical

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            categorical_ = categorical_transformed
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_helper_function and not helper_functions.is_calculated(
                    dependency):
                logger.info("%s: Going to calculate: %s", dataset_name,
                            dependency)
                value = helper_functions[dependency](X_, y_, categorical_)
                helper_functions.set_value(dependency, value)
                mf_[dependency] = value

        logger.info("%s: Going to calculate: %s", dataset_name,
                    name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_)
    return mf_
Esempio n. 7
0
    def _check_statistics(self, X, X_true, strategy, statistics,
                          missing_values):
        """Utility function for testing imputation for a given strategy.

        Test:
            - along the two axes
            - with dense and sparse arrays

        Check that:
            - the statistics (mean, median, mode) are correct
            - the missing values are imputed correctly"""

        err_msg = "Parameters: strategy = %s, missing_values = %s, " \
                  "axis = {0}, sparse = {1}" % (strategy, missing_values)

        # Normal matrix, axis = 0
        imputer = Imputer(missing_values, strategy=strategy, axis=0)
        X_trans = imputer.fit(X).transform(X.copy())
        assert_array_equal(imputer.statistics_, statistics,
                           err_msg.format(0, False))
        assert_array_equal(X_trans, X_true, err_msg.format(0, False))

        # Normal matrix, axis = 1
        imputer = Imputer(missing_values, strategy=strategy, axis=1)
        imputer.fit(X.transpose())
        if np.isnan(statistics).any():
            assert_raises(ValueError, imputer.transform, X.copy().transpose())
        else:
            X_trans = imputer.transform(X.copy().transpose())
            assert_array_equal(X_trans, X_true.transpose(),
                               err_msg.format(1, False))

        # Sparse matrix, axis = 0
        imputer = Imputer(missing_values, strategy=strategy, axis=0)
        imputer.fit(sparse.csc_matrix(X))
        X_trans = imputer.transform(sparse.csc_matrix(X.copy()))

        if sparse.issparse(X_trans):
            X_trans = X_trans.toarray()

        assert_array_equal(imputer.statistics_, statistics,
                           err_msg.format(0, True))
        assert_array_equal(X_trans, X_true, err_msg.format(0, True))

        # Sparse matrix, axis = 1
        imputer = Imputer(missing_values, strategy=strategy, axis=1)
        imputer.fit(sparse.csc_matrix(X.transpose()))
        if np.isnan(statistics).any():
            assert_raises(ValueError, imputer.transform,
                          sparse.csc_matrix(X.copy().transpose()))
        else:
            X_trans = imputer.transform(sparse.csc_matrix(
                X.copy().transpose()))

            if sparse.issparse(X_trans):
                X_trans = X_trans.toarray()

            assert_array_equal(X_trans, X_true.transpose(),
                               err_msg.format(1, True))
Esempio n. 8
0
    def test_imputation_copy(self):
        """Test imputation with copy"""
        X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

        # copy=True, dense => copy
        X = X_orig.copy().toarray()
        imputer = Imputer(missing_values=0, strategy="mean", copy=True)
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = -1
        assert_false(np.all(X == Xt))

        # copy=True, sparse csr => copy
        X = X_orig.copy()
        imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True)
        Xt = imputer.fit(X).transform(X)
        Xt.data[0] = -1
        assert_false(np.all(X.data == Xt.data))

        # copy=False, dense => no copy
        X = X_orig.copy().toarray()
        imputer = Imputer(missing_values=0, strategy="mean", copy=False)
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = -1
        assert_true(np.all(X == Xt))

        # copy=False, sparse csr, axis=1 => no copy
        X = X_orig.copy()
        imputer = Imputer(missing_values=X.data[0],
                          strategy="mean",
                          copy=False,
                          axis=1)
        Xt = imputer.fit(X).transform(X)
        Xt.data[0] = -1
        assert_true(np.all(X.data == Xt.data))

        # copy=False, sparse csc, axis=0 => no copy
        X = X_orig.copy().tocsc()
        imputer = Imputer(missing_values=X.data[0],
                          strategy="mean",
                          copy=False,
                          axis=0)
        Xt = imputer.fit(X).transform(X)
        Xt.data[0] = -1
        assert_true(np.all(X.data == Xt.data))

        # copy=False, sparse csr, axis=0 => copy
        X = X_orig.copy()
        imputer = Imputer(missing_values=X.data[0],
                          strategy="mean",
                          copy=False,
                          axis=0)
        Xt = imputer.fit(X).transform(X)
        Xt.data[0] = -1
        assert_false(np.all(X.data == Xt.data))

        # copy=False, sparse csc, axis=1 => copy
        X = X_orig.copy().tocsc()
        imputer = Imputer(missing_values=X.data[0],
                          strategy="mean",
                          copy=False,
                          axis=1)
        Xt = imputer.fit(X).transform(X)
        Xt.data[0] = -1
        assert_false(np.all(X.data == Xt.data))

        # copy=False, sparse csr, axis=1, missing_values=0 => copy
        X = X_orig.copy()
        imputer = Imputer(missing_values=0,
                          strategy="mean",
                          copy=False,
                          axis=1)
        Xt = imputer.fit(X).transform(X)
        assert_false(sparse.issparse(Xt))
Esempio n. 9
0
    def _check_statistics(self, X, X_true,
                          strategy, statistics, missing_values):
        """Utility function for testing imputation for a given strategy.

        Test:
            - along the two axes
            - with dense and sparse arrays

        Check that:
            - the statistics (mean, median, mode) are correct
            - the missing values are imputed correctly"""

        err_msg = "Parameters: strategy = %s, missing_values = %s, " \
                  "axis = {0}, sparse = {1}" % (strategy, missing_values)

        # Normal matrix, axis = 0
        imputer = Imputer(missing_values, strategy=strategy, axis=0)
        X_trans = imputer.fit(X).transform(X.copy())
        assert_array_equal(imputer.statistics_, statistics,
                           err_msg.format(0, False))
        assert_array_equal(X_trans, X_true, err_msg.format(0, False))

        # Normal matrix, axis = 1
        imputer = Imputer(missing_values, strategy=strategy, axis=1)
        imputer.fit(X.transpose())
        if np.isnan(statistics).any():
            assert_raises(ValueError, imputer.transform, X.copy().transpose())
        else:
            X_trans = imputer.transform(X.copy().transpose())
            assert_array_equal(X_trans, X_true.transpose(),
                               err_msg.format(1, False))

        # Sparse matrix, axis = 0
        imputer = Imputer(missing_values, strategy=strategy, axis=0)
        imputer.fit(sparse.csc_matrix(X))
        X_trans = imputer.transform(sparse.csc_matrix(X.copy()))

        if sparse.issparse(X_trans):
            X_trans = X_trans.toarray()

        assert_array_equal(imputer.statistics_, statistics,
                           err_msg.format(0, True))
        assert_array_equal(X_trans, X_true, err_msg.format(0, True))

        # Sparse matrix, axis = 1
        imputer = Imputer(missing_values, strategy=strategy, axis=1)
        imputer.fit(sparse.csc_matrix(X.transpose()))
        if np.isnan(statistics).any():
            assert_raises(ValueError, imputer.transform,
                          sparse.csc_matrix(X.copy().transpose()))
        else:
            X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose()))

            if sparse.issparse(X_trans):
                X_trans = X_trans.toarray()

            assert_array_equal(X_trans, X_true.transpose(),
                               err_msg.format(1, True))
Esempio n. 10
0
    def test_imputation_copy(self):
        """Test imputation with copy"""
        X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

        # copy=True, dense => copy
        X = X_orig.copy().toarray()
        imputer = Imputer(missing_values=0, strategy="mean", copy=True)
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = -1
        assert_false(np.all(X == Xt))

        # copy=True, sparse csr => copy
        X = X_orig.copy()
        imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True)
        Xt = imputer.fit(X).transform(X)
        Xt.data[0] = -1
        assert_false(np.all(X.data == Xt.data))

        # copy=False, dense => no copy
        X = X_orig.copy().toarray()
        imputer = Imputer(missing_values=0, strategy="mean", copy=False)
        Xt = imputer.fit(X).transform(X)
        Xt[0, 0] = -1
        assert_true(np.all(X == Xt))

        # copy=False, sparse csr, axis=1 => no copy
        X = X_orig.copy()
        imputer = Imputer(missing_values=X.data[0], strategy="mean",
                          copy=False, axis=1)
        Xt = imputer.fit(X).transform(X)
        Xt.data[0] = -1
        assert_true(np.all(X.data == Xt.data))

        # copy=False, sparse csc, axis=0 => no copy
        X = X_orig.copy().tocsc()
        imputer = Imputer(missing_values=X.data[0], strategy="mean",
                          copy=False, axis=0)
        Xt = imputer.fit(X).transform(X)
        Xt.data[0] = -1
        assert_true(np.all(X.data == Xt.data))

        # copy=False, sparse csr, axis=0 => copy
        X = X_orig.copy()
        imputer = Imputer(missing_values=X.data[0], strategy="mean",
                          copy=False, axis=0)
        Xt = imputer.fit(X).transform(X)
        Xt.data[0] = -1
        assert_false(np.all(X.data == Xt.data))

        # copy=False, sparse csc, axis=1 => copy
        X = X_orig.copy().tocsc()
        imputer = Imputer(missing_values=X.data[0], strategy="mean",
                          copy=False, axis=1)
        Xt = imputer.fit(X).transform(X)
        Xt.data[0] = -1
        assert_false(np.all(X.data == Xt.data))

        # copy=False, sparse csr, axis=1, missing_values=0 => copy
        X = X_orig.copy()
        imputer = Imputer(missing_values=0, strategy="mean",
                          copy=False, axis=1)
        Xt = imputer.fit(X).transform(X)
        assert_false(sparse.issparse(Xt))