예제 #1
0
    def setUp(self):
        self.cwd = os.getcwd()
        tests_dir = __file__
        os.chdir(os.path.dirname(tests_dir))

        decoder = arff.ArffDecoder()
        with open(os.path.join("datasets", "dataset.arff")) as fh:
            dataset = decoder.decode(fh, encode_nominal=True)

        # -1 because the last attribute is the class
        self.attribute_types = [
            'numeric' if type(type_) != list else 'nominal'
            for name, type_ in dataset['attributes'][:-1]]
        self.categorical = [True if attribute == 'nominal' else False
                            for attribute in self.attribute_types]

        data = np.array(dataset['data'], dtype=np.float64)
        X = data[:,:-1]
        y = data[:,-1].reshape((-1,))

        ohe = OneHotEncoder(self.categorical)
        X_transformed = ohe.fit_transform(X)
        imp = Imputer(copy=False)
        X_transformed = imp.fit_transform(X_transformed)
        standard_scaler = StandardScaler()
        X_transformed = standard_scaler.fit_transform(X_transformed)
        X_transformed = X_transformed.todense()

        # Transform the array which indicates the categorical metafeatures
        number_numerical = np.sum(~np.array(self.categorical))
        categorical_transformed = [True] * (X_transformed.shape[1] -
                                            number_numerical) + \
                                  [False] * number_numerical
        self.categorical_transformed = categorical_transformed

        self.X = X
        self.X_transformed = X_transformed
        self.y = y
        self.mf = meta_features.metafeatures
        self.helpers = meta_features.helper_functions

        # Precompute some helper functions
        self.helpers.set_value("PCA", self.helpers["PCA"]
            (self.X_transformed, self.y))
        self.helpers.set_value("MissingValues", self.helpers[
            "MissingValues"](self.X, self.y, self.categorical))
        self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"](
            self.X, self.y, self.categorical))
        self.helpers.set_value("ClassOccurences",
                               self.helpers["ClassOccurences"](self.X, self.y))
        self.helpers.set_value("Skewnesses",
            self.helpers["Skewnesses"](self.X_transformed, self.y,
                                       self.categorical_transformed))
        self.helpers.set_value("Kurtosisses",
            self.helpers["Kurtosisses"](self.X_transformed, self.y,
                                        self.categorical_transformed))
예제 #2
0
    def test_scaler_1d(self):
        """Test scaling of dataset along single axis"""
        rng = np.random.RandomState(0)
        X = rng.randn(5)
        X_orig_copy = X.copy()

        scaler = StandardScaler()
        X_scaled = scaler.fit(X).transform(X, copy=False)
        assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
        assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

        # check inverse transform
        X_scaled_back = scaler.inverse_transform(X_scaled)
        assert_array_almost_equal(X_scaled_back, X_orig_copy)

        # Test with 1D list
        X = [0., 1., 2, 0.4, 1.]
        scaler = StandardScaler()
        X_scaled = scaler.fit(X).transform(X, copy=False)
        assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
        assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

        X_scaled = scale(X)
        assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
        assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

        # Test with sparse list
        X = scipy.sparse.coo_matrix((np.random.random((10,)),
                                     ([i**2 for i in range(10)],
                                      [0 for i in range(10)])))
        X = X.tocsr()
        scaler = StandardScaler()
        X_scaled = scaler.fit(X).transform(X, copy=False)

        self.assertFalse(np.any(np.isnan(X_scaled.data)))
        self.assertAlmostEqual(X_scaled.mean(), 0)
        self.assertAlmostEqual(np.sqrt(X_scaled.data.var()), 1)

        # Check that X has not been copied
        # self.assertTrue(X_scaled is X)
        # Check that the matrix is still sparse
        self.assertEqual(len(X.indices), 10)
예제 #3
0
    def test_standard_scaler_sparse_boston_data(self):
        X_train, Y_train, X_test, Y_test = get_dataset('boston',
                                                       make_sparse=True)
        num_data_points = len(X_train.data)

        scaler = StandardScaler()
        scaler.fit(X_train, Y_train)
        tr = scaler.transform(X_train)

        # Test this for every single dimension!
        means = np.array([tr.data[tr.indptr[i]:tr.indptr[i + 1]].mean()
                          for i in range(13)])
        vars = np.array([tr.data[tr.indptr[i]:tr.indptr[i + 1]].var()
                         for i in range(13)])

        for i in chain(range(1, 3), range(4, 13)):
            self.assertAlmostEqual(means[i], 0, 2)
            self.assertAlmostEqual(vars[i], 1, 2)
        self.assertAlmostEqual(means[3], 1)
        self.assertAlmostEqual(vars[3], 0)
        # Test that the matrix is still sparse
        self.assertTrue(scipy.sparse.issparse(tr))
        self.assertEqual(num_data_points, len(tr.data))
예제 #4
0
    def setUp(self):
        self.cwd = os.getcwd()
        tests_dir = __file__
        os.chdir(os.path.dirname(tests_dir))

        decoder = arff.ArffDecoder()
        with open(os.path.join("datasets", "dataset.arff")) as fh:
            dataset = decoder.decode(fh, encode_nominal=True)

        # -1 because the last attribute is the class
        self.attribute_types = [
            'numeric' if type(type_) != list else 'nominal'
            for name, type_ in dataset['attributes'][:-1]
        ]
        self.categorical = [
            True if attribute == 'nominal' else False
            for attribute in self.attribute_types
        ]

        data = np.array(dataset['data'], dtype=np.float64)
        X = data[:, :-1]
        y = data[:, -1].reshape((-1, ))

        ohe = OneHotEncoder(self.categorical)
        X_transformed = ohe.fit_transform(X)
        imp = Imputer(copy=False)
        X_transformed = imp.fit_transform(X_transformed)
        standard_scaler = StandardScaler()
        X_transformed = standard_scaler.fit_transform(X_transformed)
        X_transformed = X_transformed.todense()

        # Transform the array which indicates the categorical metafeatures
        number_numerical = np.sum(~np.array(self.categorical))
        categorical_transformed = [True] * (X_transformed.shape[1] -
                                            number_numerical) + \
                                  [False] * number_numerical
        self.categorical_transformed = categorical_transformed

        self.X = X
        self.X_transformed = X_transformed
        self.y = y
        self.mf = meta_features.metafeatures
        self.helpers = meta_features.helper_functions

        # Precompute some helper functions
        self.helpers.set_value("PCA", self.helpers["PCA"](self.X_transformed,
                                                          self.y))
        self.helpers.set_value(
            "MissingValues", self.helpers["MissingValues"](self.X, self.y,
                                                           self.categorical))
        self.helpers.set_value(
            "NumSymbols", self.helpers["NumSymbols"](self.X, self.y,
                                                     self.categorical))
        self.helpers.set_value("ClassOccurences",
                               self.helpers["ClassOccurences"](self.X, self.y))
        self.helpers.set_value(
            "Skewnesses",
            self.helpers["Skewnesses"](self.X_transformed, self.y,
                                       self.categorical_transformed))
        self.helpers.set_value(
            "Kurtosisses",
            self.helpers["Kurtosisses"](self.X_transformed, self.y,
                                        self.categorical_transformed))
예제 #5
0
def calculate_all_metafeatures(X,
                               y,
                               categorical,
                               dataset_name,
                               calculate=None,
                               dont_calculate=None,
                               densify_threshold=1000):
    logger = get_logger(__name__)
    """Calculate all metafeatures."""
    helper_functions.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)
                ohe = OneHotEncoder(categorical_features=categorical,
                                    sparse=True)
                X_transformed = ohe.fit_transform(X)
                imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype)
                X_transformed = imputer.fit_transform(X_transformed)
                standard_scaler = StandardScaler(copy=False)
                X_transformed = standard_scaler.fit_transform(X_transformed)

                # Transform the array which indicates the categorical metafeatures
                number_numerical = np.sum(~np.array(categorical))
                categorical_transformed = [True] * (X_transformed.shape[1] -
                                                    number_numerical) + \
                                          [False] * number_numerical

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[
                        0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            categorical_ = categorical_transformed
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_helper_function and not helper_functions.is_calculated(
                    dependency):
                logger.info("%s: Going to calculate: %s", dataset_name,
                            dependency)
                value = helper_functions[dependency](X_, y_, categorical_)
                helper_functions.set_value(dependency, value)
                mf_[dependency] = value

        logger.info("%s: Going to calculate: %s", dataset_name, name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_)
    return mf_
예제 #6
0
def calculate_all_metafeatures(X, y, categorical, dataset_name,
        calculate=None, dont_calculate=None, densify_threshold=1000):
    logger = get_logger(__name__)

    """Calculate all metafeatures."""
    helper_functions.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)
                ohe = OneHotEncoder(categorical_features=categorical, sparse=True)
                X_transformed = ohe.fit_transform(X)
                imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype)
                X_transformed = imputer.fit_transform(X_transformed)
                standard_scaler = StandardScaler(copy=False)
                X_transformed = standard_scaler.fit_transform(X_transformed)

                # Transform the array which indicates the categorical metafeatures
                number_numerical = np.sum(~np.array(categorical))
                categorical_transformed = [True] * (X_transformed.shape[1] -
                                                    number_numerical) + \
                                          [False] * number_numerical

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            categorical_ = categorical_transformed
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_helper_function and not helper_functions.is_calculated(
                    dependency):
                logger.info("%s: Going to calculate: %s", dataset_name,
                            dependency)
                value = helper_functions[dependency](X_, y_, categorical_)
                helper_functions.set_value(dependency, value)
                mf_[dependency] = value

        logger.info("%s: Going to calculate: %s", dataset_name,
                    name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_)
    return mf_
예제 #7
0
 def __init__(self, random_state):
     from autosklearn.pipeline.implementations.StandardScaler import \
         StandardScaler
     self.preprocessor = StandardScaler()
예제 #8
0
    def test_scaler_2d_arrays(self):
        """Test scaling of 2d array along first axis"""
        rng = np.random.RandomState(0)
        X = rng.randn(4, 5)
        X[:, 0] = 0.0  # first feature is always of zero

        scaler = StandardScaler()
        X_scaled = scaler.fit(X).transform(X, copy=True)
        self.assertFalse(np.any(np.isnan(X_scaled)))

        assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
        assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
        # Check that X has been copied
        self.assertTrue(X_scaled is not X)

        # check inverse transform
        X_scaled_back = scaler.inverse_transform(X_scaled)
        self.assertTrue(X_scaled_back is not X)
        self.assertTrue(X_scaled_back is not X_scaled)
        assert_array_almost_equal(X_scaled_back, X)

        X_scaled = scale(X, axis=1, with_std=False)
        self.assertFalse(np.any(np.isnan(X_scaled)))
        assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
        X_scaled = scale(X, axis=1, with_std=True)
        self.assertFalse(np.any(np.isnan(X_scaled)))
        assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
        assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
        # Check that the data hasn't been modified
        self.assertTrue(X_scaled is not X)

        X_scaled = scaler.fit(X).transform(X, copy=False)
        self.assertFalse(np.any(np.isnan(X_scaled)))
        assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
        assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
        # Check that X has not been copied
        self.assertTrue(X_scaled is X)

        X = rng.randn(4, 5)
        X[:, 0] = 1.0  # first feature is a constant, non zero feature
        scaler = StandardScaler()
        X_scaled = scaler.fit(X).transform(X, copy=True)
        self.assertFalse(np.any(np.isnan(X_scaled)))
        assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
        assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
        # Check that X has not been copied
        self.assertTrue(X_scaled is not X)

        # Same thing for sparse matrices...
        X = scipy.sparse.coo_matrix((np.random.random((12,)),
                                     ([i for i in range(12)],
                                      [int(i / 3) for i in range(12)])))
        X = X.tocsr()
        scaler = StandardScaler()
        X_scaled = scaler.fit(X).transform(X, copy=False)

        self.assertFalse(np.any(np.isnan(X_scaled.data)))
        assert_array_almost_equal(
            [X_scaled.data[X_scaled.indptr[i]:X_scaled.indptr[i + 1]].mean()
             for i in range(X_scaled.shape[1])],
                                  np.zeros((4, ), dtype=np.float64))
        assert_array_almost_equal(np.sqrt([
            X_scaled.data[X_scaled.indptr[i]:X_scaled.indptr[i + 1]].var()
            for i in range(X_scaled.shape[1])]),
                                  np.ones((4, ), dtype=np.float64))

        # Because we change the sparse format to csc, we cannot assert that
        # the matrix did not change!
        # self.assertTrue(X_scaled is X)
        # Check that the matrix is still sparse
        self.assertEqual(len(X.indices), 12)