def setUp(self):
        self.cwd = os.getcwd()
        tests_dir = __file__
        os.chdir(os.path.dirname(tests_dir))

        decoder = arff.ArffDecoder()
        with open(os.path.join("datasets", "dataset.arff")) as fh:
            dataset = decoder.decode(fh, encode_nominal=True)

        # -1 because the last attribute is the class
        self.attribute_types = [
            'numeric' if type(type_) != list else 'nominal'
            for name, type_ in dataset['attributes'][:-1]]
        self.categorical = [True if attribute == 'nominal' else False
                            for attribute in self.attribute_types]

        data = np.array(dataset['data'], dtype=np.float64)
        X = data[:,:-1]
        y = data[:,-1].reshape((-1,))

        ohe = OneHotEncoder(self.categorical)
        X_transformed = ohe.fit_transform(X)
        imp = Imputer(copy=False)
        X_transformed = imp.fit_transform(X_transformed)
        center = not scipy.sparse.isspmatrix((X_transformed))
        standard_scaler = StandardScaler(with_mean=center)
        X_transformed = standard_scaler.fit_transform(X_transformed)
        X_transformed = X_transformed.todense()

        # Transform the array which indicates the categorical metafeatures
        number_numerical = np.sum(~np.array(self.categorical))
        categorical_transformed = [True] * (X_transformed.shape[1] -
                                            number_numerical) + \
                                  [False] * number_numerical
        self.categorical_transformed = categorical_transformed

        self.X = X
        self.X_transformed = X_transformed
        self.y = y
        self.mf = meta_features.metafeatures
        self.helpers = meta_features.helper_functions

        # Precompute some helper functions
        self.helpers.set_value("PCA", self.helpers["PCA"]
            (self.X_transformed, self.y))
        self.helpers.set_value("MissingValues", self.helpers[
            "MissingValues"](self.X, self.y, self.categorical))
        self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"](
            self.X, self.y, self.categorical))
        self.helpers.set_value("ClassOccurences",
                               self.helpers["ClassOccurences"](self.X, self.y))
        self.helpers.set_value("Skewnesses",
            self.helpers["Skewnesses"](self.X_transformed, self.y,
                                       self.categorical_transformed))
        self.helpers.set_value("Kurtosisses",
            self.helpers["Kurtosisses"](self.X_transformed, self.y,
                                        self.categorical_transformed))
Esempio n. 2
0
    def perform1HotEncoding(self):
        if not hasattr(self, 'data'):
            raise ValueError('perform1HotEncoding can only be called when '
                             'data is loaded')
        if hasattr(self, 'encoder_'):
            raise ValueError('perform1HotEncoding can only be called on '
                             'non-encoded data.')
        self._encoder = None

        sparse = True if self.info['is_sparse'] == 1 else False
        has_missing = True if self.info['has_missing'] else False

        to_encode = ['categorical']
        if has_missing:
            to_encode += ['binary']
        encoding_mask = [
            feat_type.lower() in to_encode for feat_type in self.feat_type
        ]

        categorical = [
            True if feat_type.lower() == 'categorical' else False
            for feat_type in self.feat_type
        ]

        predicted_RAM_usage = float(
            predict_RAM_usage(self.data['X_train'], categorical)) / 1024 / 1024

        if predicted_RAM_usage > 1000:
            sparse = True

        if any(encoding_mask):
            encoder = OneHotEncoder(categorical_features=encoding_mask,
                                    dtype=np.float32,
                                    sparse=sparse)
            self.data['X_train'] = encoder.fit_transform(self.data['X_train'])
            if 'X_valid' in self.data:
                self.data['X_valid'] = encoder.transform(self.data['X_valid'])
            if 'X_test' in self.data:
                self.data['X_test'] = encoder.transform(self.data['X_test'])

            if not sparse and scipy.sparse.issparse(self.data['X_train']):
                self.data['X_train'] = self.data['X_train'].todense()
                if 'X_valid' in self.data:
                    self.data['X_valid'] = self.data['X_valid'].todense()
                if 'X_test' in self.data:
                    self.data['X_test'] = self.data['X_test'].todense()

            self.encoder = encoder
            self.info['is_sparse'] = 1 if sparse else 0
Esempio n. 3
0
    def test_transform_with_unknown_value(self):
        input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
        ohe = OneHotEncoder()
        ohe.fit(input)
        test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
        output = ohe.transform(test_data).todense()
        self.assertEqual(5, np.sum(output))

        input = np.array(((0, 1, 2, 3, 4, 5), (0, 1, 2, 3, 4, 5))).transpose()
        ips = scipy.sparse.csr_matrix(input)
        ohe = OneHotEncoder()
        ohe.fit(ips)
        test_data = np.array(((0, 1, 2, 6), (0, 1, 6, 7))).transpose()
        tds = scipy.sparse.csr_matrix(test_data)
        output = ohe.transform(tds).todense()
        self.assertEqual(3, np.sum(output))
    def perform1HotEncoding(self):
        if not hasattr(self, 'data'):
            raise ValueError('perform1HotEncoding can only be called when '
                             'data is loaded')
        if hasattr(self, 'encoder_'):
            raise ValueError('perform1HotEncoding can only be called on '
                             'non-encoded data.')
        self._encoder = None

        sparse = True if self.info['is_sparse'] == 1 else False
        has_missing = True if self.info['has_missing'] else False

        to_encode = ['categorical']
        if has_missing:
            to_encode += ['binary']
        encoding_mask = [feat_type.lower() in to_encode
                         for feat_type in self.feat_type]

        categorical = [True if feat_type.lower() == 'categorical' else False
                       for feat_type in self.feat_type]

        predicted_RAM_usage = float(predict_RAM_usage(
            self.data['X_train'], categorical)) / 1024 / 1024

        if predicted_RAM_usage > 1000:
            sparse = True

        if any(encoding_mask):
            encoder = OneHotEncoder(categorical_features=encoding_mask,
                                    dtype=np.float32,
                                    sparse=sparse)
            self.data['X_train'] = encoder.fit_transform(self.data['X_train'])
            if 'X_valid' in self.data:
                self.data['X_valid'] = encoder.transform(self.data['X_valid'])
            if 'X_test' in self.data:
                self.data['X_test'] = encoder.transform(self.data['X_test'])

            if not sparse and scipy.sparse.issparse(self.data['X_train']):
                self.data['X_train'] = self.data['X_train'].todense()
                if 'X_valid' in self.data:
                    self.data['X_valid'] = self.data['X_valid'].todense()
                if 'X_test' in self.data:
                    self.data['X_test'] = self.data['X_test'].todense()

            self.encoder = encoder
            self.info['is_sparse'] = 1 if sparse else 0
Esempio n. 5
0
    def fit_then_transform_dense(self, expected, input,
                                 categorical_features='all',
                                 minimum_fraction=None):
        ohe = OneHotEncoder(categorical_features=categorical_features,
                            sparse=False, minimum_fraction=minimum_fraction)
        transformation = ohe.fit_transform(input.copy())
        self.assertIsInstance(transformation, np.ndarray)
        assert_array_almost_equal(expected, transformation)

        ohe2 = OneHotEncoder(categorical_features=categorical_features,
                             sparse=False, minimum_fraction=minimum_fraction)
        ohe2.fit(input.copy())
        transformation = ohe2.transform(input.copy())
        self.assertIsInstance(transformation, np.ndarray)
        assert_array_almost_equal(expected, transformation)
Esempio n. 6
0
def perform_one_hot_encoding(sparse, categorical, data):

    predicted_RAM_usage = float(predict_RAM_usage(data[0], categorical)) / 1024 / 1024

    if predicted_RAM_usage > 1000:
        sparse = True

    rvals = []
    if any(categorical):
        encoder = OneHotEncoder(categorical_features=categorical,
                                dtype=np.float32,
                                sparse=sparse)
        rvals.append(encoder.fit_transform(data[0]))
        for d in data[1:]:
            rvals.append(encoder.transform(d))

        if not sparse and scipy.sparse.issparse(rvals[0]):
            for i in range(len(rvals)):
                rvals[i] = rvals[i].todense()
    else:
        rvals = data

    return rvals, sparse
def perform_one_hot_encoding(sparse, categorical, data):
    predicted_RAM_usage = float(
        predict_RAM_usage(data[0], categorical)) / 1024 / 1024

    if predicted_RAM_usage > 1000:
        sparse = True

    rvals = []
    if any(categorical):
        encoder = OneHotEncoder(categorical_features=categorical,
                                dtype=np.float32,
                                sparse=sparse)
        rvals.append(encoder.fit_transform(data[0]))
        for d in data[1:]:
            rvals.append(encoder.transform(d))

        if not sparse and scipy.sparse.issparse(rvals[0]):
            for i in range(len(rvals)):
                rvals[i] = rvals[i].todense()
    else:
        rvals = data

    return rvals, sparse
Esempio n. 8
0
    def fit_then_transform(self, expected, input, categorical_features='all',
                           minimum_fraction=None):
        # Test fit_transform
        ohe = OneHotEncoder(categorical_features=categorical_features,
                            minimum_fraction=minimum_fraction)
        transformation = ohe.fit_transform(input.copy())
        self.assertIsInstance(transformation, scipy.sparse.csr_matrix)
        assert_array_almost_equal(expected.astype(float),
                                  transformation.todense())

        # Test fit, and afterwards transform
        ohe2 = OneHotEncoder(categorical_features=categorical_features,
                             minimum_fraction=minimum_fraction)
        ohe2.fit(input.copy())
        transformation = ohe2.transform(input.copy())
        self.assertIsInstance(transformation, scipy.sparse.csr_matrix)
        assert_array_almost_equal(expected, transformation.todense())
    def test_classification_workflow(self):
        task = openml.tasks.get_task(254)
        X, y = task.get_X_and_y()

        ohe = OneHotEncoder(categorical_features=[True] * 22)
        tree = sklearn.tree.DecisionTreeClassifier(random_state=1)
        pipeline = sklearn.pipeline.Pipeline((('ohe', ohe), ('tree', tree)))

        X_train, X_test, y_train, y_test = \
            sklearn.model_selection.train_test_split(X, y, random_state=3,
                                                     train_size=0.5,
                                                     test_size=0.5)
        pipeline.fit(X_train, y_train)
        self.assertEqual(np.mean(y_train == pipeline.predict(X_train)), 1)
        # With an incorrect copy operation the OneHotEncoder would rearrange
        # the data in such a way that the accuracy would drop to 66%
        self.assertEqual(np.mean(y_test == pipeline.predict(X_test)), 1)
Esempio n. 10
0
    def setUp(self):
        self.cwd = os.getcwd()
        tests_dir = __file__
        os.chdir(os.path.dirname(tests_dir))

        decoder = arff.ArffDecoder()
        with open(os.path.join("datasets", "dataset.arff")) as fh:
            dataset = decoder.decode(fh, encode_nominal=True)

        # -1 because the last attribute is the class
        self.attribute_types = [
            'numeric' if type(type_) != list else 'nominal'
            for name, type_ in dataset['attributes'][:-1]
        ]
        self.categorical = [
            True if attribute == 'nominal' else False
            for attribute in self.attribute_types
        ]

        data = np.array(dataset['data'], dtype=np.float64)
        X = data[:, :-1]
        y = data[:, -1].reshape((-1, ))

        # First, swap NaNs and zeros, because when converting an encoded
        # dense matrix to sparse, the values which are encoded to zero are lost
        X_sparse = X.copy()
        NaNs = ~np.isfinite(X_sparse)
        X_sparse[NaNs] = 0
        X_sparse = sparse.csr_matrix(X_sparse)

        ohe = OneHotEncoder(self.categorical)
        X_transformed = X_sparse.copy()
        X_transformed = ohe.fit_transform(X_transformed)
        imp = SimpleImputer(copy=False)
        X_transformed = imp.fit_transform(X_transformed)
        standard_scaler = StandardScaler(with_mean=False)
        X_transformed = standard_scaler.fit_transform(X_transformed)

        # Transform the array which indicates the categorical metafeatures
        number_numerical = np.sum(~np.array(self.categorical))
        categorical_transformed = [True] * (X_transformed.shape[1] -
                                            number_numerical) + \
                                  [False] * number_numerical
        self.categorical_transformed = categorical_transformed

        self.X = X_sparse
        self.X_transformed = X_transformed
        self.y = y
        self.mf = meta_features.metafeatures
        self.helpers = meta_features.helper_functions

        # Precompute some helper functions
        self.helpers.set_value("PCA", self.helpers["PCA"](self.X_transformed,
                                                          self.y))
        self.helpers.set_value(
            "MissingValues", self.helpers["MissingValues"](self.X, self.y,
                                                           self.categorical))
        self.mf.set_value(
            "NumberOfMissingValues",
            self.mf["NumberOfMissingValues"](self.X, self.y, self.categorical))
        self.helpers.set_value(
            "NumSymbols", self.helpers["NumSymbols"](self.X, self.y,
                                                     self.categorical))
        self.helpers.set_value("ClassOccurences",
                               self.helpers["ClassOccurences"](self.X, self.y))
        self.helpers.set_value(
            "Skewnesses",
            self.helpers["Skewnesses"](self.X_transformed, self.y,
                                       self.categorical_transformed))
        self.helpers.set_value(
            "Kurtosisses",
            self.helpers["Kurtosisses"](self.X_transformed, self.y,
                                        self.categorical_transformed))
    def setUp(self):
        self.cwd = os.getcwd()
        tests_dir = __file__
        os.chdir(os.path.dirname(tests_dir))

        decoder = arff.ArffDecoder()
        with open(os.path.join("datasets", "dataset.arff")) as fh:
            dataset = decoder.decode(fh, encode_nominal=True)

        # -1 because the last attribute is the class
        self.attribute_types = [
            'numeric' if type(type_) != list else 'nominal'
            for name, type_ in dataset['attributes'][:-1]]
        self.categorical = [True if attribute == 'nominal' else False
                            for attribute in self.attribute_types]

        data = np.array(dataset['data'], dtype=np.float64)
        X = data[:, :-1]
        y = data[:, -1].reshape((-1,))

        # First, swap NaNs and zeros, because when converting an encoded
        # dense matrix to sparse, the values which are encoded to zero are lost
        X_sparse = X.copy()
        NaNs = ~np.isfinite(X_sparse)
        X_sparse[NaNs] = 0
        X_sparse = sparse.csr_matrix(X_sparse)

        ohe = OneHotEncoder(self.categorical)
        X_transformed = X_sparse.copy()
        X_transformed = ohe.fit_transform(X_transformed)
        imp = Imputer(copy=False)
        X_transformed = imp.fit_transform(X_transformed)
        standard_scaler = StandardScaler()
        X_transformed = standard_scaler.fit_transform(X_transformed)

        # Transform the array which indicates the categorical metafeatures
        number_numerical = np.sum(~np.array(self.categorical))
        categorical_transformed = [True] * (X_transformed.shape[1] -
                                            number_numerical) + \
                                  [False] * number_numerical
        self.categorical_transformed = categorical_transformed

        self.X = X_sparse
        self.X_transformed = X_transformed
        self.y = y
        self.mf = meta_features.metafeatures
        self.helpers = meta_features.helper_functions

        # Precompute some helper functions
        self.helpers.set_value("PCA", self.helpers["PCA"]
            (self.X_transformed, self.y))
        self.helpers.set_value("MissingValues", self.helpers[
            "MissingValues"](self.X, self.y, self.categorical))
        self.mf.set_value("NumberOfMissingValues",
            self.mf["NumberOfMissingValues"](self.X, self.y, self.categorical))
        self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"](
            self.X, self.y, self.categorical))
        self.helpers.set_value("ClassOccurences",
            self.helpers["ClassOccurences"](self.X, self.y))
        self.helpers.set_value("Skewnesses",
            self.helpers["Skewnesses"](self.X_transformed, self.y,
                                       self.categorical_transformed))
        self.helpers.set_value("Kurtosisses",
            self.helpers["Kurtosisses"](self.X_transformed, self.y,
                                        self.categorical_transformed))
Esempio n. 12
0
def calculate_all_metafeatures(X,
                               y,
                               categorical,
                               dataset_name,
                               calculate=None,
                               dont_calculate=None,
                               densify_threshold=1000):
    logger = get_logger(__name__)
    """Calculate all metafeatures."""
    helper_functions.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)
                ohe = OneHotEncoder(categorical_features=categorical,
                                    sparse=True)
                X_transformed = ohe.fit_transform(X)
                imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype)
                X_transformed = imputer.fit_transform(X_transformed)
                standard_scaler = StandardScaler(copy=False)
                X_transformed = standard_scaler.fit_transform(X_transformed)

                # Transform the array which indicates the categorical metafeatures
                number_numerical = np.sum(~np.array(categorical))
                categorical_transformed = [True] * (X_transformed.shape[1] -
                                                    number_numerical) + \
                                          [False] * number_numerical

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[
                        0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            categorical_ = categorical_transformed
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_helper_function and not helper_functions.is_calculated(
                    dependency):
                logger.info("%s: Going to calculate: %s", dataset_name,
                            dependency)
                value = helper_functions[dependency](X_, y_, categorical_)
                helper_functions.set_value(dependency, value)
                mf_[dependency] = value

        logger.info("%s: Going to calculate: %s", dataset_name, name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_)
    return mf_
Esempio n. 13
0
def calculate_all_metafeatures(X, y, categorical, dataset_name,
        calculate=None, dont_calculate=None, densify_threshold=1000):
    logger = get_logger(__name__)

    """Calculate all metafeatures."""
    helper_functions.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)
                ohe = OneHotEncoder(categorical_features=categorical, sparse=True)
                X_transformed = ohe.fit_transform(X)
                imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype)
                X_transformed = imputer.fit_transform(X_transformed)
                standard_scaler = StandardScaler(copy=False)
                X_transformed = standard_scaler.fit_transform(X_transformed)

                # Transform the array which indicates the categorical metafeatures
                number_numerical = np.sum(~np.array(categorical))
                categorical_transformed = [True] * (X_transformed.shape[1] -
                                                    number_numerical) + \
                                          [False] * number_numerical

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            categorical_ = categorical_transformed
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_helper_function and not helper_functions.is_calculated(
                    dependency):
                logger.info("%s: Going to calculate: %s", dataset_name,
                            dependency)
                value = helper_functions[dependency](X_, y_, categorical_)
                helper_functions.set_value(dependency, value)
                mf_[dependency] = value

        logger.info("%s: Going to calculate: %s", dataset_name,
                    name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_)
    return mf_