def setUp(self): self.cwd = os.getcwd() tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class self.attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1]] self.categorical = [True if attribute == 'nominal' else False for attribute in self.attribute_types] data = np.array(dataset['data'], dtype=np.float64) X = data[:,:-1] y = data[:,-1].reshape((-1,)) ohe = OneHotEncoder(self.categorical) X_transformed = ohe.fit_transform(X) imp = Imputer(copy=False) X_transformed = imp.fit_transform(X_transformed) standard_scaler = StandardScaler() X_transformed = standard_scaler.fit_transform(X_transformed) X_transformed = X_transformed.todense() # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(self.categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical self.categorical_transformed = categorical_transformed self.X = X self.X_transformed = X_transformed self.y = y self.mf = meta_features.metafeatures self.helpers = meta_features.helper_functions # Precompute some helper functions self.helpers.set_value("PCA", self.helpers["PCA"] (self.X_transformed, self.y)) self.helpers.set_value("MissingValues", self.helpers[ "MissingValues"](self.X, self.y, self.categorical)) self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"]( self.X, self.y, self.categorical)) self.helpers.set_value("ClassOccurences", self.helpers["ClassOccurences"](self.X, self.y)) self.helpers.set_value("Skewnesses", self.helpers["Skewnesses"](self.X_transformed, self.y, self.categorical_transformed)) self.helpers.set_value("Kurtosisses", self.helpers["Kurtosisses"](self.X_transformed, self.y, self.categorical_transformed))
def test_scaler_1d(self): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # Test with sparse list X = scipy.sparse.coo_matrix((np.random.random((10,)), ([i**2 for i in range(10)], [0 for i in range(10)]))) X = X.tocsr() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) self.assertFalse(np.any(np.isnan(X_scaled.data))) self.assertAlmostEqual(X_scaled.mean(), 0) self.assertAlmostEqual(np.sqrt(X_scaled.data.var()), 1) # Check that X has not been copied # self.assertTrue(X_scaled is X) # Check that the matrix is still sparse self.assertEqual(len(X.indices), 10)
def test_standard_scaler_sparse_boston_data(self): X_train, Y_train, X_test, Y_test = get_dataset('boston', make_sparse=True) num_data_points = len(X_train.data) scaler = StandardScaler() scaler.fit(X_train, Y_train) tr = scaler.transform(X_train) # Test this for every single dimension! means = np.array([tr.data[tr.indptr[i]:tr.indptr[i + 1]].mean() for i in range(13)]) vars = np.array([tr.data[tr.indptr[i]:tr.indptr[i + 1]].var() for i in range(13)]) for i in chain(range(1, 3), range(4, 13)): self.assertAlmostEqual(means[i], 0, 2) self.assertAlmostEqual(vars[i], 1, 2) self.assertAlmostEqual(means[3], 1) self.assertAlmostEqual(vars[3], 0) # Test that the matrix is still sparse self.assertTrue(scipy.sparse.issparse(tr)) self.assertEqual(num_data_points, len(tr.data))
def setUp(self): self.cwd = os.getcwd() tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class self.attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1] ] self.categorical = [ True if attribute == 'nominal' else False for attribute in self.attribute_types ] data = np.array(dataset['data'], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1, )) ohe = OneHotEncoder(self.categorical) X_transformed = ohe.fit_transform(X) imp = Imputer(copy=False) X_transformed = imp.fit_transform(X_transformed) standard_scaler = StandardScaler() X_transformed = standard_scaler.fit_transform(X_transformed) X_transformed = X_transformed.todense() # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(self.categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical self.categorical_transformed = categorical_transformed self.X = X self.X_transformed = X_transformed self.y = y self.mf = meta_features.metafeatures self.helpers = meta_features.helper_functions # Precompute some helper functions self.helpers.set_value("PCA", self.helpers["PCA"](self.X_transformed, self.y)) self.helpers.set_value( "MissingValues", self.helpers["MissingValues"](self.X, self.y, self.categorical)) self.helpers.set_value( "NumSymbols", self.helpers["NumSymbols"](self.X, self.y, self.categorical)) self.helpers.set_value("ClassOccurences", self.helpers["ClassOccurences"](self.X, self.y)) self.helpers.set_value( "Skewnesses", self.helpers["Skewnesses"](self.X_transformed, self.y, self.categorical_transformed)) self.helpers.set_value( "Kurtosisses", self.helpers["Kurtosisses"](self.X_transformed, self.y, self.categorical_transformed))
def calculate_all_metafeatures(X, y, categorical, dataset_name, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) ohe = OneHotEncoder(categorical_features=categorical, sparse=True) X_transformed = ohe.fit_transform(X) imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype) X_transformed = imputer.fit_transform(X_transformed) standard_scaler = StandardScaler(copy=False) X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[ 0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_) return mf_
def calculate_all_metafeatures(X, y, categorical, dataset_name, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) ohe = OneHotEncoder(categorical_features=categorical, sparse=True) X_transformed = ohe.fit_transform(X) imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype) X_transformed = imputer.fit_transform(X_transformed) standard_scaler = StandardScaler(copy=False) X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_) return mf_
def __init__(self, random_state): from autosklearn.pipeline.implementations.StandardScaler import \ StandardScaler self.preprocessor = StandardScaler()
def test_scaler_2d_arrays(self): """Test scaling of 2d array along first axis""" rng = np.random.RandomState(0) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has been copied self.assertTrue(X_scaled is not X) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) self.assertTrue(X_scaled_back is not X) self.assertTrue(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) X_scaled = scale(X, axis=1, with_std=True) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) # Check that the data hasn't been modified self.assertTrue(X_scaled is not X) X_scaled = scaler.fit(X).transform(X, copy=False) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied self.assertTrue(X_scaled is X) X = rng.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) self.assertFalse(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied self.assertTrue(X_scaled is not X) # Same thing for sparse matrices... X = scipy.sparse.coo_matrix((np.random.random((12,)), ([i for i in range(12)], [int(i / 3) for i in range(12)]))) X = X.tocsr() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) self.assertFalse(np.any(np.isnan(X_scaled.data))) assert_array_almost_equal( [X_scaled.data[X_scaled.indptr[i]:X_scaled.indptr[i + 1]].mean() for i in range(X_scaled.shape[1])], np.zeros((4, ), dtype=np.float64)) assert_array_almost_equal(np.sqrt([ X_scaled.data[X_scaled.indptr[i]:X_scaled.indptr[i + 1]].var() for i in range(X_scaled.shape[1])]), np.ones((4, ), dtype=np.float64)) # Because we change the sparse format to csc, we cannot assert that # the matrix did not change! # self.assertTrue(X_scaled is X) # Check that the matrix is still sparse self.assertEqual(len(X.indices), 12)