def test_imputation_shape(self): """Verify the shapes of the imputed matrix for different strategies.""" X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent']: imputer = Imputer(strategy=strategy) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (10, 2)) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert_equal(X_imputed.shape, (10, 2))
def calculate_all_metafeatures(X, y, categorical, dataset_name, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) ohe = OneHotEncoder(categorical_features=categorical, sparse=True) X_transformed = ohe.fit_transform(X) imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype) X_transformed = imputer.fit_transform(X_transformed) standard_scaler = StandardScaler(copy=False) X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[ 0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_) return mf_
def calculate_all_metafeatures(X, y, categorical, dataset_name, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) ohe = OneHotEncoder(categorical_features=categorical, sparse=True) X_transformed = ohe.fit_transform(X) imputer = Imputer(strategy='mean', copy=False, dtype=X.dtype) X_transformed = imputer.fit_transform(X_transformed) standard_scaler = StandardScaler(copy=False) X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_) return mf_