def test_make_column_transformer_remainder_transformer(): scaler = StandardScaler() norm = Normalizer() remainder = StandardScaler() ct = make_column_transformer((scaler, 'first'), (norm, ['second']), remainder=remainder) assert ct.remainder == remainder
def test_make_column_transformer_pandas(): pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) norm = Normalizer() ct1 = ColumnTransformer([('norm', Normalizer(), X_df.columns)]) ct2 = make_column_transformer((norm, X_df.columns)) assert_almost_equal(ct1.fit_transform(X_df), ct2.fit_transform(X_df))
def test_make_column_transformer(): scaler = StandardScaler() norm = Normalizer() ct = make_column_transformer((scaler, 'first'), (norm, ['second'])) names, transformers, columns = zip(*ct.transformers) assert names == ("standardscaler", "normalizer") assert transformers == (scaler, norm) assert columns == ('first', ['second'])
def test_column_transformer_remainder(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_first = np.array([0, 1, 2]).reshape(-1, 1) X_res_second = np.array([2, 4, 6]).reshape(-1, 1) X_res_both = X_array # default drop ct = ColumnTransformer([('trans1', Trans(), [0])]) assert_array_equal(ct.fit_transform(X_array), X_res_first) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'drop' assert_array_equal(ct.transformers_[-1][2], [1]) # specify passthrough ct = ColumnTransformer([('trans', Trans(), [0])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1]) # column order is not preserved (passed through added to end) ct = ColumnTransformer([('trans1', Trans(), [1])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1]) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1]) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [0]) # passthrough when all actual transformers are skipped ct = ColumnTransformer([('trans1', 'drop', [0])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_second) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1]) # error on invalid arg ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\', \'passthrough\', " "or estimator.", ct.fit, X_array) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\', \'passthrough\', " "or estimator.", ct.fit_transform, X_array) # check default for make_column_transformer ct = make_column_transformer((Trans(), [0])) assert ct.remainder == 'drop'
def test_make_column_transformer_kwargs(): scaler = StandardScaler() norm = Normalizer() ct = make_column_transformer((scaler, 'first'), (norm, ['second']), n_jobs=3, remainder='drop', sparse_threshold=0.5) assert ct.transformers == make_column_transformer( (scaler, 'first'), (norm, ['second'])).transformers assert ct.n_jobs == 3 assert ct.remainder == 'drop' assert ct.sparse_threshold == 0.5 # invalid keyword parameters should raise an error message assert_raise_message(TypeError, 'Unknown keyword arguments: "transformer_weights"', make_column_transformer, (scaler, 'first'), (norm, ['second']), transformer_weights={ 'pca': 10, 'Transf': 1 })
def test_column_transformer_mixed_cols_sparse(): df = np.array([['a', 1, True], ['b', 2, False]], dtype='O') ct = make_column_transformer((OneHotEncoder(), [0]), ('passthrough', [1, 2]), sparse_threshold=1.0) # this shouldn't fail, since boolean can be coerced into a numeric # See: https://github.com/scikit-learn/scikit-learn/issues/11912 X_trans = ct.fit_transform(df) assert X_trans.getformat() == 'csr' assert_array_equal(X_trans.toarray(), np.array([[1, 0, 1, 1], [0, 1, 2, 0]])) ct = make_column_transformer((OneHotEncoder(), [0]), ('passthrough', [0]), sparse_threshold=1.0) with pytest.raises(ValueError, match="For a sparse output, all columns should"): # this fails since strings `a` and `b` cannot be # coerced into a numeric. ct.fit_transform(df)