def test_column_transformer_invalid_columns(remainder): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T # general invalid for col in [1.5, ['string', 1], slice(1, 's'), np.array([1.])]: ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder) assert_raise_message(ValueError, "No valid specification", ct.fit, X_array) # invalid for arrays for col in ['string', ['string', 'other'], slice('a', 'b')]: ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder) assert_raise_message(ValueError, "Specifying the columns", ct.fit, X_array) # transformed n_features does not match fitted n_features col = [0, 1] ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder) ct.fit(X_array) X_array_more = np.array([[0, 1, 2], [2, 4, 6], [3, 6, 9]]).T msg = ("Given feature/column names or counts do not match the ones for " "the data given during fit.") with pytest.warns(DeprecationWarning, match=msg): ct.transform(X_array_more) # Should accept added columns, for now X_array_fewer = np.array([ [0, 1, 2], ]).T err_msg = 'Number of features' with pytest.raises(ValueError, match=err_msg): ct.transform(X_array_fewer)
def test_column_transformer_sparse_stacking(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T col_trans = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', SparseMatrixTrans(), 1)], sparse_threshold=0.8) col_trans.fit(X_array) X_trans = col_trans.transform(X_array) assert sparse.issparse(X_trans) assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1) assert_array_equal(X_trans.toarray()[:, 1:], np.eye(X_trans.shape[0])) assert len(col_trans.transformers_) == 2 assert col_trans.transformers_[-1][0] != 'remainder' col_trans = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', SparseMatrixTrans(), 1)], sparse_threshold=0.1) col_trans.fit(X_array) X_trans = col_trans.transform(X_array) assert not sparse.issparse(X_trans) assert X_trans.shape == (X_trans.shape[0], X_trans.shape[0] + 1) assert_array_equal(X_trans[:, 1:], np.eye(X_trans.shape[0]))
def test_column_transformer_reordered_column_names_remainder(explicit_colname): """Regression test for issue #14223: 'Named col indexing fails with ColumnTransformer remainder on changing DataFrame column ordering' Should raise error on changed order combined with remainder. Should allow for added columns in `transform` input DataFrame as long as all preceding columns match. """ pd = pytest.importorskip('pandas') X_fit_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_fit_df = pd.DataFrame(X_fit_array, columns=['first', 'second']) X_trans_array = np.array([[2, 4, 6], [0, 1, 2]]).T X_trans_df = pd.DataFrame(X_trans_array, columns=['second', 'first']) tf = ColumnTransformer([('bycol', Trans(), explicit_colname)], remainder=Trans()) tf.fit(X_fit_df) err_msg = 'Column ordering must be equal' warn_msg = ("Given feature/column names or counts do not match the ones " "for the data given during fit.") with pytest.raises(ValueError, match=err_msg): tf.transform(X_trans_df) # No error for added columns if ordering is identical X_extended_df = X_fit_df.copy() X_extended_df['third'] = [3, 6, 9] with pytest.warns(DeprecationWarning, match=warn_msg): tf.transform(X_extended_df) # No error should be raised, for now # No 'columns' AttributeError when transform input is a numpy array X_array = X_fit_array.copy() err_msg = 'Specifying the columns' with pytest.raises(ValueError, match=err_msg): tf.transform(X_array)
def test_feature_name_validation(): """Tests if the proper warning/error is raised if the columns do not match during fit and transform.""" pd = pytest.importorskip("pandas") X = np.ones(shape=(3, 2)) X_extra = np.ones(shape=(3, 3)) df = pd.DataFrame(X, columns=['a', 'b']) df_extra = pd.DataFrame(X_extra, columns=['a', 'b', 'c']) tf = ColumnTransformer([('bycol', Trans(), ['a', 'b'])]) tf.fit(df) msg = ("Given feature/column names or counts do not match the ones for " "the data given during fit.") with pytest.warns(DeprecationWarning, match=msg): tf.transform(df_extra) tf = ColumnTransformer([('bycol', Trans(), [0])]) tf.fit(df) with pytest.warns(DeprecationWarning, match=msg): tf.transform(X_extra) with warnings.catch_warnings(record=True) as warns: tf.transform(X) assert not warns tf = ColumnTransformer([('bycol', Trans(), ['a'])], remainder=Trans()) tf.fit(df) with pytest.warns(DeprecationWarning, match=msg): tf.transform(df_extra) tf = ColumnTransformer([('bycol', Trans(), [0, -1])]) tf.fit(df) msg = "At least one negative column was used to" with pytest.raises(RuntimeError, match=msg): tf.transform(df_extra) tf = ColumnTransformer([('bycol', Trans(), slice(-1, -3, -1))]) tf.fit(df) with pytest.raises(RuntimeError, match=msg): tf.transform(df_extra) with warnings.catch_warnings(record=True) as warns: tf.transform(df) assert not warns