def test_transformer_y_classes_regression_error(categorical_features, numerical_features, seed): """Testing if y_classes raises an Error when target_type is provided as Numerical.""" tr = Transformer(categorical_features, numerical_features, "Numerical", seed) with pytest.raises(ValueError): _ = tr.y_classes()
def test_transformer_create_preprocessor_y(categorical_features, numerical_features, target_type, expected_function): """Testing if y preprocessor is created correctly.""" tr = Transformer(categorical_features, numerical_features, target_type) preprocessor = tr._create_default_transformer_y() assert type(preprocessor).__name__ == type(expected_function).__name__
def test_transformer_create_preprocessor_y_invalid_target_type( categorical_features, numerical_features, target_type): """Testing if ._create_preprocessor_y raises an Exception when invalid target_type is provided""" tr = Transformer(categorical_features, numerical_features, "Categorical") # initiating with proper type tr.target_type = target_type with pytest.raises(ValueError) as excinfo: preprocessor = tr._create_default_transformer_y() assert "should be Categorical or Numerical" in str(excinfo.value)
def transformer_multiclass(categorical_features, numerical_features, seed, preprocessor_X): """Transformer for data_multiclass test data.""" tr = Transformer( categorical_features=categorical_features, numerical_features=numerical_features, target_type="Categorical", random_state=seed ) tr.preprocessor_X = preprocessor_X tr.preprocessor_y = LabelEncoder() return tr
def transformer_classification(categorical_features, numerical_features, seed, preprocessor_X): """Transformer for data_classification_balanced test data.""" categorical_features.remove("Target") tr = Transformer( categorical_features=categorical_features, numerical_features=numerical_features, target_type="Categorical", random_state=seed ) tr.preprocessor_X = preprocessor_X tr.preprocessor_y = LabelEncoder() return tr
def transformer_regression(categorical_features, numerical_features, seed, preprocessor_X): """Transformer for data_regression test data.""" numerical_features.remove("Price") tr = Transformer( categorical_features=categorical_features, numerical_features=numerical_features, target_type="Numerical", random_state=seed ) tr.preprocessor_X = preprocessor_X tr.preprocessor_y = FunctionTransformer(lambda x: x) return tr
def test_transformer_transform_y_numerical(data_classification_balanced, categorical_features, numerical_features, feature_name): """Testing if fit_y() and transform_y() are changing provided y correctly (when y is numerical)""" df = pd.concat( [data_classification_balanced[0], data_classification_balanced[1]], axis=1) target = df[feature_name] expected_result = target.array tr = Transformer(categorical_features, numerical_features, "Numerical") actual_result = tr.fit_transform_y(target) assert np.allclose(actual_result, expected_result, equal_nan=True)
def test_transformer_create_preprocessor_X(categorical_features, numerical_features): """Testing if X preprocessor correctly assigns steps to columns depending on their type.""" categorical_features.remove("Target") tr = Transformer(categorical_features, numerical_features, "Categorical") preprocessor = tr._create_preprocessor_X() expected_steps = [("numerical", numerical_features), ("categorical", categorical_features)] actual_steps = [(item[0], item[2]) for item in preprocessor.transformers] for step in expected_steps: assert step in actual_steps assert len(actual_steps) == len(expected_steps)
def test_transformer_y_classes_classification(data_classification_balanced, categorical_features, numerical_features, y_column, expected_result, seed): """Testing if classification transformer returns correct classes from y.""" df = pd.concat( [data_classification_balanced[0], data_classification_balanced[1]], axis=1) cat_feats = categorical_features.remove(y_column) tr = Transformer(cat_feats, numerical_features, "Categorical", random_state=seed) tr.fit_y(df[y_column]) actual_result = tr.y_classes().tolist() assert actual_result == expected_result
def test_transformer_transform_y_classification_pos_label( data_classification_balanced, categorical_features, numerical_features, feature, classification_pos_label, ): """Testing if transformer correctly changes mappings of y when explicit classification_pos_label is provided.""" df = pd.concat( [data_classification_balanced[0], data_classification_balanced[1]], axis=1) expected_result = df[feature].apply( lambda x: 1 if x == classification_pos_label else 0) tr = Transformer(categorical_features, numerical_features, "Categorical", classification_pos_label=classification_pos_label) actual_result = tr.fit_transform_y(df[feature]) assert np.array_equal(actual_result, expected_result)
def test_transformer_preprocessor_X_remainder_order( categorical_features, numerical_features, data_classification_balanced, expected_raw_mapping, transformed_features): """Testing if remainder portion of ColumnTransformer returns the columns in the expected (alphabetical) order.""" categorical_features.remove("Target") categorical_features = [ f for f in categorical_features if f not in transformed_features ] numerical_features = [ f for f in numerical_features if f not in transformed_features ] X = data_classification_balanced[0].drop(["Date"], axis=1) tr = Transformer(categorical_features, numerical_features, "Numerical") tr.fit(X) transformed = tr.transform(X) try: transformed = transformed.toarray() except AttributeError: transformed = transformed cols = tr.transformed_columns() + sorted(transformed_features) actual_result = pd.DataFrame(transformed, columns=cols) for col in transformed_features: assert np.allclose(actual_result[col].to_numpy(), X[col].to_numpy(), equal_nan=True)
def test_transformer_preprocessor_X_remainder(categorical_features, numerical_features, data_classification_balanced, expected_raw_mapping, transformed_feature): """Testing if feature not declared in either categorical or numerical features passes through unchanged.""" categorical_features.remove("Target") categorical_features = [ f for f in categorical_features if f != transformed_feature ] numerical_features = [ f for f in numerical_features if f != transformed_feature ] X = data_classification_balanced[0].drop(["Date"], axis=1) if transformed_feature in expected_raw_mapping.keys(): X[transformed_feature] = X[transformed_feature].replace( expected_raw_mapping[transformed_feature]) tr = Transformer(categorical_features, numerical_features, "Numerical") tr.fit(X) transformed = tr.transform(X) try: transformed = transformed.toarray() except AttributeError: transformed = transformed cols = tr.transformed_columns() + [transformed_feature] actual_result = pd.DataFrame(transformed, columns=cols) assert np.allclose(actual_result[transformed_feature].to_numpy(), X[transformed_feature].to_numpy(), equal_nan=True) # checking if there is only one column with transformed_feature (no derivations) assert sum([1 for col in cols if transformed_feature in col]) == 1
def test_transformer_transform_y_categorical(data_classification_balanced, categorical_features, numerical_features, expected_raw_mapping, feature_name): """Testing if fit_y() and transform_y() are changing provided y correctly (when y is categorical)""" df = pd.concat( [data_classification_balanced[0], data_classification_balanced[1]], axis=1) target = df[feature_name] mapping = { key: int(item - 1) for key, item in expected_raw_mapping[feature_name].items() } mapping[np.nan] = max(mapping.values()) + 1 expected_result = target.replace(mapping).array tr = Transformer(categorical_features, numerical_features, "Categorical") actual_result = tr.fit_transform_y(target) assert np.array_equal(actual_result, expected_result)
def test_transformer_transform_X_numerical(data_classification_balanced, feature_name): """Testing if every numerical column from a test data is transformed correctly.""" random_state = 1 df = pd.concat( [data_classification_balanced[0], data_classification_balanced[1]], axis=1) feature = df[feature_name] median = feature.describe()["50%"] feature = feature.fillna(median).to_numpy().reshape(-1, 1) feature = QuantileTransformer( output_distribution="normal", random_state=random_state).fit_transform(feature) feature = StandardScaler().fit_transform(feature) expected_result = feature tr = Transformer([], [feature_name], "Categorical", random_state=random_state) actual_result = tr.fit_transform(pd.DataFrame(df[feature_name])) assert np.allclose(actual_result, expected_result)
def test_transformer_transform_X_categorical(data_classification_balanced, feature_name, csr_matrix_flag): """Testing if every categorical column from a test data is transformed correctly.""" df = pd.concat( [data_classification_balanced[0], data_classification_balanced[1]], axis=1) # replacing for SimpleImputer which cant handle bool dtype df["bool"] = df["bool"].replace({False: 0, True: 1}) feature = df[feature_name] most_frequent = feature.value_counts(dropna=False).index[0] feature = feature.fillna(most_frequent) expected_result = OneHotEncoder(handle_unknown="ignore").fit_transform( feature.to_numpy().reshape(-1, 1)).toarray() tr = Transformer([feature_name], [], "Categorical") actual_result = tr.fit_transform(pd.DataFrame(df[feature_name])) # for n > 2 unique values, output is a csr_matrix if csr_matrix_flag: actual_result = actual_result.toarray() assert pd.DataFrame(actual_result).equals(pd.DataFrame(expected_result))
def test_transformer_transform_y_classification_pos_label_multiclass( data_multiclass, categorical_features, numerical_features, classification_pos_label, ): """Testing if transformer correctly changes mappings of y when explicit classification_pos_label is provided for multiclass problem (so the mapping changes it to classification problem).""" y = data_multiclass[1] mapping = { "Fruits": 0, "Sweets": 0, "Dairy": 0, classification_pos_label: 1 # overwriting with test choice } expected_result = y.replace(mapping) tr = Transformer(categorical_features, numerical_features, "Categorical", classification_pos_label=classification_pos_label) actual_result = tr.fit_transform_y(y) assert np.array_equal(actual_result, expected_result)
def test_transformer_custom_transformers(categorical_features, numerical_features, seed): """Testing if setting custom transformers in Transformer object works correctly.""" categorical_tr = [ SimpleImputer(strategy="most_frequent"), OrdinalEncoder() ] numerical_tr = [SimpleImputer(), PowerTransformer()] y_transformer = FunctionTransformer() tr = Transformer(categorical_features, numerical_features, "Categorical", seed) tr.set_custom_preprocessor_X(categorical_tr, numerical_tr) tr.set_custom_preprocessor_y(y_transformer) assert tr.categorical_transformers == categorical_tr assert tr.numerical_transformers == numerical_tr assert tr.y_transformer == y_transformer
def test_transformer_custom_transformer_none_transformers( categorical_features, numerical_features, seed, custom_transformers, tr_type): """Testing if setting custom transformers with only one type of transformers provided works correctly.""" tr = Transformer(categorical_features, numerical_features, "Categorical", seed) if tr_type == "categorical": tr.set_custom_preprocessor_X( categorical_transformers=custom_transformers) expected_numerical = tr._default_numerical_transformers expected_categorical = custom_transformers elif tr_type == "numerical": tr.set_custom_preprocessor_X( numerical_transformers=custom_transformers) expected_numerical = custom_transformers expected_categorical = tr._default_categorical_transformers else: raise assert tr.categorical_transformers == expected_categorical assert tr.numerical_transformers == expected_numerical