def test_transformer_y_classes_regression_error(categorical_features,
                                                numerical_features, seed):
    """Testing if y_classes raises an Error when target_type is provided as Numerical."""
    tr = Transformer(categorical_features, numerical_features, "Numerical",
                     seed)
    with pytest.raises(ValueError):
        _ = tr.y_classes()
def test_transformer_create_preprocessor_y(categorical_features,
                                           numerical_features, target_type,
                                           expected_function):
    """Testing if y preprocessor is created correctly."""
    tr = Transformer(categorical_features, numerical_features, target_type)
    preprocessor = tr._create_default_transformer_y()

    assert type(preprocessor).__name__ == type(expected_function).__name__
def test_transformer_create_preprocessor_y_invalid_target_type(
        categorical_features, numerical_features, target_type):
    """Testing if ._create_preprocessor_y raises an Exception when invalid target_type is provided"""
    tr = Transformer(categorical_features, numerical_features,
                     "Categorical")  # initiating with proper type
    tr.target_type = target_type
    with pytest.raises(ValueError) as excinfo:
        preprocessor = tr._create_default_transformer_y()
    assert "should be Categorical or Numerical" in str(excinfo.value)
def transformer_multiclass(categorical_features, numerical_features, seed, preprocessor_X):
    """Transformer for data_multiclass test data."""
    tr = Transformer(
        categorical_features=categorical_features,
        numerical_features=numerical_features,
        target_type="Categorical",
        random_state=seed
    )

    tr.preprocessor_X = preprocessor_X
    tr.preprocessor_y = LabelEncoder()

    return tr
def transformer_classification(categorical_features, numerical_features, seed, preprocessor_X):
    """Transformer for data_classification_balanced test data."""
    categorical_features.remove("Target")
    tr = Transformer(
        categorical_features=categorical_features,
        numerical_features=numerical_features,
        target_type="Categorical",
        random_state=seed
    )

    tr.preprocessor_X = preprocessor_X
    tr.preprocessor_y = LabelEncoder()

    return tr
def transformer_regression(categorical_features, numerical_features, seed, preprocessor_X):
    """Transformer for data_regression test data."""
    numerical_features.remove("Price")
    tr = Transformer(
        categorical_features=categorical_features,
        numerical_features=numerical_features,
        target_type="Numerical",
        random_state=seed
    )

    tr.preprocessor_X = preprocessor_X
    tr.preprocessor_y = FunctionTransformer(lambda x: x)

    return tr
def test_transformer_transform_y_numerical(data_classification_balanced,
                                           categorical_features,
                                           numerical_features, feature_name):
    """Testing if fit_y() and transform_y() are changing provided y correctly (when y is numerical)"""
    df = pd.concat(
        [data_classification_balanced[0], data_classification_balanced[1]],
        axis=1)
    target = df[feature_name]
    expected_result = target.array

    tr = Transformer(categorical_features, numerical_features, "Numerical")
    actual_result = tr.fit_transform_y(target)

    assert np.allclose(actual_result, expected_result, equal_nan=True)
def test_transformer_create_preprocessor_X(categorical_features,
                                           numerical_features):
    """Testing if X preprocessor correctly assigns steps to columns depending on their type."""
    categorical_features.remove("Target")
    tr = Transformer(categorical_features, numerical_features, "Categorical")
    preprocessor = tr._create_preprocessor_X()
    expected_steps = [("numerical", numerical_features),
                      ("categorical", categorical_features)]

    actual_steps = [(item[0], item[2]) for item in preprocessor.transformers]

    for step in expected_steps:
        assert step in actual_steps

    assert len(actual_steps) == len(expected_steps)
def test_transformer_y_classes_classification(data_classification_balanced,
                                              categorical_features,
                                              numerical_features, y_column,
                                              expected_result, seed):
    """Testing if classification transformer returns correct classes from y."""
    df = pd.concat(
        [data_classification_balanced[0], data_classification_balanced[1]],
        axis=1)
    cat_feats = categorical_features.remove(y_column)
    tr = Transformer(cat_feats,
                     numerical_features,
                     "Categorical",
                     random_state=seed)
    tr.fit_y(df[y_column])

    actual_result = tr.y_classes().tolist()

    assert actual_result == expected_result
def test_transformer_transform_y_classification_pos_label(
    data_classification_balanced,
    categorical_features,
    numerical_features,
    feature,
    classification_pos_label,
):
    """Testing if transformer correctly changes mappings of y when explicit classification_pos_label is provided."""
    df = pd.concat(
        [data_classification_balanced[0], data_classification_balanced[1]],
        axis=1)
    expected_result = df[feature].apply(
        lambda x: 1 if x == classification_pos_label else 0)
    tr = Transformer(categorical_features,
                     numerical_features,
                     "Categorical",
                     classification_pos_label=classification_pos_label)
    actual_result = tr.fit_transform_y(df[feature])
    assert np.array_equal(actual_result, expected_result)
def test_transformer_preprocessor_X_remainder_order(
        categorical_features, numerical_features, data_classification_balanced,
        expected_raw_mapping, transformed_features):
    """Testing if remainder portion of ColumnTransformer returns the columns in the expected (alphabetical) order."""
    categorical_features.remove("Target")
    categorical_features = [
        f for f in categorical_features if f not in transformed_features
    ]
    numerical_features = [
        f for f in numerical_features if f not in transformed_features
    ]

    X = data_classification_balanced[0].drop(["Date"], axis=1)

    tr = Transformer(categorical_features, numerical_features, "Numerical")
    tr.fit(X)
    transformed = tr.transform(X)
    try:
        transformed = transformed.toarray()
    except AttributeError:
        transformed = transformed

    cols = tr.transformed_columns() + sorted(transformed_features)
    actual_result = pd.DataFrame(transformed, columns=cols)

    for col in transformed_features:
        assert np.allclose(actual_result[col].to_numpy(),
                           X[col].to_numpy(),
                           equal_nan=True)
def test_transformer_preprocessor_X_remainder(categorical_features,
                                              numerical_features,
                                              data_classification_balanced,
                                              expected_raw_mapping,
                                              transformed_feature):
    """Testing if feature not declared in either categorical or numerical features passes through unchanged."""
    categorical_features.remove("Target")
    categorical_features = [
        f for f in categorical_features if f != transformed_feature
    ]
    numerical_features = [
        f for f in numerical_features if f != transformed_feature
    ]
    X = data_classification_balanced[0].drop(["Date"], axis=1)

    if transformed_feature in expected_raw_mapping.keys():
        X[transformed_feature] = X[transformed_feature].replace(
            expected_raw_mapping[transformed_feature])

    tr = Transformer(categorical_features, numerical_features, "Numerical")
    tr.fit(X)
    transformed = tr.transform(X)
    try:
        transformed = transformed.toarray()
    except AttributeError:
        transformed = transformed

    cols = tr.transformed_columns() + [transformed_feature]
    actual_result = pd.DataFrame(transformed, columns=cols)

    assert np.allclose(actual_result[transformed_feature].to_numpy(),
                       X[transformed_feature].to_numpy(),
                       equal_nan=True)
    # checking if there is only one column with transformed_feature (no derivations)
    assert sum([1 for col in cols if transformed_feature in col]) == 1
def test_transformer_transform_y_categorical(data_classification_balanced,
                                             categorical_features,
                                             numerical_features,
                                             expected_raw_mapping,
                                             feature_name):
    """Testing if fit_y() and transform_y() are changing provided y correctly (when y is categorical)"""
    df = pd.concat(
        [data_classification_balanced[0], data_classification_balanced[1]],
        axis=1)

    target = df[feature_name]
    mapping = {
        key: int(item - 1)
        for key, item in expected_raw_mapping[feature_name].items()
    }
    mapping[np.nan] = max(mapping.values()) + 1
    expected_result = target.replace(mapping).array

    tr = Transformer(categorical_features, numerical_features, "Categorical")
    actual_result = tr.fit_transform_y(target)

    assert np.array_equal(actual_result, expected_result)
def test_transformer_transform_X_numerical(data_classification_balanced,
                                           feature_name):
    """Testing if every numerical column from a test data is transformed correctly."""
    random_state = 1
    df = pd.concat(
        [data_classification_balanced[0], data_classification_balanced[1]],
        axis=1)
    feature = df[feature_name]
    median = feature.describe()["50%"]
    feature = feature.fillna(median).to_numpy().reshape(-1, 1)
    feature = QuantileTransformer(
        output_distribution="normal",
        random_state=random_state).fit_transform(feature)
    feature = StandardScaler().fit_transform(feature)
    expected_result = feature

    tr = Transformer([], [feature_name],
                     "Categorical",
                     random_state=random_state)
    actual_result = tr.fit_transform(pd.DataFrame(df[feature_name]))

    assert np.allclose(actual_result, expected_result)
def test_transformer_transform_X_categorical(data_classification_balanced,
                                             feature_name, csr_matrix_flag):
    """Testing if every categorical column from a test data is transformed correctly."""
    df = pd.concat(
        [data_classification_balanced[0], data_classification_balanced[1]],
        axis=1)
    # replacing for SimpleImputer which cant handle bool dtype
    df["bool"] = df["bool"].replace({False: 0, True: 1})
    feature = df[feature_name]
    most_frequent = feature.value_counts(dropna=False).index[0]
    feature = feature.fillna(most_frequent)
    expected_result = OneHotEncoder(handle_unknown="ignore").fit_transform(
        feature.to_numpy().reshape(-1, 1)).toarray()

    tr = Transformer([feature_name], [], "Categorical")
    actual_result = tr.fit_transform(pd.DataFrame(df[feature_name]))

    # for n > 2 unique values, output is a csr_matrix
    if csr_matrix_flag:
        actual_result = actual_result.toarray()

    assert pd.DataFrame(actual_result).equals(pd.DataFrame(expected_result))
def test_transformer_transform_y_classification_pos_label_multiclass(
    data_multiclass,
    categorical_features,
    numerical_features,
    classification_pos_label,
):
    """Testing if transformer correctly changes mappings of y when explicit classification_pos_label is provided
    for multiclass problem (so the mapping changes it to classification problem)."""
    y = data_multiclass[1]
    mapping = {
        "Fruits": 0,
        "Sweets": 0,
        "Dairy": 0,
        classification_pos_label: 1  # overwriting with test choice
    }
    expected_result = y.replace(mapping)
    tr = Transformer(categorical_features,
                     numerical_features,
                     "Categorical",
                     classification_pos_label=classification_pos_label)
    actual_result = tr.fit_transform_y(y)
    assert np.array_equal(actual_result, expected_result)
def test_transformer_custom_transformers(categorical_features,
                                         numerical_features, seed):
    """Testing if setting custom transformers in Transformer object works correctly."""
    categorical_tr = [
        SimpleImputer(strategy="most_frequent"),
        OrdinalEncoder()
    ]
    numerical_tr = [SimpleImputer(), PowerTransformer()]
    y_transformer = FunctionTransformer()

    tr = Transformer(categorical_features, numerical_features, "Categorical",
                     seed)
    tr.set_custom_preprocessor_X(categorical_tr, numerical_tr)
    tr.set_custom_preprocessor_y(y_transformer)

    assert tr.categorical_transformers == categorical_tr
    assert tr.numerical_transformers == numerical_tr
    assert tr.y_transformer == y_transformer
def test_transformer_custom_transformer_none_transformers(
        categorical_features, numerical_features, seed, custom_transformers,
        tr_type):
    """Testing if setting custom transformers with only one type of transformers provided works correctly."""
    tr = Transformer(categorical_features, numerical_features, "Categorical",
                     seed)

    if tr_type == "categorical":
        tr.set_custom_preprocessor_X(
            categorical_transformers=custom_transformers)
        expected_numerical = tr._default_numerical_transformers
        expected_categorical = custom_transformers
    elif tr_type == "numerical":
        tr.set_custom_preprocessor_X(
            numerical_transformers=custom_transformers)
        expected_numerical = custom_transformers
        expected_categorical = tr._default_categorical_transformers
    else:
        raise

    assert tr.categorical_transformers == expected_categorical
    assert tr.numerical_transformers == expected_numerical