def test_error_if_input_df_contains_categories_not_present_in_training_df(
        df_enc, df_enc_rare):
    # test case 4: when dataset to be transformed contains categories not present
    # in training dataset
    msg = "During the encoding, NaN values were introduced in the feature(s) var_A."

    # check for warning when rare_labels equals 'ignore'
    with pytest.warns(UserWarning) as record:
        encoder = OrdinalEncoder(errors="ignore")
        encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
        encoder.transform(df_enc_rare[["var_A", "var_B"]])

    # check that only one warning was raised
    assert len(record) == 1
    # check that the message matches
    assert record[0].message.args[0] == msg

    # check for error when rare_labels equals 'raise'
    with pytest.raises(ValueError) as record:
        encoder = OrdinalEncoder(errors="raise")
        encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
        encoder.transform(df_enc_rare[["var_A", "var_B"]])

    # check that the error message matches
    assert str(record.value) == msg
Example #2
0
def test_error_if_input_df_contains_categories_not_present_in_training_df(
        df_enc, df_enc_rare):
    # test case 4: when dataset to be transformed contains categories not present
    # in training dataset
    with pytest.warns(UserWarning):
        encoder = OrdinalEncoder(encoding_method="arbitrary")
        encoder.fit(df_enc)
        encoder.transform(df_enc_rare)
def feature_engineering_ordinal_encoding(X_train, y_train, X_test):

    encoder = OrdinalEncoder(
        encoding_method='ordered',
        variables=[
            'gender',
            # 'hypertension',
            #'heart_disease',
            'ever_married',
            'work_type',
            'Residence_type',
            'smoking_status'
        ])

    encoder.fit(X_train, y_train)

    train_t = encoder.transform(X_train)
    test_t = encoder.transform(X_test)

    return train_t, test_t
def test_variables_cast_as_category(df_enc_category_dtypes):
    df = df_enc_category_dtypes.copy()
    encoder = OrdinalEncoder(encoding_method="ordered", variables=["var_A"])
    encoder.fit(df[["var_A", "var_B"]], df["target"])
    X = encoder.transform(df[["var_A", "var_B"]])

    # expected output
    transf_df = df.copy()
    transf_df["var_A"] = [
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2
    ]

    # test transform output
    pd.testing.assert_frame_equal(X,
                                  transf_df[["var_A", "var_B"]],
                                  check_dtype=False)
    assert X["var_A"].dtypes == int
Example #5
0
def test_ordered_encoding_1_variable(df_enc):
    # test case 1: 1 variable, ordered encoding
    encoder = OrdinalEncoder(encoding_method="ordered", variables=["var_A"])
    encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"])
    X = encoder.transform(df_enc[["var_A", "var_B"]])

    # expected output
    transf_df = df_enc.copy()
    transf_df["var_A"] = [
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2
    ]

    # test init params
    assert encoder.encoding_method == "ordered"
    assert encoder.variables == ["var_A"]
    # test fit attr
    assert encoder.encoder_dict_ == {"var_A": {"A": 1, "B": 0, "C": 2}}
    assert encoder.input_shape_ == (20, 2)
    # test transform output
    pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]])
def test_ordered_encoding_1_variable_ignore_format(df_enc_numeric):

    encoder = OrdinalEncoder(encoding_method="ordered",
                             variables=["var_A"],
                             ignore_format=True)
    encoder.fit(df_enc_numeric[["var_A", "var_B"]], df_enc_numeric["target"])
    X = encoder.transform(df_enc_numeric[["var_A", "var_B"]])

    # expected output
    transf_df = df_enc_numeric.copy()
    transf_df["var_A"] = [
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2
    ]

    # test init params
    assert encoder.encoding_method == "ordered"
    assert encoder.variables == ["var_A"]
    # test fit attr
    assert encoder.variables_ == ["var_A"]
    assert encoder.encoder_dict_ == {"var_A": {1: 1, 2: 0, 3: 2}}
    assert encoder.n_features_in_ == 2
    # test transform output
    pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]])
Example #7
0
    # Regularize data set
    df.price_per_size = df.price_per_size / 10000
    df.price = df.price / 1000000
    df.rent = df.rent / 1000

    # Test train split
    X = df.drop(columns=['price'], axis=1)
    Y = df['price']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

    # Encoding the regions
    regions_df = np.asarray(X['region']).reshape(1, -1)
    enc = OrdinalEncoder(encoding_method='ordered', variables=['region'])
    enc.fit(X_train, y_train)
    X_train_enc = enc.transform(X_train)
    X_test_enc = enc.transform(X_test)

    # fit model no training data
    regressor = xgboost.XGBRegressor(n_estimators=100,
                                     reg_lambda=1,
                                     gamma=0,
                                     max_depth=3)
    regressor.fit(X_train_enc, y_train)

    # make predictions for test data
    y_pred = regressor.predict(X_test_enc)

    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    mse = mean_squared_error(y_test, predictions)
def test_transform_raises_error_if_df_contains_na(df_enc, df_enc_na):
    # test case 4: when dataset contains na, transform method
    with pytest.raises(ValueError):
        encoder = OrdinalEncoder(encoding_method="arbitrary")
        encoder.fit(df_enc)
        encoder.transform(df_enc_na)
Example #9
0
def test_non_fitted_error(df_enc):
    with pytest.raises(NotFittedError):
        imputer = OrdinalEncoder()
        imputer.transform(df_enc)