Ejemplo n.º 1
0
def test_seed_per_observation_plus_product_of_seeding_variables(df_na):
    # test case 3: observation seed, 2 variables as seed, product of seed variables
    imputer = RandomSampleImputer(
        variables=["City", "Studies"],
        random_state=["Marks", "Age"],
        seed="observation",
        seeding_method="multiply",
    )
    df_na[["Marks", "Age"]] = df_na[["Marks", "Age"]].fillna(1)
    X_transformed = imputer.fit_transform(df_na)

    # expected output
    ref = {
        "Name":
        ["tom", "nick", "krish", np.nan, "peter", np.nan, "fred", "sam"],
        "City": [
            "London",
            "Manchester",
            "London",
            "Manchester",
            "London",
            "London",
            "Bristol",
            "Manchester",
        ],
        "Studies": [
            "Bachelor",
            "Bachelor",
            "Bachelor",
            "Masters",
            "Bachelor",
            "PhD",
            "None",
            "Masters",
        ],
        "Age": [20, 21, 19, np.nan, 23, 40, 41, 37],
        "Marks": [0.9, 0.8, 0.7, np.nan, 0.3, np.nan, 0.8, 0.6],
        "dob":
        pd.date_range("2020-02-24", periods=8, freq="T"),
    }
    ref = pd.DataFrame(ref)

    assert imputer.variables == ["City", "Studies"]
    assert imputer.random_state == ["Marks", "Age"]
    assert imputer.seed == "observation"

    pd.testing.assert_frame_equal(imputer.X_[["City", "Studies"]],
                                  df_na[["City", "Studies"]])

    pd.testing.assert_frame_equal(
        X_transformed[["City", "Studies"]],
        ref[["City", "Studies"]],
        check_dtype=False,
    )
def test_general_seed_plus_automatically_select_variables(df_na):
    # set up transformer
    imputer = RandomSampleImputer(variables=None,
                                  random_state=5,
                                  seed="general")
    X_transformed = imputer.fit_transform(df_na)

    # expected output:
    # fillna based on seed used (found experimenting on Jupyter notebook)
    ref = {
        "Name":
        ["tom", "nick", "krish", "peter", "peter", "sam", "fred", "sam"],
        "City": [
            "London",
            "Manchester",
            "London",
            "Manchester",
            "London",
            "London",
            "Bristol",
            "Manchester",
        ],
        "Studies": [
            "Bachelor",
            "Bachelor",
            "PhD",
            "Masters",
            "Bachelor",
            "PhD",
            "None",
            "Masters",
        ],
        "Age": [20, 21, 19, 23, 23, 40, 41, 37],
        "Marks": [0.9, 0.8, 0.7, 0.3, 0.3, 0.6, 0.8, 0.6],
        "dob":
        pd.date_range("2020-02-24", periods=8, freq="T"),
    }
    ref = pd.DataFrame(ref)

    # test init params
    assert imputer.variables == [
        "Name", "City", "Studies", "Age", "Marks", "dob"
    ]
    assert imputer.random_state == 5
    assert imputer.seed == "general"

    # test fit attr
    assert imputer.input_shape_ == (8, 6)
    pd.testing.assert_frame_equal(imputer.X_, df_na)

    # test transform output
    pd.testing.assert_frame_equal(X_transformed, ref, check_dtype=False)
Ejemplo n.º 3
0
def test_seed_per_observation_with_only_1_variable_as_seed(df_na):
    # test case 4: observation seed, only variable indicated as seed, method: addition
    # Note the variable used as seed should not have missing data
    imputer = RandomSampleImputer(variables=["City", "Studies"],
                                  random_state="Age",
                                  seed="observation")
    df_na["Age"] = df_na["Age"].fillna(1)
    X_transformed = imputer.fit_transform(df_na)

    # expected output
    ref = {
        "Name":
        ["tom", "nick", "krish", np.nan, "peter", np.nan, "fred", "sam"],
        "City": [
            "London",
            "Manchester",
            "Manchester",
            "Manchester",
            "London",
            "London",
            "Bristol",
            "Manchester",
        ],
        "Studies": [
            "Bachelor",
            "Bachelor",
            "Masters",
            "Masters",
            "Bachelor",
            "PhD",
            "None",
            "Masters",
        ],
        "Age": [20, 21, 19, np.nan, 23, 40, 41, 37],
        "Marks": [0.9, 0.8, 0.7, np.nan, 0.3, np.nan, 0.8, 0.6],
        "dob":
        pd.date_range("2020-02-24", periods=8, freq="T"),
    }
    ref = pd.DataFrame(ref)

    assert imputer.random_state == ["Age"]

    pd.testing.assert_frame_equal(imputer.X_[["City", "Studies"]],
                                  df_na[["City", "Studies"]])

    pd.testing.assert_frame_equal(
        X_transformed[["City", "Studies"]],
        ref[["City", "Studies"]],
        check_dtype=False,
    )
Ejemplo n.º 4
0
def test_seed_per_observation_and_multiple_variables_in_random_state(df_na):
    # test case 2: imputer seed per observation using multiple variables to determine
    # the random_state
    # Note the variables used as seed should not have missing data
    imputer = RandomSampleImputer(variables=["City", "Studies"],
                                  random_state=["Marks", "Age"],
                                  seed="observation")
    df_na[["Marks", "Age"]] = df_na[["Marks", "Age"]].fillna(1)
    X_transformed = imputer.fit_transform(df_na)

    # expected output
    ref = {
        "Name":
        ["tom", "nick", "krish", np.nan, "peter", np.nan, "fred", "sam"],
        "City": [
            "London",
            "Manchester",
            "London",
            "London",
            "London",
            "London",
            "Bristol",
            "Manchester",
        ],
        "Studies": [
            "Bachelor",
            "Bachelor",
            "PhD",
            "Bachelor",
            "Bachelor",
            "PhD",
            "None",
            "Masters",
        ],
        "Age": [20, 21, 19, np.nan, 23, 40, 41, 37],
        "Marks": [0.9, 0.8, 0.7, np.nan, 0.3, np.nan, 0.8, 0.6],
        "dob":
        pd.date_range("2020-02-24", periods=8, freq="T"),
    }
    ref = pd.DataFrame(ref)

    assert imputer.variables == ["City", "Studies"]
    assert imputer.random_state == ["Marks", "Age"]
    assert imputer.seed == "observation"
    pd.testing.assert_frame_equal(imputer.X_[["City", "Studies"]],
                                  df_na[["City", "Studies"]])

    pd.testing.assert_frame_equal(X_transformed[["City", "Studies"]],
                                  ref[["City", "Studies"]])
def test_error_if_random_state_is_string(df_na):
    with pytest.raises(ValueError):
        imputer = RandomSampleImputer(seed="observation",
                                      random_state="arbitrary")
        imputer.fit(df_na)
def test_error_if_random_state_is_none_when_seed_is_observation():
    with pytest.raises(ValueError):
        RandomSampleImputer(seed="observation", random_state=None)
def test_error_if_random_state_takes_not_permitted_value():
    with pytest.raises(ValueError):
        RandomSampleImputer(seed="general", random_state="arbitrary")
def test_error_if_seeding_method_not_permitted_value():
    with pytest.raises(ValueError):
        RandomSampleImputer(seeding_method="arbitrary")
    LogTransformer,
    PowerTransformer,
    ReciprocalTransformer,
    YeoJohnsonTransformer,
)
from feature_engine.wrappers import SklearnTransformerWrapper


# imputation
@parametrize_with_checks([
    MeanMedianImputer(),
    ArbitraryNumberImputer(),
    CategoricalImputer(fill_value=0, ignore_format=True),
    EndTailImputer(),
    AddMissingIndicator(),
    RandomSampleImputer(),
    DropMissingData(),
])
def test_sklearn_compatible_imputer(estimator, check):
    check(estimator)


# encoding
@parametrize_with_checks([
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(regression=False, ignore_format=True),
    MeanEncoder(ignore_format=True),
    OneHotEncoder(ignore_format=True),
    OrdinalEncoder(ignore_format=True),
    RareLabelEncoder(
        tol=0.00000000001,
Ejemplo n.º 10
0
def imputeField36Field40(numpy_array):
    dataFrame = pd.DataFrame(numpy_array[:, 47:49])
    dateImputer = RandomSampleImputer()
    dateImputer.fit(dataFrame)
    convertedField39_Field40 = dateImputer.transform(dataFrame)
    numpy_array[:, 47:49] = convertedField39_Field40
Ejemplo n.º 11
0
def imputeField26Ffield35(array):
    dataFrame = pd.DataFrame(array[:, 35:41])
    stringImputer = RandomSampleImputer()
    stringImputer.fit(dataFrame)
    convertedField35_Field40 = stringImputer.transform(dataFrame)
    array[:, 35:41] = convertedField35_Field40