Esempio n. 1
0
def test_transform_can_be_applied_to_training_frame_with_special_flag():
    ds = load_dataset()
    te = H2OTargetEncoderEstimator()
    te.train(y=ds.target, training_frame=ds.train)
    transformed_as_training = te.transform(ds.train, as_training=True)
    transformed = te.transform(ds.train)

    assert pu.compare_frames(transformed,
                             transformed_as_training,
                             0,
                             tol_numeric=1e-5)

    # now with non default params
    te_nd = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out",
                                      blending=True,
                                      inflection_point=5,
                                      smoothing=17,
                                      seed=seed,
                                      noise=0.01)
    te_nd.train(y=ds.target, training_frame=ds.train)
    transformed_as_training = te_nd.transform(ds.train, as_training=True)
    transformed = te_nd.transform(ds.train)
    try:
        assert pu.compare_frames(transformed,
                                 transformed_as_training,
                                 0,
                                 tol_numeric=1e-5)
        assert False, "should have raised"
    except AssertionError as ae:
        assert "should have raised" not in str(ae)
def test_deprecated_k_param_is_alias_for_inflection_point():
    ds = load_dataset(incl_test=True)
    te = H2OTargetEncoderEstimator(noise=0)
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.predict(ds.test)
    # print(encoded)
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        te_k = H2OTargetEncoderEstimator(noise=0, k=5, blending=True)
        assert len(w) == 1
        assert issubclass(w[0].category, H2ODeprecationWarning)
        assert "``k`` param of ``{}`` is deprecated".format(
            te_init_name) in str(w[0].message)

    te_k.train(y=ds.target, training_frame=ds.train)
    encoded_k = te_k.predict(ds.test)
    # print(encoded_k)
    te_ip = H2OTargetEncoderEstimator(noise=0,
                                      inflection_point=5,
                                      blending=True)
    te_ip.train(y=ds.target, training_frame=ds.train)
    encoded_ip = te_ip.predict(ds.test)
    # print(encoded_ip)
    try:
        pu.compare_frames(encoded_k, encoded, 0, tol_numeric=1e-5)
        assert False, "should have raised"
    except AssertionError as ae:
        assert "should have raised" not in str(ae)
    assert pu.compare_frames(encoded_k, encoded_ip, 0, tol_numeric=1e-5)
def test_deprecated_noise_level_param_is_alias_for_noise():
    ds = load_dataset(incl_test=True)
    te = H2OTargetEncoderEstimator()
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.predict(ds.test)
    # print(encoded)
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        te_nl = H2OTargetEncoderEstimator(noise_level=0)
        assert len(w) == 1
        assert issubclass(w[0].category, H2ODeprecationWarning)
        assert "``noise_level`` param of ``{}`` is deprecated".format(
            te_init_name) in str(w[0].message)

    te_nl.train(y=ds.target, training_frame=ds.train)
    encoded_nl = te_nl.predict(ds.test)
    # print(encoded_nl)
    te_n = H2OTargetEncoderEstimator(noise=0)
    te_n.train(y=ds.target, training_frame=ds.train)
    encoded_n = te_n.predict(ds.test)
    # print(encoded_n)
    try:
        pu.compare_frames(encoded_nl, encoded, 0, tol_numeric=1e-5)
        assert False, "should have raised"
    except AssertionError as ae:
        assert "should have raised" not in str(ae)
    assert pu.compare_frames(encoded_nl, encoded_n, 0, tol_numeric=1e-5)
def test_target_encoding_fit_method():
    print("Check fit method of the TargetEncoder class")
    targetColumnName = "survived"
    foldColumnName = "kfold_column"

    teColumns = ["home.dest", "cabin", "embarked"]
    trainingFrame = h2o.import_file(
        pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor(
    )
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5,
                                                               seed=1234)

    te = H2OTargetEncoderEstimator(k=0.7, f=0.3, data_leakage_handling="none")
    te.train(training_frame=trainingFrame,
             encoded_columns=teColumns,
             target_column=targetColumnName)
    print(te)
    transformed = te.transform(frame=trainingFrame)

    assert transformed is not None
    print(transformed.names)
    assert transformed.ncols == trainingFrame.ncols + len(teColumns)
    for te_col in teColumns:
        assert te_col + "_te" in transformed.names

    assert transformed.nrows == 1309

    # Test fold_column proper handling + kfold data leakage strategy defined
    te = H2OTargetEncoderEstimator(k=0.7, f=0.3)
    te.train(training_frame=trainingFrame,
             fold_column="pclass",
             target_column=targetColumnName,
             encoded_columns=teColumns)
    transformed = te.transform(trainingFrame,
                               data_leakage_handling="kfold",
                               seed=1234)

    te.train(training_frame=trainingFrame,
             fold_column="pclass",
             target_column=targetColumnName,
             encoded_columns=teColumns)

    assert transformed is not None
    assert transformed.nrows == 1309

    # Test MOJO download
    mojo_file = te.download_mojo(tempfile.mkdtemp())
    assert os.path.isfile(mojo_file)
    assert os.path.getsize(mojo_file) > 0

    # Argument check
    te.train(training_frame=trainingFrame,
             fold_column="pclass",
             y=targetColumnName,
             x=teColumns)
def test_default_strategy_is_none():
    ds = load_dataset(incl_test=True)
    te = H2OTargetEncoderEstimator(noise=0)
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.predict(ds.test)

    te_none = H2OTargetEncoderEstimator(data_leakage_handling="none", noise=0)
    te_none.train(y=ds.target, training_frame=ds.train)
    encoded_none = te_none.predict(ds.test)

    assert pu.compare_frames(encoded, encoded_none, 0, tol_numeric=1e-5)
def test_target_encoded_frame_does_not_contain_fold_column():
    print("Check that attached TargetEncoderModel is being used during training and scoring")
    targetColumnName = "survived"
    foldColumnName = "kfold_column"

    teColumns = ["cabin", "embarked"]
    trainingFrame = h2o.import_file(pu.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234)

    te = H2OTargetEncoderEstimator(inflection_point=0.7, 
                                   smoothing=0.3, 
                                   data_leakage_handling="KFold", 
                                   fold_column=foldColumnName,
                                   seed=1234)
    te.train(training_frame=trainingFrame, x=teColumns, y=targetColumnName)
    model_summary = te._model_json['output']['model_summary'].as_data_frame()
    encoded_column_names = model_summary['encoded_column_name']

    # Checking that we don't have empty entries in TwoDim table
    assert len(model_summary) == 2

    encoded_columns_with_te_suffix = model_summary[encoded_column_names.str.contains('_te')]
    assert len(encoded_columns_with_te_suffix) == 2

    transformed = te.transform(trainingFrame, as_training=True)

    # Checking that fold column is not being encoded.
    assert foldColumnName+"_te" not in transformed.col_names
Esempio n. 7
0
def test_transform_can_override_blending_parameters():
    ds = load_dataset(incl_test=True)
    te = H2OTargetEncoderEstimator(noise=0)
    te.train(y=ds.target, training_frame=ds.train)
    transformed = te.transform(ds.test)
    transformed_blending = te.transform(ds.test, blending=True)
    try:
        assert pu.compare_frames(transformed,
                                 transformed_blending,
                                 0,
                                 tol_numeric=1e-5)
        assert False, "should have raised"
    except AssertionError as ae:
        assert "should have raised" not in str(ae)

    transformed_blending_custom = te.transform(ds.test,
                                               blending=True,
                                               inflection_point=3,
                                               smoothing=17)
    try:
        assert pu.compare_frames(transformed_blending_custom,
                                 transformed_blending,
                                 0,
                                 tol_numeric=1e-5)
        assert False, "should have raised"
    except AssertionError as ae:
        assert "should have raised" not in str(ae)
def test_strategies_produce_same_results_when_applied_on_new_data():
    ds = load_dataset(incl_test=True, incl_foldc=True)
    te_none = H2OTargetEncoderEstimator(noise=0)
    te_none.train(y=ds.target, training_frame=ds.train)
    encoded_none = te_none.transform(ds.test)
    te_loo = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out",
                                       noise=0)
    te_loo.train(y=ds.target, training_frame=ds.train)
    encoded_loo = te_loo.transform(ds.test)
    te_kfold = H2OTargetEncoderEstimator(data_leakage_handling="kfold",
                                         noise=0)
    te_kfold.train(y=ds.target, training_frame=ds.train, fold_column='foldc')
    encoded_kfold = te_kfold.transform(ds.test)

    for l, r in itertools.combinations(
        [encoded_none, encoded_loo, encoded_kfold], 2):
        assert pu.compare_frames(l, r, 0, tol_numeric=1e-2)
def test_fold_column_is_not_encoded():
    ds = load_dataset(incl_foldc=True)
    te = H2OTargetEncoderEstimator(data_leakage_handling="none")
    te.train(y=ds.target, training_frame=ds.train, fold_column="foldc")
    encoded = te.predict(ds.train)
    assert "foldc" in encoded.names
    assert "foldc_te" not in encoded.names

    te = H2OTargetEncoderEstimator(data_leakage_handling="kfold")
    te.train(y=ds.target, training_frame=ds.train, fold_column="foldc")
    encoded = te.predict(ds.train)
    assert "foldc" in encoded.names
    assert "foldc_te" not in encoded.names

    te = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out")
    te.train(y=ds.target, training_frame=ds.train, fold_column="foldc")
    encoded = te.predict(ds.train)
    assert "foldc" in encoded.names
    assert "foldc_te" not in encoded.names
Esempio n. 10
0
def test_transform_produces_the_same_result_as_predict_by_default():
    ds = load_dataset(incl_test=True)
    te = H2OTargetEncoderEstimator()
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.predict(ds.test)
    transformed = te.transform(ds.test)
    assert pu.compare_frames(encoded, transformed, 0, tol_numeric=1e-5)

    # now with non default params
    te_nd = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out",
                                      blending=True,
                                      inflection_point=5,
                                      smoothing=17,
                                      seed=seed,
                                      noise=0.01)
    te_nd.train(y=ds.target, training_frame=ds.train)
    encoded = te_nd.predict(ds.test)
    transformed = te_nd.transform(ds.test)
    assert pu.compare_frames(encoded, transformed, 0, tol_numeric=1e-5)
Esempio n. 11
0
def test_encoding_fails_if_there_is_no_categorical_column_to_encode():
    ds = load_dataset()
    non_cat = {n for n, t in ds.train.types.items() if t in ['int', 'real']}
    to_encode = non_cat
    assert len(to_encode) > 0
    te = H2OTargetEncoderEstimator()
    try:
        te.train(x=to_encode, y=ds.target, training_frame=ds.train)
        assert False, "should have raised error"
    except H2OResponseError as e:
        assert "Training data must have at least 2 features (incl. response)" in str(
            e)
def test_strategies_produce_different_results_for_training():
    ds = load_dataset(incl_foldc=True)
    te_none = H2OTargetEncoderEstimator(noise=0)
    te_none.train(y=ds.target, training_frame=ds.train)
    encoded_none = te_none.transform(ds.train, as_training=True)
    te_loo = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out",
                                       noise=0)
    te_loo.train(y=ds.target, training_frame=ds.train)
    encoded_loo = te_loo.transform(ds.train, as_training=True)
    te_kfold = H2OTargetEncoderEstimator(data_leakage_handling="kfold",
                                         noise=0)
    te_kfold.train(y=ds.target, training_frame=ds.train, fold_column='foldc')
    encoded_kfold = te_kfold.transform(ds.train, as_training=True)

    for l, r in itertools.combinations(
        [encoded_none, encoded_loo, encoded_kfold], 2):
        try:
            assert pu.compare_frames(l, r, 0, tol_numeric=1e-2)
            assert False, "should have raised"
        except AssertionError as ae:
            assert "should have raised" not in str(ae)
def test_kfold_requires_fold_column():
    ds = load_dataset(incl_foldc=True)
    te = H2OTargetEncoderEstimator(data_leakage_handling="kfold")
    try:
        te.train(y=ds.target, training_frame=ds.train)
        assert False, "should have raised"
    except Exception as e:
        assert "Fold column is required when using KFold leakage handling strategy" in str(
            e)

    te.train(y=ds.target, training_frame=ds.train, fold_column="foldc")
    assert te.predict(ds.train) is not None
def test_columns_to_encode_can_be_listed_in_dedicated_param():
    ds = load_dataset(incl_test=True)
    categoricals = {n
                    for n, t in ds.train.types.items()
                    if t == 'enum'} - {ds.target}
    to_encode = {c for i, c in enumerate(categoricals) if i % 2}
    assert len(to_encode) > 0
    te = H2OTargetEncoderEstimator(columns_to_encode=list(to_encode))
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.predict(ds.test)
    te_cols = [c for c in encoded.names if c.endswith("_te")]
    assert len(te_cols) == len(to_encode)
    assert {"{}_te".format(n) for n in to_encode} == set(te_cols)
def test_regression_with_kfold():
    ds = load_dataset(incl_foldc=True)
    te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="kfold")
    te.train(y=ds.target, training_frame=ds.train, fold_column="foldc")
    encoded = te.transform(ds.train, as_training=True)
    print(encoded)
    col_te_golden = [45.05575, 24.68343, 45.00326, 27.65044, 45.00326]
    col_te = encoded['sex_te'].head(5).as_data_frame().values.reshape(-1).tolist()
    assert_allclose(col_te, col_te_golden, atol=1e-5)
    
    # with open("./golden/regression_kfold.csv", "w") as f:
    #     f.write(encoded.get_frame_data())
    golden = h2o.import_file("./golden/regression_kfold.csv")
    assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_original_features_can_be_automatically_removed_from_result_frame():
    target = "survived"
    teColumns = ["cabin", "embarked"]
    trainingFrame = h2o.import_file(pu.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[target] = trainingFrame[target].asfactor()

    te = H2OTargetEncoderEstimator(keep_original_categorical_columns=False)
    te.train(training_frame=trainingFrame, x=teColumns, y=target)

    transformed = te.transform(trainingFrame)
    for col in teColumns:
        assert "{}_te".format(col) in transformed.names
        assert col not in transformed.names
def test_original_features_are_kept_by_default():
    target = "survived"
    teColumns = ["cabin", "embarked"]
    trainingFrame = h2o.import_file(pu.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[target] = trainingFrame[target].asfactor()

    te = H2OTargetEncoderEstimator()
    te.train(training_frame=trainingFrame, x=teColumns, y=target)
    
    transformed = te.transform(trainingFrame)
    for col in teColumns:
        assert "{}_te".format(col) in transformed.names
        assert col in transformed.names
def test_loo_requires_target_to_encode_training_frame():
    ds = load_dataset()
    te = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out")
    te.train(y=ds.target, training_frame=ds.train)

    train_no_target = h2o.assign(ds.train.drop(ds.target), "train_no_target")
    assert train_no_target is not None
    try:
        te.transform(train_no_target, as_training=True)
        assert False, "should have raised"
    except Exception as e:
        assert "LeaveOneOut strategy requires a response column" in str(e)

    assert te.predict(train_no_target) is not None
def test_regression_with_loo():
    ds = load_dataset()
    te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="leave_one_out")
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.transform(ds.train, as_training=True)
    print(encoded)
    col_te_golden = [45.84229, 25.99816, 45.97086, 25.99816, 45.97086]
    col_te = encoded['sex_te'].head(5).as_data_frame().values.reshape(-1).tolist()
    assert_allclose(col_te, col_te_golden, atol=1e-5)
    
    # with open("./golden/regression_loo.csv", "w") as f:
    #     f.write(encoded.get_frame_data())
    golden = h2o.import_file("./golden/regression_loo.csv")
    assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_te_model_does_nothing_if_there_is_no_categorical_column_to_encode():
    ds = load_dataset()
    non_cat = {n for n, t in ds.train.types.items() if t in ['int', 'real']}
    to_encode = non_cat
    assert len(to_encode) > 0
    te = H2OTargetEncoderEstimator()

    te.train(x=to_encode, y=ds.target, training_frame=ds.train)
    transformed = te.transform(ds.train)
    assert transformed.names == ds.train.names
    assert transformed.key == ds.train.key

    encoded = te.predict(ds.train)
    assert encoded.names == ds.train.names
    assert encoded.key != ds.train.key
def columns_listed_in_columns_to_encode_should_not_be_ignored_in_x():
    ds = load_dataset(incl_test=True)
    categoricals = list({n
                         for n, t in ds.train.types.items() if t == 'enum'} -
                        {ds.target})
    assert len(categoricals) > 3
    ignored = categoricals[0]
    two_inter = [ignored, categoricals[1]]
    te = H2OTargetEncoderEstimator(columns_to_encode=[two_inter])
    x = list(set(ds.train.names) - {ignored})
    try:
        te.train(x=x, y=ds.target, training_frame=ds.train)
    except Exception as e:
        assert "Column `{}` from interaction [{}] is not categorical or is missing from the training frame".format(
            ignored, ', '.join(two_inter)) in str(e)
def test_all_categoricals_are_encoded_by_default():
    ds = load_dataset(incl_test=True)
    categoricals = {n
                    for n, t in ds.train.types.items()
                    if t == 'enum'} - {ds.target}
    assert len(categoricals) > 0
    te = H2OTargetEncoderEstimator()
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.predict(ds.test)
    assert {"{}_te".format(n)
            for n in categoricals} < set(
                encoded.names), "some categoricals haven't been encoded"
    assert set(ds.train.names) < set(
        encoded.names
    ), "some original columns have been removed from predictions"
def test_multinomial_with_loo():
    ds = load_dataset()
    te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="leave_one_out")
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.transform(ds.train, as_training=True)
    print(encoded)
    col_te_golden = [0.22796, 0.20309, 0.22796, 0.20309, 0.22796]
    col_te = encoded['sex_Class_2_te'].head(5).as_data_frame().values.reshape(-1).tolist()
    assert_allclose(col_te, col_te_golden, atol=1e-5)
    
    # with open("{}/golden/multinomial_loo.csv".format(here), "w") as f:
    #     f.write(encoded.get_frame_data())
    golden = h2o.import_file("{}/golden/multinomial_loo.csv".format(here))
    assert golden.names == encoded.names
    assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_multinomial_with_kfold():
    ds = load_dataset(incl_foldc=True)
    te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="kfold")
    te.train(y=ds.target, training_frame=ds.train, fold_column="foldc")
    encoded = te.transform(ds.train, as_training=True)
    print(encoded)
    col_te_golden = [0.22300, 0.20857, 0.23127, 0.19478, 0.23127]
    col_te = encoded['sex_Class_2_te'].head(5).as_data_frame().values.reshape(-1).tolist()
    assert_allclose(col_te, col_te_golden, atol=1e-5)
    
    # with open("{}/golden/multinomial_kfold.csv".format(here), "w") as f:
    #     f.write(encoded.get_frame_data())
    golden = h2o.import_file("{}/golden/multinomial_kfold.csv".format(here))
    assert golden.names == encoded.names
    assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_binomial_with_kfold():
    ds = load_dataset(incl_foldc=True)
    te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="kfold")
    te.train(y=ds.target, training_frame=ds.train, fold_column="foldc")
    encoded = te.transform(ds.train, as_training=True)
    print(encoded)
    col_te_golden = [0.714286, 0.178771, 0.729642, 0.208696, 0.729642]
    col_te = encoded['sex_te'].head(5).as_data_frame().values.reshape(
        -1).tolist()
    assert_allclose(col_te, col_te_golden, atol=1e-5)

    # with open("./golden/binomial_kfold.csv", "w") as f:
    #     f.write(encoded.get_frame_data())
    golden = h2o.import_file("./golden/binomial_kfold.csv")
    assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_binomial_with_none():
    ds = load_dataset()
    te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="none")
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.transform(ds.train, as_training=True)
    print(encoded)
    col_te_golden = [0.72747, 0.19099, 0.72747, 0.19099, 0.72747]
    col_te = encoded['sex_te'].head(5).as_data_frame().values.reshape(
        -1).tolist()
    assert_allclose(col_te, col_te_golden, atol=1e-5)

    # with open("./golden/binomial_none.csv", "w") as f:
    #     f.write(encoded.get_frame_data())
    golden = h2o.import_file("./golden/binomial_none.csv")
    assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_regression_with_none():
    ds = load_dataset()
    te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="none")
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.transform(ds.train, as_training=True)
    print(encoded)
    col_te_golden = [46.19810, 26.14816, 46.19810, 26.14816, 46.19809]
    col_te = encoded['sex_te'].head(5).as_data_frame().values.reshape(
        -1).tolist()
    assert_allclose(col_te, col_te_golden, atol=1e-5)

    # with open("{}/golden/regression_none.csv".format(here), "w") as f:
    #     f.write(encoded.get_frame_data())
    golden = h2o.import_file("{}/golden/regression_none.csv".format(here))
    assert golden.names == encoded.names
    assert pu.compare_frames(golden, encoded, 0, tol_numeric=1e-5)
def test_use_kfold_strategy_to_train_a_model_with_cv():
    #XXX: TE KFold strategy allows TE to be trained only once in a context of model building with CV,
    # but it can't be applied just once on the training data,
    # otherwise this is what's happening when training CV1 for example (fold1 = cv_holdout, f2-n = cv_train):
    #     column `cat_te` for cv_holdout is obtained using fold_1 so, only with information collected from folds_2-n, which is what we want.
    #     column `cat_te` for cv_train however is obtained using fold_i, and each of those contains information about fold_1: this is a data leakage from cv_holdout into cv_train.
    #     on top of this, current version of transform is using a global priorMean for NAs, creating an additional data leakage in CV context.
    # The priorMean issue can be fixed internally in the implementation of KFold strategy.
    # However, for proper CCV, we need a deep integration with CV logic in ModelBuilder (translate to Java of course..):
    #   train TE using KFold strategy on the entire train set.
    #   then during CV, for each fold:
    #     train_cv_i = te.transform(train_cv, fold=fold_i)  # so that train_cv_i is not encoded at all with encodings from other folds (they include info about current fold)
    #     test_cv_i = te.transform(test_cv, fold=fold_i)    # same
    #   finally, the final model is trained with TE applied on the whole training frame:
    #     train = te.transform(train, as_training=True)     # still using the fold column, this ensures that the final feature is equivalent to the one used in all the test_cv_i
    #     or
    #     train = te.transform(train)                       # ignoring the fold column, this way the final te feature uses the entire train set.

    ds = load_dataset(incl_test=True, incl_foldc=True)
    te = H2OTargetEncoderEstimator(noise=0, data_leakage_handling="kfold")
    te.train(y=ds.target, training_frame=ds.train, fold_column="foldc")

    train_enc_cv = te.transform(ds.train, as_training=True)
    cols_to_remove = [n[:-3] for n in train_enc_cv.names if n.endswith("_te")]
    train_enc_cv = h2o.assign(train_enc_cv.drop(cols_to_remove),
                              "train_enc_cv")

    train_enc_no_cv = te.transform(ds.train)
    train_enc_no_cv = h2o.assign(train_enc_no_cv.drop(cols_to_remove),
                                 "train_enc_no_cv")

    test_enc = te.transform(ds.test)
    test_enc = h2o.assign(test_enc.drop(cols_to_remove), "test_enc")

    print(train_enc_cv)
    print(train_enc_no_cv)

    gbm = H2OGradientBoostingEstimator(seed=seed)
    gbm.train(y=ds.target, training_frame=train_enc_cv, fold_column="foldc")
    auc_with_ccv = gbm.model_performance(test_enc).auc()
    print("AUC with CCV : %s" % auc_with_ccv)

    gbm.train(y=ds.target, training_frame=train_enc_no_cv, fold_column="foldc")
    auc_no_ccv = gbm.model_performance(test_enc).auc()
    print("AUC without CCV : %s" % auc_no_ccv)

    assert auc_with_ccv > auc_no_ccv
def test_transform_seed_param_raise_warning():
    ds = load_dataset(incl_test=True)
    te = H2OTargetEncoderEstimator(seed=42)
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.predict(ds.test)

    transformed_1 = te.transform(ds.test)
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        transformed_2 = te.transform(ds.test, seed=24)
        assert len(w) == 1
        assert issubclass(w[0].category, H2ODeprecationWarning)
        assert "`seed` is deprecated in `transform` method and will be ignored" in str(
            w[0].message)

    assert pu.compare_frames(encoded, transformed_1, 0, tol_numeric=1e-5)
    assert pu.compare_frames(encoded, transformed_2, 0, tol_numeric=1e-5)
def test_columns_groups_are_encoded_as_a_single_interaction():
    ds = load_dataset(incl_test=True)
    categoricals = list({n
                         for n, t in ds.train.types.items() if t == 'enum'} -
                        {ds.target})
    assert len(categoricals) > 3
    no_inter = categoricals[0]
    two_inter = [categoricals[0], categoricals[1]]
    three_inter = [categoricals[0], categoricals[1], categoricals[2]]
    te = H2OTargetEncoderEstimator(
        columns_to_encode=[no_inter, two_inter, three_inter])
    te.train(y=ds.target, training_frame=ds.train)
    encoded = te.predict(ds.test)
    te_cols = [c for c in encoded.names if c.endswith("_te")]
    assert len(te_cols) == 3
    assert "{}_te".format(no_inter) in te_cols
    assert "{}:{}_te".format(*two_inter) in te_cols
    assert "{}:{}:{}_te".format(*three_inter) in te_cols