Example #1
0
def test_ability_to_pass_column_parameters_as_indexes():
    print("Check that we can pass indices for specifying columns")
    targetColumnIdx = 1
    targetColumnName = "survived"
    foldColumnIdx = 14
    foldColumnName = "kfold_column"

    teColumns = [13]  # 13 stands for `home.dest`
    targetEncoder = TargetEncoder(x=teColumns,
                                  y=targetColumnIdx,
                                  fold_column=foldColumnIdx,
                                  blended_avg=True,
                                  inflection_point=3,
                                  smoothing=1)
    trainingFrame = h2o.import_file(
        pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor(
    )
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5,
                                                               seed=1234)

    encodingMap = targetEncoder.fit(frame=trainingFrame)
    assert encodingMap.map_keys['string'] == ["home.dest"]
    assert encodingMap.frames[0]['num_rows'] == 583
def test_target_encoding_transform_none_blending():
    print("Check none strategy with and without blending")
    targetColumnName = "survived"

    teColumns = ["home.dest", "cabin", "embarked"]
    teColumnsEncoded = list(map(lambda x: x+"_te", teColumns))
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)
    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()
    
    targetEncoderWithBlending = TargetEncoder(x= teColumns, y= targetColumnName,
                                              blended_avg= True, inflection_point = 3, smoothing = 1)
    
    targetEncoderWithBlending.fit(frame=trainingFrame)

    encodedFrameWithBlending = targetEncoderWithBlending.transform(frame=trainingFrame, holdout_type="none", seed=1234)

    frameWithBlendedEncodingsOnly = encodedFrameWithBlending[teColumnsEncoded]

    targetEncoderWithoutBlending = TargetEncoder(x= teColumns, y= targetColumnName,
                                                 blended_avg= False, inflection_point = 3, smoothing = 1)

    targetEncoderWithoutBlending.fit(frame=trainingFrame)

    encodedFrameWithoutBlending = targetEncoderWithoutBlending.transform(frame=trainingFrame, holdout_type="none", seed=1234)
    encodedFrameWithoutBlendingOnly = encodedFrameWithoutBlending[teColumnsEncoded]

    try:
        pyunit_utils.compare_frames(frameWithBlendedEncodingsOnly, encodedFrameWithoutBlendingOnly, 10, tol_time=0, tol_numeric=1e-6)
        assert False
    except AssertionError:
        print('Good, encodings are different as expected. Hopefully because of the blending.')
Example #3
0
def pubdev_6474_test_more_than_two_columns_to_encode_case():
    import pandas as pd
    import random

    runs = 10
    seeds = random.sample(range(1, 10000), runs)
    for current_seed in seeds:
        df = pd.DataFrame({
            'x_0': ['a'] * 5 + ['b'] * 5,
            'x_1': ['c'] * 9 + ['d'] * 1,
            'x_2': ['e'] * 2 + ['f'] * 8,
            'x_3': ['h'] * 4 + ['i'] * 6,
            'x_4': ['g'] * 7 + ['k'] * 3,
            'x_5': ['l'] * 1 + ['m'] * 9,
            'y_0': [1, 1, 1, 1, 0, 1, 0, 0, 0, 0]
        })

        hf = h2o.H2OFrame(df)
        hf['cv_fold_te'] = hf.kfold_column(n_folds=2, seed=current_seed)
        hf['y_0'] = hf['y_0'].asfactor()

        full_features = ['x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'x_5']
        target_encoder = TargetEncoder(x=full_features,
                                       y='y_0',
                                       fold_column='cv_fold_te')
        target_encoder.fit(hf)
        hf = target_encoder.transform(frame=hf,
                                      holdout_type='kfold',
                                      seed=current_seed,
                                      noise=0.0)
Example #4
0
def test_that_encoding_maps_are_accessible_as_frames():
    print("Check that we can access encoding maps as data frames")
    targetColumnName = "survived"
    foldColumnName = "kfold_column"  # it is strange that we can't set name for generated kfold

    teColumns = "home.dest"
    targetEncoder = TargetEncoder(x=teColumns,
                                  y=targetColumnName,
                                  fold_column=foldColumnName,
                                  blending_avg=True,
                                  inflection_point=3,
                                  smoothing=1)
    trainingFrame = h2o.import_file(
        pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor(
    )
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5,
                                                               seed=1234)

    targetEncoder.fit(frame=trainingFrame)

    encodingMapFramesKeys = targetEncoder.encoding_map_frames()

    assert len([
        value
        for value in encodingMapFramesKeys[0].columns if value in teColumns
    ]) > 0
Example #5
0
def test_blending_params_are_within_valid_range():
    print("Check validation for blending hyperparameters")
    targetColumnName = "survived"

    teColumns = ["home.dest", "cabin", "embarked"]
    trainingFrame = h2o.import_file(
        pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)
    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor(
    )

    try:
        TargetEncoder(x=teColumns,
                      y=targetColumnName,
                      blended_avg=True,
                      inflection_point=0,
                      smoothing=1)
        assert False
    except ValueError:
        print(
            'Good, exception was thrown as `inflection_point` is outside of the valid range'
        )

    try:
        TargetEncoder(x=teColumns,
                      y=targetColumnName,
                      blended_avg=True,
                      inflection_point=1,
                      smoothing=0)
        assert False
    except ValueError:
        print('Good, exception was thrown as expected')
def test_target_encoding_transform_kfold():
    print("Check transform method (kfold strategy) of the TargetEncoder class")
    targetColumnName = "survived"
    foldColumnName = "kfold_column"  # it is strange that we can't set name for generated kfold

    teColumns = ["home.dest", "cabin", "embarked"]
    targetEncoder = TargetEncoder(x=teColumns,
                                  y=targetColumnName,
                                  fold_column=foldColumnName,
                                  blending_avg=True,
                                  inflection_point=3,
                                  smoothing=1)
    trainingFrame = h2o.import_file(
        pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor(
    )
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5,
                                                               seed=1234)

    targetEncoder.fit(trainingFrame)

    encodedFrame = targetEncoder.transform(frame=trainingFrame,
                                           holdout_type="kfold",
                                           seed=1234)

    teColumnsEncoded = list(map(lambda x: x + "_te", teColumns))
    frameWithEncodingsOnly = encodedFrame[teColumnsEncoded]
    assert frameWithEncodingsOnly.ncols == 3
Example #7
0
def test_target_encoding_transform_loo():
    print("Check transform (loo strategy) of the TargetEncoder class")
    targetColumnName = "survived"

    teColumns = ["home.dest", "cabin", "embarked"]
    targetEncoder = TargetEncoder(x=teColumns,
                                  y=targetColumnName,
                                  fold_column='',
                                  blended_avg=True,
                                  inflection_point=3,
                                  smoothing=1)
    trainingFrame = h2o.import_file(
        pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor(
    )

    targetEncoder.fit(frame=trainingFrame)

    encodedFrame = targetEncoder.transform(frame=trainingFrame,
                                           holdout_type="loo",
                                           seed=1234)

    teColumnsEncoded = list(map(lambda x: x + "_te", teColumns))
    frameWithEncodingsOnly = encodedFrame[teColumnsEncoded]
    assert frameWithEncodingsOnly.ncols == 3
Example #8
0
def test_teColumns_parameter_as_single_column_index():
    print("Check fit method can accept non-array single column to encode")
    targetColumnName = "survived"
    foldColumnName = "kfold_column"  # it is strange that we can't set name for generated kfold

    teColumns = 13  # stands for "home.dest" column
    targetEncoder = TargetEncoder(x=teColumns,
                                  y=targetColumnName,
                                  fold_column=foldColumnName,
                                  blending_avg=True,
                                  inflection_point=3,
                                  smoothing=1)
    trainingFrame = h2o.import_file(
        pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor(
    )
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5,
                                                               seed=1234)

    encodingMap = targetEncoder.fit(frame=trainingFrame)
    assert encodingMap.map_keys['string'] == [trainingFrame.columns[teColumns]]
    trainingFrame = targetEncoder.transform(trainingFrame,
                                            holdout_type="kfold",
                                            seed=1234)
    assert "home.dest_te" in trainingFrame.columns
def test_target_encoding_transform_none_blending():
    print("Check none strategy with and without blending")
    targetColumnName = "survived"

    teColumns = ["home.dest", "cabin", "embarked"]
    teColumnsEncoded = list(map(lambda x: x+"_te", teColumns))
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)
    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()
    
    targetEncoderWithBlending = TargetEncoder(x= teColumns, y= targetColumnName,
                                              blended_avg= True, inflection_point = 3, smoothing = 1)
    
    targetEncoderWithBlending.fit(frame=trainingFrame)

    encodedFrameWithBlending = targetEncoderWithBlending.transform(frame=trainingFrame, holdout_type="none", seed=1234)

    frameWithBlendedEncodingsOnly = encodedFrameWithBlending[teColumnsEncoded]

    targetEncoderWithoutBlending = TargetEncoder(x= teColumns, y= targetColumnName,
                                                 blended_avg= False, inflection_point = 3, smoothing = 1)

    targetEncoderWithoutBlending.fit(frame=trainingFrame)

    encodedFrameWithoutBlending = targetEncoderWithoutBlending.transform(frame=trainingFrame, holdout_type="none", seed=1234)
    encodedFrameWithoutBlendingOnly = encodedFrameWithoutBlending[teColumnsEncoded]

    try:
        pyunit_utils.compare_frames(frameWithBlendedEncodingsOnly, encodedFrameWithoutBlendingOnly, 10, tol_time=0, tol_numeric=1e-6)
        assert False
    except AssertionError:
        print('Good, encodings are different as expected. Hopefully because of the blending.')
def test_target_encoding_fit_method():
    print("Check fit method of the TargetEncoder class")
    targetColumnName = "survived"
    foldColumnName = "kfold_column" # it is strange that we can't set name for generated kfold

    teColumns = ["home.dest", "cabin", "embarked"]
    targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName,
                                  fold_column= foldColumnName, blended_avg= True, inflection_point = 3, smoothing = 1)
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234)

    encodingMap = targetEncoder.fit(frame=trainingFrame)
    assert encodingMap.map_keys['string'] == teColumns
    assert encodingMap.frames[0]['num_rows'] == 583
def test_target_encoding_fit_method():
    print("Check fit method of the TargetEncoder class")
    targetColumnName = "survived"
    foldColumnName = "kfold_column" # it is strange that we can't set name for generated kfold

    teColumns = ["home.dest", "cabin", "embarked"]
    targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName,
                                  fold_column= foldColumnName, blended_avg= True, inflection_point = 3, smoothing = 1)
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234)

    encodingMap = targetEncoder.fit(frame=trainingFrame)
    assert encodingMap.map_keys['string'] == teColumns
    assert encodingMap.frames[0]['num_rows'] == 583
def test_teColumns_parameter_as_single_column_index():
    print("Check fit method can accept non-array single column to encode")
    targetColumnName = "survived"
    foldColumnName = "kfold_column" # it is strange that we can't set name for generated kfold

    teColumns = 13 # stands for "home.dest" column
    targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName,
                                  fold_column= foldColumnName, blending_avg= True, inflection_point = 3, smoothing = 1)
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234)

    encodingMap = targetEncoder.fit(frame=trainingFrame)
    assert encodingMap.map_keys['string'] == [trainingFrame.columns[teColumns]]
    trainingFrame = targetEncoder.transform(trainingFrame, holdout_type="kfold", seed=1234)
    assert "home.dest_te" in trainingFrame.columns 
def test_target_encoding_transform_none():
    print("Check transform (none strategy) of the TargetEncoder class")
    targetColumnName = "survived"

    teColumns = ["home.dest", "cabin", "embarked"]
    targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName,
                                  blended_avg= True, inflection_point = 3, smoothing = 1)
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()

    targetEncoder.fit(frame=trainingFrame)

    encodedFrame = targetEncoder.transform(frame=trainingFrame, holdout_type="none", seed=1234)

    teColumnsEncoded = list(map(lambda x: x+"_te", teColumns))
    frameWithEncodingsOnly = encodedFrame[teColumnsEncoded]
    assert frameWithEncodingsOnly.ncols == 3
def test_ability_to_pass_column_parameters_as_indexes():
    print("Check that we can pass indices for specifying columns")
    targetColumnIdx = 1 
    targetColumnName = "survived"
    foldColumnIdx = 14
    foldColumnName = "kfold_column" 

    teColumns = [13] # 13 stands for `home.dest`
    targetEncoder = TargetEncoder(x= teColumns, y= targetColumnIdx,
                                  fold_column= foldColumnIdx, blended_avg= True, inflection_point = 3, smoothing = 1)
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234)
    
    encodingMap = targetEncoder.fit(frame=trainingFrame)
    assert encodingMap.map_keys['string'] == ["home.dest"]
    assert encodingMap.frames[0]['num_rows'] == 583
def titanic_with_te_kfoldstrategy(frame=None, seeds=None):
    sum_of_aucs = 0
    for current_seed in seeds:
        ds = split_data(frame, current_seed)

        targetColumnName = "survived"

        foldColumnName = "kfold_column"
        ds['train'][foldColumnName] = ds['train'].kfold_column(
            n_folds=5, seed=current_seed)

        teColumns = ["home.dest", "cabin", "embarked"]
        targetEncoder = TargetEncoder(x=teColumns,
                                      y=targetColumnName,
                                      fold_column=foldColumnName,
                                      blending_avg=True,
                                      inflection_point=3,
                                      smoothing=1)
        targetEncoder.fit(frame=ds['train'])

        encodedTrain = targetEncoder.transform(frame=ds['train'],
                                               holdout_type="kfold",
                                               seed=1234)
        encodedValid = targetEncoder.transform(frame=ds['valid'],
                                               holdout_type="none",
                                               noise=0.0)
        encodedTest = targetEncoder.transform(frame=ds['test'],
                                              holdout_type="none",
                                              noise=0.0)

        myX = [
            "pclass", "sex", "age", "sibsp", "parch", "fare", "cabin_te",
            "embarked_te", "home.dest_te"
        ]
        air_model = H2OGradientBoostingEstimator(
            ntrees=1000,
            learn_rate=0.1,
            score_tree_interval=10,
            stopping_rounds=5,
            stopping_metric="AUC",
            stopping_tolerance=0.001,
            distribution="multinomial",
            # why AUC is different for quasibinomial and multinomial?
            seed=1234)
        air_model.train(x=myX,
                        y=targetColumnName,
                        training_frame=encodedTrain,
                        validation_frame=encodedValid)
        variable_importance = air_model._model_json['output'][
            'variable_importances'].as_data_frame()
        # print(variable_importance)

        my_gbm_metrics = air_model.model_performance(encodedTest)
        auc = my_gbm_metrics.auc()
        sum_of_aucs += auc
        print("AUC with kfold for seed: " + str(current_seed) + " = " +
              str(auc))
    return sum_of_aucs / len(seeds)
def test_that_both_deprecated_and_new_parameters_are_working_together():
    print("Check that both deprecated and new parameters are working together")
    targetColumnName = "survived"

    teColumns = ["home.dest", "cabin", "embarked"]
    targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName, blending_avg= False)
    targetEncoderNewConstructor = TargetEncoder(x= teColumns, y= targetColumnName, blended_avg= False)
    targetEncoderDefault = TargetEncoder(x= teColumns, y= targetColumnName)

    assert targetEncoder._blending == False
    assert targetEncoder._blending == targetEncoderNewConstructor._blending
    assert targetEncoderDefault._blending == True

    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        TargetEncoder(x= teColumns, y= targetColumnName, blending_avg= False)
        assert len(w) == 1
        assert issubclass(w[-1].category, DeprecationWarning)
        assert "Parameter blending_avg is deprecated; use blended_avg instead" == str(w[-1].message)
def test_target_encoding_transform_kfold():
    print("Check transform method (kfold strategy) of the TargetEncoder class")
    targetColumnName = "survived"
    foldColumnName = "kfold_column" # it is strange that we can't set name for generated kfold

    teColumns = ["home.dest", "cabin", "embarked"]
    targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName,
                                  fold_column= foldColumnName, blended_avg= True, inflection_point = 3, smoothing = 1)
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234)

    targetEncoder.fit(trainingFrame)

    encodedFrame = targetEncoder.transform(frame=trainingFrame, holdout_type="kfold", seed=1234)

    teColumnsEncoded = list(map(lambda x: x+"_te", teColumns))
    frameWithEncodingsOnly = encodedFrame[teColumnsEncoded]
    assert frameWithEncodingsOnly.ncols == 3
Example #18
0
def test_that_warning_will_be_shown_if_we_add_noise_for_none_strategy():
    print(
        "Check that warning will be shown if user is trying to apply noise for holdout_type = `none` case"
    )
    targetColumnName = "survived"

    teColumns = ["home.dest", "cabin", "embarked"]
    targetEncoder = TargetEncoder(x=teColumns,
                                  y=targetColumnName,
                                  blended_avg=True,
                                  inflection_point=3,
                                  smoothing=1)

    trainingFrame = h2o.import_file(
        pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor(
    )

    targetEncoder.fit(trainingFrame)

    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        targetEncoder.transform(frame=trainingFrame,
                                holdout_type="none",
                                noise=0.1,
                                seed=1234)
        assert len(w) == 1
        assert issubclass(w[-1].category, UserWarning)
        assert "Attempt to apply noise with holdout_type=`none` strategy" == str(
            w[-1].message)
Example #19
0
def test_that_error_will_be_thrown_if_user_has_not_used_fold_column():
    print(
        "Check fold_column is being specified when we are attempting to use kfold strategy"
    )
    targetColumnName = "survived"
    foldColumnName = "kfold_column"

    teColumns = ["home.dest", "cabin", "embarked"]
    # Here we are not specifying `fold_column`
    targetEncoder = TargetEncoder(x=teColumns,
                                  y=targetColumnName,
                                  blended_avg=True,
                                  inflection_point=3,
                                  smoothing=1)

    trainingFrame = h2o.import_file(
        pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor(
    )
    trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5,
                                                               seed=1234)

    targetEncoder.fit(trainingFrame)

    try:
        # We expect to get error as we are trying to use kfold strategy but encoding map was created without folds
        targetEncoder.transform(frame=trainingFrame,
                                holdout_type="kfold",
                                seed=1234)
        assert False
    except ValueError:
        print('Good, exception was thrown as expected')
def test_target_encoding_default_noise_is_applied():
    print("Check that seed is applied when we use noise. Noise is set to the same values. Only seed is different.")

    targetColumnName = "survived"

    teColumns = ["home.dest", "cabin", "embarked"]
    teColumnsEncoded = list(map(lambda x: x+"_te", teColumns))
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)
    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()

    targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName,
                                  blended_avg= True, inflection_point = 3, smoothing = 1)

    targetEncoder.fit(frame=trainingFrame)

    seedTest = 1234
    encodedFrame = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=0.0, seed=seedTest)

    encodingsOnly = encodedFrame[teColumnsEncoded]

    # Second transformation without specifying noise. Default will be applied.
    encodedFrame2 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", seed=seedTest)
    encodingsOnly2 = encodedFrame2[teColumnsEncoded]

    # Third  transformation with zero noise
    encodedFrame3 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=0.0, seed=seedTest)
    encodingsOnly3 = encodedFrame3[teColumnsEncoded]

    # Comparing results
    # Third encoding should be equal to the first one since no noise is applied in both cases
    assert pyunit_utils.compare_frames(encodingsOnly, encodingsOnly3, 10, tol_time=0, tol_numeric=1e-6)
    # First two encodings should be different since default noise will be applied to the second transformation
    try:
        pyunit_utils.compare_frames(encodingsOnly, encodingsOnly2, 10, tol_time=0, tol_numeric=1e-6)
        assert False
    except AssertionError:
        print('Good, encodings are different as expected. Default noise is working')
def test_target_encoding_seed_is_working():
    print("Check that seed is applied when we use noise. Noise is set to the same values. Only seed is different.")
    noiseTest = 0.02

    targetColumnName = "survived"

    teColumns = ["home.dest", "cabin", "embarked"]
    teColumnsEncoded = list(map(lambda x: x+"_te", teColumns))
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)
    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()

    targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName,
                                  blending_avg= True, inflection_point = 3, smoothing = 1)

    targetEncoder.fit(frame=trainingFrame)

    encodedFrame = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=noiseTest, seed=1234, is_train_or_valid=True)

    encodingsOnly = encodedFrame[teColumnsEncoded]

    # Second transformation with the same seed 1234
    encodedFrame2 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=noiseTest, seed=1234, is_train_or_valid=True)
    encodingsOnly2 = encodedFrame2[teColumnsEncoded]

    # Third  transformation with another seed 1235
    encodedFrame3 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=noiseTest, seed=1235, is_train_or_valid=True)
    encodingsOnly3 = encodedFrame3[teColumnsEncoded]

    # Comparing results
    # First two encodings should be equal
    assert pyunit_utils.compare_frames(encodingsOnly, encodingsOnly2, 10, tol_time=0, tol_numeric=1e-6)
    # Third encoding should be different from the first two ones
    try:
        pyunit_utils.compare_frames(encodingsOnly, encodingsOnly3, 10, tol_time=0, tol_numeric=1e-6)
        assert False
    except AssertionError:
        print('Good, encodings are different as expected. Seed is working.')
def titanic_with_te_kfoldstrategy(frame = None, seeds = None):
    sum_of_aucs = 0
    for current_seed in seeds:
      ds = split_data(frame, current_seed)

      targetColumnName = "survived"

      foldColumnName = "kfold_column"
      ds['train'][foldColumnName] = ds['train'].kfold_column(n_folds=5, seed=current_seed)

      teColumns = ["home.dest", "cabin", "embarked"]
      targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName,
                                    fold_column= foldColumnName, blended_avg= True, inflection_point = 3, smoothing = 1)
      targetEncoder.fit(frame=ds['train'])

      encodedTrain = targetEncoder.transform(frame=ds['train'], holdout_type="kfold", seed=1234)
      encodedValid = targetEncoder.transform(frame=ds['valid'], holdout_type="none", noise=0.0)
      encodedTest = targetEncoder.transform(frame=ds['test'], holdout_type="none", noise=0.0)

      myX = ["pclass", "sex", "age", "sibsp", "parch", "fare", "cabin_te", "embarked_te", "home.dest_te"]
      air_model = H2OGradientBoostingEstimator(ntrees=1000,
                                               learn_rate=0.1,
                                               score_tree_interval=10,
                                               stopping_rounds=5,
                                               stopping_metric="AUC",
                                               stopping_tolerance=0.001,
                                               distribution="multinomial",
                                               seed=1234)
      air_model.train(x=myX, y=targetColumnName,
                      training_frame=encodedTrain, validation_frame=encodedValid)
      variable_importance = air_model._model_json['output']['variable_importances'].as_data_frame()
      # print(variable_importance)

      my_gbm_metrics = air_model.model_performance(encodedTest)
      auc = my_gbm_metrics.auc()
      sum_of_aucs += auc
      print("AUC with kfold for seed: " + str(current_seed) + " = " + str(auc))
    return sum_of_aucs / len(seeds)
def test_target_encoding_default_noise_is_applied():
    print("Check that seed is applied when we use noise. Noise is set to the same values. Only seed is different.")

    targetColumnName = "survived"

    teColumns = ["home.dest", "cabin", "embarked"]
    teColumnsEncoded = list(map(lambda x: x+"_te", teColumns))
    trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)
    trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor()

    targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName,
                                  blended_avg= True, inflection_point = 3, smoothing = 1)

    targetEncoder.fit(frame=trainingFrame)

    seedTest = 1234
    encodedFrame = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=0.0, seed=seedTest)

    encodingsOnly = encodedFrame[teColumnsEncoded]

    # Second transformation without specifying noise. Default will be applied.
    encodedFrame2 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", seed=seedTest)
    encodingsOnly2 = encodedFrame2[teColumnsEncoded]

    # Third  transformation with zero noise
    encodedFrame3 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=0.0, seed=seedTest)
    encodingsOnly3 = encodedFrame3[teColumnsEncoded]

    # Comparing results
    # Third encoding should be equal to the first one since no noise is applied in both cases
    assert pyunit_utils.compare_frames(encodingsOnly, encodingsOnly3, 10, tol_time=0, tol_numeric=1e-6)
    # First two encodings should be different since default noise will be applied to the second transformation
    try:
        pyunit_utils.compare_frames(encodingsOnly, encodingsOnly2, 10, tol_time=0, tol_numeric=1e-6)
        assert False
    except AssertionError:
        print('Good, encodings are different as expected. Default noise is working')
def test_that_old_te_is_helpful_for_titanic_gbm_xval():

    #Import the titanic dataset
    titanic = h2o.import_file(
        "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
    )

    # Set response column as a factor
    titanic['survived'] = titanic['survived'].asfactor()
    response = 'survived'

    # Split the dataset into train, test
    train, test = titanic.split_frame(ratios=[.8], seed=1234)

    # Choose which columns to encode
    encoded_columns = ["home.dest", "cabin", "embarked"]

    # Set target encoding parameters
    blended_avg = True
    inflection_point = 3
    smoothing = 10
    # In general, the less data you have the more regularisation you need
    noise = 0.15

    # For k_fold strategy we need to provide fold column
    fold_column = "kfold_column"
    train[fold_column] = train.kfold_column(n_folds=5, seed=3456)

    # Train a TE model
    titanic_te = TargetEncoder(x=encoded_columns,
                               y=response,
                               fold_column=fold_column,
                               blended_avg=blended_avg,
                               inflection_point=inflection_point,
                               smoothing=smoothing)

    titanic_te.fit(frame=train)

    train_te = titanic_te.transform(frame=train,
                                    holdout_type="kfold",
                                    seed=1234,
                                    noise=noise)
    test_te = titanic_te.transform(frame=test, holdout_type="none", noise=0.0)

    gbm_with_te = H2OGradientBoostingEstimator(max_depth=6,
                                               min_rows=1,
                                               fold_column=fold_column,
                                               score_tree_interval=5,
                                               ntrees=10000,
                                               sample_rate=0.8,
                                               col_sample_rate=0.8,
                                               seed=1234,
                                               stopping_rounds=5,
                                               stopping_metric="auto",
                                               stopping_tolerance=0.001,
                                               model_id="gbm_with_te")

    # Training is based on training data with early stopping based on xval performance
    x_with_te = [
        "pclass", "sex", "age", "sibsp", "parch", "fare", "cabin_te",
        "embarked_te", "home.dest_te"
    ]
    gbm_with_te.train(x=x_with_te, y=response, training_frame=train_te)

    # To prevent overly optimistic results ( overfitting to xval metrics) metric is computed on yet unseen test split
    my_gbm_metrics_train_auc = gbm_with_te.model_performance(train_te).auc()
    print("TE train:" + str(my_gbm_metrics_train_auc))

    my_gbm_metrics = gbm_with_te.model_performance(test_te)
    auc_with_te = my_gbm_metrics.auc()

    # auc_with_te = 0.89493
    print("TE test:" + str(auc_with_te))

    gbm_baseline = H2OGradientBoostingEstimator(max_depth=6,
                                                min_rows=1,
                                                fold_column=fold_column,
                                                score_tree_interval=5,
                                                ntrees=10000,
                                                sample_rate=0.8,
                                                col_sample_rate=0.8,
                                                seed=1234,
                                                stopping_rounds=5,
                                                stopping_metric="auto",
                                                stopping_tolerance=0.001,
                                                model_id="gbm_baseline")

    x_baseline = [
        "pclass", "sex", "age", "sibsp", "parch", "fare", "cabin", "embarked",
        "home.dest"
    ]
    gbm_baseline.train(x=x_baseline, y=response, training_frame=train)
    gbm_baseline_metrics = gbm_baseline.model_performance(test)
    auc_baseline = gbm_baseline_metrics.auc()

    # auc_baseline = 0.84174
    print("Baseline test:" + str(auc_baseline))

    assert auc_with_te > auc_baseline
def test_target_encoding_parameters():
    print("Check arguments to TargetEncoder class")
    targetEncoder = TargetEncoder(x=["teColumn1"])

    assert targetEncoder._teColumns == ["teColumn1"]
Example #26
0
#Create a fold column
fold = te_train.kfold_column(n_folds=5, seed=1234)
fold.set_names(["fold"])

te_train = te_train.cbind(fold)
te_train["fold"] = te_train["fold"].asfactor()

#Set list of columns to encode - drop DV from variable list
x = list(te_train.columns)
x = [i for i in x if i not in [dep_var, "fold"]]

#Initialize target encoder and fit to training set
target_encoder = TargetEncoder(x=x,
                               y=dep_var,
                               fold_column="fold",
                               blended_avg=True,
                               inflection_point=3,
                               smoothing=1,
                               seed=1234)
target_encoder.fit(te_train)

#Transform training set
encoded_train = target_encoder.transform(frame=te_train,
                                         holdout_type="kfold",
                                         noise=0.2,
                                         seed=1664)

#Transform test set
encoded_test = target_encoder.transform(frame=te_test,
                                        holdout_type="none",
                                        noise=0.0,