def pubdev_6474_test_more_than_two_columns_to_encode_case(): import pandas as pd import random runs = 10 seeds = random.sample(range(1, 10000), runs) for current_seed in seeds: df = pd.DataFrame({ 'x_0': ['a'] * 5 + ['b'] * 5, 'x_1': ['c'] * 9 + ['d'] * 1, 'x_2': ['e'] * 2 + ['f'] * 8, 'x_3': ['h'] * 4 + ['i'] * 6, 'x_4': ['g'] * 7 + ['k'] * 3, 'x_5': ['l'] * 1 + ['m'] * 9, 'y_0': [1, 1, 1, 1, 0, 1, 0, 0, 0, 0] }) hf = h2o.H2OFrame(df) hf['cv_fold_te'] = hf.kfold_column(n_folds=2, seed=current_seed) hf['y_0'] = hf['y_0'].asfactor() full_features = ['x_0', 'x_1', 'x_2', 'x_3', 'x_4', 'x_5'] target_encoder = TargetEncoder(x=full_features, y='y_0', fold_column='cv_fold_te') target_encoder.fit(hf) hf = target_encoder.transform(frame=hf, holdout_type='kfold', seed=current_seed, noise=0.0)
def test_that_warning_will_be_shown_if_we_add_noise_for_none_strategy(): print( "Check that warning will be shown if user is trying to apply noise for holdout_type = `none` case" ) targetColumnName = "survived" teColumns = ["home.dest", "cabin", "embarked"] targetEncoder = TargetEncoder(x=teColumns, y=targetColumnName, blended_avg=True, inflection_point=3, smoothing=1) trainingFrame = h2o.import_file( pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor( ) targetEncoder.fit(trainingFrame) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=0.1, seed=1234) assert len(w) == 1 assert issubclass(w[-1].category, UserWarning) assert "Attempt to apply noise with holdout_type=`none` strategy" == str( w[-1].message)
def test_that_error_will_be_thrown_if_user_has_not_used_fold_column(): print( "Check fold_column is being specified when we are attempting to use kfold strategy" ) targetColumnName = "survived" foldColumnName = "kfold_column" teColumns = ["home.dest", "cabin", "embarked"] # Here we are not specifying `fold_column` targetEncoder = TargetEncoder(x=teColumns, y=targetColumnName, blended_avg=True, inflection_point=3, smoothing=1) trainingFrame = h2o.import_file( pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor( ) trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234) targetEncoder.fit(trainingFrame) try: # We expect to get error as we are trying to use kfold strategy but encoding map was created without folds targetEncoder.transform(frame=trainingFrame, holdout_type="kfold", seed=1234) assert False except ValueError: print('Good, exception was thrown as expected')
def test_that_encoding_maps_are_accessible_as_frames(): print("Check that we can access encoding maps as data frames") targetColumnName = "survived" foldColumnName = "kfold_column" # it is strange that we can't set name for generated kfold teColumns = "home.dest" targetEncoder = TargetEncoder(x=teColumns, y=targetColumnName, fold_column=foldColumnName, blending_avg=True, inflection_point=3, smoothing=1) trainingFrame = h2o.import_file( pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor( ) trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234) targetEncoder.fit(frame=trainingFrame) encodingMapFramesKeys = targetEncoder.encoding_map_frames() assert len([ value for value in encodingMapFramesKeys[0].columns if value in teColumns ]) > 0
def test_target_encoding_transform_kfold(): print("Check transform method (kfold strategy) of the TargetEncoder class") targetColumnName = "survived" foldColumnName = "kfold_column" # it is strange that we can't set name for generated kfold teColumns = ["home.dest", "cabin", "embarked"] targetEncoder = TargetEncoder(x=teColumns, y=targetColumnName, fold_column=foldColumnName, blending_avg=True, inflection_point=3, smoothing=1) trainingFrame = h2o.import_file( pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor( ) trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234) targetEncoder.fit(trainingFrame) encodedFrame = targetEncoder.transform(frame=trainingFrame, holdout_type="kfold", seed=1234) teColumnsEncoded = list(map(lambda x: x + "_te", teColumns)) frameWithEncodingsOnly = encodedFrame[teColumnsEncoded] assert frameWithEncodingsOnly.ncols == 3
def test_target_encoding_transform_loo(): print("Check transform (loo strategy) of the TargetEncoder class") targetColumnName = "survived" teColumns = ["home.dest", "cabin", "embarked"] targetEncoder = TargetEncoder(x=teColumns, y=targetColumnName, fold_column='', blended_avg=True, inflection_point=3, smoothing=1) trainingFrame = h2o.import_file( pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor( ) targetEncoder.fit(frame=trainingFrame) encodedFrame = targetEncoder.transform(frame=trainingFrame, holdout_type="loo", seed=1234) teColumnsEncoded = list(map(lambda x: x + "_te", teColumns)) frameWithEncodingsOnly = encodedFrame[teColumnsEncoded] assert frameWithEncodingsOnly.ncols == 3
def test_target_encoding_transform_none_blending(): print("Check none strategy with and without blending") targetColumnName = "survived" teColumns = ["home.dest", "cabin", "embarked"] teColumnsEncoded = list(map(lambda x: x+"_te", teColumns)) trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor() targetEncoderWithBlending = TargetEncoder(x= teColumns, y= targetColumnName, blended_avg= True, inflection_point = 3, smoothing = 1) targetEncoderWithBlending.fit(frame=trainingFrame) encodedFrameWithBlending = targetEncoderWithBlending.transform(frame=trainingFrame, holdout_type="none", seed=1234) frameWithBlendedEncodingsOnly = encodedFrameWithBlending[teColumnsEncoded] targetEncoderWithoutBlending = TargetEncoder(x= teColumns, y= targetColumnName, blended_avg= False, inflection_point = 3, smoothing = 1) targetEncoderWithoutBlending.fit(frame=trainingFrame) encodedFrameWithoutBlending = targetEncoderWithoutBlending.transform(frame=trainingFrame, holdout_type="none", seed=1234) encodedFrameWithoutBlendingOnly = encodedFrameWithoutBlending[teColumnsEncoded] try: pyunit_utils.compare_frames(frameWithBlendedEncodingsOnly, encodedFrameWithoutBlendingOnly, 10, tol_time=0, tol_numeric=1e-6) assert False except AssertionError: print('Good, encodings are different as expected. Hopefully because of the blending.')
def titanic_with_te_kfoldstrategy(frame=None, seeds=None): sum_of_aucs = 0 for current_seed in seeds: ds = split_data(frame, current_seed) targetColumnName = "survived" foldColumnName = "kfold_column" ds['train'][foldColumnName] = ds['train'].kfold_column( n_folds=5, seed=current_seed) teColumns = ["home.dest", "cabin", "embarked"] targetEncoder = TargetEncoder(x=teColumns, y=targetColumnName, fold_column=foldColumnName, blending_avg=True, inflection_point=3, smoothing=1) targetEncoder.fit(frame=ds['train']) encodedTrain = targetEncoder.transform(frame=ds['train'], holdout_type="kfold", seed=1234) encodedValid = targetEncoder.transform(frame=ds['valid'], holdout_type="none", noise=0.0) encodedTest = targetEncoder.transform(frame=ds['test'], holdout_type="none", noise=0.0) myX = [ "pclass", "sex", "age", "sibsp", "parch", "fare", "cabin_te", "embarked_te", "home.dest_te" ] air_model = H2OGradientBoostingEstimator( ntrees=1000, learn_rate=0.1, score_tree_interval=10, stopping_rounds=5, stopping_metric="AUC", stopping_tolerance=0.001, distribution="multinomial", # why AUC is different for quasibinomial and multinomial? seed=1234) air_model.train(x=myX, y=targetColumnName, training_frame=encodedTrain, validation_frame=encodedValid) variable_importance = air_model._model_json['output'][ 'variable_importances'].as_data_frame() # print(variable_importance) my_gbm_metrics = air_model.model_performance(encodedTest) auc = my_gbm_metrics.auc() sum_of_aucs += auc print("AUC with kfold for seed: " + str(current_seed) + " = " + str(auc)) return sum_of_aucs / len(seeds)
def test_teColumns_parameter_as_single_column_index(): print("Check fit method can accept non-array single column to encode") targetColumnName = "survived" foldColumnName = "kfold_column" # it is strange that we can't set name for generated kfold teColumns = 13 # stands for "home.dest" column targetEncoder = TargetEncoder(x=teColumns, y=targetColumnName, fold_column=foldColumnName, blending_avg=True, inflection_point=3, smoothing=1) trainingFrame = h2o.import_file( pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor( ) trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234) encodingMap = targetEncoder.fit(frame=trainingFrame) assert encodingMap.map_keys['string'] == [trainingFrame.columns[teColumns]] trainingFrame = targetEncoder.transform(trainingFrame, holdout_type="kfold", seed=1234) assert "home.dest_te" in trainingFrame.columns
def test_ability_to_pass_column_parameters_as_indexes(): print("Check that we can pass indices for specifying columns") targetColumnIdx = 1 targetColumnName = "survived" foldColumnIdx = 14 foldColumnName = "kfold_column" teColumns = [13] # 13 stands for `home.dest` targetEncoder = TargetEncoder(x=teColumns, y=targetColumnIdx, fold_column=foldColumnIdx, blended_avg=True, inflection_point=3, smoothing=1) trainingFrame = h2o.import_file( pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor( ) trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234) encodingMap = targetEncoder.fit(frame=trainingFrame) assert encodingMap.map_keys['string'] == ["home.dest"] assert encodingMap.frames[0]['num_rows'] == 583
def test_target_encoding_transform_none(): print("Check transform (none strategy) of the TargetEncoder class") targetColumnName = "survived" teColumns = ["home.dest", "cabin", "embarked"] targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName, blended_avg= True, inflection_point = 3, smoothing = 1) trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor() targetEncoder.fit(frame=trainingFrame) encodedFrame = targetEncoder.transform(frame=trainingFrame, holdout_type="none", seed=1234) teColumnsEncoded = list(map(lambda x: x+"_te", teColumns)) frameWithEncodingsOnly = encodedFrame[teColumnsEncoded] assert frameWithEncodingsOnly.ncols == 3
def test_target_encoding_transform_kfold(): print("Check transform method (kfold strategy) of the TargetEncoder class") targetColumnName = "survived" foldColumnName = "kfold_column" # it is strange that we can't set name for generated kfold teColumns = ["home.dest", "cabin", "embarked"] targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName, fold_column= foldColumnName, blended_avg= True, inflection_point = 3, smoothing = 1) trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor() trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234) targetEncoder.fit(trainingFrame) encodedFrame = targetEncoder.transform(frame=trainingFrame, holdout_type="kfold", seed=1234) teColumnsEncoded = list(map(lambda x: x+"_te", teColumns)) frameWithEncodingsOnly = encodedFrame[teColumnsEncoded] assert frameWithEncodingsOnly.ncols == 3
def titanic_with_te_kfoldstrategy(frame = None, seeds = None): sum_of_aucs = 0 for current_seed in seeds: ds = split_data(frame, current_seed) targetColumnName = "survived" foldColumnName = "kfold_column" ds['train'][foldColumnName] = ds['train'].kfold_column(n_folds=5, seed=current_seed) teColumns = ["home.dest", "cabin", "embarked"] targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName, fold_column= foldColumnName, blended_avg= True, inflection_point = 3, smoothing = 1) targetEncoder.fit(frame=ds['train']) encodedTrain = targetEncoder.transform(frame=ds['train'], holdout_type="kfold", seed=1234) encodedValid = targetEncoder.transform(frame=ds['valid'], holdout_type="none", noise=0.0) encodedTest = targetEncoder.transform(frame=ds['test'], holdout_type="none", noise=0.0) myX = ["pclass", "sex", "age", "sibsp", "parch", "fare", "cabin_te", "embarked_te", "home.dest_te"] air_model = H2OGradientBoostingEstimator(ntrees=1000, learn_rate=0.1, score_tree_interval=10, stopping_rounds=5, stopping_metric="AUC", stopping_tolerance=0.001, distribution="multinomial", seed=1234) air_model.train(x=myX, y=targetColumnName, training_frame=encodedTrain, validation_frame=encodedValid) variable_importance = air_model._model_json['output']['variable_importances'].as_data_frame() # print(variable_importance) my_gbm_metrics = air_model.model_performance(encodedTest) auc = my_gbm_metrics.auc() sum_of_aucs += auc print("AUC with kfold for seed: " + str(current_seed) + " = " + str(auc)) return sum_of_aucs / len(seeds)
def test_target_encoding_default_noise_is_applied(): print("Check that seed is applied when we use noise. Noise is set to the same values. Only seed is different.") targetColumnName = "survived" teColumns = ["home.dest", "cabin", "embarked"] teColumnsEncoded = list(map(lambda x: x+"_te", teColumns)) trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor() targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName, blended_avg= True, inflection_point = 3, smoothing = 1) targetEncoder.fit(frame=trainingFrame) seedTest = 1234 encodedFrame = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=0.0, seed=seedTest) encodingsOnly = encodedFrame[teColumnsEncoded] # Second transformation without specifying noise. Default will be applied. encodedFrame2 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", seed=seedTest) encodingsOnly2 = encodedFrame2[teColumnsEncoded] # Third transformation with zero noise encodedFrame3 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=0.0, seed=seedTest) encodingsOnly3 = encodedFrame3[teColumnsEncoded] # Comparing results # Third encoding should be equal to the first one since no noise is applied in both cases assert pyunit_utils.compare_frames(encodingsOnly, encodingsOnly3, 10, tol_time=0, tol_numeric=1e-6) # First two encodings should be different since default noise will be applied to the second transformation try: pyunit_utils.compare_frames(encodingsOnly, encodingsOnly2, 10, tol_time=0, tol_numeric=1e-6) assert False except AssertionError: print('Good, encodings are different as expected. Default noise is working')
def test_target_encoding_seed_is_working(): print("Check that seed is applied when we use noise. Noise is set to the same values. Only seed is different.") noiseTest = 0.02 targetColumnName = "survived" teColumns = ["home.dest", "cabin", "embarked"] teColumnsEncoded = list(map(lambda x: x+"_te", teColumns)) trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor() targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName, blending_avg= True, inflection_point = 3, smoothing = 1) targetEncoder.fit(frame=trainingFrame) encodedFrame = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=noiseTest, seed=1234, is_train_or_valid=True) encodingsOnly = encodedFrame[teColumnsEncoded] # Second transformation with the same seed 1234 encodedFrame2 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=noiseTest, seed=1234, is_train_or_valid=True) encodingsOnly2 = encodedFrame2[teColumnsEncoded] # Third transformation with another seed 1235 encodedFrame3 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=noiseTest, seed=1235, is_train_or_valid=True) encodingsOnly3 = encodedFrame3[teColumnsEncoded] # Comparing results # First two encodings should be equal assert pyunit_utils.compare_frames(encodingsOnly, encodingsOnly2, 10, tol_time=0, tol_numeric=1e-6) # Third encoding should be different from the first two ones try: pyunit_utils.compare_frames(encodingsOnly, encodingsOnly3, 10, tol_time=0, tol_numeric=1e-6) assert False except AssertionError: print('Good, encodings are different as expected. Seed is working.')
def test_target_encoding_fit_method(): print("Check fit method of the TargetEncoder class") targetColumnName = "survived" foldColumnName = "kfold_column" # it is strange that we can't set name for generated kfold teColumns = ["home.dest", "cabin", "embarked"] targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName, fold_column= foldColumnName, blended_avg= True, inflection_point = 3, smoothing = 1) trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor() trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234) encodingMap = targetEncoder.fit(frame=trainingFrame) assert encodingMap.map_keys['string'] == teColumns assert encodingMap.frames[0]['num_rows'] == 583
def test_teColumns_parameter_as_single_column_index(): print("Check fit method can accept non-array single column to encode") targetColumnName = "survived" foldColumnName = "kfold_column" # it is strange that we can't set name for generated kfold teColumns = 13 # stands for "home.dest" column targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName, fold_column= foldColumnName, blending_avg= True, inflection_point = 3, smoothing = 1) trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor() trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234) encodingMap = targetEncoder.fit(frame=trainingFrame) assert encodingMap.map_keys['string'] == [trainingFrame.columns[teColumns]] trainingFrame = targetEncoder.transform(trainingFrame, holdout_type="kfold", seed=1234) assert "home.dest_te" in trainingFrame.columns
def test_ability_to_pass_column_parameters_as_indexes(): print("Check that we can pass indices for specifying columns") targetColumnIdx = 1 targetColumnName = "survived" foldColumnIdx = 14 foldColumnName = "kfold_column" teColumns = [13] # 13 stands for `home.dest` targetEncoder = TargetEncoder(x= teColumns, y= targetColumnIdx, fold_column= foldColumnIdx, blended_avg= True, inflection_point = 3, smoothing = 1) trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor() trainingFrame[foldColumnName] = trainingFrame.kfold_column(n_folds=5, seed=1234) encodingMap = targetEncoder.fit(frame=trainingFrame) assert encodingMap.map_keys['string'] == ["home.dest"] assert encodingMap.frames[0]['num_rows'] == 583
def test_that_old_te_is_helpful_for_titanic_gbm_xval(): #Import the titanic dataset titanic = h2o.import_file( "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv" ) # Set response column as a factor titanic['survived'] = titanic['survived'].asfactor() response = 'survived' # Split the dataset into train, test train, test = titanic.split_frame(ratios=[.8], seed=1234) # Choose which columns to encode encoded_columns = ["home.dest", "cabin", "embarked"] # Set target encoding parameters blended_avg = True inflection_point = 3 smoothing = 10 # In general, the less data you have the more regularisation you need noise = 0.15 # For k_fold strategy we need to provide fold column fold_column = "kfold_column" train[fold_column] = train.kfold_column(n_folds=5, seed=3456) # Train a TE model titanic_te = TargetEncoder(x=encoded_columns, y=response, fold_column=fold_column, blended_avg=blended_avg, inflection_point=inflection_point, smoothing=smoothing) titanic_te.fit(frame=train) train_te = titanic_te.transform(frame=train, holdout_type="kfold", seed=1234, noise=noise) test_te = titanic_te.transform(frame=test, holdout_type="none", noise=0.0) gbm_with_te = H2OGradientBoostingEstimator(max_depth=6, min_rows=1, fold_column=fold_column, score_tree_interval=5, ntrees=10000, sample_rate=0.8, col_sample_rate=0.8, seed=1234, stopping_rounds=5, stopping_metric="auto", stopping_tolerance=0.001, model_id="gbm_with_te") # Training is based on training data with early stopping based on xval performance x_with_te = [ "pclass", "sex", "age", "sibsp", "parch", "fare", "cabin_te", "embarked_te", "home.dest_te" ] gbm_with_te.train(x=x_with_te, y=response, training_frame=train_te) # To prevent overly optimistic results ( overfitting to xval metrics) metric is computed on yet unseen test split my_gbm_metrics_train_auc = gbm_with_te.model_performance(train_te).auc() print("TE train:" + str(my_gbm_metrics_train_auc)) my_gbm_metrics = gbm_with_te.model_performance(test_te) auc_with_te = my_gbm_metrics.auc() # auc_with_te = 0.89493 print("TE test:" + str(auc_with_te)) gbm_baseline = H2OGradientBoostingEstimator(max_depth=6, min_rows=1, fold_column=fold_column, score_tree_interval=5, ntrees=10000, sample_rate=0.8, col_sample_rate=0.8, seed=1234, stopping_rounds=5, stopping_metric="auto", stopping_tolerance=0.001, model_id="gbm_baseline") x_baseline = [ "pclass", "sex", "age", "sibsp", "parch", "fare", "cabin", "embarked", "home.dest" ] gbm_baseline.train(x=x_baseline, y=response, training_frame=train) gbm_baseline_metrics = gbm_baseline.model_performance(test) auc_baseline = gbm_baseline_metrics.auc() # auc_baseline = 0.84174 print("Baseline test:" + str(auc_baseline)) assert auc_with_te > auc_baseline
te_train = te_train.cbind(fold) te_train["fold"] = te_train["fold"].asfactor() #Set list of columns to encode - drop DV from variable list x = list(te_train.columns) x = [i for i in x if i not in [dep_var, "fold"]] #Initialize target encoder and fit to training set target_encoder = TargetEncoder(x=x, y=dep_var, fold_column="fold", blended_avg=True, inflection_point=3, smoothing=1, seed=1234) target_encoder.fit(te_train) #Transform training set encoded_train = target_encoder.transform(frame=te_train, holdout_type="kfold", noise=0.2, seed=1664) #Transform test set encoded_test = target_encoder.transform(frame=te_test, holdout_type="none", noise=0.0, seed=1) #Union two sets backtogther and convert back to df df_high_card_te = encoded_train.drop("fold", axis=1).rbind(encoded_test)