def test_permutation_importance_test(): X, X_test, y = _setup() process = make_pipeline(dp.ImputeNaN(), dp.TargetMeanEncoding(), dp.PermutationImportanceTest()) Xt = process.fit_transform(X, y) _check_equal_rows(X, Xt) _check_col_does_not_exist_in_df(Xt, 'Name') _check_col_does_not_exist_in_df(Xt, 'PassengerId') Xt_test = process.transform(X_test) _check_equal_rows(X_test, Xt_test) _check_same_cols_and_order(Xt, Xt_test)
def test_append_cluster_target_mean(): X, X_test, y = _setup() process = make_pipeline(dp.ImputeNaN(), dp.TargetMeanEncoding(), dp.AppendClusterTargetMean()) Xt = process.fit_transform(X, y) _check_equal_rows(X, Xt) _check_number_of_cols_equal(Xt, 12) _check_col_exist_in_df(Xt, 'cluster_mean') Xt_test = process.transform(X_test) _check_equal_rows(X_test, Xt_test) _check_number_of_cols_equal(Xt_test, 12) _check_same_cols_and_order(Xt, Xt_test)
def test_append_classification_model(): X, X_test, y = _setup() probability_candidates = [True, False] for probability in probability_candidates: trans = dp.AppendClassificationModel(model=RandomForestClassifier(), probability=probability) process = make_pipeline(dp.ImputeNaN(), dp.TargetMeanEncoding(), trans) Xt = process.fit_transform(X, y) _check_equal_rows(X, Xt) _check_col_exist_in_df(Xt, 'Predicted_RandomForestClassifier') Xt_test = process.transform(X_test) _check_equal_rows(X_test, Xt_test) _check_same_cols_and_order(Xt, Xt_test)
def test_pipelines(): X, X_test, y = _setup() ctrans_candidates = [ dp.OneHotEncoding(), dp.TargetMeanEncoding(), dp.CountEncoding(), dp.RankedCountEncoding(), dp.FrequencyEncoding(), dp.RankedTargetMeanEncoding(), ] scaler_candidates = [dp.StandardScaling(), dp.MinMaxScaling()] for scaler in scaler_candidates: for ctrans in ctrans_candidates: process = make_pipeline( dp.DropColumns(drop_columns="PassengerId"), dp.DropNoVariance(), dp.GroupRareCategory(), dp.ClipData(), dp.DropHighCardinality(), dp.BinarizeNaN(), dp.CountRowNaN(), dp.ImputeNaN(), ctrans, dp.DropNoVariance(), dp.DropHighCorrelation(), scaler, dp.AppendAnomalyScore(), dp.AppendCluster(), dp.AppendClusterDistance(), dp.AppendPrincipalComponent(), dp.DropHighCorrelation(), dp.DropLowAUC(), ) Xt = process.fit_transform(X, y) Xt_test = process.transform(X_test) _check_equal_rows(X, Xt) _check_equal_rows(X_test, Xt_test) _check_same_cols_and_order(Xt, Xt_test)
def test_target_mean_encoding(): X, X_test, y = _setup() trans = dp.TargetMeanEncoding() Xt = trans.fit_transform(X, y) assert Xt['Name'].mean() == 0.38383838383838975 assert Xt['Sex'].mean() == 0.38383838383838054 assert Xt['Cabin'].mean() == 0.35791513764516214 assert Xt['Ticket'].mean() == 0.4306411823436723 assert Xt['Embarked'].mean() == 0.38367351680115463 _check_equal_rows(X, Xt) _check_equal_cols(X, Xt) Xt_test = trans.transform(X_test) _check_equal_rows(X_test, Xt_test) _check_equal_cols(X_test, Xt_test) _check_same_cols_and_order(Xt, Xt_test)
def test_cascaded_encoders(): X, X_test, y = _setup() process = make_pipeline( dp.ImputeNaN(), dp.OneHotEncoding(), dp.TargetMeanEncoding(), dp.CountEncoding(), dp.RankedCountEncoding(), dp.FrequencyEncoding(), dp.RankedTargetMeanEncoding(), ) Xt = process.fit_transform(X, y) Xt_test = process.transform(X_test) _check_equal_rows(X, Xt) _check_equal_rows(X_test, Xt_test) _check_same_cols_and_order(Xt, Xt_test)
def test_append_encoder(): X, X_test, y = _setup() encoder_candidates = [ dp.TargetMeanEncoding(), dp.CountEncoding(), dp.RankedCountEncoding(), dp.FrequencyEncoding(), dp.RankedTargetMeanEncoding(), dp.RankedEvaluationMetricEncoding(), ] for encoder in encoder_candidates: correct_col_no = 16 trans = dp.AppendEncoder(encoder) Xt = trans.fit_transform(X, y) _check_equal_rows(X, Xt) _check_number_of_cols_equal(Xt, correct_col_no) Xt_test = trans.transform(X_test) _check_equal_rows(X_test, Xt_test) _check_number_of_cols_equal(Xt_test, correct_col_no) _check_same_cols_and_order(Xt, Xt_test) trans = dp.AppendEncoder(dp.OneHotEncoding()) correct_col_no = 1736 Xt = trans.fit_transform(X, y) _check_equal_rows(X, Xt) _check_number_of_cols_equal(Xt, correct_col_no) Xt_test = trans.transform(X_test) _check_equal_rows(X_test, Xt_test) _check_number_of_cols_equal(Xt_test, correct_col_no) _check_same_cols_and_order(Xt, Xt_test)