def test_surrogate_error_tree_int_categorical(self): X_train, X_test, y_train, y_test, categorical_features = \ create_adult_census_data() model = create_kneighbors_classifier(X_train, y_train) run_error_analyzer(model, X_test, y_test, list(X_train.columns), categorical_features)
def test_matrix_filter_adult_census_quantile_binning( self, string_labels, metric): (X_train, X_test, y_train, y_test, categorical_features) = create_adult_census_data(string_labels) model_task = ModelTask.CLASSIFICATION feature_names = X_test.columns.tolist() matrix_features = ['Capital Gain'] # validate quantile binning for column with many zero values model = create_kneighbors_classifier(X_train, y_train) # validate warning printed err_capg = ("Removing duplicate bin edges for quantile binning of " "feature Capital Gain. There are too many duplicate " "values for the specified number of bins.") with pytest.warns(UserWarning, match=err_capg): run_error_analyzer(model, X_test, y_test, feature_names, categorical_features, model_task=model_task, matrix_features=matrix_features, quantile_binning=True, metric=metric) matrix_features = ['Capital Gain', 'Capital Loss'] err_capl = ("Removing duplicate bin edges for quantile binning of " "feature Capital Loss. There are too many duplicate " "values for the specified number of bins.") with pytest.warns(UserWarning) as warninfo: run_error_analyzer(model, X_test, y_test, feature_names, categorical_features, model_task=model_task, matrix_features=matrix_features, quantile_binning=True, metric=metric) warns = {(warn.category, warn.message.args[0]) for warn in warninfo} expected = {(UserWarning, err_capg), (UserWarning, err_capl)} for expected_warning in expected: assert expected_warning in warns
def test_traverse_tree(self): X_train, X_test, y_train, y_test, categorical_features = \ create_adult_census_data() model = create_kneighbors_classifier(X_train, y_train) feature_names = list(X_train.columns) error_analyzer = ModelAnalyzer(model, X_test, y_test, feature_names, categorical_features) categorical_info = get_categorical_info(error_analyzer, feature_names) cat_ind_reindexed, categories_reindexed = categorical_info pred_y = model.predict(X_test) diff = pred_y != y_test max_depth = 3 num_leaves = 31 surrogate = create_surrogate_model(error_analyzer, X_test, diff, max_depth, num_leaves, cat_ind_reindexed) model_json = surrogate._Booster.dump_model() tree_structure = model_json["tree_info"][0]['tree_structure'] max_split_index = get_max_split_index(tree_structure) + 1 filtered_indexed_df = X_test filtered_indexed_df[DIFF] = diff filtered_indexed_df[TRUE_Y] = y_test filtered_indexed_df[PRED_Y] = pred_y json_tree = traverse(filtered_indexed_df, tree_structure, max_split_index, (categories_reindexed, cat_ind_reindexed), [], feature_names, metric=error_analyzer.metric) # create dictionary from json tree id to values json_tree_dict = {} for entry in json_tree: json_tree_dict[entry['id']] = entry validate_traversed_tree(tree_structure, json_tree_dict, max_split_index, feature_names)