def test_empty_cohort_diabetes_regression(self, analyzer_type):
        X_train, X_test, y_train, y_test, feature_names = \
            create_diabetes_data()

        model = create_kneighbors_classifier(X_train, y_train)

        composite_filters = [{
            COMPOSITE_FILTERS: [{
                COMPOSITE_FILTERS: [{
                    ARG: [0.06],
                    COLUMN: 's1',
                    METHOD: CohortFilterMethods.METHOD_GREATER
                }, {
                    ARG: [-0.01],
                    COLUMN: 's2',
                    METHOD: CohortFilterMethods.METHOD_LESS
                }],
                OPERATION:
                CohortFilterOps.AND
            }],
            OPERATION:
            CohortFilterOps.OR
        }]
        run_error_analyzer(model,
                           X_test,
                           y_test,
                           feature_names,
                           analyzer_type,
                           composite_filters=composite_filters)
    def test_empty_cohort_cancer_classification(self, analyzer_type):
        X_train, X_test, y_train, y_test, feature_names, _ = \
            create_cancer_data()

        model = create_kneighbors_classifier(X_train, y_train)

        composite_filters = [{
            COMPOSITE_FILTERS: [{
                COMPOSITE_FILTERS: [{
                    ARG: [20.45, 22.27],
                    COLUMN: 'mean radius',
                    METHOD: CohortFilterMethods.METHOD_RANGE
                }, {
                    ARG: [10.88, 14.46],
                    COLUMN: 'mean texture',
                    METHOD: CohortFilterMethods.METHOD_RANGE
                }],
                OPERATION:
                CohortFilterOps.AND
            }],
            OPERATION:
            CohortFilterOps.OR
        }]
        run_error_analyzer(model,
                           X_test,
                           y_test,
                           feature_names,
                           analyzer_type,
                           composite_filters=composite_filters)
    def test_surrogate_error_tree_int_categorical(self):
        X_train, X_test, y_train, y_test, categorical_features = \
            create_adult_census_data()

        model = create_kneighbors_classifier(X_train, y_train)

        run_error_analyzer(model, X_test, y_test, list(X_train.columns),
                           categorical_features)
    def test_parameters(self, metric, min_child_samples, max_depth, num_leaves,
                        analyzer_type):
        X_train, X_test, y_train, y_test, feature_names, _ = create_iris_data()

        model = create_kneighbors_classifier(X_train, y_train)
        run_error_analyzer(model,
                           X_test,
                           y_test,
                           feature_names,
                           analyzer_type,
                           max_depth=max_depth,
                           num_leaves=num_leaves,
                           min_child_samples=min_child_samples,
                           metric=metric)
Ejemplo n.º 5
0
    def test_matrix_filter_adult_census_quantile_binning(
            self, string_labels, metric):
        (X_train, X_test, y_train, y_test,
         categorical_features) = create_adult_census_data(string_labels)

        model_task = ModelTask.CLASSIFICATION
        feature_names = X_test.columns.tolist()
        matrix_features = ['Capital Gain']
        # validate quantile binning for column with many zero values
        model = create_kneighbors_classifier(X_train, y_train)

        # validate warning printed
        err_capg = ("Removing duplicate bin edges for quantile binning of "
                    "feature Capital Gain. There are too many duplicate "
                    "values for the specified number of bins.")
        with pytest.warns(UserWarning, match=err_capg):
            run_error_analyzer(model,
                               X_test,
                               y_test,
                               feature_names,
                               categorical_features,
                               model_task=model_task,
                               matrix_features=matrix_features,
                               quantile_binning=True,
                               metric=metric)
        matrix_features = ['Capital Gain', 'Capital Loss']
        err_capl = ("Removing duplicate bin edges for quantile binning of "
                    "feature Capital Loss. There are too many duplicate "
                    "values for the specified number of bins.")
        with pytest.warns(UserWarning) as warninfo:
            run_error_analyzer(model,
                               X_test,
                               y_test,
                               feature_names,
                               categorical_features,
                               model_task=model_task,
                               matrix_features=matrix_features,
                               quantile_binning=True,
                               metric=metric)
        warns = {(warn.category, warn.message.args[0]) for warn in warninfo}
        expected = {(UserWarning, err_capg), (UserWarning, err_capl)}
        for expected_warning in expected:
            assert expected_warning in warns
 def test_traverse_tree(self):
     X_train, X_test, y_train, y_test, categorical_features = \
         create_adult_census_data()
     model = create_kneighbors_classifier(X_train, y_train)
     feature_names = list(X_train.columns)
     error_analyzer = ModelAnalyzer(model, X_test, y_test,
                                    feature_names,
                                    categorical_features)
     categorical_info = get_categorical_info(error_analyzer,
                                             feature_names)
     cat_ind_reindexed, categories_reindexed = categorical_info
     pred_y = model.predict(X_test)
     diff = pred_y != y_test
     max_depth = 3
     num_leaves = 31
     surrogate = create_surrogate_model(error_analyzer,
                                        X_test,
                                        diff,
                                        max_depth,
                                        num_leaves,
                                        cat_ind_reindexed)
     model_json = surrogate._Booster.dump_model()
     tree_structure = model_json["tree_info"][0]['tree_structure']
     max_split_index = get_max_split_index(tree_structure) + 1
     filtered_indexed_df = X_test
     filtered_indexed_df[DIFF] = diff
     filtered_indexed_df[TRUE_Y] = y_test
     filtered_indexed_df[PRED_Y] = pred_y
     json_tree = traverse(filtered_indexed_df,
                          tree_structure,
                          max_split_index,
                          (categories_reindexed,
                           cat_ind_reindexed),
                          [],
                          feature_names,
                          metric=error_analyzer.metric)
     # create dictionary from json tree id to values
     json_tree_dict = {}
     for entry in json_tree:
         json_tree_dict[entry['id']] = entry
     validate_traversed_tree(tree_structure, json_tree_dict,
                             max_split_index, feature_names)