def test_fast_permutation_importance_matches_sklearn_output(
        mock_supports_fast_importance, pipeline_class, parameters,
        has_minimal_dependencies):
    if has_minimal_dependencies and pipeline_class == LinearPipelineWithTargetEncoderAndOHE:
        pytest.skip(
            "Skipping test_fast_permutation_importance_matches_sklearn_output for target encoder cause "
            "dependency not installed.")
    X, y = load_fraud(100)

    if pipeline_class == LinearPipelineWithTextFeatures:
        X = X.set_types(logical_types={'provider': 'NaturalLanguage'})

    # Do this to make sure we use the same int as sklearn under the hood
    random_state = np.random.RandomState(0)
    random_seed = random_state.randint(np.iinfo(np.int32).max + 1)

    mock_supports_fast_importance.return_value = True
    parameters['Random Forest Classifier'] = {'n_jobs': 1}
    pipeline = pipeline_class(parameters=parameters)
    pipeline.fit(X, y)
    fast_scores = calculate_permutation_importance(pipeline,
                                                   X,
                                                   y,
                                                   objective='Log Loss Binary',
                                                   random_seed=random_seed)
    mock_supports_fast_importance.return_value = False
    slow_scores = calculate_permutation_importance(pipeline,
                                                   X,
                                                   y,
                                                   objective='Log Loss Binary',
                                                   random_seed=0)
    pd.testing.assert_frame_equal(fast_scores, slow_scores)
Beispiel #2
0
def test_partial_dependence_more_categories_than_grid_resolution(logistic_regression_binary_pipeline_class):
    def round_dict_keys(dictionary, places=6):
        """ Function to round all keys of a dictionary that has floats as keys. """
        dictionary_rounded = {}
        for key in dictionary:
            dictionary_rounded[round(key, places)] = dictionary[key]
        return dictionary_rounded

    X, y = load_fraud(1000)
    X = X.drop(columns=['datetime', 'expiration_date', 'country', 'region', 'provider'])
    pipeline = logistic_regression_binary_pipeline_class({})
    pipeline.fit(X, y)
    num_cat_features = len(set(X["currency"].to_series()))
    assert num_cat_features == 164

    part_dep_ans = {0.1432616813857269: 154, 0.1502346349971562: 1, 0.14487916687594762: 1,
                    0.1573183451314127: 1, 0.11695462432136654: 1, 0.07950579532536253: 1, 0.006794444792966759: 1,
                    0.17745270478939879: 1, 0.1666874487986626: 1, 0.13357573073236878: 1, 0.06778096366056789: 1}
    part_dep_ans_rounded = round_dict_keys(part_dep_ans)

    # Check the case where grid_resolution < number of categorical features
    part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features / 2))
    part_dep_dict = dict(part_dep["partial_dependence"].value_counts())
    assert part_dep_ans_rounded == round_dict_keys(part_dep_dict)

    # Check the case where grid_resolution == number of categorical features
    part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features))
    part_dep_dict = dict(part_dep["partial_dependence"].value_counts())
    assert part_dep_ans_rounded == round_dict_keys(part_dep_dict)

    # Check the case where grid_resolution > number of categorical features
    part_dep = partial_dependence(pipeline, X, 'currency', grid_resolution=round(num_cat_features * 2))
    part_dep_dict = dict(part_dep["partial_dependence"].value_counts())
    assert part_dep_ans_rounded == round_dict_keys(part_dep_dict)
Beispiel #3
0
def test_fraud():
    X, y = demos.load_fraud()
    assert X.shape == (99992, 12)
    assert y.shape == (99992, )
    assert isinstance(X, ww.DataTable)
    assert isinstance(y, ww.DataColumn)

    X, y = demos.load_fraud(1000)
    assert X.shape == (1000, 12)
    assert y.shape == (1000, )

    X, y = demos.load_fraud(1000, return_pandas=True)
    assert X.shape == (1000, 12)
    assert y.shape == (1000, )
    assert isinstance(X, pd.DataFrame)
    assert isinstance(y, pd.Series)
def test_partial_dependence_more_categories_than_grid_resolution(
        logistic_regression_binary_pipeline_class):
    def round_dict_keys(dictionary, places=6):
        """ Function to round all keys of a dictionary that has floats as keys. """
        dictionary_rounded = {}
        for key in dictionary:
            dictionary_rounded[round(key, places)] = dictionary[key]
        return dictionary_rounded

    X, y = load_fraud(1000)
    X = X.drop(columns=[
        'datetime', 'expiration_date', 'country', 'region', 'provider'
    ])
    pipeline = logistic_regression_binary_pipeline_class({})
    pipeline.fit(X, y)
    num_cat_features = len(set(X["currency"].to_series()))
    assert num_cat_features == 164

    part_dep_ans = {
        0.1424060057413758: 154,
        0.006837318701999957: 1,
        0.24445532203317386: 1,
        0.15637574440029903: 1,
        0.11676042311300606: 1,
        0.13434069071819482: 1,
        0.1502609021969637: 1,
        0.14486201259150977: 1,
        0.16687406140200164: 1,
        0.06815227785761911: 1,
        0.0791821060634158: 1
    }
    part_dep_ans_rounded = round_dict_keys(part_dep_ans)

    # Check the case where grid_resolution < number of categorical features
    part_dep = partial_dependence(pipeline,
                                  X,
                                  'currency',
                                  grid_resolution=round(num_cat_features / 2))
    part_dep_dict = dict(part_dep["partial_dependence"].value_counts())
    assert part_dep_ans_rounded == round_dict_keys(part_dep_dict)

    # Check the case where grid_resolution == number of categorical features
    part_dep = partial_dependence(pipeline,
                                  X,
                                  'currency',
                                  grid_resolution=round(num_cat_features))
    part_dep_dict = dict(part_dep["partial_dependence"].value_counts())
    assert part_dep_ans_rounded == round_dict_keys(part_dep_dict)

    # Check the case where grid_resolution > number of categorical features
    part_dep = partial_dependence(pipeline,
                                  X,
                                  'currency',
                                  grid_resolution=round(num_cat_features * 2))
    part_dep_dict = dict(part_dep["partial_dependence"].value_counts())
    assert part_dep_ans_rounded == round_dict_keys(part_dep_dict)
Beispiel #5
0
def test_partial_dependence_respect_grid_resolution():
    X, y = load_fraud(1000)

    pl = BinaryClassificationPipeline(component_graph=["DateTime Featurization Component", "One Hot Encoder", "Random Forest Classifier"])
    pl.fit(X, y)
    dep = partial_dependence(pl, X, features="amount", grid_resolution=20)

    assert dep.shape[0] == 20
    assert dep.shape[0] != max(X.select('categorical').describe().loc["nunique"]) + 1

    dep = partial_dependence(pl, X, features="provider", grid_resolution=20)
    assert dep.shape[0] == X['provider'].to_series().nunique()
    assert dep.shape[0] != max(X.select('categorical').describe().loc["nunique"]) + 1
Beispiel #6
0
def fraud_100():
    return load_fraud(n_rows=100)