def test_fast_permutation_importance_matches_sklearn_output( mock_supports_fast_importance, pipeline_class, parameters, has_minimal_dependencies): if has_minimal_dependencies and pipeline_class == LinearPipelineWithTargetEncoderAndOHE: pytest.skip( "Skipping test_fast_permutation_importance_matches_sklearn_output for target encoder cause " "dependency not installed.") X, y = load_fraud(100) if pipeline_class == LinearPipelineWithTextFeatures: X = X.set_types(logical_types={'provider': 'NaturalLanguage'}) # Do this to make sure we use the same int as sklearn under the hood random_state = np.random.RandomState(0) random_seed = random_state.randint(np.iinfo(np.int32).max + 1) mock_supports_fast_importance.return_value = True parameters['Random Forest Classifier'] = {'n_jobs': 1} pipeline = pipeline_class(parameters=parameters) pipeline.fit(X, y) fast_scores = calculate_permutation_importance(pipeline, X, y, objective='Log Loss Binary', random_seed=random_seed) mock_supports_fast_importance.return_value = False slow_scores = calculate_permutation_importance(pipeline, X, y, objective='Log Loss Binary', random_seed=0) pd.testing.assert_frame_equal(fast_scores, slow_scores)
def test_get_permutation_importance_invalid_objective( X_y_regression, linear_regression_pipeline_class): X, y = X_y_regression pipeline = linear_regression_pipeline_class(parameters={}, random_seed=42) with pytest.raises( ValueError, match= f"Given objective 'MCC Multiclass' cannot be used with '{pipeline.name}'" ): calculate_permutation_importance(pipeline, X, y, "mcc multiclass")
def test_permutation_importance_raises_deprecated_random_state_warning( mock_fast_permutation_importance, X_y_binary, logistic_regression_binary_pipeline_class): X, y = X_y_binary pipeline = logistic_regression_binary_pipeline_class(parameters={}, random_seed=2) with warnings.catch_warnings(record=True) as warn: warnings.simplefilter("always") calculate_permutation_importance(pipeline, X, y, objective="Log Loss Binary", random_state=15) _, kwargs = mock_fast_permutation_importance.call_args_list[0] assert kwargs['random_seed'] == 15 assert str(warn[0].message).startswith( "Argument 'random_state' has been deprecated in favor of 'random_seed'" )
def test_get_permutation_importance_multiclass(X_y_multi, logistic_regression_multiclass_pipeline_class, multiclass_core_objectives): X, y = X_y_multi pipeline = logistic_regression_multiclass_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, random_state=42) pipeline.fit(X, y) for objective in multiclass_core_objectives: permutation_importance = calculate_permutation_importance(pipeline, X, y, objective) assert list(permutation_importance.columns) == ["feature", "importance"] assert not permutation_importance.isnull().all().all()
def test_get_permutation_importance_regression(linear_regression_pipeline_class, regression_core_objectives): X = pd.DataFrame([1, 2, 1, 2, 1, 2, 1, 2, 1, 2]) y = pd.Series([1, 2, 1, 2, 1, 2, 1, 2, 1, 2]) pipeline = linear_regression_pipeline_class(parameters={"Linear Regressor": {"n_jobs": 1}}, random_state=42) pipeline.fit(X, y) for objective in regression_core_objectives: permutation_importance = calculate_permutation_importance(pipeline, X, y, objective) assert list(permutation_importance.columns) == ["feature", "importance"] assert not permutation_importance.isnull().all().all()
def test_get_permutation_importance_binary(X_y_binary, data_type, logistic_regression_binary_pipeline_class, binary_core_objectives, make_data_type): X, y = X_y_binary X = make_data_type(data_type, X) y = make_data_type(data_type, y) pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}}, random_state=42) pipeline.fit(X, y) for objective in binary_core_objectives: permutation_importance = calculate_permutation_importance(pipeline, X, y, objective) assert list(permutation_importance.columns) == ["feature", "importance"] assert not permutation_importance.isnull().all().all()
def test_get_permutation_importance_correlated_features(logistic_regression_binary_pipeline_class): y = pd.Series([1, 0, 1, 1]) X = pd.DataFrame() X["correlated"] = y * 2 X["not correlated"] = [-1, -1, -1, 0] y = y.astype(bool) pipeline = logistic_regression_binary_pipeline_class(parameters={}, random_state=42) pipeline.fit(X, y) importance = calculate_permutation_importance(pipeline, X, y, objective="Log Loss Binary", random_state=0) assert list(importance.columns) == ["feature", "importance"] assert not importance.isnull().all().all() correlated_importance_val = importance["importance"][importance.index[importance["feature"] == "correlated"][0]] not_correlated_importance_val = importance["importance"][importance.index[importance["feature"] == "not correlated"][0]] assert correlated_importance_val > not_correlated_importance_val