def test_make_pipeline_no_nulls(input_type, problem_type): X = pd.DataFrame({ "numerical": [1, 2, 3, 1, 2], "categorical": ["a", "b", "a", "c", "c"], "some dates": pd.date_range('2000-02-03', periods=5, freq='W') }) y = pd.Series([0, 1, 1, 0, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): parameters = { "pipeline": { "date_index": "some dates", "gap": 1, "max_delay": 1 }, "Time Series Baseline Estimator": { "date_index": "some dates", "gap": 1, "max_delay": 1 } } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [ OneHotEncoder, StandardScaler, estimator_class ] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [OneHotEncoder, estimator_class] if estimator_class.model_family == ModelFamily.ARIMA: assert pipeline.component_graph == [Imputer ] + estimator_components else: assert pipeline.component_graph == [ Imputer, DateTimeFeaturizer ] + delayed_features + estimator_components
def test_shap(estimator, problem_type, n_points_to_explain, X_y_binary, X_y_multi, X_y_regression, helper_functions): if problem_type not in estimator.supported_problem_types: pytest.skip("Skipping because estimator and pipeline are not compatible.") if problem_type == ProblemTypes.BINARY: training_data, y = X_y_binary is_binary = True elif problem_type == ProblemTypes.MULTICLASS: training_data, y = X_y_multi is_binary = False else: training_data, y = X_y_regression try: pipeline = make_pipeline(training_data, y, estimator, problem_type, parameters={estimator.name: {'n_jobs': 1}}) except ValueError: pipeline = make_pipeline(training_data, y, estimator, problem_type) shap_values = calculate_shap_for_test(training_data, y, pipeline, n_points_to_explain) if problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: assert isinstance(shap_values, list), "For binary classification, returned values must be a list" assert all(isinstance(class_values, dict) for class_values in shap_values), "Not all list elements are lists!" if is_binary: assert len(shap_values) == N_CLASSES_BINARY, "A dictionary should be returned for each class!" else: assert len(shap_values) == N_CLASSES_MULTICLASS, "A dictionary should be returned for each class!" assert all( len(values) == N_FEATURES for values in shap_values), "A SHAP value must be computed for every feature!" for class_values in shap_values: assert all(isinstance(feature, list) for feature in class_values.values()), "Every value in the dict must be a list!" assert all(len(v) == n_points_to_explain for v in class_values.values()), "A SHAP value must be computed for every data point to explain!" elif problem_type == ProblemTypes.REGRESSION: assert isinstance(shap_values, dict), "For regression, returned values must be a dictionary!" assert len(shap_values) == N_FEATURES, "A SHAP value should be computed for every feature!" assert all(isinstance(feature, list) for feature in shap_values.values()), "Every value in the dict must be a list!" assert all(len(v) == n_points_to_explain for v in shap_values.values()), "A SHAP value must be computed for every data point to explain!"
def test_automl_pickle_generated_pipeline(mock_regression_fit, mock_regression_score, X_y_regression): mock_regression_score.return_value = {"R2": 1.0} class RegressionPipelineCustom(RegressionPipeline): custom_name = "Custom Regression Name" component_graph = ["Imputer", "Linear Regressor"] custom_hyperparameters = { "Imputer": { "numeric_impute_strategy": "most_frequent" } } X, y = X_y_regression pipeline = GeneratedPipelineRegression allowed_estimators = get_estimators('regression') allowed_pipelines = [ make_pipeline(X, y, estimator, problem_type='regression') for estimator in allowed_estimators ] allowed_pipelines.append(RegressionPipelineCustom) a = AutoMLSearch(X_train=X, y_train=y, problem_type='regression', allowed_pipelines=allowed_pipelines) a.search() a.add_to_rankings(RegressionPipelineCustom({})) seen_name = False for i, row in a.rankings.iterrows(): automl_pipeline = a.get_pipeline(row['id']) assert automl_pipeline.__class__ == pipeline assert pickle.loads(pickle.dumps(automl_pipeline)) if automl_pipeline.custom_name == RegressionPipelineCustom.custom_name: seen_name = True assert automl_pipeline.custom_hyperparameters == RegressionPipelineCustom.custom_hyperparameters assert automl_pipeline.component_graph == RegressionPipelineCustom.component_graph assert seen_name