def test_submit_scoring_job_single(self): """ Test that scoring a single pipeline using the parallel engine produces the same results as simply running the score_pipeline function. """ X, y = self.X_y_binary pipeline = TestLRCPipeline( {"Logistic Regression Classifier": { "n_jobs": 1 }}) engine = DaskEngine(client=self.client) objectives = [automl_data.objective] pipeline_future = engine.submit_training_job(X=ww.DataTable(X), y=ww.DataColumn(y), automl_config=automl_data, pipeline=pipeline) pipeline = pipeline_future.get_result() pipeline_score_future = engine.submit_scoring_job( X=ww.DataTable(X), y=ww.DataColumn(y), automl_config=automl_data, pipeline=pipeline, objectives=objectives) assert isinstance(pipeline_score_future, DaskComputation) pipeline_score = pipeline_score_future.get_result() original_pipeline_score = pipeline.score(X=X, y=y, objectives=objectives) assert not np.isnan(pipeline_score["Log Loss Binary"]) assert pipeline_score == original_pipeline_score
def testinfer_feature_types(): X_dt = ww.DataTable(pd.DataFrame([[1, 2], [3, 4]])) pd.testing.assert_frame_equal(X_dt.to_dataframe(), infer_feature_types(X_dt).to_dataframe()) X_dc = ww.DataColumn(pd.Series([1, 2, 3, 4])) pd.testing.assert_series_equal(X_dc.to_series(), infer_feature_types(X_dc).to_series()) X_pd = pd.DataFrame({0: pd.Series([1, 2], dtype="Int64"), 1: pd.Series([3, 4], dtype="Int64")}) pd.testing.assert_frame_equal(X_pd, infer_feature_types(X_pd).to_dataframe()) X_pd = pd.Series([1, 2, 3, 4], dtype="Int64") pd.testing.assert_series_equal(X_pd, infer_feature_types(X_pd).to_series()) X_list = [1, 2, 3, 4] X_expected = ww.DataColumn(pd.Series(X_list)) pd.testing.assert_series_equal(X_expected.to_series(), infer_feature_types(X_list).to_series()) assert X_list == [1, 2, 3, 4] X_np = np.array([1, 2, 3, 4]) X_expected = ww.DataColumn(pd.Series(X_np)) pd.testing.assert_series_equal(X_expected.to_series(), infer_feature_types(X_np).to_series()) assert np.array_equal(X_np, np.array([1, 2, 3, 4])) X_np = np.array([[1, 2], [3, 4]]) X_expected = ww.DataTable(pd.DataFrame(X_np)) pd.testing.assert_frame_equal(X_expected.to_dataframe(), infer_feature_types(X_np).to_dataframe()) assert np.array_equal(X_np, np.array([[1, 2], [3, 4]]))
def test_class_imbalance_nonnumeric_balanced(input_type): X = pd.DataFrame() y_bools_balanced = pd.Series([True, True, True, False, False]) y_binary_balanced = pd.Series(["No", "Yes", "No", "Yes"]) y_multiclass_balanced = pd.Series([ "red", "green", "red", "red", "blue", "green", "red", "blue", "green", "red" ]) if input_type == "ww": X = ww.DataTable(X) y_bools_balanced = ww.DataColumn(y_bools_balanced) y_binary_balanced = ww.DataColumn(y_binary_balanced) y_multiclass_balanced = ww.DataColumn(y_multiclass_balanced) class_imbalance_check = ClassImbalanceDataCheck(num_cv_folds=1) assert class_imbalance_check.validate(X, y_multiclass_balanced) == { "warnings": [], "errors": [], "actions": [] } assert class_imbalance_check.validate(X, y_binary_balanced) == { "warnings": [], "errors": [], "actions": [] } assert class_imbalance_check.validate(X, y_multiclass_balanced) == { "warnings": [], "errors": [], "actions": [] }
def test_default_data_checks_regression(input_type): X = pd.DataFrame({ 'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 5, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100] }) y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_no_variance = ww.DataColumn(y_no_variance) null_leakage = [ DataCheckWarning( message= "Column 'lots_of_null' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "lots_of_null" }).to_dict() ] data_checks = DefaultDataChecks( "regression", get_default_primary_search_objective("regression")) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] } # Skip Invalid Target assert data_checks.validate(X, y_no_variance) == { "warnings": messages[:3] + null_leakage, "errors": messages[4:] + [ DataCheckError(message="Y has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={ "column": "Y" }).to_dict() ] } data_checks = DataChecks( DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, { "InvalidTargetDataCheck": { "problem_type": "regression", "objective": get_default_primary_search_objective("regression") } }) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] }
def test_default_data_checks_regression(input_type): X = pd.DataFrame({'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 5, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100], 'natural_language_nan': [None, "string_that_is_long_enough_for_natural_language_1", "string_that_is_long_enough_for_natural_language_2", "string_that_is_long_enough_for_natural_language_3", "string_that_is_long_enough_for_natural_language_4"], 'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5))}) X['nan_dt_col'][0] = None y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_no_variance = ww.DataColumn(y_no_variance) null_leakage = [DataCheckWarning(message="Column 'lots_of_null' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "lots_of_null"}).to_dict()] data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression")) id_leakage_warning = [DataCheckWarning(message="Column 'id' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "id"}).to_dict()] nan_dt_leakage_warning = [DataCheckWarning(message="Column 'nan_dt_col' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={"column": "nan_dt_col"}).to_dict()] impute_action = DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, 'impute_strategy': 'mean'}).to_dict() nan_dt_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'nan_dt_col'}).to_dict() expected_actions_with_drop_and_impute = expected_actions[:3] + [nan_dt_action, impute_action] + expected_actions[4:] assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, "errors": messages[3:], "actions": expected_actions_with_drop_and_impute} # Skip Invalid Target assert data_checks.validate(X, y_no_variance) == { "warnings": messages[:3] + null_leakage, "errors": messages[4:7] + [DataCheckError(message="Y has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "Y"}).to_dict()] + messages[7:], "actions": expected_actions[:3] + expected_actions[4:] } data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {"InvalidTargetDataCheck": {"problem_type": "regression", "objective": get_default_primary_search_objective("regression")}}) assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning, "errors": messages[3:], "actions": expected_actions_with_drop_and_impute}
def score_pipelines(pipelines, engine): futures = [] for pipeline in pipelines: futures.append(engine.submit_training_job(X=ww.DataTable(X), y=ww.DataColumn(y), automl_config=automl_data, pipeline=pipeline)) pipelines = [f.get_result() for f in futures] futures = [] for pipeline in pipelines: futures.append(engine.submit_scoring_job(X=ww.DataTable(X), y=ww.DataColumn(y), automl_config=automl_data, pipeline=pipeline, objectives=[automl_data.objective])) results = [f.get_result() for f in futures] return results
def test_compute_final_component_features_nonlinear(mock_en_predict, mock_rf_predict, mock_ohe, mock_imputer, example_graph, X_y_binary): X, y = X_y_binary mock_imputer.return_value = ww.DataTable(pd.DataFrame(X)) mock_ohe.return_value = ww.DataTable(pd.DataFrame(X)) mock_en_predict.return_value = ww.DataColumn(pd.Series(np.ones(X.shape[0]))) mock_rf_predict.return_value = ww.DataColumn(pd.Series(np.zeros(X.shape[0]))) X_expected = pd.DataFrame({'Random Forest': np.zeros(X.shape[0]), 'Elastic Net': np.ones(X.shape[0])}) component_graph = ComponentGraph(example_graph).instantiate({}) component_graph.fit(X, y) X_t = component_graph.compute_final_component_features(X) assert_frame_equal(X_expected, X_t.to_dataframe()) assert mock_imputer.call_count == 2 assert mock_ohe.call_count == 4
def test_make_pipeline_text_columns(input_type, problem_type): X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2], "categorical": ["a", "b", "a", "c", "c"], "text": ["string one", "another", "text for a column, this should be a text column!!", "text string", "hello world"]}) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [OneHotEncoder, StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [OneHotEncoder, estimator_class] assert pipeline.component_graph == [Imputer, TextFeaturizer] + delayed_features + estimator_components
def __getitem__(self, key): selection = self.underlying_data.iloc[key] if isinstance(selection, pd.Series) or (ks and isinstance(selection, ks.Series)): col_name = selection.name if isinstance(self.ww_data, ww.DataTable) and set( selection.index.values) == set(self.ww_data.columns): # return selection as series if series of one row. return selection if isinstance(self.ww_data, ww.DataTable): logical_type = self.ww_data.logical_types.get(col_name, None) semantic_tags = self.ww_data.semantic_tags.get(col_name, None) else: logical_type = self.ww_data.logical_type or None semantic_tags = self.ww_data.semantic_tags or None if semantic_tags is not None: semantic_tags = semantic_tags - {'index'} - {'time_index'} name = self.ww_data.name return ww.DataColumn( selection, logical_type=logical_type, semantic_tags=semantic_tags, use_standard_tags=self.ww_data.use_standard_tags, name=name) elif isinstance(selection, pd.DataFrame) or (ks and isinstance( selection, ks.DataFrame)): return _new_dt_including(self.ww_data, selection) else: # singular value return selection
def test_make_pipeline_only_text_columns(input_type, problem_type): X = pd.DataFrame({"text": ["string one", "the evalml team is full of wonderful people", "text for a column, this should be a text column!!", "text string", "hello world"], "another text": ["ladidididididida", "cats are great", "text for a column, this should be a text column!!", "text string", "goodbye world"]}) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] standard_scaler = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: standard_scaler = [StandardScaler] assert pipeline.component_graph == [TextFeaturizer] + delayed_features + standard_scaler + [estimator_class]
def test_make_pipeline_all_nan_no_categoricals(input_type, problem_type): # testing that all_null column is not considered categorical X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan], "num": [1, 2, 3, 4, 5]}) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [estimator_class] assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
def _imbalanced_data_X_y(problem_type, categorical_columns, size): """"Generates a dummy classification dataset with particular amounts of class imbalance and categorical input columns. For our targets, we maintain a 1:5, or 0.2, class ratio of minority : majority. We only generate minimum amount for X to set the logical_types, so the length of X and y will be different. Arguments: problem_type (str): Either 'binary' or 'multiclass' categorical_columns (str): Determines how many categorical cols to use. Either 'all', 'some', or 'none'. size (str): Either 'large' or 'small'. 'large' returns a dataset of size 21,000, while 'small' returns a size of 4200 """ multiplier = 5 if size == 'large' else 1 col_names = [f"col_{i}" for i in range(100)] # generate X to be all int values X_dict = {col_name: [i % (j + 1) for i in range(1, 100)] for j, col_name in enumerate(col_names)} X = pd.DataFrame(X_dict) if categorical_columns == 'all': X_ww = ww.DataTable(X, logical_types={col_name: "Categorical" for col_name in col_names}) elif categorical_columns == 'some': X_ww = ww.DataTable(X, logical_types={col_name: "Categorical" for col_name in col_names[: len(col_names) // 2]}) else: X_ww = ww.DataTable(X) if problem_type == 'binary': targets = [0] * 3500 + [1] * 700 else: targets = [0] * 3000 + [1] * 600 + [2] * 600 targets *= multiplier y_ww = ww.DataColumn(pd.Series(targets)) return X_ww, y_ww
def test_delay_feature_transformer_supports_custom_index(encode_X_as_str, encode_y_as_str, use_woodwork, delayed_features_data): X, y = delayed_features_data X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, encode_X_as_str, encode_y_as_str) X.index = pd.RangeIndex(50, 81) X_answer.index = pd.RangeIndex(50, 81) y.index = pd.RangeIndex(50, 81) y_answer.index = pd.RangeIndex(50, 81) answer = pd.DataFrame({"feature": X.feature, "feature_delay_1": X_answer.feature.shift(1), "feature_delay_2": X_answer.feature.shift(2), "feature_delay_3": X_answer.feature.shift(3), "target_delay_0": y_answer, "target_delay_1": y_answer.shift(1), "target_delay_2": y_answer.shift(2), "target_delay_3": y_answer.shift(3)}, index=pd.RangeIndex(50, 81)) if use_woodwork: X = ww.DataTable(X) y = ww.DataColumn(y) pd.testing.assert_frame_equal(DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X, y), answer) answer_only_y = pd.DataFrame({"target_delay_0": y_answer, "target_delay_1": y_answer.shift(1), "target_delay_2": y_answer.shift(2), "target_delay_3": y_answer.shift(3)}, index=pd.RangeIndex(50, 81)) pd.testing.assert_frame_equal(DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y), answer_only_y)
def test_target_imputer_woodwork_custom_overrides_returned_by_components( y_pd, has_nan, impute_strategy): y_to_use = y_pd.copy() if has_nan: y_to_use[len(y_pd) - 1] = np.nan override_types = [Integer, Double, Categorical, Boolean] for logical_type in override_types: try: y = ww.DataColumn(y_to_use.copy(), logical_type=logical_type) except TypeError: continue impute_strategy_to_use = impute_strategy if logical_type in [Categorical, NaturalLanguage]: impute_strategy_to_use = "most_frequent" imputer = TargetImputer(impute_strategy=impute_strategy_to_use) imputer.fit(None, y) _, y_t = imputer.transform(None, y) assert isinstance(y_t, ww.DataColumn) if impute_strategy_to_use == "most_frequent" or not has_nan: assert y_t.logical_type == logical_type else: assert y_t.logical_type == Double
def test_make_pipeline_no_column_names(input_type, problem_type): X = pd.DataFrame([[1, "a", np.nan], [2, "b", np.nan], [5, "b", np.nan]]) y = pd.Series([0, 0, 1]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [OneHotEncoder, StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [OneHotEncoder, estimator_class] assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
def eval_pipelines(pipelines, engine): futures = [] for pipeline in pipelines: futures.append(engine.submit_evaluation_job(X=ww.DataTable(X), y=ww.DataColumn(y), automl_config=automl_data, pipeline=pipeline)) results = [f.get_result() for f in futures] return results
def test_explain_predictions_best_worst_custom_metric(mock_make_table, output_format, answer): mock_make_table.return_value = "table goes here" if output_format == "text" else { "explanations": ["explanation_dictionary_goes_here"] } pipeline = MagicMock() pipeline.parameters = "Parameters go here" input_features = pd.DataFrame({"a": [5, 6]}) pipeline.problem_type = ProblemTypes.REGRESSION pipeline.name = "Test Pipeline Name" pipeline.compute_estimator_features.return_value = ww.DataTable( input_features) pipeline.predict.return_value = ww.DataColumn(pd.Series([2, 1])) y_true = pd.Series([3, 2]) def sum(y_true, y_pred): return y_pred + y_true best_worst_report = explain_predictions_best_worst( pipeline, input_features, y_true=y_true, num_to_explain=1, metric=sum, output_format=output_format) if output_format == "text": compare_two_tables(best_worst_report.splitlines(), regression_custom_metric_answer.splitlines()) else: assert best_worst_report == answer
def test_make_pipeline_datetime_no_categorical(input_type, problem_type): X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2], "some dates": pd.date_range('2000-02-03', periods=5, freq='W')}) y = pd.Series([0, 1, 1, 0, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [estimator_class] assert pipeline.component_graph == [Imputer, DateTimeFeaturizer] + delayed_features + estimator_components
def test_predict_repeat_estimator(mock_predict, mock_fit, X_y_binary): X, y = X_y_binary mock_predict.return_value = ww.DataColumn(pd.Series(y)) graph = { 'Imputer': [Imputer], 'OneHot_RandomForest': [OneHotEncoder, 'Imputer.x'], 'OneHot_Logistic': [OneHotEncoder, 'Imputer.x'], 'Random Forest': [RandomForestClassifier, 'OneHot_RandomForest.x'], 'Logistic Regression': [LogisticRegressionClassifier, 'OneHot_Logistic.x'], 'Final Estimator': [LogisticRegressionClassifier, 'Random Forest', 'Logistic Regression'] } component_graph = ComponentGraph(graph) component_graph.instantiate({}) component_graph.fit(X, y) assert not component_graph.get_component( 'Logistic Regression')._component_obj == component_graph.get_component( 'Final Estimator')._component_obj component_graph.predict(X) assert mock_predict.call_count == 5 assert mock_fit.call_count == 3
def test_partial_dependence_multiclass_categorical(class_label, logistic_regression_multiclass_pipeline_class): pytest.importorskip('plotly.graph_objects', reason='Skipping plotting test because plotly not installed') X, y = load_wine() X['categorical_column'] = ww.DataColumn(pd.Series([i % 3 for i in range(X.shape[0])]).astype(str), logical_type="Categorical") X['categorical_column_2'] = ww.DataColumn(pd.Series([i % 6 for i in range(X.shape[0])]).astype(str), logical_type="Categorical") pipeline = logistic_regression_multiclass_pipeline_class({"Logistic Regression Classifier": {"n_jobs": 1}}) pipeline.fit(X, y) fig = graph_partial_dependence(pipeline, X, features='categorical_column', class_label=class_label, grid_resolution=5) for i, plot_data in enumerate(fig.to_dict()['data']): assert plot_data['type'] == 'bar' assert plot_data['x'].tolist() == ['0', '1', '2'] if class_label is None: assert plot_data['name'] == f'class_{i}' else: assert plot_data['name'] == class_label fig = graph_partial_dependence(pipeline, X, features=('alcohol', 'categorical_column'), class_label=class_label, grid_resolution=5) for i, plot_data in enumerate(fig.to_dict()['data']): assert plot_data['type'] == 'contour' assert fig.to_dict()['layout']['yaxis']['ticktext'] == ['0', '1', '2'] if class_label is None: assert plot_data['name'] == f'class_{i}' else: assert plot_data['name'] == class_label fig = graph_partial_dependence(pipeline, X, features=('categorical_column_2', 'categorical_column'), class_label=class_label, grid_resolution=5) for i, plot_data in enumerate(fig.to_dict()['data']): assert plot_data['type'] == 'contour' assert fig.to_dict()['layout']['xaxis']['ticktext'] == ['0', '1', '2'] assert fig.to_dict()['layout']['yaxis']['ticktext'] == ['0', '1', '2', '3', '4', '5'] if class_label is None: assert plot_data['name'] == f'class_{i}' else: assert plot_data['name'] == class_label
def test_predict(mock_predict, mock_fit, example_graph, X_y_binary): X, y = X_y_binary mock_predict.return_value = ww.DataColumn(pd.Series(y)) component_graph = ComponentGraph(example_graph).instantiate({}) component_graph.fit(X, y) component_graph.predict(X) assert mock_predict.call_count == 5 # Called twice when fitting pipeline, thrice when predicting assert mock_fit.call_count == 3 # Only called during fit, not predict
def test_classification_pipeline_encodes_targets(mock_encode, mock_decode, mock_score, mock_predict, mock_predict_proba, mock_fit, pipeline_class, X_y_binary): X, y = X_y_binary y_series = pd.Series(y) mock_predict.return_value = ww.DataColumn(y_series) mock_predict_proba.return_value = ww.DataTable( pd.DataFrame({ "negative": y_series, "positive": y_series })) X = pd.DataFrame({"feature": range(len(y))}) y_encoded = y_series.map(lambda label: "positive" if label == 1 else "negative") mock_encode.return_value = y_series mock_decode.return_value = y_encoded class MyTsPipeline(pipeline_class): component_graph = [ 'Delayed Feature Transformer', 'Logistic Regression Classifier' ] pl = MyTsPipeline({ "Delayed Feature Transformer": { "gap": 0, "max_delay": 1 }, "pipeline": { "gap": 0, "max_delay": 1 } }) # Check fit encodes target pl.fit(X, y_encoded) _, target_passed_to_estimator = mock_fit.call_args[0] # Check that target is converted to ints. Use .iloc[1:] because the first feature row has NaNs assert_series_equal(target_passed_to_estimator, y_series.iloc[1:]) # Check predict encodes target mock_encode.reset_mock() pl.predict(X, y_encoded) mock_encode.assert_called_once() # Check predict proba encodes target mock_encode.reset_mock() pl.predict_proba(X, y_encoded) mock_encode.assert_called_once() # Check score encodes target mock_encode.reset_mock() pl.score(X, y_encoded, objectives=['MCC Binary']) mock_encode.assert_called_once()
def test_make_pipeline_no_nulls(input_type, problem_type): X = pd.DataFrame({ "numerical": [1, 2, 3, 1, 2], "categorical": ["a", "b", "a", "c", "c"], "some dates": pd.date_range('2000-02-03', periods=5, freq='W') }) y = pd.Series([0, 1, 1, 0, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): parameters = { "pipeline": { "date_index": "some dates", "gap": 1, "max_delay": 1 }, "Time Series Baseline Estimator": { "date_index": "some dates", "gap": 1, "max_delay": 1 } } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [ OneHotEncoder, StandardScaler, estimator_class ] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [OneHotEncoder, estimator_class] if estimator_class.model_family == ModelFamily.ARIMA: assert pipeline.component_graph == [Imputer ] + estimator_components else: assert pipeline.component_graph == [ Imputer, DateTimeFeaturizer ] + delayed_features + estimator_components
def test_empty_data_checks(input_type, X_y_binary): X, y = X_y_binary if input_type != "np": X = pd.DataFrame(X) y = pd.Series(y) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) data_checks = EmptyDataChecks() assert data_checks.validate(X, y) == {"warnings": [], "errors": []}
def test_iLocIndexer_class_error(sample_df_dask, sample_series_dask): dt_dask = ww.DataTable(sample_df_dask) with pytest.raises(TypeError, match="iloc is not supported for Dask DataTables"): _iLocIndexer(dt_dask) dc_dask = ww.DataColumn(sample_series_dask) with pytest.raises(TypeError, match="iloc is not supported for Dask DataColumns"): _iLocIndexer(dc_dask)
def test_class_imbalance_severe(min_samples, input_type): X = pd.DataFrame() # 0 will be < 10% of the data, but there will be 50 samples of it y_values_binary = pd.Series([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] * 50) y_values_multiclass = pd.Series([0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2] * 50) if input_type == "ww": X = ww.DataTable(X) y_values_binary = ww.DataColumn(y_values_binary) y_values_multiclass = ww.DataColumn(y_values_multiclass) class_imbalance_check = ClassImbalanceDataCheck(min_samples=min_samples, num_cv_folds=1) warnings = [ DataCheckWarning( message="The following labels fall below 10% of the target: [0]", data_check_name=class_imbalance_data_check_name, message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, details={ "target_values": [0] }).to_dict() ] if min_samples > 50: warnings.append( DataCheckWarning( message= f"The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than {min_samples} samples: [0]", data_check_name=class_imbalance_data_check_name, message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, details={ "target_values": [0] }).to_dict()) assert class_imbalance_check.validate(X, y_values_binary) == { "warnings": warnings, "errors": [], "actions": [] } assert class_imbalance_check.validate(X, y_values_multiclass) == { "warnings": warnings, "errors": [], "actions": [] }
def test_fit(mock_predict, mock_fit, mock_fit_transform, example_graph, X_y_binary): X, y = X_y_binary mock_fit_transform.return_value = ww.DataTable(X) mock_predict.return_value = ww.DataColumn(y) component_graph = ComponentGraph(example_graph).instantiate({}) component_graph.fit(X, y) assert mock_fit_transform.call_count == 3 assert mock_fit.call_count == 3 assert mock_predict.call_count == 2
def test_target_leakage_multi(): leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.8) # test empty pd.DataFrame, empty pd.Series assert leakage_check.validate(pd.DataFrame(), pd.Series()) == { "warnings": [], "errors": [], "actions": [] } y = pd.Series([1, 0, 2, 1, 2, 0]) X = pd.DataFrame() X["a"] = y * 3 X["b"] = y - 1 X["c"] = y / 10 X["d"] = [0, 0, 0, 0, 0, 0] X["e"] = ["a", "b", "c", "a", "b", "c"] expected_messages = { "warnings": [ DataCheckWarning( message= "Column 'a' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "a" }).to_dict(), DataCheckWarning( message= "Column 'b' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "b" }).to_dict(), DataCheckWarning( message= "Column 'c' is 80.0% or more correlated with the target", data_check_name=target_leakage_data_check_name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "c" }).to_dict() ], "errors": [], "actions": [] } # test X as ww.DataTable, y as ww.DataColumn assert leakage_check.validate(ww.DataTable(X), ww.DataColumn(y)) == expected_messages # test y as list assert leakage_check.validate(X, y.values) == expected_messages
def test_component_graph_evaluation_plumbing(mock_transa, mock_transb, mock_transc, mock_preda, mock_predb, mock_predc, dummy_components): TransformerA, TransformerB, TransformerC, EstimatorA, EstimatorB, EstimatorC = dummy_components mock_transa.return_value = ww.DataTable(pd.DataFrame({'feature trans': [1, 0, 0, 0, 0, 0], 'feature a': np.ones(6)})) mock_transb.return_value = ww.DataTable(pd.DataFrame({'feature b': np.ones(6) * 2})) mock_transc.return_value = ww.DataTable(pd.DataFrame({'feature c': np.ones(6) * 3})) mock_preda.return_value = ww.DataColumn(pd.Series([0, 0, 0, 1, 0, 0])) mock_predb.return_value = ww.DataColumn(pd.Series([0, 0, 0, 0, 1, 0])) mock_predc.return_value = ww.DataColumn(pd.Series([0, 0, 0, 0, 0, 1])) graph = { 'transformer a': [TransformerA], 'transformer b': [TransformerB, 'transformer a'], 'transformer c': [TransformerC, 'transformer a', 'transformer b'], 'estimator a': [EstimatorA], 'estimator b': [EstimatorB, 'transformer a'], 'estimator c': [EstimatorC, 'transformer a', 'estimator a', 'transformer b', 'estimator b', 'transformer c'] } component_graph = ComponentGraph(graph) component_graph.instantiate({}) X = pd.DataFrame({'feature1': np.zeros(6), 'feature2': np.zeros(6)}) y = pd.Series(np.zeros(6)) component_graph.fit(X, y) predict_out = component_graph.predict(X) assert_frame_equal(mock_transa.call_args[0][0].to_dataframe(), X) assert_frame_equal(mock_transb.call_args[0][0].to_dataframe(), pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="Int64"), 'feature a': np.ones(6)}, columns=['feature trans', 'feature a'])) assert_frame_equal(mock_transc.call_args[0][0].to_dataframe(), pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="Int64"), 'feature a': np.ones(6), 'feature b': np.ones(6) * 2}, columns=['feature trans', 'feature a', 'feature b'])) assert_frame_equal(mock_preda.call_args[0][0].to_dataframe(), X) assert_frame_equal(mock_predb.call_args[0][0].to_dataframe(), pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="Int64"), 'feature a': np.ones(6)}, columns=['feature trans', 'feature a'])) assert_frame_equal(mock_predc.call_args[0][0].to_dataframe(), pd.DataFrame({'feature trans': pd.Series([1, 0, 0, 0, 0, 0], dtype="Int64"), 'feature a': np.ones(6), 'estimator a': pd.Series([0, 0, 0, 1, 0, 0], dtype="Int64"), 'feature b': np.ones(6) * 2, 'estimator b': pd.Series([0, 0, 0, 0, 1, 0], dtype="Int64"), 'feature c': np.ones(6) * 3}, columns=['feature trans', 'feature a', 'estimator a', 'feature b', 'estimator b', 'feature c'])) assert_series_equal(pd.Series([0, 0, 0, 0, 0, 1], dtype="Int64"), predict_out.to_series())
def test_imputer_all_bool_return_original(data_type): X = pd.DataFrame([True, True, False, True, True], dtype=bool) X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype=bool) y = pd.Series([1, 0, 0, 1, 0]) if data_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) imputer = Imputer() imputer.fit(X, y) X_t = imputer.transform(X) assert_frame_equal(X_expected_arr, X_t)