def test_make_pipeline_datetime_no_categorical(input_type, problem_type): X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2], "some dates": pd.date_range('2000-02-03', periods=5, freq='W')}) y = pd.Series([0, 1, 1, 0, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [estimator_class] assert pipeline.component_graph == [Imputer, DateTimeFeaturizer] + delayed_features + estimator_components
def test_make_pipeline_all_nan_no_categoricals(input_type, problem_type): # testing that all_null column is not considered categorical X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan], "num": [1, 2, 3, 4, 5]}) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [estimator_class] assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
def test_make_pipeline_no_column_names(input_type, problem_type): X = pd.DataFrame([[1, "a", np.nan], [2, "b", np.nan], [5, "b", np.nan]]) y = pd.Series([0, 0, 1]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [OneHotEncoder, StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [OneHotEncoder, estimator_class] assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
def test_make_pipeline_text_columns(input_type, problem_type): X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2], "categorical": ["a", "b", "a", "c", "c"], "text": ["string one", "another", "text for a column, this should be a text column!!", "text string", "hello world"]}) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [OneHotEncoder, StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [OneHotEncoder, estimator_class] assert pipeline.component_graph == [Imputer, TextFeaturizer] + delayed_features + estimator_components
def test_make_pipeline_only_text_columns(input_type, problem_type): X = pd.DataFrame({"text": ["string one", "the evalml team is full of wonderful people", "text for a column, this should be a text column!!", "text string", "hello world"], "another text": ["ladidididididida", "cats are great", "text for a column, this should be a text column!!", "text string", "goodbye world"]}) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] standard_scaler = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: standard_scaler = [StandardScaler] assert pipeline.component_graph == [TextFeaturizer] + delayed_features + standard_scaler + [estimator_class]
def test_make_pipeline_numpy_input(problem_type): X = np.array([[1, 2, 0, np.nan], [2, 2, 1, np.nan], [5, 1, np.nan, np.nan]]) y = np.array([0, 0, 1, 0]) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [StandardScaler, estimator_class] else: estimator_components = [estimator_class] assert pipeline.component_graph == [ DropNullColumns, Imputer ] + delayed_features + estimator_components
def test_make_pipeline_error(): X = pd.DataFrame([[0, 1], [1, 0]]) y = pd.Series([1, 0]) estimators = get_estimators(problem_type="binary") custom_hyperparameters = [{"Imputer": {"numeric_imput_strategy": ["median"]}}, {"One Hot Encoder": {"value1": ["value2"]}}] for estimator in estimators: with pytest.raises(ValueError, match="if custom_hyperparameters provided, must be dictionary"): make_pipeline(X, y, estimator, "binary", custom_hyperparameters)
def test_make_pipeline_no_nulls(input_type, problem_type): X = pd.DataFrame({ "numerical": [1, 2, 3, 1, 2], "categorical": ["a", "b", "a", "c", "c"], "some dates": pd.date_range('2000-02-03', periods=5, freq='W') }) y = pd.Series([0, 1, 1, 0, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): parameters = { "pipeline": { "date_index": "some dates", "gap": 1, "max_delay": 1 }, "Time Series Baseline Estimator": { "date_index": "some dates", "gap": 1, "max_delay": 1 } } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [ OneHotEncoder, StandardScaler, estimator_class ] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [OneHotEncoder, estimator_class] if estimator_class.model_family == ModelFamily.ARIMA: assert pipeline.component_graph == [Imputer ] + estimator_components else: assert pipeline.component_graph == [ Imputer, DateTimeFeaturizer ] + delayed_features + estimator_components
def test_make_pipeline_only_datetime_columns(input_type, problem_type): X = pd.DataFrame({ "some dates": pd.date_range('2000-02-03', periods=5, freq='W'), "some other dates": pd.date_range('2000-05-19', periods=5, freq='W') }) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): parameters = { "pipeline": { "date_index": "some dates", "gap": 1, "max_delay": 1 }, "Time Series Baseline Estimator": { "date_index": "some dates", "gap": 1, "max_delay": 1 } } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] standard_scaler = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: standard_scaler = [StandardScaler] if estimator_class.model_family == ModelFamily.ARIMA: assert pipeline.component_graph == standard_scaler + [ estimator_class ] else: assert pipeline.component_graph == [ DateTimeFeaturizer ] + delayed_features + standard_scaler + [estimator_class]
def test_make_pipeline_custom_hyperparameters(problem_type): X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan], "categorical": ["a", "b", "a", "c", "c"], "some dates": pd.date_range('2000-02-03', periods=5, freq='W')}) custom_hyperparameters = {'Imputer': { 'numeric_impute_strategy': ['median'] }} y = pd.Series([0, 0, 1, 0, 0]) estimators = get_estimators(problem_type=problem_type) for estimator_class in estimators: for problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type, custom_hyperparameters) assert pipeline.custom_hyperparameters == custom_hyperparameters pipeline2 = make_pipeline(X, y, estimator_class, problem_type) assert not pipeline2.custom_hyperparameters
def test_make_pipeline_numpy_input(problem_type): X = np.array([[1, 2, 0, np.nan], [2, 2, 1, np.nan], [5, 1, np.nan, np.nan]]) y = np.array([0, 0, 1, 0]) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): parameters = { "pipeline": { "date_index": None, "gap": 1, "max_delay": 1 }, "Time Series Baseline Estimator": { "date_index": None, "gap": 1, "max_delay": 1 } } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [StandardScaler, estimator_class] else: estimator_components = [estimator_class] if estimator_class.model_family == ModelFamily.ARIMA: assert pipeline.component_graph == [DropNullColumns, Imputer ] + estimator_components else: assert pipeline.component_graph == [ DropNullColumns, Imputer ] + delayed_features + estimator_components
def test_make_pipeline_samplers(problem_type, samplers, X_y_binary, X_y_multi, X_y_regression, has_minimal_dependencies): if problem_type == 'binary': X, y = X_y_binary elif problem_type == 'multiclass': X, y = X_y_multi else: X, y = X_y_regression estimators = get_estimators(problem_type=problem_type) for estimator in estimators: if problem_type == 'regression' and samplers is not None: with pytest.raises(ValueError, match='Sampling is unsupported for'): make_pipeline(X, y, estimator, problem_type, sampler_name=samplers) else: pipeline = make_pipeline(X, y, estimator, problem_type, sampler_name=samplers) if has_minimal_dependencies and samplers is not None: samplers = 'Undersampler' # check that we do add the sampler properly if samplers is not None and problem_type != 'regression': # we add the sampler before the scaler if it exists if pipeline.component_graph[-2].name == 'Standard Scaler': assert pipeline.component_graph[-3].name == samplers else: assert pipeline.component_graph[-2].name == samplers else: assert not any('sampler' in comp.name for comp in pipeline.component_graph)
def test_get_estimators(has_minimal_dependencies): if has_minimal_dependencies: assert len(get_estimators(problem_type=ProblemTypes.BINARY)) == 5 assert len( get_estimators(problem_type=ProblemTypes.BINARY, model_families=[ModelFamily.LINEAR_MODEL])) == 2 assert len(get_estimators(problem_type=ProblemTypes.MULTICLASS)) == 5 assert len(get_estimators(problem_type=ProblemTypes.REGRESSION)) == 5 else: assert len(get_estimators(problem_type=ProblemTypes.BINARY)) == 8 assert len( get_estimators(problem_type=ProblemTypes.BINARY, model_families=[ModelFamily.LINEAR_MODEL])) == 2 assert len(get_estimators(problem_type=ProblemTypes.MULTICLASS)) == 8 assert len(get_estimators(problem_type=ProblemTypes.REGRESSION)) == 8 assert len( get_estimators(problem_type=ProblemTypes.BINARY, model_families=[])) == 0 assert len( get_estimators(problem_type=ProblemTypes.MULTICLASS, model_families=[])) == 0 assert len( get_estimators(problem_type=ProblemTypes.REGRESSION, model_families=[])) == 0 with pytest.raises(RuntimeError, match="Unrecognized model type for problem type"): get_estimators(problem_type=ProblemTypes.REGRESSION, model_families=["random_forest", "none"]) with pytest.raises(TypeError, match="model_families parameter is not a list."): get_estimators(problem_type=ProblemTypes.REGRESSION, model_families='random_forest') with pytest.raises(KeyError): get_estimators(problem_type="Not A Valid Problem Type")