def test_make_pipeline_datetime_no_categorical(input_type, problem_type): X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2], "some dates": pd.date_range('2000-02-03', periods=5, freq='W')}) y = pd.Series([0, 1, 1, 0, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [estimator_class] assert pipeline.component_graph == [Imputer, DateTimeFeaturizer] + delayed_features + estimator_components
def test_make_pipeline_all_nan_no_categoricals(input_type, problem_type): # testing that all_null column is not considered categorical X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan], "num": [1, 2, 3, 4, 5]}) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [estimator_class] assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
def test_make_pipeline_no_column_names(input_type, problem_type): X = pd.DataFrame([[1, "a", np.nan], [2, "b", np.nan], [5, "b", np.nan]]) y = pd.Series([0, 0, 1]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [OneHotEncoder, StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [OneHotEncoder, estimator_class] assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
def test_make_pipeline_text_columns(input_type, problem_type): X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2], "categorical": ["a", "b", "a", "c", "c"], "text": ["string one", "another", "text for a column, this should be a text column!!", "text string", "hello world"]}) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [OneHotEncoder, StandardScaler, estimator_class] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [OneHotEncoder, estimator_class] assert pipeline.component_graph == [Imputer, TextFeaturizer] + delayed_features + estimator_components
def test_make_pipeline_numpy_input(problem_type): X = np.array([[1, 2, 0, np.nan], [2, 2, 1, np.nan], [5, 1, np.nan, np.nan]]) y = np.array([0, 0, 1, 0]) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] else: delayed_features = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [StandardScaler, estimator_class] else: estimator_components = [estimator_class] assert pipeline.component_graph == [ DropNullColumns, Imputer ] + delayed_features + estimator_components
def test_make_pipeline_only_text_columns(input_type, problem_type): X = pd.DataFrame({"text": ["string one", "the evalml team is full of wonderful people", "text for a column, this should be a text column!!", "text string", "hello world"], "another text": ["ladidididididida", "cats are great", "text for a column, this should be a text column!!", "text string", "goodbye world"]}) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: pipeline = make_pipeline(X, y, estimator_class, problem_type) assert isinstance(pipeline, type(pipeline_class)) assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] standard_scaler = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: standard_scaler = [StandardScaler] assert pipeline.component_graph == [TextFeaturizer] + delayed_features + standard_scaler + [estimator_class]
def test_make_pipeline_no_nulls(input_type, problem_type): X = pd.DataFrame({ "numerical": [1, 2, 3, 1, 2], "categorical": ["a", "b", "a", "c", "c"], "some dates": pd.date_range('2000-02-03', periods=5, freq='W') }) y = pd.Series([0, 1, 1, 0, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): parameters = { "pipeline": { "date_index": "some dates", "gap": 1, "max_delay": 1 }, "Time Series Baseline Estimator": { "date_index": "some dates", "gap": 1, "max_delay": 1 } } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [ OneHotEncoder, StandardScaler, estimator_class ] elif estimator_class.model_family == ModelFamily.CATBOOST: estimator_components = [estimator_class] else: estimator_components = [OneHotEncoder, estimator_class] if estimator_class.model_family == ModelFamily.ARIMA: assert pipeline.component_graph == [Imputer ] + estimator_components else: assert pipeline.component_graph == [ Imputer, DateTimeFeaturizer ] + delayed_features + estimator_components
def test_make_pipeline_only_datetime_columns(input_type, problem_type): X = pd.DataFrame({ "some dates": pd.date_range('2000-02-03', periods=5, freq='W'), "some other dates": pd.date_range('2000-05-19', periods=5, freq='W') }) y = pd.Series([0, 0, 1, 1, 0]) if input_type == 'ww': X = ww.DataTable(X) y = ww.DataColumn(y) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): parameters = { "pipeline": { "date_index": "some dates", "gap": 1, "max_delay": 1 }, "Time Series Baseline Estimator": { "date_index": "some dates", "gap": 1, "max_delay": 1 } } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) assert pipeline.custom_hyperparameters is None delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] standard_scaler = [] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: standard_scaler = [StandardScaler] if estimator_class.model_family == ModelFamily.ARIMA: assert pipeline.component_graph == standard_scaler + [ estimator_class ] else: assert pipeline.component_graph == [ DateTimeFeaturizer ] + delayed_features + standard_scaler + [estimator_class]
def test_make_pipeline_numpy_input(problem_type): X = np.array([[1, 2, 0, np.nan], [2, 2, 1, np.nan], [5, 1, np.nan, np.nan]]) y = np.array([0, 0, 1, 0]) estimators = get_estimators(problem_type=problem_type) pipeline_class = _get_pipeline_base_class(problem_type) if problem_type == ProblemTypes.MULTICLASS: y = pd.Series([0, 2, 1, 2]) for estimator_class in estimators: if problem_type in estimator_class.supported_problem_types: parameters = {} if is_time_series(problem_type): parameters = { "pipeline": { "date_index": None, "gap": 1, "max_delay": 1 }, "Time Series Baseline Estimator": { "date_index": None, "gap": 1, "max_delay": 1 } } pipeline = make_pipeline(X, y, estimator_class, problem_type, parameters) assert isinstance(pipeline, pipeline_class) delayed_features = [] if is_time_series(problem_type): delayed_features = [DelayedFeatureTransformer] if estimator_class.model_family == ModelFamily.LINEAR_MODEL: estimator_components = [StandardScaler, estimator_class] else: estimator_components = [estimator_class] if estimator_class.model_family == ModelFamily.ARIMA: assert pipeline.component_graph == [DropNullColumns, Imputer ] + estimator_components else: assert pipeline.component_graph == [ DropNullColumns, Imputer ] + delayed_features + estimator_components