Beispiel #1
0
def test_make_pipeline_datetime_no_categorical(input_type, problem_type):
    X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2],
                      "some dates": pd.date_range('2000-02-03', periods=5, freq='W')})
    y = pd.Series([0, 1, 1, 0, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [estimator_class]
            assert pipeline.component_graph == [Imputer, DateTimeFeaturizer] + delayed_features + estimator_components
Beispiel #2
0
def test_make_pipeline_all_nan_no_categoricals(input_type, problem_type):
    # testing that all_null column is not considered categorical
    X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan],
                      "num": [1, 2, 3, 4, 5]})
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [estimator_class]
            assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
Beispiel #3
0
def test_make_pipeline_no_column_names(input_type, problem_type):
    X = pd.DataFrame([[1, "a", np.nan], [2, "b", np.nan], [5, "b", np.nan]])
    y = pd.Series([0, 0, 1])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [OneHotEncoder, StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [OneHotEncoder, estimator_class]
            assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
Beispiel #4
0
def test_make_pipeline_text_columns(input_type, problem_type):
    X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2],
                      "categorical": ["a", "b", "a", "c", "c"],
                      "text": ["string one", "another", "text for a column, this should be a text column!!", "text string", "hello world"]})
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)

    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [OneHotEncoder, StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [OneHotEncoder, estimator_class]
            assert pipeline.component_graph == [Imputer, TextFeaturizer] + delayed_features + estimator_components
Beispiel #5
0
def test_make_pipeline_only_text_columns(input_type, problem_type):
    X = pd.DataFrame({"text": ["string one", "the evalml team is full of wonderful people", "text for a column, this should be a text column!!", "text string", "hello world"],
                      "another text": ["ladidididididida", "cats are great", "text for a column, this should be a text column!!", "text string", "goodbye world"]})
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)

    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            delayed_features = []
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            standard_scaler = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                standard_scaler = [StandardScaler]
            assert pipeline.component_graph == [TextFeaturizer] + delayed_features + standard_scaler + [estimator_class]
Beispiel #6
0
def test_make_pipeline_numpy_input(problem_type):
    X = np.array([[1, 2, 0, np.nan], [2, 2, 1, np.nan], [5, 1, np.nan,
                                                         np.nan]])
    y = np.array([0, 0, 1, 0])

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [StandardScaler, estimator_class]
            else:
                estimator_components = [estimator_class]
            assert pipeline.component_graph == [
                DropNullColumns, Imputer
            ] + delayed_features + estimator_components
Beispiel #7
0
def test_make_pipeline_error():
    X = pd.DataFrame([[0, 1], [1, 0]])
    y = pd.Series([1, 0])
    estimators = get_estimators(problem_type="binary")
    custom_hyperparameters = [{"Imputer": {"numeric_imput_strategy": ["median"]}}, {"One Hot Encoder": {"value1": ["value2"]}}]

    for estimator in estimators:
        with pytest.raises(ValueError, match="if custom_hyperparameters provided, must be dictionary"):
            make_pipeline(X, y, estimator, "binary", custom_hyperparameters)
def test_make_pipeline_no_nulls(input_type, problem_type):
    X = pd.DataFrame({
        "numerical": [1, 2, 3, 1, 2],
        "categorical": ["a", "b", "a", "c", "c"],
        "some dates":
        pd.date_range('2000-02-03', periods=5, freq='W')
    })
    y = pd.Series([0, 1, 1, 0, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            parameters = {}
            if is_time_series(problem_type):
                parameters = {
                    "pipeline": {
                        "date_index": "some dates",
                        "gap": 1,
                        "max_delay": 1
                    },
                    "Time Series Baseline Estimator": {
                        "date_index": "some dates",
                        "gap": 1,
                        "max_delay": 1
                    }
                }

            pipeline = make_pipeline(X, y, estimator_class, problem_type,
                                     parameters)
            assert isinstance(pipeline, pipeline_class)
            assert pipeline.custom_hyperparameters is None
            delayed_features = []
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [
                    OneHotEncoder, StandardScaler, estimator_class
                ]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [OneHotEncoder, estimator_class]
            if estimator_class.model_family == ModelFamily.ARIMA:
                assert pipeline.component_graph == [Imputer
                                                    ] + estimator_components
            else:
                assert pipeline.component_graph == [
                    Imputer, DateTimeFeaturizer
                ] + delayed_features + estimator_components
def test_make_pipeline_only_datetime_columns(input_type, problem_type):
    X = pd.DataFrame({
        "some dates":
        pd.date_range('2000-02-03', periods=5, freq='W'),
        "some other dates":
        pd.date_range('2000-05-19', periods=5, freq='W')
    })
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)

    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            parameters = {}
            if is_time_series(problem_type):
                parameters = {
                    "pipeline": {
                        "date_index": "some dates",
                        "gap": 1,
                        "max_delay": 1
                    },
                    "Time Series Baseline Estimator": {
                        "date_index": "some dates",
                        "gap": 1,
                        "max_delay": 1
                    }
                }

            pipeline = make_pipeline(X, y, estimator_class, problem_type,
                                     parameters)
            assert isinstance(pipeline, pipeline_class)
            assert pipeline.custom_hyperparameters is None
            delayed_features = []
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            standard_scaler = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                standard_scaler = [StandardScaler]
            if estimator_class.model_family == ModelFamily.ARIMA:
                assert pipeline.component_graph == standard_scaler + [
                    estimator_class
                ]
            else:
                assert pipeline.component_graph == [
                    DateTimeFeaturizer
                ] + delayed_features + standard_scaler + [estimator_class]
Beispiel #10
0
def test_make_pipeline_custom_hyperparameters(problem_type):
    X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan],
                      "categorical": ["a", "b", "a", "c", "c"],
                      "some dates": pd.date_range('2000-02-03', periods=5, freq='W')})
    custom_hyperparameters = {'Imputer': {
        'numeric_impute_strategy': ['median']
    }}

    y = pd.Series([0, 0, 1, 0, 0])
    estimators = get_estimators(problem_type=problem_type)

    for estimator_class in estimators:
        for problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type, custom_hyperparameters)
            assert pipeline.custom_hyperparameters == custom_hyperparameters

            pipeline2 = make_pipeline(X, y, estimator_class, problem_type)
            assert not pipeline2.custom_hyperparameters
def test_make_pipeline_numpy_input(problem_type):
    X = np.array([[1, 2, 0, np.nan], [2, 2, 1, np.nan], [5, 1, np.nan,
                                                         np.nan]])
    y = np.array([0, 0, 1, 0])

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            parameters = {}
            if is_time_series(problem_type):
                parameters = {
                    "pipeline": {
                        "date_index": None,
                        "gap": 1,
                        "max_delay": 1
                    },
                    "Time Series Baseline Estimator": {
                        "date_index": None,
                        "gap": 1,
                        "max_delay": 1
                    }
                }

            pipeline = make_pipeline(X, y, estimator_class, problem_type,
                                     parameters)
            assert isinstance(pipeline, pipeline_class)
            delayed_features = []
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [StandardScaler, estimator_class]
            else:
                estimator_components = [estimator_class]
            if estimator_class.model_family == ModelFamily.ARIMA:
                assert pipeline.component_graph == [DropNullColumns, Imputer
                                                    ] + estimator_components
            else:
                assert pipeline.component_graph == [
                    DropNullColumns, Imputer
                ] + delayed_features + estimator_components
def test_make_pipeline_samplers(problem_type, samplers, X_y_binary, X_y_multi,
                                X_y_regression, has_minimal_dependencies):
    if problem_type == 'binary':
        X, y = X_y_binary
    elif problem_type == 'multiclass':
        X, y = X_y_multi
    else:
        X, y = X_y_regression
    estimators = get_estimators(problem_type=problem_type)

    for estimator in estimators:
        if problem_type == 'regression' and samplers is not None:
            with pytest.raises(ValueError,
                               match='Sampling is unsupported for'):
                make_pipeline(X,
                              y,
                              estimator,
                              problem_type,
                              sampler_name=samplers)
        else:
            pipeline = make_pipeline(X,
                                     y,
                                     estimator,
                                     problem_type,
                                     sampler_name=samplers)
            if has_minimal_dependencies and samplers is not None:
                samplers = 'Undersampler'
            # check that we do add the sampler properly
            if samplers is not None and problem_type != 'regression':
                # we add the sampler before the scaler if it exists
                if pipeline.component_graph[-2].name == 'Standard Scaler':
                    assert pipeline.component_graph[-3].name == samplers
                else:
                    assert pipeline.component_graph[-2].name == samplers
            else:
                assert not any('sampler' in comp.name
                               for comp in pipeline.component_graph)
def test_get_estimators(has_minimal_dependencies):
    if has_minimal_dependencies:
        assert len(get_estimators(problem_type=ProblemTypes.BINARY)) == 5
        assert len(
            get_estimators(problem_type=ProblemTypes.BINARY,
                           model_families=[ModelFamily.LINEAR_MODEL])) == 2
        assert len(get_estimators(problem_type=ProblemTypes.MULTICLASS)) == 5
        assert len(get_estimators(problem_type=ProblemTypes.REGRESSION)) == 5
    else:
        assert len(get_estimators(problem_type=ProblemTypes.BINARY)) == 8
        assert len(
            get_estimators(problem_type=ProblemTypes.BINARY,
                           model_families=[ModelFamily.LINEAR_MODEL])) == 2
        assert len(get_estimators(problem_type=ProblemTypes.MULTICLASS)) == 8
        assert len(get_estimators(problem_type=ProblemTypes.REGRESSION)) == 8

    assert len(
        get_estimators(problem_type=ProblemTypes.BINARY,
                       model_families=[])) == 0
    assert len(
        get_estimators(problem_type=ProblemTypes.MULTICLASS,
                       model_families=[])) == 0
    assert len(
        get_estimators(problem_type=ProblemTypes.REGRESSION,
                       model_families=[])) == 0

    with pytest.raises(RuntimeError,
                       match="Unrecognized model type for problem type"):
        get_estimators(problem_type=ProblemTypes.REGRESSION,
                       model_families=["random_forest", "none"])
    with pytest.raises(TypeError,
                       match="model_families parameter is not a list."):
        get_estimators(problem_type=ProblemTypes.REGRESSION,
                       model_families='random_forest')
    with pytest.raises(KeyError):
        get_estimators(problem_type="Not A Valid Problem Type")