Example #1
0
def test_make_pipeline_datetime_no_categorical(input_type, problem_type):
    X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2],
                      "some dates": pd.date_range('2000-02-03', periods=5, freq='W')})
    y = pd.Series([0, 1, 1, 0, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [estimator_class]
            assert pipeline.component_graph == [Imputer, DateTimeFeaturizer] + delayed_features + estimator_components
Example #2
0
def test_make_pipeline_all_nan_no_categoricals(input_type, problem_type):
    # testing that all_null column is not considered categorical
    X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan],
                      "num": [1, 2, 3, 4, 5]})
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [estimator_class]
            assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
Example #3
0
def test_make_pipeline_no_column_names(input_type, problem_type):
    X = pd.DataFrame([[1, "a", np.nan], [2, "b", np.nan], [5, "b", np.nan]])
    y = pd.Series([0, 0, 1])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [OneHotEncoder, StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [OneHotEncoder, estimator_class]
            assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
Example #4
0
def test_make_pipeline_text_columns(input_type, problem_type):
    X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2],
                      "categorical": ["a", "b", "a", "c", "c"],
                      "text": ["string one", "another", "text for a column, this should be a text column!!", "text string", "hello world"]})
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)

    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [OneHotEncoder, StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [OneHotEncoder, estimator_class]
            assert pipeline.component_graph == [Imputer, TextFeaturizer] + delayed_features + estimator_components
Example #5
0
def test_make_pipeline_numpy_input(problem_type):
    X = np.array([[1, 2, 0, np.nan], [2, 2, 1, np.nan], [5, 1, np.nan,
                                                         np.nan]])
    y = np.array([0, 0, 1, 0])

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [StandardScaler, estimator_class]
            else:
                estimator_components = [estimator_class]
            assert pipeline.component_graph == [
                DropNullColumns, Imputer
            ] + delayed_features + estimator_components
Example #6
0
def test_make_pipeline_only_text_columns(input_type, problem_type):
    X = pd.DataFrame({"text": ["string one", "the evalml team is full of wonderful people", "text for a column, this should be a text column!!", "text string", "hello world"],
                      "another text": ["ladidididididida", "cats are great", "text for a column, this should be a text column!!", "text string", "goodbye world"]})
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)

    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            delayed_features = []
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            standard_scaler = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                standard_scaler = [StandardScaler]
            assert pipeline.component_graph == [TextFeaturizer] + delayed_features + standard_scaler + [estimator_class]
def test_make_pipeline_no_nulls(input_type, problem_type):
    X = pd.DataFrame({
        "numerical": [1, 2, 3, 1, 2],
        "categorical": ["a", "b", "a", "c", "c"],
        "some dates":
        pd.date_range('2000-02-03', periods=5, freq='W')
    })
    y = pd.Series([0, 1, 1, 0, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            parameters = {}
            if is_time_series(problem_type):
                parameters = {
                    "pipeline": {
                        "date_index": "some dates",
                        "gap": 1,
                        "max_delay": 1
                    },
                    "Time Series Baseline Estimator": {
                        "date_index": "some dates",
                        "gap": 1,
                        "max_delay": 1
                    }
                }

            pipeline = make_pipeline(X, y, estimator_class, problem_type,
                                     parameters)
            assert isinstance(pipeline, pipeline_class)
            assert pipeline.custom_hyperparameters is None
            delayed_features = []
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [
                    OneHotEncoder, StandardScaler, estimator_class
                ]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [OneHotEncoder, estimator_class]
            if estimator_class.model_family == ModelFamily.ARIMA:
                assert pipeline.component_graph == [Imputer
                                                    ] + estimator_components
            else:
                assert pipeline.component_graph == [
                    Imputer, DateTimeFeaturizer
                ] + delayed_features + estimator_components
def test_make_pipeline_only_datetime_columns(input_type, problem_type):
    X = pd.DataFrame({
        "some dates":
        pd.date_range('2000-02-03', periods=5, freq='W'),
        "some other dates":
        pd.date_range('2000-05-19', periods=5, freq='W')
    })
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)

    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            parameters = {}
            if is_time_series(problem_type):
                parameters = {
                    "pipeline": {
                        "date_index": "some dates",
                        "gap": 1,
                        "max_delay": 1
                    },
                    "Time Series Baseline Estimator": {
                        "date_index": "some dates",
                        "gap": 1,
                        "max_delay": 1
                    }
                }

            pipeline = make_pipeline(X, y, estimator_class, problem_type,
                                     parameters)
            assert isinstance(pipeline, pipeline_class)
            assert pipeline.custom_hyperparameters is None
            delayed_features = []
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            standard_scaler = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                standard_scaler = [StandardScaler]
            if estimator_class.model_family == ModelFamily.ARIMA:
                assert pipeline.component_graph == standard_scaler + [
                    estimator_class
                ]
            else:
                assert pipeline.component_graph == [
                    DateTimeFeaturizer
                ] + delayed_features + standard_scaler + [estimator_class]
def test_make_pipeline_numpy_input(problem_type):
    X = np.array([[1, 2, 0, np.nan], [2, 2, 1, np.nan], [5, 1, np.nan,
                                                         np.nan]])
    y = np.array([0, 0, 1, 0])

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            parameters = {}
            if is_time_series(problem_type):
                parameters = {
                    "pipeline": {
                        "date_index": None,
                        "gap": 1,
                        "max_delay": 1
                    },
                    "Time Series Baseline Estimator": {
                        "date_index": None,
                        "gap": 1,
                        "max_delay": 1
                    }
                }

            pipeline = make_pipeline(X, y, estimator_class, problem_type,
                                     parameters)
            assert isinstance(pipeline, pipeline_class)
            delayed_features = []
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [StandardScaler, estimator_class]
            else:
                estimator_components = [estimator_class]
            if estimator_class.model_family == ModelFamily.ARIMA:
                assert pipeline.component_graph == [DropNullColumns, Imputer
                                                    ] + estimator_components
            else:
                assert pipeline.component_graph == [
                    DropNullColumns, Imputer
                ] + delayed_features + estimator_components