Beispiel #1
0
def test_stacked_different_input_pipelines_classification():
    input_pipelines = [
        make_pipeline_from_components([RandomForestClassifier()],
                                      ProblemTypes.MULTICLASS),
        make_pipeline_from_components([RandomForestClassifier()],
                                      ProblemTypes.BINARY)
    ]
    with pytest.raises(ValueError,
                       match="All pipelines must have the same problem type."):
        StackedEnsembleClassifier(input_pipelines=input_pipelines)
def test_stacked_fit_predict_classification(X_y_binary, X_y_multi, stackable_classifiers, problem_type):
    if problem_type == ProblemTypes.BINARY:
        X, y = X_y_binary
        num_classes = 2
    elif problem_type == ProblemTypes.MULTICLASS:
        X, y = X_y_multi
        num_classes = 3

    input_pipelines = [make_pipeline_from_components([classifier], problem_type)
                       for classifier in stackable_classifiers]
    clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, n_jobs=1)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert len(y_pred) == len(y)
    assert isinstance(y_pred, ww.DataColumn)
    assert not np.isnan(y_pred.to_series()).all()

    y_pred_proba = clf.predict_proba(X)
    assert isinstance(y_pred_proba, ww.DataTable)
    assert y_pred_proba.shape == (len(y), num_classes)
    assert not np.isnan(y_pred_proba.to_dataframe()).all().all()

    clf = StackedEnsembleClassifier(input_pipelines=input_pipelines, final_estimator=RandomForestClassifier(), n_jobs=1)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert len(y_pred) == len(y)
    assert isinstance(y_pred, ww.DataColumn)
    assert not np.isnan(y_pred.to_series()).all()

    y_pred_proba = clf.predict_proba(X)
    assert y_pred_proba.shape == (len(y), num_classes)
    assert isinstance(y_pred_proba, ww.DataTable)
    assert not np.isnan(y_pred_proba.to_dataframe()).all().all()
def test_get_component(example_graph):
    graph = example_graph
    component_graph = ComponentGraph(graph)

    assert component_graph.get_component('OneHot_ElasticNet') == OneHotEncoder
    assert component_graph.get_component(
        'Logistic Regression') == LogisticRegressionClassifier

    with pytest.raises(ValueError, match='not in the graph'):
        component_graph.get_component('Fake Component')

    component_graph.instantiate({
        'OneHot_RandomForest': {
            'top_n': 3
        },
        'Random Forest': {
            'max_depth': 4,
            'n_estimators': 50
        }
    })
    assert component_graph.get_component(
        'OneHot_ElasticNet') == OneHotEncoder()
    assert component_graph.get_component(
        'OneHot_RandomForest') == OneHotEncoder(top_n=3)
    assert component_graph.get_component(
        'Random Forest') == RandomForestClassifier(n_estimators=50,
                                                   max_depth=4)
Beispiel #4
0
def test_scikit_learn_wrapper_invalid_problem_type():
    evalml_pipeline = make_pipeline_from_components([RandomForestClassifier()],
                                                    ProblemTypes.MULTICLASS)
    evalml_pipeline.problem_type = None
    with pytest.raises(
            ValueError,
            match="Could not wrap EvalML object in scikit-learn wrapper."):
        scikit_learn_wrapped_estimator(evalml_pipeline)
Beispiel #5
0
def test_get_estimators(example_graph):
    component_graph = ComponentGraph(example_graph)
    with pytest.raises(ValueError, match='Cannot get estimators until'):
        component_graph.get_estimators()

    component_graph.instantiate({})
    assert component_graph.get_estimators() == [RandomForestClassifier(), ElasticNetClassifier(), LogisticRegressionClassifier()]

    component_graph = ComponentGraph.from_list(['Imputer', 'One Hot Encoder'])
    component_graph.instantiate({})
    assert component_graph.get_estimators() == []
Beispiel #6
0
def test_generate_code_errors():
    with pytest.raises(ValueError, match="Element must be a component instance"):
        generate_component_code(make_pipeline_from_components([RandomForestClassifier()], ProblemTypes.BINARY))

    with pytest.raises(ValueError, match="Element must be a component instance"):
        generate_component_code(LinearRegressor)

    with pytest.raises(ValueError, match="Element must be a component instance"):
        generate_component_code(Imputer)

    with pytest.raises(ValueError, match="Element must be a component instance"):
        generate_component_code(ComponentBase)
def test_iteration(example_graph):
    component_graph = ComponentGraph(example_graph)

    expected = [
        Imputer, OneHotEncoder, ElasticNetClassifier, OneHotEncoder,
        RandomForestClassifier, LogisticRegressionClassifier
    ]
    iteration = [component for component in component_graph]
    assert iteration == expected

    component_graph.instantiate({'OneHot_RandomForest': {'top_n': 32}})
    expected = [
        Imputer(),
        OneHotEncoder(),
        ElasticNetClassifier(),
        OneHotEncoder(top_n=32),
        RandomForestClassifier(),
        LogisticRegressionClassifier()
    ]
    iteration = [component for component in component_graph]
    assert iteration == expected
Beispiel #8
0
def test_serialization(X_y_binary, tmpdir, helper_functions):
    X, y = X_y_binary
    path = os.path.join(str(tmpdir), 'component.pkl')
    for component_class in all_components():
        print('Testing serialization of component {}'.format(
            component_class.name))
        try:
            component = helper_functions.safe_init_component_with_njobs_1(
                component_class)
        except EnsembleMissingPipelinesError:
            if (component_class == StackedEnsembleClassifier):
                component = component_class(input_pipelines=[
                    make_pipeline_from_components([RandomForestClassifier()],
                                                  ProblemTypes.BINARY)
                ],
                                            n_jobs=1)
            elif (component_class == StackedEnsembleRegressor):
                component = component_class(input_pipelines=[
                    make_pipeline_from_components([RandomForestRegressor()],
                                                  ProblemTypes.REGRESSION)
                ],
                                            n_jobs=1)
        component.fit(X, y)

        for pickle_protocol in range(cloudpickle.DEFAULT_PROTOCOL + 1):
            component.save(path, pickle_protocol=pickle_protocol)
            loaded_component = ComponentBase.load(path)
            assert component.parameters == loaded_component.parameters
            assert component.describe(
                return_dict=True) == loaded_component.describe(
                    return_dict=True)
            if (issubclass(component_class, Estimator) and
                    not (isinstance(component, StackedEnsembleClassifier)
                         or isinstance(component, StackedEnsembleRegressor))):
                assert (component.feature_importance ==
                        loaded_component.feature_importance).all()
Beispiel #9
0
def test_describe_component():
    enc = OneHotEncoder()
    imputer = Imputer()
    simple_imputer = SimpleImputer("mean")
    column_imputer = PerColumnImputer({"a": "mean", "b": ("constant", 100)})
    scaler = StandardScaler()
    feature_selection_clf = RFClassifierSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf)
    feature_selection_reg = RFRegressorSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf)
    drop_col_transformer = DropColumns(columns=['col_one', 'col_two'])
    drop_null_transformer = DropNullColumns()
    datetime = DateTimeFeaturizer()
    text_featurizer = TextFeaturizer()
    lsa = LSA()
    pca = PCA()
    lda = LinearDiscriminantAnalysis()
    ft = DFSTransformer()
    us = Undersampler()
    assert enc.describe(return_dict=True) == {'name': 'One Hot Encoder', 'parameters': {'top_n': 10,
                                                                                        'features_to_encode': None,
                                                                                        'categories': None,
                                                                                        'drop': 'if_binary',
                                                                                        'handle_unknown': 'ignore',
                                                                                        'handle_missing': 'error'}}
    assert imputer.describe(return_dict=True) == {'name': 'Imputer', 'parameters': {'categorical_impute_strategy': "most_frequent",
                                                                                    'categorical_fill_value': None,
                                                                                    'numeric_impute_strategy': "mean",
                                                                                    'numeric_fill_value': None}}
    assert simple_imputer.describe(return_dict=True) == {'name': 'Simple Imputer', 'parameters': {'impute_strategy': 'mean', 'fill_value': None}}
    assert column_imputer.describe(return_dict=True) == {'name': 'Per Column Imputer', 'parameters': {'impute_strategies': {'a': 'mean', 'b': ('constant', 100)}, 'default_impute_strategy': 'most_frequent'}}
    assert scaler.describe(return_dict=True) == {'name': 'Standard Scaler', 'parameters': {}}
    assert feature_selection_clf.describe(return_dict=True) == {'name': 'RF Classifier Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}}
    assert feature_selection_reg.describe(return_dict=True) == {'name': 'RF Regressor Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}}
    assert drop_col_transformer.describe(return_dict=True) == {'name': 'Drop Columns Transformer', 'parameters': {'columns': ['col_one', 'col_two']}}
    assert drop_null_transformer.describe(return_dict=True) == {'name': 'Drop Null Columns Transformer', 'parameters': {'pct_null_threshold': 1.0}}
    assert datetime.describe(return_dict=True) == {'name': 'DateTime Featurization Component',
                                                   'parameters': {'features_to_extract': ['year', 'month', 'day_of_week', 'hour'],
                                                                  'encode_as_categories': False}}
    assert text_featurizer.describe(return_dict=True) == {'name': 'Text Featurization Component', 'parameters': {}}
    assert lsa.describe(return_dict=True) == {'name': 'LSA Transformer', 'parameters': {}}
    assert pca.describe(return_dict=True) == {'name': 'PCA Transformer', 'parameters': {'n_components': None, 'variance': 0.95}}
    assert lda.describe(return_dict=True) == {'name': 'Linear Discriminant Analysis Transformer', 'parameters': {'n_components': None}}
    assert ft.describe(return_dict=True) == {'name': 'DFS Transformer', 'parameters': {"index": "index"}}
    assert us.describe(return_dict=True) == {'name': 'Undersampler', 'parameters': {"balanced_ratio": 4, "min_samples": 100, "min_percentage": 0.1}}
    # testing estimators
    base_classifier = BaselineClassifier()
    base_regressor = BaselineRegressor()
    lr_classifier = LogisticRegressionClassifier()
    en_classifier = ElasticNetClassifier()
    en_regressor = ElasticNetRegressor()
    et_classifier = ExtraTreesClassifier(n_estimators=10, max_features="auto")
    et_regressor = ExtraTreesRegressor(n_estimators=10, max_features="auto")
    rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=3)
    rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=3)
    linear_regressor = LinearRegressor()
    svm_classifier = SVMClassifier()
    svm_regressor = SVMRegressor()
    assert base_classifier.describe(return_dict=True) == {'name': 'Baseline Classifier', 'parameters': {'strategy': 'mode'}}
    assert base_regressor.describe(return_dict=True) == {'name': 'Baseline Regressor', 'parameters': {'strategy': 'mean'}}
    assert lr_classifier.describe(return_dict=True) == {'name': 'Logistic Regression Classifier', 'parameters': {'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'}}
    assert en_classifier.describe(return_dict=True) == {'name': 'Elastic Net Classifier', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'n_jobs': -1, 'max_iter': 1000, "loss": 'log', 'penalty': 'elasticnet'}}
    assert en_regressor.describe(return_dict=True) == {'name': 'Elastic Net Regressor', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'max_iter': 1000, 'normalize': False}}
    assert et_classifier.describe(return_dict=True) == {'name': 'Extra Trees Classifier', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}}
    assert et_regressor.describe(return_dict=True) == {'name': 'Extra Trees Regressor', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}}
    assert rf_classifier.describe(return_dict=True) == {'name': 'Random Forest Classifier', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}}
    assert rf_regressor.describe(return_dict=True) == {'name': 'Random Forest Regressor', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}}
    assert linear_regressor.describe(return_dict=True) == {'name': 'Linear Regressor', 'parameters': {'fit_intercept': True, 'normalize': False, 'n_jobs': -1}}
    assert svm_classifier.describe(return_dict=True) == {'name': 'SVM Classifier', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale', 'probability': True}}
    assert svm_regressor.describe(return_dict=True) == {'name': 'SVM Regressor', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale'}}
    try:
        xgb_classifier = XGBoostClassifier(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75)
        xgb_regressor = XGBoostRegressor(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75)
        assert xgb_classifier.describe(return_dict=True) == {'name': 'XGBoost Classifier', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}}
        assert xgb_regressor.describe(return_dict=True) == {'name': 'XGBoost Regressor', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}}
    except ImportError:
        pass
    try:
        cb_classifier = CatBoostClassifier()
        cb_regressor = CatBoostRegressor()
        assert cb_classifier.describe(return_dict=True) == {'name': 'CatBoost Classifier', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': True}}
        assert cb_regressor.describe(return_dict=True) == {'name': 'CatBoost Regressor', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': False}}
    except ImportError:
        pass
    try:
        lg_classifier = LightGBMClassifier()
        lg_regressor = LightGBMRegressor()
        assert lg_classifier.describe(return_dict=True) == {'name': 'LightGBM Classifier', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 0, 'num_leaves': 31,
                                                                                                          'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}}
        assert lg_regressor.describe(return_dict=True) == {'name': 'LightGBM Regressor', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 20, 'max_depth': 0, 'num_leaves': 31,
                                                                                                        'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}}
    except ImportError:
        pass
Beispiel #10
0
def test_make_pipeline_from_components(X_y_binary, logistic_regression_binary_pipeline_class):
    with pytest.raises(ValueError, match="Pipeline needs to have an estimator at the last position of the component list"):
        make_pipeline_from_components([Imputer()], problem_type='binary')

    with pytest.raises(KeyError, match="Problem type 'invalid_type' does not exist"):
        make_pipeline_from_components([RandomForestClassifier()], problem_type='invalid_type')

    with pytest.raises(TypeError, match="Custom pipeline name must be a string"):
        make_pipeline_from_components([RandomForestClassifier()], problem_type='binary', custom_name=True)

    with pytest.raises(TypeError, match="Every element of `component_instances` must be an instance of ComponentBase"):
        make_pipeline_from_components([RandomForestClassifier], problem_type='binary')

    with pytest.raises(TypeError, match="Every element of `component_instances` must be an instance of ComponentBase"):
        make_pipeline_from_components(['RandomForestClassifier'], problem_type='binary')

    imp = Imputer(numeric_impute_strategy='median', random_seed=5)
    est = RandomForestClassifier(random_seed=7)
    pipeline = make_pipeline_from_components([imp, est], ProblemTypes.BINARY, custom_name='My Pipeline',
                                             random_seed=15)
    assert [c.__class__ for c in pipeline] == [Imputer, RandomForestClassifier]
    assert [(c.random_seed == 15) for c in pipeline]
    assert pipeline.problem_type == ProblemTypes.BINARY
    assert pipeline.custom_name == 'My Pipeline'
    expected_parameters = {
        'Imputer': {
            'categorical_impute_strategy': 'most_frequent',
            'numeric_impute_strategy': 'median',
            'categorical_fill_value': None,
            'numeric_fill_value': None},
        'Random Forest Classifier': {
            'n_estimators': 100,
            'max_depth': 6,
            'n_jobs': -1}
    }
    assert pipeline.parameters == expected_parameters
    assert pipeline.random_seed == 15

    class DummyEstimator(Estimator):
        name = "Dummy!"
        model_family = "foo"
        supported_problem_types = [ProblemTypes.BINARY]
        parameters = {'bar': 'baz'}
    random_seed = 42
    pipeline = make_pipeline_from_components([DummyEstimator(random_seed=3)], ProblemTypes.BINARY,
                                             random_seed=random_seed)
    components_list = [c for c in pipeline]
    assert len(components_list) == 1
    assert isinstance(components_list[0], DummyEstimator)
    assert components_list[0].random_seed == random_seed
    expected_parameters = {'Dummy!': {'bar': 'baz'}}
    assert pipeline.parameters == expected_parameters
    assert pipeline.random_seed == random_seed

    X, y = X_y_binary
    pipeline = logistic_regression_binary_pipeline_class(parameters={"Logistic Regression Classifier": {"n_jobs": 1}},
                                                         random_seed=42)
    component_instances = [c for c in pipeline]
    new_pipeline = make_pipeline_from_components(component_instances, ProblemTypes.BINARY)
    pipeline.fit(X, y)
    predictions = pipeline.predict(X)
    new_pipeline.fit(X, y)
    new_predictions = new_pipeline.predict(X)
    assert np.array_equal(predictions, new_predictions)
    assert np.array_equal(pipeline.feature_importance, new_pipeline.feature_importance)
    assert new_pipeline.name == 'Templated Pipeline'
    assert pipeline.parameters == new_pipeline.parameters
    for component, new_component in zip(pipeline._component_graph, new_pipeline._component_graph):
        assert isinstance(new_component, type(component))
    assert pipeline.describe() == new_pipeline.describe()