def test_pipeline_methods_preprocessing_svm():
    # Test the various methods of the pipeline (preprocessing + svm).
    X = iris.data
    y = iris.target
    n_samples = X.shape[0]
    n_classes = len(np.unique(y))
    scaler = StandardScaler()
    pca = PCA(n_components=2, svd_solver='randomized', whiten=True)
    clf = SVC(probability=True, random_state=0, decision_function_shape='ovr')

    for preprocessing in [scaler, pca]:
        pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)])
        pipe.fit(X, y)

        # check shapes of various prediction functions
        predict = pipe.predict(X)
        assert predict.shape == (n_samples, )

        proba = pipe.predict_proba(X)
        assert proba.shape == (n_samples, n_classes)

        log_proba = pipe.predict_log_proba(X)
        assert log_proba.shape == (n_samples, n_classes)

        decision_function = pipe.decision_function(X)
        assert decision_function.shape == (n_samples, n_classes)

        pipe.score(X, y)
def test_noise_sim(sim):
    p = Pipeline([('s', sim), ('c', DecisionTreeClassifier())])
    p.fit(X, y)
    p.predict(X)

    p = make_pipeline(sim, DecisionTreeClassifier())
    p.fit(X, y)
    p.predict(X)
def test_fit_predict_with_intermediate_fit_params():
    # tests that Pipeline passes fit_params to intermediate steps
    # when fit_predict is invoked
    pipe = Pipeline([('transf', TransfFitParams()), ('clf', FitParamT())])
    pipe.fit_predict(X=None,
                     y=None,
                     transf__should_get_this=True,
                     clf__should_succeed=True)
    assert pipe.named_steps['transf'].fit_params['should_get_this']
    assert pipe.named_steps['clf'].successful
    assert 'should_succeed' not in pipe.named_steps['transf'].fit_params
def test_pipeline_fit_transform():
    # Test whether pipeline works with a transformer missing fit_transform
    X = iris.data
    y = iris.target
    transf = Transf()
    pipeline = Pipeline([('mock', transf)])

    # test fit_transform:
    X_trans = pipeline.fit_transform(X, y)
    X_trans2 = transf.fit(X, y).transform(X)
    assert_array_almost_equal(X_trans, X_trans2)
def test_pipeline_correctly_adjusts_steps(passthrough):
    X = np.array([[1]])
    y = np.array([1])
    mult2 = Mult(mult=2)
    mult3 = Mult(mult=3)
    mult5 = Mult(mult=5)

    pipeline = Pipeline([('m2', mult2), ('bad', passthrough), ('m3', mult3),
                         ('m5', mult5)])

    pipeline.fit(X, y)
    expected_names = ['m2', 'bad', 'm3', 'm5']
    actual_names = [name for name, _ in pipeline.steps]
    assert expected_names == actual_names
def test_pipeline_named_steps():
    transf = Transf()
    mult2 = Mult(mult=2)
    pipeline = Pipeline([('mock', transf), ("mult", mult2)])

    # Test access via named_steps bunch object
    assert 'mock' in pipeline.named_steps
    assert 'mock2' not in pipeline.named_steps
    assert pipeline.named_steps.mock is transf
    assert pipeline.named_steps.mult is mult2

    # Test bunch with conflict attribute of dict
    pipeline = Pipeline([('values', transf), ("mult", mult2)])
    assert pipeline.named_steps.values is not transf
    assert pipeline.named_steps.mult is mult2
def test_pipeline_fit_params():
    # Test that the pipeline can take fit parameters
    pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())])
    pipe.fit(X=None, y=None, clf__should_succeed=True)
    # classifier should return True
    assert pipe.predict(None)
    # and transformer params should not be changed
    assert pipe.named_steps['transf'].a is None
    assert pipe.named_steps['transf'].b is None
    # invalid parameters should raise an error message
    assert_raise_message(TypeError,
                         "fit() got an unexpected keyword argument 'bad'",
                         pipe.fit,
                         None,
                         None,
                         clf__bad=True)
def test_pipeline_sample_weight_supported():
    # Pipeline should pass sample_weight
    X = np.array([[1, 2]])
    pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())])
    pipe.fit(X, y=None)
    assert pipe.score(X) == 3
    assert pipe.score(X, y=None) == 3
    assert pipe.score(X, y=None, sample_weight=None) == 3
    assert pipe.score(X, sample_weight=np.array([2, 3])) == 8
def test_pipeline_transform():
    # Test whether pipeline works with a transformer at the end.
    # Also test pipeline.transform and pipeline.inverse_transform
    X = iris.data
    pca = PCA(n_components=2, svd_solver='full')
    pipeline = Pipeline([('pca', pca)])

    # test transform and fit_transform:
    X_trans = pipeline.fit(X).transform(X)
    X_trans2 = pipeline.fit_transform(X)
    X_trans3 = pca.fit_transform(X)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(X_trans, X_trans3)

    X_back = pipeline.inverse_transform(X_trans)
    X_back2 = pca.inverse_transform(X_trans)
    assert_array_almost_equal(X_back, X_back2)
def test_pipeline_slice():
    pipe = Pipeline([('transf1', Transf()), ('transf2', Transf()),
                     ('clf', FitParamT())])
    pipe2 = pipe[:-1]
    assert isinstance(pipe2, Pipeline)
    assert pipe2.steps == pipe.steps[:-1]
    assert 2 == len(pipe2.named_steps)
    assert_raises(ValueError, lambda: pipe[::-1])
def test_fit_predict_on_pipeline_without_fit_predict():
    # tests that a pipeline does not have fit_predict method when final
    # step of pipeline does not have fit_predict defined
    scaler = StandardScaler()
    pca = PCA(svd_solver='full')
    pipe = Pipeline([('scaler', scaler), ('pca', pca)])
    assert_raises_regex(AttributeError,
                        "'PCA' object has no attribute 'fit_predict'", getattr,
                        pipe, 'fit_predict')
def test_pipeline_index():
    transf = Transf()
    clf = FitParamT()
    pipe = Pipeline([('transf', transf), ('clf', clf)])
    assert pipe[0] == transf
    assert pipe['transf'] == transf
    assert pipe[-1] == clf
    assert pipe['clf'] == clf
    assert_raises(IndexError, lambda: pipe[3])
    assert_raises(KeyError, lambda: pipe['foobar'])
def test_fit_predict_on_pipeline():
    # test that the fit_predict method is implemented on a pipeline
    # test that the fit_predict on pipeline yields same results as applying
    # transform and clustering steps separately
    scaler = StandardScaler()
    km = KMeans(random_state=0)
    # As pipeline doesn't clone estimators on construction,
    # it must have its own estimators
    scaler_for_pipeline = StandardScaler()
    km_for_pipeline = KMeans(random_state=0)

    # first compute the transform and clustering step separately
    scaled = scaler.fit_transform(iris.data)
    separate_pred = km.fit_predict(scaled)

    # use a pipeline to do the transform and clustering in one step
    pipe = Pipeline([('scaler', scaler_for_pipeline),
                     ('Kmeans', km_for_pipeline)])
    pipeline_pred = pipe.fit_predict(iris.data)

    assert_array_almost_equal(pipeline_pred, separate_pred)
def test_pipe_grid():
    params = {
        'p__n_components': [2, 4],
        'd__n_neighbors': [2, 5],
        'c__threshold': [.3, .5],
        'c__classifier__max_depth': [2, 4]
    }
    p = Pipeline([('a', CCNoise(lcm)), ('s', StandardScaler()), ('p', PCA()),
                  ('d', KDN()), ('c', Filter(DecisionTreeClassifier()))])
    g = GridSearchCV(p, params, cv=CV1, error_score='raise')
    g.fit(X, y)
    g.predict(X)
def test_pipeline_wrong_memory():
    # Test that an error is raised when memory is not a string or a Memory
    # instance
    X = iris.data
    y = iris.target
    # Define memory as an integer
    memory = 1
    cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())],
                           memory=memory)
    assert_raises_regex(
        ValueError, "'memory' should be None, a string or"
        " have the same interface as joblib.Memory."
        " Got memory='1' instead.", cached_pipe.fit, X, y)
def test_stack_sim():
    p = Pipeline([
        ('a', UniformNoise(.3)),
        ('b', CCNoise(lcm)),
        ('c', DecisionTreeClassifier()),
    ])
    p.fit(X, y)
    p.predict(X)
def test_predict_with_predict_params():
    # tests that Pipeline passes predict_params to the final estimator
    # when predict is invoked
    pipe = Pipeline([('transf', Transf()), ('clf', DummyEstimatorParams())])
    pipe.fit(None, None)
    pipe.predict(X=None, got_attribute=True)

    assert pipe.named_steps['clf'].got_attribute
def test_pipeline_with_cache_attribute():
    X = np.array([[1, 2]])
    pipe = Pipeline([('transf', Transf()), ('clf', Mult())],
                    memory=DummyMemory())
    pipe.fit(X, y=None)
    dummy = WrongDummyMemory()
    pipe = Pipeline([('transf', Transf()), ('clf', Mult())], memory=dummy)
    assert_raises_regex(
        ValueError, "'memory' should be None, a string or"
        " have the same interface as joblib.Memory."
        " Got memory='{}' instead.".format(dummy), pipe.fit, X)
def test_pipeline_sample_weight_unsupported():
    # When sample_weight is None it shouldn't be passed
    X = np.array([[1, 2]])
    pipe = Pipeline([('transf', Transf()), ('clf', Mult())])
    pipe.fit(X, y=None)
    assert pipe.score(X) == 3
    assert pipe.score(X, sample_weight=None) == 3
    assert_raise_message(
        TypeError,
        "score() got an unexpected keyword argument 'sample_weight'",
        pipe.score,
        X,
        sample_weight=np.array([2, 3]))
def test_set_pipeline_steps():
    transf1 = Transf()
    transf2 = Transf()
    pipeline = Pipeline([('mock', transf1)])
    assert pipeline.named_steps['mock'] is transf1

    # Directly setting attr
    pipeline.steps = [('mock2', transf2)]
    assert 'mock' not in pipeline.named_steps
    assert pipeline.named_steps['mock2'] is transf2
    assert [('mock2', transf2)] == pipeline.steps

    # Using set_params
    pipeline.set_params(steps=[('mock', transf1)])
    assert [('mock', transf1)] == pipeline.steps

    # Using set_params to replace single step
    pipeline.set_params(mock=transf2)
    assert [('mock', transf2)] == pipeline.steps

    # With invalid data
    pipeline.set_params(steps=[('junk', ())])
    assert_raises(TypeError, pipeline.fit, [[1]], [1])
    assert_raises(TypeError, pipeline.fit_transform, [[1]], [1])
def test_pipeline_raise_set_params_error():
    # Test pipeline raises set params error message for nested models.
    pipe = Pipeline([('cls', LinearRegression())])

    # expected error message
    error_msg = ('Invalid parameter %s for estimator %s. '
                 'Check the list of available parameters '
                 'with `estimator.get_params().keys()`.')

    assert_raise_message(ValueError,
                         error_msg % ('fake', pipe),
                         pipe.set_params,
                         fake='nope')

    # nested model check
    assert_raise_message(ValueError,
                         error_msg % ("fake", pipe),
                         pipe.set_params,
                         fake__estimator='nope')
def test_pipeline_score_samples_pca_lof():
    X = iris.data
    # Test that the score_samples method is implemented on a pipeline.
    # Test that the score_samples method on pipeline yields same results as
    # applying transform and score_samples steps separately.
    pca = PCA(svd_solver='full', n_components='mle', whiten=True)
    lof = LocalOutlierFactor(novelty=True)
    pipe = Pipeline([('pca', pca), ('lof', lof)])
    pipe.fit(X)
    # Check the shapes
    assert pipe.score_samples(X).shape == (X.shape[0], )
    # Check the values
    lof.fit(pca.fit_transform(X))
    assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X)))
Ejemplo n.º 23
0
def test_example_two():
    from skclean.simulate_noise import UniformNoise
    from skclean.detectors import KDN
    from skclean.handlers import Filter
    from skclean.pipeline import Pipeline, make_pipeline  # Importing from skclean, not sklearn
    from skclean.utils import load_data

    X, y = load_data('breast_cancer')

    clf = Pipeline([
        ('scale', StandardScaler()),  # Scale features
        ('feat_sel', VarianceThreshold(.2)),  # Feature selection
        ('detector', KDN()),  # Detect mislabeled samples
        ('handler', Filter(SVC())
         ),  # Filter out likely mislabeled samples and then train a SVM
    ])

    clf_g = GridSearchCV(clf, {'detector__n_neighbors': [2, 5, 10]})
    n_clf_g = make_pipeline(UniformNoise(.3),
                            clf_g)  # Create label noise at the very first step

    print(cross_val_score(n_clf_g, X, y,
                          cv=5).mean())  # 5-fold cross validation
def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    assert_raises(TypeError, Pipeline)
    # Check that we can't instantiate pipelines with objects without fit
    # method
    assert_raises_regex(
        TypeError, 'Last step of Pipeline should implement fit '
        'or be the string \'passthrough\''
        '.*NoFit.*', Pipeline, [('clf', NoFit())])
    # Smoke test with only an estimator
    clf = NoTrans()
    pipe = Pipeline([('svc', clf)])
    assert (pipe.get_params(deep=True) == dict(svc__a=None,
                                               svc__b=None,
                                               svc=clf,
                                               **pipe.get_params(deep=False)))

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert clf.a == 0.1
    assert clf.b is None
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC()
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([('anova', filter1), ('svc', clf)])

    # Check that estimators are not cloned on pipeline construction
    assert pipe.named_steps['anova'] is filter1
    assert pipe.named_steps['svc'] is clf

    # Check that we can't instantiate with non-transformers on the way
    # Note that NoTrans implements fit, but not transform
    assert_raises_regex(
        TypeError, 'All intermediate steps should be transformers'
        '.*\\bNoTrans\\b.*', Pipeline, [('t', NoTrans()), ('svc', clf)])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert clf.C == 0.1
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    assert_raises(ValueError, pipe.set_params, anova__C=0.1)

    # Test clone
    pipe2 = assert_no_warnings(clone, pipe)
    assert not pipe.named_steps['svc'] is pipe2.named_steps['svc']

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop('svc')
    params.pop('anova')
    params2.pop('svc')
    params2.pop('anova')
    assert params == params2
def test_pipeline_methods_pca_svm():
    # Test the various methods of the pipeline (pca + svm).
    X = iris.data
    y = iris.target
    # Test with PCA + SVC
    clf = SVC(probability=True, random_state=0)
    pca = PCA(svd_solver='full', n_components='mle', whiten=True)
    pipe = Pipeline([('pca', pca), ('svc', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)
def test_set_params_nested_pipeline():
    estimator = Pipeline([('a', Pipeline([('b', DummyRegressor())]))])
    estimator.set_params(a__b__alpha=0.001, a__b=Lasso())
    estimator.set_params(a__steps=[('b', LogisticRegression())], a__b__C=5)
 def make():
     return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)])
def test_set_pipeline_step_passthrough(passthrough):
    X = np.array([[1]])
    y = np.array([1])
    mult2 = Mult(mult=2)
    mult3 = Mult(mult=3)
    mult5 = Mult(mult=5)

    def make():
        return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)])

    pipeline = make()

    exp = 2 * 3 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline.set_params(m3=passthrough)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    assert (pipeline.get_params(deep=True) == {
        'steps': pipeline.steps,
        'm2': mult2,
        'm3': passthrough,
        'last': mult5,
        'memory': None,
        'm2__mult': 2,
        'last__mult': 5,
        'verbose': False
    })

    pipeline.set_params(m2=passthrough)
    exp = 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    # for other methods, ensure no AttributeErrors on None:
    other_methods = [
        'predict_proba', 'predict_log_proba', 'decision_function', 'transform',
        'score'
    ]
    for method in other_methods:
        getattr(pipeline, method)(X)

    pipeline.set_params(m2=mult2)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline = make()
    pipeline.set_params(last=passthrough)
    # mult2 and mult3 are active
    exp = 6
    assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    assert_raise_message(AttributeError,
                         "'str' object has no attribute 'predict'", getattr,
                         pipeline, 'predict')

    # Check 'passthrough' step at construction time
    exp = 2 * 5
    pipeline = Pipeline([('m2', mult2), ('m3', passthrough), ('last', mult5)])
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
def test_pipeline_init_tuple():
    # Pipeline accepts steps as tuple
    X = np.array([[1, 2]])
    pipe = Pipeline((('transf', Transf()), ('clf', FitParamT())))
    pipe.fit(X, y=None)
    pipe.score(X)

    pipe.set_params(transf='passthrough')
    pipe.fit(X, y=None)
    pipe.score(X)