def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver='randomized', whiten=True) clf = SVC(gamma='scale', probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples, )) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def test_predict_with_predict_params(): # tests that Pipeline passes predict_params to the final estimator # when predict is invoked pipe = Pipeline([('transf', Transf()), ('clf', DummyEstimatorParams())]) pipe.fit(None, None) pipe.predict(X=None, got_attribute=True) assert pipe.named_steps['clf'].got_attribute
def test_pipeline_sample_weight_supported(): # Pipeline should pass sample_weight X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X, y=None) assert_equal(pipe.score(X), 3) assert_equal(pipe.score(X, y=None), 3) assert_equal(pipe.score(X, y=None, sample_weight=None), 3) assert_equal(pipe.score(X, sample_weight=np.array([2, 3])), 8)
def test_nbytes_in_logs_when_log_callback_is_custom(caplog, steps): pipe = DebugPipeline(steps, log_callback=custom_log_callback) caplog.clear() with caplog.at_level(logging.INFO): pipe.fit(IRIS.data, IRIS.target) assert caplog.text, f'Log should be none empty: {caplog.text}' assert 'nbytes=' in caplog.text, f'"nbytes=" should be in: {caplog.text}' assert caplog.text.count('nbytes=') == (len(pipe.steps) - 1), \ f'"nbytes=" should be {len(pipe.steps) - 1} times in {caplog.text}'
def test_time_in_logs_when_log_callback_is_default(caplog, steps): pipe = DebugPipeline(steps, log_callback='default') caplog.clear() with caplog.at_level(logging.INFO): pipe.fit(IRIS.data, IRIS.target) assert caplog.text, f'Log should be none empty: {caplog.text}' assert f'time=' in caplog.text, f'"time=" should be in: {caplog.text}' assert caplog.text.count('time') == (len(pipe.steps) - 1), \ f'"time" should be {len(pipe.steps) - 1} times in {caplog.text}'
def test_pipeline_init_tuple(): # Pipeline accepts steps as tuple X = np.array([[1, 2]]) pipe = Pipeline((('transf', Transf()), ('clf', FitParamT()))) pipe.fit(X, y=None) pipe.score(X) pipe.set_params(transf=None) pipe.fit(X, y=None) pipe.score(X)
def test_output_shape_in_logs_when_log_callback_is_default(caplog, steps): pipe = DebugPipeline(steps, log_callback='default') caplog.clear() with caplog.at_level(logging.INFO): pipe.fit(IRIS.data, IRIS.target) assert caplog.text, f'Log should be none empty: {caplog.text}' shape_str = f'shape={IRIS.data.shape}' assert shape_str in caplog.text, f'"{shape_str}" should be in {caplog.text}' assert caplog.text.count(shape_str) == (len(pipe.steps) - 1), \ f'"{shape_str}" should be {len(pipe.steps) - 1} times in {caplog.text}'
def test_step_name_in_logs_when_log_callback_is_default(caplog, steps): pipe = DebugPipeline(steps, log_callback='default') caplog.clear() with caplog.at_level(logging.INFO): pipe.fit(IRIS.data, IRIS.target) assert caplog.text, f'Log should be none empty: {caplog.text}' for _, step in pipe.steps[:-1]: assert str(step) in caplog.text, f'{step} should be in: {caplog.text}' assert caplog.text.count(str(step)) == 1, \ f'{step} should be once in {caplog.text}'
def test_pipeline_with_cache_attribute(): X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', Mult())], memory=DummyMemory()) pipe.fit(X, y=None) dummy = WrongDummyMemory() pipe = Pipeline([('transf', Transf()), ('clf', Mult())], memory=dummy) assert_raises_regex( ValueError, "'memory' should be None, a string or" " have the same interface as joblib.Memory." " Got memory='{}' instead.".format(dummy), pipe.fit, X)
def test_pipeline_sample_weight_unsupported(): # When sample_weight is None it shouldn't be passed X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', Mult())]) pipe.fit(X, y=None) assert_equal(pipe.score(X), 3) assert_equal(pipe.score(X, sample_weight=None), 3) assert_raise_message( TypeError, "score() got an unexpected keyword argument 'sample_weight'", pipe.score, X, sample_weight=np.array([2, 3]))
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression() filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_methods_pca_svm(): # Test the various methods of the pipeline (pca + svm). iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(gamma='scale', probability=True, random_state=0) pca = PCA(svd_solver='full', n_components='mle', whiten=True) pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
def test_pipeline_fit_params(): # Test that the pipeline can take fit parameters pipe = Pipeline([('transf', Transf()), ('clf', FitParamT())]) pipe.fit(X=None, y=None, clf__should_succeed=True) # classifier should return True assert pipe.predict(None) # and transformer params should not be changed assert pipe.named_steps['transf'].a is None assert pipe.named_steps['transf'].b is None # invalid parameters should raise an error message assert_raise_message(TypeError, "fit() got an unexpected keyword argument 'bad'", pipe.fit, None, None, clf__bad=True)
def test_pipeline_transform(): # Test whether pipeline works with a transformer at the end. # Also test pipeline.transform and pipeline.inverse_transform iris = load_iris() X = iris.data pca = PCA(n_components=2, svd_solver='full') pipeline = Pipeline([('pca', pca)]) # test transform and fit_transform: X_trans = pipeline.fit(X).transform(X) X_trans2 = pipeline.fit_transform(X) X_trans3 = pca.fit_transform(X) assert_array_almost_equal(X_trans, X_trans2) assert_array_almost_equal(X_trans, X_trans3) X_back = pipeline.inverse_transform(X_trans) X_back2 = pca.inverse_transform(X_trans) assert_array_almost_equal(X_back, X_back2)
def test_no_logs_when_log_callback_is_None(caplog, steps): pipe = DebugPipeline(steps, log_callback=None) caplog.clear() with caplog.at_level(logging.INFO): pipe.fit(IRIS.data, IRIS.target) assert not caplog.text, f'Log should be empty: {caplog.text}'
def test_pipeline_memory(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: if LooseVersion(joblib_version) < LooseVersion('0.12'): # Deal with change of API in joblib memory = Memory(cachedir=cachedir, verbose=10) else: memory = Memory(location=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma='scale', probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the transformer in the cached pipeline ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert_false(hasattr(transf, 'means_')) # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert_equal(ts, cached_pipe.named_steps['transf'].timestamp_) # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma='scale', probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert_equal(ts, cached_pipe_2.named_steps['transf_2'].timestamp_) finally: shutil.rmtree(cachedir)
def test_set_pipeline_step_none(): # Test setting Pipeline steps to None X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=None) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert_dict_equal( pipeline.get_params(deep=True), { 'steps': pipeline.steps, 'm2': mult2, 'm3': None, 'last': mult5, 'memory': None, 'm2__mult': 2, 'last__mult': 5, 'log_callback': None, }) pipeline.set_params(m2=None) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = [ 'predict_proba', 'predict_log_proba', 'decision_function', 'transform', 'score' ] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=None) # mult2 and mult3 are active exp = 6 assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert_raise_message(AttributeError, "'NoneType' object has no attribute 'predict'", getattr, pipeline, 'predict') # Check None step at construction time exp = 2 * 5 pipeline = Pipeline([('m2', mult2), ('m3', None), ('last', mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]]))