def test_pipeline_sub_estimators(): iris = load_iris() X, y = iris.data, iris.target scaling = Pipeline([('transform', ScalingTransformer())]) pipe = Pipeline([('setup', None), ('missing', None), ('scaling', scaling), ('svc', SVC(kernel='linear', random_state=0))]) param_grid = [ { 'svc__C': [0.1, 0.1] }, # Duplicates to test culling { 'setup': [None], 'svc__C': [0.1, 1, 10], 'scaling': [ScalingTransformer(), None] }, { 'setup': [SelectKBest()], 'setup__k': [1, 2], 'svc': [ SVC(kernel='linear', random_state=0, C=0.1), SVC(kernel='linear', random_state=0, C=1), SVC(kernel='linear', random_state=0, C=10) ] } ] gs = GridSearchCV(pipe, param_grid=param_grid, return_train_score=True) gs.fit(X, y) dgs = dcv.GridSearchCV(pipe, param_grid=param_grid, scheduler='sync', return_train_score=True) dgs.fit(X, y) # Check best params match assert gs.best_params_ == dgs.best_params_ # Check cv results match res = pd.DataFrame(dgs.cv_results_) sol = pd.DataFrame(gs.cv_results_) # TODO: Failures on Py36 / sklearn dev with order here. res = res.reindex(columns=sol.columns) pd.util.testing.assert_index_equal(res.columns, sol.columns) skip = [ 'mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time' ] res = res.drop(skip, axis=1) sol = sol.drop(skip, axis=1) assert res.equals(sol) # Check SVC coefs match np.testing.assert_allclose(gs.best_estimator_.named_steps['svc'].coef_, dgs.best_estimator_.named_steps['svc'].coef_)
def test_pipeline_feature_union(): iris = load_iris() X, y = iris.data, iris.target pca = PCA(random_state=0) kbest = SelectKBest() empty_union = FeatureUnion([("first", None), ("second", None)]) empty_pipeline = Pipeline([("first", None), ("second", None)]) scaling = Pipeline([("transform", ScalingTransformer())]) svc = SVC(kernel="linear", random_state=0) pipe = Pipeline([ ("empty_pipeline", empty_pipeline), ("scaling", scaling), ("missing", None), ( "union", FeatureUnion( [ ("pca", pca), ("missing", None), ("kbest", kbest), ("empty_union", empty_union), ], transformer_weights={"pca": 0.5}, ), ), ("svc", svc), ]) param_grid = dict( scaling__transform__factor=[1, 2], union__pca__n_components=[1, 2, 3], union__kbest__k=[1, 2], svc__C=[0.1, 1, 10], ) gs = GridSearchCV(pipe, param_grid=param_grid, cv=3, iid=True) gs.fit(X, y) dgs = dcv.GridSearchCV(pipe, param_grid=param_grid, scheduler="sync", cv=3) dgs.fit(X, y) # Check best params match assert gs.best_params_ == dgs.best_params_ # Check PCA components match sk_pca = gs.best_estimator_.named_steps["union"].transformer_list[0][1] dk_pca = dgs.best_estimator_.named_steps["union"].transformer_list[0][1] np.testing.assert_allclose(sk_pca.components_, dk_pca.components_) # Check SelectKBest scores match sk_kbest = gs.best_estimator_.named_steps["union"].transformer_list[2][1] dk_kbest = dgs.best_estimator_.named_steps["union"].transformer_list[2][1] np.testing.assert_allclose(sk_kbest.scores_, dk_kbest.scores_) # Check SVC coefs match np.testing.assert_allclose( gs.best_estimator_.named_steps["svc"].coef_, dgs.best_estimator_.named_steps["svc"].coef_, )
def test_pipeline_feature_union(): iris = load_iris() X, y = iris.data, iris.target pca = PCA(random_state=0) kbest = SelectKBest() empty_union = FeatureUnion([('first', None), ('second', None)]) empty_pipeline = Pipeline([('first', None), ('second', None)]) scaling = Pipeline([('transform', ScalingTransformer())]) svc = SVC(kernel='linear', random_state=0) pipe = Pipeline([('empty_pipeline', empty_pipeline), ('scaling', scaling), ('missing', None), ('union', FeatureUnion([('pca', pca), ('missing', None), ('kbest', kbest), ('empty_union', empty_union)], transformer_weights={'pca': 0.5})), ('svc', svc)]) param_grid = dict(scaling__transform__factor=[1, 2], union__pca__n_components=[1, 2, 3], union__kbest__k=[1, 2], svc__C=[0.1, 1, 10]) gs = GridSearchCV(pipe, param_grid=param_grid) gs.fit(X, y) dgs = dcv.GridSearchCV(pipe, param_grid=param_grid, scheduler='sync') dgs.fit(X, y) # Check best params match assert gs.best_params_ == dgs.best_params_ # Check PCA components match sk_pca = gs.best_estimator_.named_steps['union'].transformer_list[0][1] dk_pca = dgs.best_estimator_.named_steps['union'].transformer_list[0][1] np.testing.assert_allclose(sk_pca.components_, dk_pca.components_) # Check SelectKBest scores match sk_kbest = gs.best_estimator_.named_steps['union'].transformer_list[2][1] dk_kbest = dgs.best_estimator_.named_steps['union'].transformer_list[2][1] np.testing.assert_allclose(sk_kbest.scores_, dk_kbest.scores_) # Check SVC coefs match np.testing.assert_allclose(gs.best_estimator_.named_steps['svc'].coef_, dgs.best_estimator_.named_steps['svc'].coef_)
def test_feature_union(weights): X = np.ones((10, 5)) y = np.zeros(10) union = FeatureUnion( [ ("tr0", ScalingTransformer()), ("tr1", ScalingTransformer()), ("tr2", ScalingTransformer()), ] ) factors = [(2, 3, 5), (2, 4, 5), (2, 4, 6), (2, 4, None), (None, None, None)] params, sols, grid = [], [], [] for constants, w in product(factors, weights or [None]): p = {} for n, c in enumerate(constants): if c is None: p["tr%d" % n] = None elif n == 3: # 3rd is always an estimator p["tr%d" % n] = ScalingTransformer(c) else: p["tr%d__factor" % n] = c sol = union.set_params(transformer_weights=w, **p).transform(X) sols.append(sol) if w is not None: p["transformer_weights"] = w params.append(p) p2 = {"union__" + k: [v] for k, v in p.items()} p2["est"] = [CheckXClassifier(sol[0])] grid.append(p2) # Need to recreate the union after setting estimators to `None` above union = FeatureUnion( [ ("tr0", ScalingTransformer()), ("tr1", ScalingTransformer()), ("tr2", ScalingTransformer()), ] ) pipe = Pipeline([("union", union), ("est", CheckXClassifier())]) gs = dcv.GridSearchCV(pipe, grid, refit=False, cv=2) with warnings.catch_warnings(record=True): gs.fit(X, y)
def test_pipeline_sub_estimators(): iris = load_iris() X, y = iris.data, iris.target scaling = Pipeline([("transform", ScalingTransformer())]) pipe = Pipeline([ ("setup", None), ("missing", None), ("scaling", scaling), ("svc", SVC(kernel="linear", random_state=0)), ]) param_grid = [ { "svc__C": [0.1, 0.1] }, # Duplicates to test culling { "setup": [None], "svc__C": [0.1, 1, 10], "scaling": [ScalingTransformer(), None], }, { "setup": [SelectKBest()], "setup__k": [1, 2], "svc": [ SVC(kernel="linear", random_state=0, C=0.1), SVC(kernel="linear", random_state=0, C=1), SVC(kernel="linear", random_state=0, C=10), ], }, ] gs = GridSearchCV(pipe, param_grid=param_grid, return_train_score=True, cv=3, **iid) gs.fit(X, y) dgs = dcv.GridSearchCV(pipe, param_grid=param_grid, scheduler="sync", return_train_score=True, cv=3) dgs.fit(X, y) # Check best params match assert gs.best_params_ == dgs.best_params_ # Check cv results match res = pd.DataFrame(dgs.cv_results_) sol = pd.DataFrame(gs.cv_results_) # TODO: Failures on Py36 / sklearn dev with order here. res = res.reindex(columns=sol.columns) pd.util.testing.assert_index_equal(res.columns, sol.columns) skip = [ "mean_fit_time", "std_fit_time", "mean_score_time", "std_score_time" ] res = res.drop(skip, axis=1) sol = sol.drop(skip, axis=1) pd.util.testing.assert_frame_equal(res, sol, check_exact=False, check_less_precise=1) # Check SVC coefs match np.testing.assert_allclose( gs.best_estimator_.named_steps["svc"].coef_, dgs.best_estimator_.named_steps["svc"].coef_, )