def test_fit_predict_pipeline(teardown): x_data = iris.data y_t_data = iris.target random_state = 123 n_components = 2 # baikal way x = Input() y_t = Input() x_pca = PCA(n_components=n_components, random_state=random_state, name="pca")(x) y = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state, name="logreg", )(x_pca, y_t) model = Model(x, y, y_t) y_pred_baikal = model.fit(x_data, y_t_data).predict(x_data) # traditional way pca = PCA(n_components=n_components, random_state=random_state) logreg = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state ) x_data_transformed = pca.fit_transform(x_data) y_pred_traditional = logreg.fit(x_data_transformed, y_t_data).predict( x_data_transformed ) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_fit_predict_naive_stack(teardown): x_data = iris.data y_t_data = iris.target random_state = 123 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state, solver="liblinear")(x, y_t) y2 = RandomForestClassifier(random_state=random_state)(x, y_t) features = Stack(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state, solver="liblinear")(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = LogisticRegression(random_state=random_state, solver="liblinear") logreg.fit(x_data, y_t_data) logreg_pred = logreg.predict(x_data) random_forest = RandomForestClassifier(random_state=random_state) random_forest.fit(x_data, y_t_data) random_forest_pred = random_forest.predict(x_data) features = np.stack([logreg_pred, random_forest_pred], axis=1) stacked = LogisticRegression(random_state=random_state, solver="liblinear") stacked.fit(features, y_t_data) y_pred_traditional = stacked.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_fit_predict_ensemble(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data y_t_data = iris.target random_state = 123 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state)(x, y_t) y2 = RandomForestClassifier(random_state=random_state)(x, y_t) features = Stack(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = sklearn.linear_model.LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_pred = logreg.predict(x_data) random_forest = sklearn.ensemble.RandomForestClassifier(random_state=random_state) random_forest.fit(x_data, y_t_data) random_forest_pred = random_forest.predict(x_data) features = np.stack([logreg_pred, random_forest_pred], axis=1) ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state) ensemble.fit(features, y_t_data) y_pred_traditional = ensemble.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_fit_params(teardown): x_data = iris.data y_t_data = iris.target random_state = 123 n_components = 2 sample_weight = y_t_data + 1 # Just weigh the classes differently fit_params = {"logreg__sample_weight": sample_weight} # baikal way x = Input() y_t = Input() x_pca = PCA(n_components=n_components, random_state=random_state, name="pca")(x) y = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state, name="logreg", )(x_pca, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data, **fit_params) # traditional way pca = PCA(n_components=n_components, random_state=random_state) logreg = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state ) pipe = Pipeline([("pca", pca), ("logreg", logreg)]) pipe.fit(x_data, y_t_data, **fit_params) # Use assert_allclose instead of all equal due to small numerical differences # between fit_transform(...) and fit(...).transform(...) assert_allclose(model.get_step("logreg").coef_, pipe.named_steps["logreg"].coef_)
def test_grid_search_cv_with_tunable_step(): param_grid = { "classifier": [ LogisticRegression(random_state=random_state), RandomForestClassifier(random_state=random_state), ], "pca__n_components": [2, 4], } # baikal way def build_fn(): x = Input() y_t = Input() h = PCA(random_state=random_state, name="pca")(x) y = LogisticRegression(random_state=random_state, name="classifier")(h, y_t) model = Model(x, y, y_t) return model sk_model = SKLearnWrapper(build_fn) gscv_baikal = GridSearchCV( sk_model, param_grid, cv=cv, scoring="accuracy", return_train_score=True, verbose=verbose, ) gscv_baikal.fit(x_data, y_t_data) # traditional way pca = PCA(random_state=random_state) classifier = LogisticRegression(random_state=random_state) pipe = Pipeline([("pca", pca), ("classifier", classifier)]) gscv_traditional = GridSearchCV( pipe, param_grid, cv=cv, scoring="accuracy", return_train_score=True, verbose=verbose, ) gscv_traditional.fit(x_data, y_t_data) assert gscv_baikal.best_params_ == gscv_traditional.best_params_ assert_array_equal( gscv_traditional.cv_results_["mean_train_score"], gscv_baikal.cv_results_["mean_train_score"], ) assert_array_equal( gscv_traditional.cv_results_["mean_test_score"], gscv_baikal.cv_results_["mean_test_score"], )
def test_fit_predict_standard_stack(teardown): # This uses the "standard" protocol where the 2nd level features # are the out-of-fold predictions of the 1st. It also appends the # original data to the 2nd level features. # See for example: https://www.kdnuggets.com/2017/02/stacking-models-imropved-predictions.html X_data, y_t_data = breast_cancer.data, breast_cancer.target X_train, X_test, y_t_train, y_t_test = train_test_split(X_data, y_t_data, test_size=0.2, random_state=0) random_state = 42 # baikal way x = Input() y_t = Input() y_p1 = RandomForestClassifierOOF(n_estimators=10, random_state=random_state)( x, y_t, compute_func="predict_proba") y_p1 = Lambda(lambda array: array[:, 1:])(y_p1) # remove collinear feature x_scaled = StandardScaler()(x) y_p2 = LinearSVCOOF(random_state=random_state)( x_scaled, y_t, compute_func="decision_function") stacked_features = ColumnStack()([x, y_p1, y_p2]) y_p = LogisticRegression(solver="liblinear", random_state=random_state)(stacked_features, y_t) model = Model(x, y_p, y_t) model.fit(X_train, y_t_train) y_pred_baikal = model.predict(X_test) # traditional way estimators = [ ("rf", RandomForestClassifier(n_estimators=10, random_state=random_state)), ("svr", make_pipeline(StandardScaler(), LinearSVC(random_state=random_state))), ] clf = sklearn.ensemble.StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(solver="liblinear", random_state=random_state), passthrough=True, ) y_pred_traditional = clf.fit(X_train, y_t_train).predict(X_test) assert_array_equal(y_pred_baikal, y_pred_traditional)
def make_naive_stacked_model(n_components, random_state, x_data, y_t_data): # An unnecessarily complex Model # Sub-model 1 x1 = Input(name="x1") y1_t = Input(name="y1_t") h1 = PCA(n_components=n_components, random_state=random_state, name="pca_sub1")(x1) y1 = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state, name="logreg_sub1", )(h1, y1_t) submodel1 = Model(x1, y1, y1_t, name="submodel1") # Sub-model 2 (a nested stacked model) x2 = Input(name="x2") y2_t = Input(name="y2_t") y2_1 = RandomForestClassifier(random_state=random_state, name="rforest_sub2")( x2, y2_t ) y2_2 = ExtraTreesClassifier(random_state=random_state, name="extrees_sub2")( x2, y2_t ) features = Stack(axis=1, name="stack_sub2")([y2_1, y2_2]) y2 = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state, name="logreg_sub2", )(features, y2_t) submodel2 = Model(x2, y2, y2_t, name="submodel2") # Stack of submodels x = Input(name="x") y_t = Input(name="y_t") y1 = submodel1(x, y_t) y2 = submodel2(x, y_t) features = Stack(axis=1, name="stack")([y1, y2]) y = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state, name="logreg_stacked", )(features, y_t) stacked_model_baikal = Model(x, y, y_t, name="stacked") stacked_model_baikal.fit(x_data, y_t_data) return stacked_model_baikal
def test_nested_model_stack(teardown): x_data = iris.data y_t_data = iris.target random_state = 123 n_components = 2 # ----------- baikal way stacked_model_baikal = make_naive_stacked_model( n_components, random_state, x_data, y_t_data ) y_pred_baikal = stacked_model_baikal.predict(x_data) # ----------- traditional way # Submodel 1 submodel1 = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state ) pca = PCA(n_components=n_components, random_state=random_state) pca.fit(x_data) pca_trans = pca.transform(x_data) submodel1.fit(pca_trans, y_t_data) submodel1_pred = submodel1.predict(pca_trans) # Submodel 2 (a nested stacked model) random_forest = RandomForestClassifier(random_state=random_state) random_forest.fit(x_data, y_t_data) random_forest_pred = random_forest.predict(x_data) extra_trees = ExtraTreesClassifier(random_state=random_state) extra_trees.fit(x_data, y_t_data) extra_trees_pred = extra_trees.predict(x_data) features = np.stack([random_forest_pred, extra_trees_pred], axis=1) submodel2 = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state ) submodel2.fit(features, y_t_data) submodel2_pred = submodel2.predict(features) # Stacked model features = np.stack([submodel1_pred, submodel2_pred], axis=1) stacked_model_traditional = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state ) stacked_model_traditional.fit(features, y_t_data) y_pred_traditional = stacked_model_traditional.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_set_params(self, teardown): step = LogisticRegression() new_params_wrong = {"non_existent_param": 42} with pytest.raises(ValueError): step.set_params(**new_params_wrong) new_params = {"C": 100.0, "fit_intercept": False, "penalty": "l1"} step.set_params(**new_params) params = step.get_params() expected = { "C": 100.0, "class_weight": None, "dual": False, "fit_intercept": False, "intercept_scaling": 1, "max_iter": 100, "multi_class": "warn", "n_jobs": None, "penalty": "l1", "random_state": None, "solver": "warn", "tol": 0.0001, "verbose": 0, "warm_start": False, "l1_ratio": None, } assert expected == params
def test_with_steps_with_duplicated_names(self, teardown): x = Input() h = PCA(name="duplicated-name")(x) y = LogisticRegression(name="duplicated-name")(h) with pytest.raises(RuntimeError): Model(x, y)
def dataplaceholders(self): x1 = Input(name="x1") x2 = Input(name="x2") y1_t = Input(name="y1_t") y1 = LogisticRegression()(x1, y1_t) y2 = PCA()(x2) return x1, x2, y1, y2, y1_t
def test_nested_model(teardown): x_data = iris.data y_t_data = iris.target # Sub-model x = Input() y_t = Input() h = PCA(n_components=2)(x) y = LogisticRegression()(h, y_t) submodel = Model(x, y, y_t) # Model x = Input() y_t = Input() y = submodel(x, y_t) model = Model(x, y, y_t) with raises_with_cause(RuntimeError, NotFittedError): submodel.predict(x_data) model.fit(x_data, y_t_data) y_pred = model.predict(x_data) y_pred_sub = submodel.predict(x_data) assert_array_equal(y_pred, y_pred_sub)
def dataplaceholders(self): x1 = Input(name="x1") x2 = Input(name="x2") y1_t = Input(name="y1_t") x1_rescaled = StandardScaler()(x1) y1 = LogisticRegression()(x1_rescaled, y1_t) y2 = PCA()(x2) return x1, x2, x1_rescaled, y1, y2, y1_t
def test_with_undefined_target(self, teardown): x = Input() y = LogisticRegression()(x, trainable=True) model = Model(inputs=x, outputs=y) with raises_with_cause(RuntimeError, TypeError): # LogisticRegression.fit will be called with not enough arguments # hence the TypeError model.fit(iris.data)
def test_plot_independent_submodels(teardown, tmp_path, expand_nested): xs, y_ts, y_ps, steps = [], [], [], [] for i in range(3): step = (LogisticRegression() if i == 0 else build_submodel( LogisticRegression(), i)) x = Input(name="x{}".format(i)) y_t = Input(name="y_t{}".format(i)) y_p = step(x, y_t) xs.append(x) y_ts.append(y_t) y_ps.append(y_p) model = Model(xs, y_ps, y_ts) filename = str(tmp_path / "test_plot_model.png") plot_model(model, filename, show=False, expand_nested=expand_nested)
def build_fn(): x = Input() y_t = Input() h = PCA(random_state=random_state, name="pca")(x) y = LogisticRegression(random_state=random_state, name="classifier")(h, y_t) model = Model(x, y, y_t) return model
def test_compute_func(self, simple_step, shared_step, dataplaceholders, teardown): assert simple_step.compute_func == simple_step.predict simple_step.compute_func = simple_step.predict_proba assert simple_step.compute_func == simple_step.predict_proba with pytest.raises(AttributeError): shared_step.compute_func with pytest.raises(AttributeError): shared_step.compute_func = shared_step.predict_proba with pytest.raises(AttributeError): # because the step hasn't been called LogisticRegression().compute_func with pytest.raises(AttributeError): # because the step hasn't been called LogisticRegression().compute_func = lambda x: x
def test_trainable(self, simple_step, shared_step, dataplaceholders, teardown): assert simple_step.trainable simple_step.trainable = False assert not simple_step.trainable with pytest.raises(AttributeError): shared_step.trainable with pytest.raises(AttributeError): shared_step.trainable = True with pytest.raises(AttributeError): # because the step hasn't been called LogisticRegression().trainable with pytest.raises(AttributeError): # because the step hasn't been called LogisticRegression().trainable = False
def build_fn(): x = Input() y_t = Input() h = PCA(random_state=random_state, name="pca")(x) y = LogisticRegression(random_state=random_state, solver="liblinear", name="logreg")(h, y_t) model = Model(x, y, y_t) return model
def test_fit_predict_naive_stack_with_proba_features(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data[mask] y_t_data = iris.target[mask] random_state = 123 n_estimators = 5 # baikal way x = Input() y_t = Input() y_p1 = LogisticRegression(random_state=random_state)( x, y_t, compute_func="predict_proba" ) y_p2 = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)( x, y_t, compute_func="apply" ) y_p1 = Lambda(compute_func=lambda array: array[:, 1:])(y_p1) y_p2 = Lambda(compute_func=lambda array: array[:, 1:])(y_p2) features = Concatenate(axis=1)([y_p1, y_p2]) y_p = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y_p, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_proba = logreg.predict_proba(x_data) random_forest = RandomForestClassifier( n_estimators=n_estimators, random_state=random_state ) random_forest.fit(x_data, y_t_data) random_forest_leafidx = random_forest.apply(x_data) features = np.concatenate( [logreg_proba[:, 1:], random_forest_leafidx[:, 1:]], axis=1 ) stacked = LogisticRegression(random_state=random_state) stacked.fit(features, y_t_data) y_pred_traditional = stacked.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_outputs(self, simple_step, shared_step, dataplaceholders, teardown): *_, y_simple, y_shared_1, y_shared_2 = dataplaceholders assert simple_step.outputs == [y_simple] with pytest.raises(AttributeError): shared_step.outputs with pytest.raises(AttributeError): # because the step hasn't been called LogisticRegression().outputs
def test_targets(self, simple_step, shared_step, dataplaceholders, teardown): y_t = dataplaceholders[2] assert simple_step.targets == [y_t] with pytest.raises(AttributeError): shared_step.targets with pytest.raises(AttributeError): # because the step hasn't been called LogisticRegression().targets
def test_predict_with_not_fitted_steps(self, teardown): x_data = iris.data x = Input(name="x") xt = PCA(n_components=2)(x) y = LogisticRegression(multi_class="multinomial", solver="lbfgs")(xt) model = Model(x, y) with raises_with_cause(RuntimeError, NotFittedError): model.predict(x_data)
def test_inputs(self, simple_step, shared_step, dataplaceholders, teardown): x1 = dataplaceholders[0] assert simple_step.inputs == [x1] with pytest.raises(AttributeError): shared_step.inputs with pytest.raises(AttributeError): # because the step hasn't been called LogisticRegression().inputs
def test_grid_search_cv(): param_grid = { "pca__n_components": [2, 4], "logreg__C": [0.1, 1.0, 10], "logreg__penalty": ["l1", "l2"], } # baikal way def build_fn(): x = Input() y_t = Input() h = PCA(random_state=random_state, name="pca")(x) y = LogisticRegression(random_state=random_state, solver="liblinear", name="logreg")(h, y_t) model = Model(x, y, y_t) return model sk_model = SKLearnWrapper(build_fn) assert isinstance(sk_model.model, Model) gscv_baikal = GridSearchCV( sk_model, param_grid, cv=cv, scoring="accuracy", return_train_score=True, verbose=verbose, ) gscv_baikal.fit(x_data, y_t_data) # traditional way pca = PCA(random_state=random_state) logreg = LogisticRegression(random_state=random_state, solver="liblinear") pipe = Pipeline([("pca", pca), ("logreg", logreg)]) gscv_traditional = GridSearchCV( pipe, param_grid, cv=cv, scoring="accuracy", return_train_score=True, verbose=verbose, ) gscv_traditional.fit(x_data, y_t_data) assert gscv_baikal.best_params_ == gscv_traditional.best_params_ assert_array_equal( gscv_traditional.cv_results_["mean_train_score"], gscv_baikal.cv_results_["mean_train_score"], ) assert_array_equal( gscv_traditional.cv_results_["mean_test_score"], gscv_baikal.cv_results_["mean_test_score"], )
def test_with_non_fitted_non_trainable_step(self, teardown): x = Input() y_t = Input() z = PCA()(x, trainable=False) y = LogisticRegression()(z, y_t) model = Model(x, y, y_t) with raises_with_cause(RuntimeError, NotFittedError): # this will raise an error when calling compute # on PCA which was flagged as trainable=False but # hasn't been fitted model.fit(iris.data, iris.target)
def test_with_unnecessary_inputs(self, teardown): x1 = Input() x2 = Input() y_t = Input() h = PCA()(x1) y = LogisticRegression()(h, y_t) with pytest.raises(ValueError): Model([x1, x2], y, y_t) with pytest.raises(ValueError): Model([x1, h], y, y_t) # x1 is an unnecessary input upstream of h
def test_with_unnecessary_target(self, teardown): x = Input() y_t = Input() logreg = LogisticRegression() y_p = logreg(x, y_t) model = Model(x, y_p, y_t) model.fit(iris.data, iris.target) # won't require the target is trainable was set to False, # but won't complain if it was passed to fit logreg.trainable = False model.fit(iris.data, iris.target)
def test_fit_predict_ensemble_with_proba_features(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data[mask] y_t_data = iris.target[mask] random_state = 123 n_estimators = 5 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state, function="predict_proba")(x, y_t) y2 = RandomForestClassifier( n_estimators=n_estimators, random_state=random_state, function="apply" )(x, y_t) features = Concatenate(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = sklearn.linear_model.LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_proba = logreg.predict_proba(x_data) random_forest = sklearn.ensemble.RandomForestClassifier( n_estimators=n_estimators, random_state=random_state ) random_forest.fit(x_data, y_t_data) random_forest_leafidx = random_forest.apply(x_data) features = np.concatenate([logreg_proba, random_forest_leafidx], axis=1) ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state) ensemble.fit(features, y_t_data) y_pred_traditional = ensemble.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_with_wrong_type(self, teardown): x = Input() y_t = Input() y = LogisticRegression()(x, y_t) wrong = np.zeros((10,)) with pytest.raises(ValueError): Model(wrong, y, y_t) with pytest.raises(ValueError): Model(x, wrong, y_t) with pytest.raises(ValueError): Model(x, y, wrong)