def test_with_improperly_defined_step(self, teardown): x = Input() y = DummyImproperlyDefined()(x) model = Model(x, y) with pytest.raises(RuntimeError): model.predict(iris.data)
def test_fit_predict_pipeline(teardown): x_data = iris.data y_t_data = iris.target random_state = 123 n_components = 2 # baikal way x = Input() y_t = Input() x_pca = PCA(n_components=n_components, random_state=random_state, name="pca")(x) y = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state, name="logreg", )(x_pca, y_t) model = Model(x, y, y_t) y_pred_baikal = model.fit(x_data, y_t_data).predict(x_data) # traditional way pca = PCA(n_components=n_components, random_state=random_state) logreg = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state ) x_data_transformed = pca.fit_transform(x_data) y_pred_traditional = logreg.fit(x_data_transformed, y_t_data).predict( x_data_transformed ) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_fit_predict_ensemble(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data y_t_data = iris.target random_state = 123 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state)(x, y_t) y2 = RandomForestClassifier(random_state=random_state)(x, y_t) features = Stack(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = sklearn.linear_model.LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_pred = logreg.predict(x_data) random_forest = sklearn.ensemble.RandomForestClassifier(random_state=random_state) random_forest.fit(x_data, y_t_data) random_forest_pred = random_forest.predict(x_data) features = np.stack([logreg_pred, random_forest_pred], axis=1) ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state) ensemble.fit(features, y_t_data) y_pred_traditional = ensemble.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_fit_predict_naive_stack(teardown): x_data = iris.data y_t_data = iris.target random_state = 123 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state, solver="liblinear")(x, y_t) y2 = RandomForestClassifier(random_state=random_state)(x, y_t) features = Stack(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state, solver="liblinear")(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = LogisticRegression(random_state=random_state, solver="liblinear") logreg.fit(x_data, y_t_data) logreg_pred = logreg.predict(x_data) random_forest = RandomForestClassifier(random_state=random_state) random_forest.fit(x_data, y_t_data) random_forest_pred = random_forest.predict(x_data) features = np.stack([logreg_pred, random_forest_pred], axis=1) stacked = LogisticRegression(random_state=random_state, solver="liblinear") stacked.fit(features, y_t_data) y_pred_traditional = stacked.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_nested_model(teardown): x_data = iris.data y_t_data = iris.target # Sub-model x = Input() y_t = Input() h = PCA(n_components=2)(x) y = LogisticRegression()(h, y_t) submodel = Model(x, y, y_t) # Model x = Input() y_t = Input() y = submodel(x, y_t) model = Model(x, y, y_t) with raises_with_cause(RuntimeError, NotFittedError): submodel.predict(x_data) model.fit(x_data, y_t_data) y_pred = model.predict(x_data) y_pred_sub = submodel.predict(x_data) assert_array_equal(y_pred, y_pred_sub)
def test_fit_params(teardown): x_data = iris.data y_t_data = iris.target random_state = 123 n_components = 2 sample_weight = y_t_data + 1 # Just weigh the classes differently fit_params = {"logreg__sample_weight": sample_weight} # baikal way x = Input() y_t = Input() x_pca = PCA(n_components=n_components, random_state=random_state, name="pca")(x) y = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state, name="logreg", )(x_pca, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data, **fit_params) # traditional way pca = PCA(n_components=n_components, random_state=random_state) logreg = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state ) pipe = Pipeline([("pca", pca), ("logreg", logreg)]) pipe.fit(x_data, y_t_data, **fit_params) # Use assert_allclose instead of all equal due to small numerical differences # between fit_transform(...) and fit(...).transform(...) assert_allclose(model.get_step("logreg").coef_, pipe.named_steps["logreg"].coef_)
def test_get_params(teardown): dummy1 = DummyEstimator(name="dummy1") dummy2 = DummyEstimator(x=456, y="def", name="dummy2") concat = Concatenate(name="concat") # a step without get_params/set_params # a meaningless pipeline that contains shared steps x1 = Input() x2 = Input() h = dummy1(x1) c = concat([x1, h]) y1 = dummy2(c) y2 = dummy2(x2, compute_func=lambda X: X * 2, trainable=False) model = Model([x1, x2], [y1, y2]) expected = { "dummy1": dummy1, "dummy2": dummy2, "concat": concat, "dummy1__x": 123, "dummy1__y": "abc", "dummy2__x": 456, "dummy2__y": "def", } params = model.get_params() assert params == expected
def test_with_undefined_target(self, teardown): x = Input() y = LogisticRegression()(x, trainable=True) model = Model(inputs=x, outputs=y) with raises_with_cause(RuntimeError, TypeError): # LogisticRegression.fit will be called with not enough arguments # hence the TypeError model.fit(iris.data)
def test_fit_with_shared_step(self, teardown): x = Input() scaler = StandardScaler() z = scaler(x, compute_func="transform", trainable=True) y = scaler(z, compute_func="inverse_transform", trainable=False) model = Model(x, y) model.fit(np.array([1, 3, 1, 3]).reshape(-1, 1)) assert (scaler.mean_, scaler.var_) == (2.0, 1.0)
def test_predict_with_shared_step(self, teardown): x1 = Input() x2 = Input() doubler = Lambda(lambda x: x * 2) y1 = doubler(x1) y2 = doubler(x2) model = Model([x1, x2], [y1, y2]) assert model.predict([2, 3]) == [4, 6]
def test_lazy_model(teardown): x_data = np.array([[1, 2], [3, 4]]) x = Input() model = Model(x, x) model.fit(x_data) # nothing to fit x_pred = model.predict(x_data) assert_array_equal(x_pred, x_data)
def test_with_unnecessarily_defined_but_missing_target(self, teardown): x = Input() y_t = Input() pca = PCA() # The target passed to PCA is unnecessary (see notes in Step.__call__) y = pca(x, y_t, trainable=True) model = Model(inputs=x, outputs=y, targets=y_t) with pytest.raises(ValueError): # fails because of the model target specification and trainable=True model.fit(iris.data)
def test_single_input(step_class, teardown): x = Input() y = step_class()(x) model = Model(x, y) x_data = np.array([[1, 2], [3, 4]]) if step_class is Stack: assert_array_equal(x_data.reshape((2, 2, 1)), model.predict(x_data)) else: assert_array_equal(x_data, model.predict(x_data))
def test_multiedge(teardown): x = Input() z1, z2 = DummySIMO()(x) y = DummyMISO()([z1, z2]) model = Model(x, y) x_data = np.array([[1], [2]]) y_out = model.predict(x_data) assert_array_equal(y_out, np.array([[2], [4]]))
def test_predict_with_not_fitted_steps(self, teardown): x_data = iris.data x = Input(name="x") xt = PCA(n_components=2)(x) y = LogisticRegression(multi_class="multinomial", solver="lbfgs")(xt) model = Model(x, y) with raises_with_cause(RuntimeError, NotFittedError): model.predict(x_data)
def test_fit_predict_with_shared_step(teardown): x = Input() scaler = StandardScaler() z = scaler(x, compute_func="transform", trainable=True) y = scaler(z, compute_func="inverse_transform", trainable=False) model = Model(x, y) X_data = np.array([1, 3, 1, 3]).reshape(-1, 1) model.fit(X_data) assert_array_equal(model.predict(X_data), X_data)
def test_split(x, indices_or_sections, teardown): x1 = Input() ys = Split(indices_or_sections, axis=0)(x1) model = Model(x1, ys) y_expected = np.split(x, indices_or_sections, axis=0) y_pred = model.predict(x) y_pred = listify(y_pred) for actual, expected in safezip2(y_pred, y_expected): assert_array_equal(actual, expected)
def test_with_non_fitted_non_trainable_step(self, teardown): x = Input() y_t = Input() z = PCA()(x, trainable=False) y = LogisticRegression()(z, y_t) model = Model(x, y, y_t) with raises_with_cause(RuntimeError, NotFittedError): # this will raise an error when calling compute # on PCA which was flagged as trainable=False but # hasn't been fitted model.fit(iris.data, iris.target)
def test_stack(teardown): x1 = Input() x2 = Input() y = Stack(axis=1)([x1, x2]) model = Model([x1, x2], y) x1_data = np.array([[1, 2], [10, 20]]) x2_data = np.array([[3, 4], [30, 40]]) y_expected = np.stack([x1_data, x2_data], axis=1) y_pred = model.predict([x1_data, x2_data]) assert_array_equal(y_pred, y_expected)
def test_concatenate(teardown): x1 = Input() x2 = Input() y = Concatenate(axis=1)([x1, x2]) model = Model([x1, x2], y) x1_data = np.array([[1, 2], [10, 20]]) x2_data = np.array([[3, 4, 5], [30, 40, 50]]) y_expected = np.concatenate([x1_data, x2_data], axis=1) y_pred = model.predict([x1_data, x2_data]) assert_array_equal(y_pred, y_expected)
def test_with_unnecessary_inputs(self, teardown): x1 = Input() x2 = Input() y_t = Input() h = PCA()(x1) y = LogisticRegression()(h, y_t) with pytest.raises(ValueError): Model([x1, x2], y, y_t) with pytest.raises(ValueError): Model([x1, h], y, y_t) # x1 is an unnecessary input upstream of h
def test_fit_and_predict_model_with_no_fittable_steps(teardown): X_data = np.array([[1, 2], [3, 4]]) y_expected = np.array([[2, 4], [6, 8]]) x = Input() y = DummySISO()(x) model = Model(x, y) model.fit(X_data) # nothing to fit y_pred = model.predict(X_data) assert_array_equal(y_pred, y_expected)
def test_transformed_target(teardown): x = Input() y_t = Input() y_t_mod = Lambda(lambda y: np.log(y))(y_t) y_p_mod = LinearRegression()(x, y_t_mod) y_p = Lambda(lambda y: np.exp(y))(y_p_mod) model = Model(x, y_p, y_t) x_data = np.arange(4).reshape(-1, 1) y_t_data = np.exp(2 * x_data).ravel() model.fit(x_data, y_t_data) assert_array_equal(model.get_step("LinearRegression_0").coef_, np.array([2.0]))
def test_fit_predict_standard_stack(teardown): # This uses the "standard" protocol where the 2nd level features # are the out-of-fold predictions of the 1st. It also appends the # original data to the 2nd level features. # See for example: https://www.kdnuggets.com/2017/02/stacking-models-imropved-predictions.html X_data, y_t_data = breast_cancer.data, breast_cancer.target X_train, X_test, y_t_train, y_t_test = train_test_split(X_data, y_t_data, test_size=0.2, random_state=0) random_state = 42 # baikal way x = Input() y_t = Input() y_p1 = RandomForestClassifierOOF(n_estimators=10, random_state=random_state)( x, y_t, compute_func="predict_proba") y_p1 = Lambda(lambda array: array[:, 1:])(y_p1) # remove collinear feature x_scaled = StandardScaler()(x) y_p2 = LinearSVCOOF(random_state=random_state)( x_scaled, y_t, compute_func="decision_function") stacked_features = ColumnStack()([x, y_p1, y_p2]) y_p = LogisticRegression(solver="liblinear", random_state=random_state)(stacked_features, y_t) model = Model(x, y_p, y_t) model.fit(X_train, y_t_train) y_pred_baikal = model.predict(X_test) # traditional way estimators = [ ("rf", RandomForestClassifier(n_estimators=10, random_state=random_state)), ("svr", make_pipeline(StandardScaler(), LinearSVC(random_state=random_state))), ] clf = sklearn.ensemble.StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(solver="liblinear", random_state=random_state), passthrough=True, ) y_pred_traditional = clf.fit(X_train, y_t_train).predict(X_test) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_columnstack(teardown): x1 = Input() x2 = Input() y = ColumnStack()([x1, x2]) model = Model([x1, x2], y) x1_data = np.array([1, 10, 100]) x2_data = np.array([2, 20, 200]) y_expected = np.column_stack([x1_data, x2_data]) y_pred = model.predict([x1_data, x2_data]) assert_array_equal(y_pred, y_expected)
def test_fit_params_unhashable_step(): class UnhashableStep(Step, sklearn.linear_model.LogisticRegression): def __eq__(self, other): pass x = Input() y_t = Input() y = UnhashableStep()(x, y_t) model = Model(x, y, y_t) mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data[mask] y_t_data = iris.target[mask] model.fit(x_data, y_t_data)
def test_with_wrong_type(self, teardown): x = Input() y_t = Input() y = LogisticRegression()(x, y_t) wrong = np.zeros((10,)) with pytest.raises(ValueError): Model(wrong, y, y_t) with pytest.raises(ValueError): Model(x, wrong, y_t) with pytest.raises(ValueError): Model(x, y, wrong)
def test_fit_compute(self, teardown): dummy_estimator_1 = DummyEstimator() dummy_estimator_2 = DummyEstimator() x = Input() y_t = Input() y_p1 = dummy_estimator_1(x, y_t, fit_compute_func=None) y_p2 = dummy_estimator_2(x, y_t) model = Model(x, [y_p1, y_p2], y_t) model.fit(iris.data, iris.target) assert dummy_estimator_1.fit_calls == 1 assert dummy_estimator_1.fit_predict_calls == 0 assert dummy_estimator_2.fit_calls == 0 assert dummy_estimator_2.fit_predict_calls == 1
def test_fit_and_predict_model_with_no_fittable_steps(teardown): X1_data = np.array([[1, 2], [3, 4]]) X2_data = np.array([[5, 6], [7, 8]]) y_expected = np.array([[12, 16], [20, 24]]) x1 = Input() x2 = Input() z = DummyMISO()([x1, x2]) y = DummySISO()(z) model = Model([x1, x2], y) model.fit([X1_data, X2_data]) # nothing to fit y_pred = model.predict([X1_data, X2_data]) assert_array_equal(y_pred, y_expected)
def build_model(step): x1 = Input() x2 = Input() y_t1 = Input() y_t2 = Input() y_p = step([x1, x2], [y_t1, y_t2]) return Model([x1, x2], y_p, [y_t1, y_t2])