def test_with_improperly_defined_step(self, teardown): x = Input() y = DummyImproperlyDefined()(x) model = Model(x, y) with pytest.raises(RuntimeError): model.predict(iris.data)
def test_nested_model(teardown): x_data = iris.data y_t_data = iris.target # Sub-model x = Input() y_t = Input() h = PCA(n_components=2)(x) y = LogisticRegression()(h, y_t) submodel = Model(x, y, y_t) # Model x = Input() y_t = Input() y = submodel(x, y_t) model = Model(x, y, y_t) with raises_with_cause(RuntimeError, NotFittedError): submodel.predict(x_data) model.fit(x_data, y_t_data) y_pred = model.predict(x_data) y_pred_sub = submodel.predict(x_data) assert_array_equal(y_pred, y_pred_sub)
def test_single_input(step_class, teardown): x = Input() y = step_class()(x) model = Model(x, y) x_data = np.array([[1, 2], [3, 4]]) if step_class is Stack: assert_array_equal(x_data.reshape((2, 2, 1)), model.predict(x_data)) else: assert_array_equal(x_data, model.predict(x_data))
def test_predict_with_not_fitted_steps(self, teardown): x_data = iris.data x = Input(name="x") xt = PCA(n_components=2)(x) y = LogisticRegression(multi_class="multinomial", solver="lbfgs")(xt) model = Model(x, y) with raises_with_cause(RuntimeError, NotFittedError): model.predict(x_data)
def test_fit_predict_ensemble(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data y_t_data = iris.target random_state = 123 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state)(x, y_t) y2 = RandomForestClassifier(random_state=random_state)(x, y_t) features = Stack(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = sklearn.linear_model.LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_pred = logreg.predict(x_data) random_forest = sklearn.ensemble.RandomForestClassifier(random_state=random_state) random_forest.fit(x_data, y_t_data) random_forest_pred = random_forest.predict(x_data) features = np.stack([logreg_pred, random_forest_pred], axis=1) ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state) ensemble.fit(features, y_t_data) y_pred_traditional = ensemble.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_fit_predict_naive_stack(teardown): x_data = iris.data y_t_data = iris.target random_state = 123 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state, solver="liblinear")(x, y_t) y2 = RandomForestClassifier(random_state=random_state)(x, y_t) features = Stack(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state, solver="liblinear")(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = LogisticRegression(random_state=random_state, solver="liblinear") logreg.fit(x_data, y_t_data) logreg_pred = logreg.predict(x_data) random_forest = RandomForestClassifier(random_state=random_state) random_forest.fit(x_data, y_t_data) random_forest_pred = random_forest.predict(x_data) features = np.stack([logreg_pred, random_forest_pred], axis=1) stacked = LogisticRegression(random_state=random_state, solver="liblinear") stacked.fit(features, y_t_data) y_pred_traditional = stacked.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_predict_with_shared_step(self, teardown): x1 = Input() x2 = Input() doubler = Lambda(lambda x: x * 2) y1 = doubler(x1) y2 = doubler(x2) model = Model([x1, x2], [y1, y2]) assert model.predict([2, 3]) == [4, 6]
def test_lazy_model(teardown): x_data = np.array([[1, 2], [3, 4]]) x = Input() model = Model(x, x) model.fit(x_data) # nothing to fit x_pred = model.predict(x_data) assert_array_equal(x_pred, x_data)
def test_try_and_raise_with_cause(teardown): x = Input() y = DummyStepWithFaultyPredict(name="faultystep")(x) model = Model(x, y) with raises_with_cause(RuntimeError, KeyError): model.predict(123) x = Input() y = DummyStepWithFaultyFit(name="faultystep")(x) model = Model(x, y) with raises_with_cause(RuntimeError, ValueError): model.fit(123) x = Input() y = DummyStepWithFaultyFitPredict(name="faultystep")(x) model = Model(x, y) with raises_with_cause(RuntimeError, ValueError): model.fit(123)
def test_multiedge(teardown): x = Input() z1, z2 = DummySIMO()(x) y = DummyMISO()([z1, z2]) model = Model(x, y) x_data = np.array([[1], [2]]) y_out = model.predict(x_data) assert_array_equal(y_out, np.array([[2], [4]]))
def test_fit_predict_with_shared_step(teardown): x = Input() scaler = StandardScaler() z = scaler(x, compute_func="transform", trainable=True) y = scaler(z, compute_func="inverse_transform", trainable=False) model = Model(x, y) X_data = np.array([1, 3, 1, 3]).reshape(-1, 1) model.fit(X_data) assert_array_equal(model.predict(X_data), X_data)
def test_split(x, indices_or_sections, teardown): x1 = Input() ys = Split(indices_or_sections, axis=0)(x1) model = Model(x1, ys) y_expected = np.split(x, indices_or_sections, axis=0) y_pred = model.predict(x) y_pred = listify(y_pred) for actual, expected in safezip2(y_pred, y_expected): assert_array_equal(actual, expected)
def test_concatenate(teardown): x1 = Input() x2 = Input() y = Concatenate(axis=1)([x1, x2]) model = Model([x1, x2], y) x1_data = np.array([[1, 2], [10, 20]]) x2_data = np.array([[3, 4, 5], [30, 40, 50]]) y_expected = np.concatenate([x1_data, x2_data], axis=1) y_pred = model.predict([x1_data, x2_data]) assert_array_equal(y_pred, y_expected)
def test_fit_and_predict_model_with_no_fittable_steps(teardown): X_data = np.array([[1, 2], [3, 4]]) y_expected = np.array([[2, 4], [6, 8]]) x = Input() y = DummySISO()(x) model = Model(x, y) model.fit(X_data) # nothing to fit y_pred = model.predict(X_data) assert_array_equal(y_pred, y_expected)
def test_stack(teardown): x1 = Input() x2 = Input() y = Stack(axis=1)([x1, x2]) model = Model([x1, x2], y) x1_data = np.array([[1, 2], [10, 20]]) x2_data = np.array([[3, 4], [30, 40]]) y_expected = np.stack([x1_data, x2_data], axis=1) y_pred = model.predict([x1_data, x2_data]) assert_array_equal(y_pred, y_expected)
def test_fit_predict_standard_stack(teardown): # This uses the "standard" protocol where the 2nd level features # are the out-of-fold predictions of the 1st. It also appends the # original data to the 2nd level features. # See for example: https://www.kdnuggets.com/2017/02/stacking-models-imropved-predictions.html X_data, y_t_data = breast_cancer.data, breast_cancer.target X_train, X_test, y_t_train, y_t_test = train_test_split(X_data, y_t_data, test_size=0.2, random_state=0) random_state = 42 # baikal way x = Input() y_t = Input() y_p1 = RandomForestClassifierOOF(n_estimators=10, random_state=random_state)( x, y_t, compute_func="predict_proba") y_p1 = Lambda(lambda array: array[:, 1:])(y_p1) # remove collinear feature x_scaled = StandardScaler()(x) y_p2 = LinearSVCOOF(random_state=random_state)( x_scaled, y_t, compute_func="decision_function") stacked_features = ColumnStack()([x, y_p1, y_p2]) y_p = LogisticRegression(solver="liblinear", random_state=random_state)(stacked_features, y_t) model = Model(x, y_p, y_t) model.fit(X_train, y_t_train) y_pred_baikal = model.predict(X_test) # traditional way estimators = [ ("rf", RandomForestClassifier(n_estimators=10, random_state=random_state)), ("svr", make_pipeline(StandardScaler(), LinearSVC(random_state=random_state))), ] clf = sklearn.ensemble.StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(solver="liblinear", random_state=random_state), passthrough=True, ) y_pred_traditional = clf.fit(X_train, y_t_train).predict(X_test) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_columnstack(teardown): x1 = Input() x2 = Input() y = ColumnStack()([x1, x2]) model = Model([x1, x2], y) x1_data = np.array([1, 10, 100]) x2_data = np.array([2, 20, 200]) y_expected = np.column_stack([x1_data, x2_data]) y_pred = model.predict([x1_data, x2_data]) assert_array_equal(y_pred, y_expected)
def test_fit_and_predict_model_with_no_fittable_steps(teardown): X1_data = np.array([[1, 2], [3, 4]]) X2_data = np.array([[5, 6], [7, 8]]) y_expected = np.array([[12, 16], [20, 24]]) x1 = Input() x2 = Input() z = DummyMISO()([x1, x2]) y = DummySISO()(z) model = Model([x1, x2], y) model.fit([X1_data, X2_data]) # nothing to fit y_pred = model.predict([X1_data, X2_data]) assert_array_equal(y_pred, y_expected)
def test_lambda(teardown): def function(x1, x2, p1, p2=1): return p1 * x1, x2 / p2 x = Input() y1, y2 = Lambda(function, n_outputs=2, p1=2, p2=2)([x, x]) model = Model(x, [y1, y2]) x_data = np.array([[1.0, 2.0], [3.0, 4.0]]) y1_expected = np.array([[2.0, 4.0], [6.0, 8.0]]) y2_expected = np.array([[0.5, 1.0], [1.5, 2.0]]) y1_pred, y2_pred = model.predict(x_data) assert_array_equal(y1_pred, y1_expected) assert_array_equal(y2_pred, y2_expected)
def test_fit_predict_naive_stack_with_proba_features(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data[mask] y_t_data = iris.target[mask] random_state = 123 n_estimators = 5 # baikal way x = Input() y_t = Input() y_p1 = LogisticRegression(random_state=random_state)( x, y_t, compute_func="predict_proba" ) y_p2 = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)( x, y_t, compute_func="apply" ) y_p1 = Lambda(compute_func=lambda array: array[:, 1:])(y_p1) y_p2 = Lambda(compute_func=lambda array: array[:, 1:])(y_p2) features = Concatenate(axis=1)([y_p1, y_p2]) y_p = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y_p, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_proba = logreg.predict_proba(x_data) random_forest = RandomForestClassifier( n_estimators=n_estimators, random_state=random_state ) random_forest.fit(x_data, y_t_data) random_forest_leafidx = random_forest.apply(x_data) features = np.concatenate( [logreg_proba[:, 1:], random_forest_leafidx[:, 1:]], axis=1 ) stacked = LogisticRegression(random_state=random_state) stacked.fit(features, y_t_data) y_pred_traditional = stacked.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_fit_predict_ensemble_with_proba_features(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data[mask] y_t_data = iris.target[mask] random_state = 123 n_estimators = 5 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state, function="predict_proba")(x, y_t) y2 = RandomForestClassifier( n_estimators=n_estimators, random_state=random_state, function="apply" )(x, y_t) features = Concatenate(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = sklearn.linear_model.LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_proba = logreg.predict_proba(x_data) random_forest = sklearn.ensemble.RandomForestClassifier( n_estimators=n_estimators, random_state=random_state ) random_forest.fit(x_data, y_t_data) random_forest_leafidx = random_forest.apply(x_data) features = np.concatenate([logreg_proba, random_forest_leafidx], axis=1) ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state) ensemble.fit(features, y_t_data) y_pred_traditional = ensemble.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
ys_t = Split(n_targets, axis=1)(y_t) ys_p = [] for j, k in enumerate(order): x_stacked = ColumnStack()(inputs=[x, *ys_p[:j]]) ys_t[k] = squeeze(ys_t[k]) ys_p.append(LogisticRegression(solver="lbfgs")(x_stacked, ys_t[k])) ys_p = [ys_p[order.index(j)] for j in range(n_targets)] y_p = ColumnStack()(ys_p) model = Model(inputs=x, outputs=y_p, targets=y_t) # This might take a few seconds plot_model(model, filename="classifier_chain.png", dpi=96) # ------- Train model model.fit(X_train, Y_train) # ------- Evaluate model Y_train_pred = model.predict(X_train) Y_test_pred = model.predict(X_test) print( "Jaccard score on train data:", jaccard_score(Y_train, Y_train_pred, average="samples"), ) print( "Jaccard score on test data:", jaccard_score(Y_test, Y_test_pred, average="samples"), )
import sklearn.svm from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from baikal import Input, Model, make_step from baikal.plot import plot_model # 1. Define a step SVC = make_step(sklearn.svm.SVC) # 2. Build the model x = Input() y_t = Input() y_p = SVC(C=1.0, kernel="rbf", gamma=0.5)(x, y_t) model = Model(x, y_p, y_t) plot_model(model, filename="readme_quick_example.png") # 3. Train the model dataset = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=0) model.fit(X_train, y_train) # 4. Use the model y_test_pred = model.predict(X_test)
y3 = LogisticRegression()(z, y_t) stacked_features = Stack()([y1, y2, y3]) y_p = SVC()(stacked_features, y_t) model = Model([x1, x2], y_p, y_t) plot_model(model, filename="multiple_input_nonlinear_pipeline_example_plot.png") # 3. Train the model dataset = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=0) # Let's suppose the dataset is originally split in two X1_train, X2_train = X_train[:, :15], X_train[:, 15:] X1_test, X2_test = X_test[:, :15], X_test[:, 15:] model.fit([X1_train, X2_train], y_train) # 4. Use the model y_test_pred = model.predict([X1_test, X2_test]) # This also works: # y_test_pred = model.predict({x1: X1_test, x2: X2_test}) # We can also query any intermediate outputs: outs = model.predict([X1_test, X2_test], output_names=["ExtraTreesClassifier_0:0/0", "PCA_0:0/0"])
def test_steps_cache(teardown): x1_data = iris.data[:, :2] x2_data = iris.data[:, 2:] y1_t_data = iris.target x1 = Input(name="x1") x2 = Input(name="x2") y1_t = Input(name="y1_t") y1 = LogisticRegression(name="LogReg")(x1, y1_t) y2 = PCA(name="PCA")(x2) hits, misses = 0, 0 # 1) instantiation always misses misses += 1 model = Model([x1, x2], [y1, y2], y1_t) assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses # 2) calling fit for the first time, hence a miss misses += 1 model.fit([x1_data, x2_data], y1_t_data) assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses # 3) same as above, just different format, hence a hit hits += 1 model.fit({x1: x1_data, x2: x2_data}, {y1_t: y1_t_data}) assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses # 4) trainable flags are considered in cache keys, hence a miss misses += 1 model.get_step("LogReg").trainable = False model.fit( [x1_data, x2_data], y1_t_data ) # NOTE: target is superfluous, but it affects caching assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses # 5) same as above, just different format, hence a hit hits += 1 model.fit({x1: x1_data, x2: x2_data}, y1_t_data) assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses # 6) we drop the (superflous) target, hence a miss misses += 1 model.fit({x1: x1_data, x2: x2_data}) assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses # 7) same as above, hence a hit hits += 1 model.fit({x1: x1_data, x2: x2_data}) assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses # 8) we restore the flag, becoming the same as 2) and 3), hence a hit hits += 1 model.get_step("LogReg").trainable = True model.fit({x1: x1_data, x2: x2_data}, y1_t_data) assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses # 9) new inputs/targets/outputs signature, hence a miss misses += 1 model.predict([x1_data, x2_data]) assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses # 10) same inputs/outputs signature as 9), hence a hit hits += 1 model.predict({"x1": x1_data, "x2": x2_data}, ["PCA:0/0", "LogReg:0/0"]) assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses # 11) new inputs/outputs signature, hence a miss misses += 1 model.predict({x1: x1_data}, "LogReg:0/0") assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses # 12) same as above, hence a hit hits += 1 model.predict({x1: x1_data}, "LogReg:0/0") assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses