def test_fit_predict_ensemble(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data y_t_data = iris.target random_state = 123 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state)(x, y_t) y2 = RandomForestClassifier(random_state=random_state)(x, y_t) features = Stack(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = sklearn.linear_model.LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_pred = logreg.predict(x_data) random_forest = sklearn.ensemble.RandomForestClassifier(random_state=random_state) random_forest.fit(x_data, y_t_data) random_forest_pred = random_forest.predict(x_data) features = np.stack([logreg_pred, random_forest_pred], axis=1) ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state) ensemble.fit(features, y_t_data) y_pred_traditional = ensemble.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_with_undefined_target(self, teardown): x = Input() y = LogisticRegression()(x, trainable=True) model = Model(inputs=x, outputs=y) with raises_with_cause(RuntimeError, TypeError): # LogisticRegression.fit will be called with not enough arguments # hence the TypeError model.fit(iris.data)
def test_with_unnecessary_target(self, teardown): x = Input() y_t = Input() logreg = LogisticRegression() y_p = logreg(x, y_t) model = Model(x, y_p, y_t) model.fit(iris.data, iris.target) # won't require the target is trainable was set to False, # but won't complain if it was passed to fit logreg.trainable = False model.fit(iris.data, iris.target)
def make_naive_stacked_model(n_components, random_state, x_data, y_t_data): # An unnecessarily complex Model # Sub-model 1 x1 = Input(name="x1") y1_t = Input(name="y1_t") h1 = PCA(n_components=n_components, random_state=random_state, name="pca_sub1")(x1) y1 = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state, name="logreg_sub1", )(h1, y1_t) submodel1 = Model(x1, y1, y1_t, name="submodel1") # Sub-model 2 (a nested stacked model) x2 = Input(name="x2") y2_t = Input(name="y2_t") y2_1 = RandomForestClassifier(random_state=random_state, name="rforest_sub2")( x2, y2_t ) y2_2 = ExtraTreesClassifier(random_state=random_state, name="extrees_sub2")( x2, y2_t ) features = Stack(axis=1, name="stack_sub2")([y2_1, y2_2]) y2 = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state, name="logreg_sub2", )(features, y2_t) submodel2 = Model(x2, y2, y2_t, name="submodel2") # Stack of submodels x = Input(name="x") y_t = Input(name="y_t") y1 = submodel1(x, y_t) y2 = submodel2(x, y_t) features = Stack(axis=1, name="stack")([y1, y2]) y = LogisticRegression( multi_class="multinomial", solver="lbfgs", random_state=random_state, name="logreg_stacked", )(features, y_t) stacked_model_baikal = Model(x, y, y_t, name="stacked") stacked_model_baikal.fit(x_data, y_t_data) return stacked_model_baikal
def test_fit_predict_standard_stack(teardown): # This uses the "standard" protocol where the 2nd level features # are the out-of-fold predictions of the 1st. It also appends the # original data to the 2nd level features. # See for example: https://www.kdnuggets.com/2017/02/stacking-models-imropved-predictions.html X_data, y_t_data = breast_cancer.data, breast_cancer.target X_train, X_test, y_t_train, y_t_test = train_test_split(X_data, y_t_data, test_size=0.2, random_state=0) random_state = 42 # baikal way x = Input() y_t = Input() y_p1 = RandomForestClassifierOOF(n_estimators=10, random_state=random_state)( x, y_t, compute_func="predict_proba") y_p1 = Lambda(lambda array: array[:, 1:])(y_p1) # remove collinear feature x_scaled = StandardScaler()(x) y_p2 = LinearSVCOOF(random_state=random_state)( x_scaled, y_t, compute_func="decision_function") stacked_features = ColumnStack()([x, y_p1, y_p2]) y_p = LogisticRegression(solver="liblinear", random_state=random_state)(stacked_features, y_t) model = Model(x, y_p, y_t) model.fit(X_train, y_t_train) y_pred_baikal = model.predict(X_test) # traditional way estimators = [ ("rf", RandomForestClassifier(n_estimators=10, random_state=random_state)), ("svr", make_pipeline(StandardScaler(), LinearSVC(random_state=random_state))), ] clf = sklearn.ensemble.StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(solver="liblinear", random_state=random_state), passthrough=True, ) y_pred_traditional = clf.fit(X_train, y_t_train).predict(X_test) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_fit_params_unhashable_step(): class UnhashableStep(Step, sklearn.linear_model.LogisticRegression): def __eq__(self, other): pass x = Input() y_t = Input() y = UnhashableStep()(x, y_t) model = Model(x, y, y_t) mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data[mask] y_t_data = iris.target[mask] model.fit(x_data, y_t_data)
def test_fit_and_predict_model_with_no_fittable_steps(teardown): X1_data = np.array([[1, 2], [3, 4]]) X2_data = np.array([[5, 6], [7, 8]]) y_expected = np.array([[12, 16], [20, 24]]) x1 = Input() x2 = Input() z = DummyMISO()([x1, x2]) y = DummySISO()(z) model = Model([x1, x2], y) model.fit([X1_data, X2_data]) # nothing to fit y_pred = model.predict([X1_data, X2_data]) assert_array_equal(y_pred, y_expected)
def test_fit_predict_ensemble_with_proba_features(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data[mask] y_t_data = iris.target[mask] random_state = 123 n_estimators = 5 # baikal way x = Input() y_t = Input() y1 = LogisticRegression(random_state=random_state, function="predict_proba")(x, y_t) y2 = RandomForestClassifier( n_estimators=n_estimators, random_state=random_state, function="apply" )(x, y_t) features = Concatenate(axis=1)([y1, y2]) y = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = sklearn.linear_model.LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_proba = logreg.predict_proba(x_data) random_forest = sklearn.ensemble.RandomForestClassifier( n_estimators=n_estimators, random_state=random_state ) random_forest.fit(x_data, y_t_data) random_forest_leafidx = random_forest.apply(x_data) features = np.concatenate([logreg_proba, random_forest_leafidx], axis=1) ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state) ensemble.fit(features, y_t_data) y_pred_traditional = ensemble.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
ys_t = Split(n_targets, axis=1)(y_t) ys_p = [] for j, k in enumerate(order): x_stacked = ColumnStack()(inputs=[x, *ys_p[:j]]) ys_t[k] = squeeze(ys_t[k]) ys_p.append(LogisticRegression(solver="lbfgs")(x_stacked, ys_t[k])) ys_p = [ys_p[order.index(j)] for j in range(n_targets)] y_p = ColumnStack()(ys_p) model = Model(inputs=x, outputs=y_p, targets=y_t) # This might take a few seconds plot_model(model, filename="classifier_chain.png", dpi=96) # ------- Train model model.fit(X_train, Y_train) # ------- Evaluate model Y_train_pred = model.predict(X_train) Y_test_pred = model.predict(X_test) print( "Jaccard score on train data:", jaccard_score(Y_train, Y_train_pred, average="samples"), ) print( "Jaccard score on test data:", jaccard_score(Y_test, Y_test_pred, average="samples"), )
y3 = LogisticRegression()(z, y_t) stacked_features = Stack()([y1, y2, y3]) y_p = SVC()(stacked_features, y_t) model = Model([x1, x2], y_p, y_t) plot_model(model, filename="multiple_input_nonlinear_pipeline_example_plot.png") # 3. Train the model dataset = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=0) # Let's suppose the dataset is originally split in two X1_train, X2_train = X_train[:, :15], X_train[:, 15:] X1_test, X2_test = X_test[:, :15], X_test[:, 15:] model.fit([X1_train, X2_train], y_train) # 4. Use the model y_test_pred = model.predict([X1_test, X2_test]) # This also works: # y_test_pred = model.predict({x1: X1_test, x2: X2_test}) # We can also query any intermediate outputs: outs = model.predict([X1_test, X2_test], output_names=["ExtraTreesClassifier_0:0/0", "PCA_0:0/0"])
import sklearn.svm from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from baikal import Input, Model, make_step from baikal.plot import plot_model # 1. Define a step SVC = make_step(sklearn.svm.SVC) # 2. Build the model x = Input() y_t = Input() y_p = SVC(C=1.0, kernel="rbf", gamma=0.5)(x, y_t) model = Model(x, y_p, y_t) plot_model(model, filename="readme_quick_example.png") # 3. Train the model dataset = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=0) model.fit(X_train, y_train) # 4. Use the model y_test_pred = model.predict(X_test)