def test_transformed_target(teardown): x = Input() y_t = Input() y_t_mod = Lambda(lambda y: np.log(y))(y_t) y_p_mod = LinearRegression()(x, y_t_mod) y_p = Lambda(lambda y: np.exp(y))(y_p_mod) model = Model(x, y_p, y_t) x_data = np.arange(4).reshape(-1, 1) y_t_data = np.exp(2 * x_data).ravel() model.fit(x_data, y_t_data) assert_array_equal( model.get_step("LinearRegression_0").coef_, np.array([2.0]))
def test_predict_with_shared_step(self, teardown): x1 = Input() x2 = Input() doubler = Lambda(lambda x: x * 2) y1 = doubler(x1) y2 = doubler(x2) model = Model([x1, x2], [y1, y2]) assert model.predict([2, 3]) == [4, 6]
def test_fit_predict_naive_stack_with_proba_features(teardown): mask = iris.target != 2 # Reduce to binary problem to avoid ConvergenceWarning x_data = iris.data[mask] y_t_data = iris.target[mask] random_state = 123 n_estimators = 5 # baikal way x = Input() y_t = Input() y_p1 = LogisticRegression(random_state=random_state)( x, y_t, compute_func="predict_proba" ) y_p2 = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)( x, y_t, compute_func="apply" ) y_p1 = Lambda(compute_func=lambda array: array[:, 1:])(y_p1) y_p2 = Lambda(compute_func=lambda array: array[:, 1:])(y_p2) features = Concatenate(axis=1)([y_p1, y_p2]) y_p = LogisticRegression(random_state=random_state)(features, y_t) model = Model(x, y_p, y_t) model.fit(x_data, y_t_data) y_pred_baikal = model.predict(x_data) # traditional way logreg = LogisticRegression(random_state=random_state) logreg.fit(x_data, y_t_data) logreg_proba = logreg.predict_proba(x_data) random_forest = RandomForestClassifier( n_estimators=n_estimators, random_state=random_state ) random_forest.fit(x_data, y_t_data) random_forest_leafidx = random_forest.apply(x_data) features = np.concatenate( [logreg_proba[:, 1:], random_forest_leafidx[:, 1:]], axis=1 ) stacked = LogisticRegression(random_state=random_state) stacked.fit(features, y_t_data) y_pred_traditional = stacked.predict(features) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_fit_predict_standard_stack(teardown): # This uses the "standard" protocol where the 2nd level features # are the out-of-fold predictions of the 1st. It also appends the # original data to the 2nd level features. # See for example: https://www.kdnuggets.com/2017/02/stacking-models-imropved-predictions.html X_data, y_t_data = breast_cancer.data, breast_cancer.target X_train, X_test, y_t_train, y_t_test = train_test_split(X_data, y_t_data, test_size=0.2, random_state=0) random_state = 42 # baikal way x = Input() y_t = Input() y_p1 = RandomForestClassifierOOF(n_estimators=10, random_state=random_state)( x, y_t, compute_func="predict_proba") y_p1 = Lambda(lambda array: array[:, 1:])(y_p1) # remove collinear feature x_scaled = StandardScaler()(x) y_p2 = LinearSVCOOF(random_state=random_state)( x_scaled, y_t, compute_func="decision_function") stacked_features = ColumnStack()([x, y_p1, y_p2]) y_p = LogisticRegression(solver="liblinear", random_state=random_state)(stacked_features, y_t) model = Model(x, y_p, y_t) model.fit(X_train, y_t_train) y_pred_baikal = model.predict(X_test) # traditional way estimators = [ ("rf", RandomForestClassifier(n_estimators=10, random_state=random_state)), ("svr", make_pipeline(StandardScaler(), LinearSVC(random_state=random_state))), ] clf = sklearn.ensemble.StackingClassifier( estimators=estimators, final_estimator=LogisticRegression(solver="liblinear", random_state=random_state), passthrough=True, ) y_pred_traditional = clf.fit(X_train, y_t_train).predict(X_test) assert_array_equal(y_pred_baikal, y_pred_traditional)
def test_lambda(teardown): def function(x1, x2, p1, p2=1): return p1 * x1, x2 / p2 x = Input() y1, y2 = Lambda(function, n_outputs=2, p1=2, p2=2)([x, x]) model = Model(x, [y1, y2]) x_data = np.array([[1.0, 2.0], [3.0, 4.0]]) y1_expected = np.array([[2.0, 4.0], [6.0, 8.0]]) y2_expected = np.array([[0.5, 1.0], [1.5, 2.0]]) y1_pred, y2_pred = model.predict(x_data) assert_array_equal(y1_pred, y1_expected) assert_array_equal(y2_pred, y2_expected)
Y = Y == "TRUE" X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) n_targets = Y.shape[1] random.seed(87) order = list(range(n_targets)) random.shuffle(order) # ------- Build model x = Input() y_t = Input() squeeze = Lambda(np.squeeze, axis=1) ys_t = Split(n_targets, axis=1)(y_t) ys_p = [] for j, k in enumerate(order): x_stacked = ColumnStack()(inputs=[x, *ys_p[:j]]) ys_t[k] = squeeze(ys_t[k]) ys_p.append(LogisticRegression(solver="lbfgs")(x_stacked, ys_t[k])) ys_p = [ys_p[order.index(j)] for j in range(n_targets)] y_p = ColumnStack()(ys_p) model = Model(inputs=x, outputs=y_p, targets=y_t) # This might take a few seconds plot_model(model, filename="classifier_chain.png", dpi=96)
y_p, test_size=0.2, random_state=0) # ------- Build model # The model is built similarly as the naive case. The difference is that during fit # baikal will detect and use the fit_predict method above. x = Input() y_t = Input() y_p1 = LogisticRegression(solver="liblinear", random_state=0)(x, y_t, compute_func="predict_proba") y_p2 = RandomForestClassifier(random_state=0)(x, y_t, compute_func="predict_proba") # predict_proba returns arrays whose columns sum to one, so we drop one column drop_first_col = Lambda(lambda array: array[:, 1:]) y_p1 = drop_first_col(y_p1) y_p2 = drop_first_col(y_p2) stacked_features = ColumnStack()([y_p1, y_p2]) y_p = ExtraTreesClassifier(random_state=0)(stacked_features, y_t) model = Model(x, y_p, y_t) plot_model(model, filename="stacked_classifiers_standard.png", dpi=96) # ------- Train model model.fit(X_train, y_train) # ------- Evaluate model y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test)
# ------- Load dataset dataset = load_boston() target = np.array(dataset.feature_names) == "DIS" X = dataset.data[:, np.logical_not(target)] y = dataset.data[:, target].squeeze() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # ------- Build model transformer = QuantileTransformer(n_quantiles=300, output_distribution="normal") x = Input() y_t = Input() # QuantileTransformer requires an explicit feature dimension, hence the Lambda step y_t_trans = Lambda(np.reshape, newshape=(-1, 1))(y_t) y_t_trans = transformer(y_t_trans) y_p_trans = RidgeCV()(x, y_t_trans) y_p = transformer(y_p_trans, compute_func="inverse_transform", trainable=False) model = Model(x, y_p, y_t) plot_model(model, filename="transformed_target.png", dpi=96) # ------- Train model model.fit(X_train, y_train) # ------- Evaluate model y_pred = model.predict(X_test) r2 = r2_score(y_test, y_pred) mae = median_absolute_error(y_test, y_pred)
random_state=0) n_targets = Y.shape[1] random.seed(87) order = list(range(n_targets)) random.shuffle(order) # ------- Build model x = Input() y_t = Input() ys_t = Split(n_targets, axis=1)(y_t) ys_p = [] for j, k in enumerate(order): x_stacked = ColumnStack()(inputs=[x, *ys_p[:j]]) ys_t[k] = Lambda(np.squeeze, axis=1)(ys_t[k]) ys_p.append(LogisticRegression(solver="lbfgs")(x_stacked, ys_t[k])) ys_p = [ys_p[order.index(j)] for j in range(n_targets)] y_p = ColumnStack()(ys_p) model = Model(inputs=x, outputs=y_p, targets=y_t) plot_model(model, filename="classifier_chain.png", dpi=96) # This might take a few seconds # ------- Train model model.fit(X_train, Y_train) # ------- Evaluate model Y_train_pred = model.predict(X_train) Y_test_pred = model.predict(X_test)