Example #1
0
def test_transformed_target(teardown):
    x = Input()
    y_t = Input()
    y_t_mod = Lambda(lambda y: np.log(y))(y_t)
    y_p_mod = LinearRegression()(x, y_t_mod)
    y_p = Lambda(lambda y: np.exp(y))(y_p_mod)
    model = Model(x, y_p, y_t)

    x_data = np.arange(4).reshape(-1, 1)
    y_t_data = np.exp(2 * x_data).ravel()
    model.fit(x_data, y_t_data)

    assert_array_equal(
        model.get_step("LinearRegression_0").coef_, np.array([2.0]))
Example #2
0
 def test_predict_with_shared_step(self, teardown):
     x1 = Input()
     x2 = Input()
     doubler = Lambda(lambda x: x * 2)
     y1 = doubler(x1)
     y2 = doubler(x2)
     model = Model([x1, x2], [y1, y2])
     assert model.predict([2, 3]) == [4, 6]
Example #3
0
def test_fit_predict_naive_stack_with_proba_features(teardown):
    mask = iris.target != 2  # Reduce to binary problem to avoid ConvergenceWarning
    x_data = iris.data[mask]
    y_t_data = iris.target[mask]
    random_state = 123
    n_estimators = 5

    # baikal way
    x = Input()
    y_t = Input()
    y_p1 = LogisticRegression(random_state=random_state)(
        x, y_t, compute_func="predict_proba"
    )
    y_p2 = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)(
        x, y_t, compute_func="apply"
    )
    y_p1 = Lambda(compute_func=lambda array: array[:, 1:])(y_p1)
    y_p2 = Lambda(compute_func=lambda array: array[:, 1:])(y_p2)
    features = Concatenate(axis=1)([y_p1, y_p2])
    y_p = LogisticRegression(random_state=random_state)(features, y_t)

    model = Model(x, y_p, y_t)
    model.fit(x_data, y_t_data)
    y_pred_baikal = model.predict(x_data)

    # traditional way
    logreg = LogisticRegression(random_state=random_state)
    logreg.fit(x_data, y_t_data)
    logreg_proba = logreg.predict_proba(x_data)

    random_forest = RandomForestClassifier(
        n_estimators=n_estimators, random_state=random_state
    )
    random_forest.fit(x_data, y_t_data)
    random_forest_leafidx = random_forest.apply(x_data)

    features = np.concatenate(
        [logreg_proba[:, 1:], random_forest_leafidx[:, 1:]], axis=1
    )
    stacked = LogisticRegression(random_state=random_state)
    stacked.fit(features, y_t_data)
    y_pred_traditional = stacked.predict(features)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
Example #4
0
def test_fit_predict_standard_stack(teardown):
    # This uses the "standard" protocol where the 2nd level features
    # are the out-of-fold predictions of the 1st. It also appends the
    # original data to the 2nd level features.
    # See for example: https://www.kdnuggets.com/2017/02/stacking-models-imropved-predictions.html
    X_data, y_t_data = breast_cancer.data, breast_cancer.target
    X_train, X_test, y_t_train, y_t_test = train_test_split(X_data,
                                                            y_t_data,
                                                            test_size=0.2,
                                                            random_state=0)
    random_state = 42

    # baikal way
    x = Input()
    y_t = Input()

    y_p1 = RandomForestClassifierOOF(n_estimators=10,
                                     random_state=random_state)(
                                         x, y_t, compute_func="predict_proba")
    y_p1 = Lambda(lambda array: array[:, 1:])(y_p1)  # remove collinear feature

    x_scaled = StandardScaler()(x)
    y_p2 = LinearSVCOOF(random_state=random_state)(
        x_scaled, y_t, compute_func="decision_function")

    stacked_features = ColumnStack()([x, y_p1, y_p2])
    y_p = LogisticRegression(solver="liblinear",
                             random_state=random_state)(stacked_features, y_t)

    model = Model(x, y_p, y_t)
    model.fit(X_train, y_t_train)
    y_pred_baikal = model.predict(X_test)

    # traditional way
    estimators = [
        ("rf",
         RandomForestClassifier(n_estimators=10, random_state=random_state)),
        ("svr",
         make_pipeline(StandardScaler(),
                       LinearSVC(random_state=random_state))),
    ]
    clf = sklearn.ensemble.StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(solver="liblinear",
                                           random_state=random_state),
        passthrough=True,
    )
    y_pred_traditional = clf.fit(X_train, y_t_train).predict(X_test)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
Example #5
0
def test_lambda(teardown):
    def function(x1, x2, p1, p2=1):
        return p1 * x1, x2 / p2

    x = Input()
    y1, y2 = Lambda(function, n_outputs=2, p1=2, p2=2)([x, x])
    model = Model(x, [y1, y2])

    x_data = np.array([[1.0, 2.0], [3.0, 4.0]])

    y1_expected = np.array([[2.0, 4.0], [6.0, 8.0]])
    y2_expected = np.array([[0.5, 1.0], [1.5, 2.0]])

    y1_pred, y2_pred = model.predict(x_data)

    assert_array_equal(y1_pred, y1_expected)
    assert_array_equal(y2_pred, y2_expected)
Example #6
0
Y = Y == "TRUE"
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

n_targets = Y.shape[1]
random.seed(87)
order = list(range(n_targets))
random.shuffle(order)

# ------- Build model
x = Input()
y_t = Input()

squeeze = Lambda(np.squeeze, axis=1)

ys_t = Split(n_targets, axis=1)(y_t)
ys_p = []
for j, k in enumerate(order):
    x_stacked = ColumnStack()(inputs=[x, *ys_p[:j]])
    ys_t[k] = squeeze(ys_t[k])
    ys_p.append(LogisticRegression(solver="lbfgs")(x_stacked, ys_t[k]))

ys_p = [ys_p[order.index(j)] for j in range(n_targets)]
y_p = ColumnStack()(ys_p)

model = Model(inputs=x, outputs=y_p, targets=y_t)
# This might take a few seconds
plot_model(model, filename="classifier_chain.png", dpi=96)
                                                    y_p,
                                                    test_size=0.2,
                                                    random_state=0)

# ------- Build model
# The model is built similarly as the naive case. The difference is that during fit
# baikal will detect and use the fit_predict method above.
x = Input()
y_t = Input()
y_p1 = LogisticRegression(solver="liblinear",
                          random_state=0)(x, y_t, compute_func="predict_proba")
y_p2 = RandomForestClassifier(random_state=0)(x,
                                              y_t,
                                              compute_func="predict_proba")
# predict_proba returns arrays whose columns sum to one, so we drop one column
drop_first_col = Lambda(lambda array: array[:, 1:])
y_p1 = drop_first_col(y_p1)
y_p2 = drop_first_col(y_p2)
stacked_features = ColumnStack()([y_p1, y_p2])
y_p = ExtraTreesClassifier(random_state=0)(stacked_features, y_t)

model = Model(x, y_p, y_t)
plot_model(model, filename="stacked_classifiers_standard.png", dpi=96)

# ------- Train model
model.fit(X_train, y_train)

# ------- Evaluate model
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
Example #8
0
# ------- Load dataset
dataset = load_boston()
target = np.array(dataset.feature_names) == "DIS"
X = dataset.data[:, np.logical_not(target)]
y = dataset.data[:, target].squeeze()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# ------- Build model
transformer = QuantileTransformer(n_quantiles=300,
                                  output_distribution="normal")

x = Input()
y_t = Input()
# QuantileTransformer requires an explicit feature dimension, hence the Lambda step
y_t_trans = Lambda(np.reshape, newshape=(-1, 1))(y_t)
y_t_trans = transformer(y_t_trans)
y_p_trans = RidgeCV()(x, y_t_trans)
y_p = transformer(y_p_trans, compute_func="inverse_transform", trainable=False)

model = Model(x, y_p, y_t)
plot_model(model, filename="transformed_target.png", dpi=96)

# ------- Train model
model.fit(X_train, y_train)

# ------- Evaluate model
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = median_absolute_error(y_test, y_pred)
Example #9
0
                                                    random_state=0)

n_targets = Y.shape[1]
random.seed(87)
order = list(range(n_targets))
random.shuffle(order)

# ------- Build model
x = Input()
y_t = Input()

ys_t = Split(n_targets, axis=1)(y_t)
ys_p = []
for j, k in enumerate(order):
    x_stacked = ColumnStack()(inputs=[x, *ys_p[:j]])
    ys_t[k] = Lambda(np.squeeze, axis=1)(ys_t[k])
    ys_p.append(LogisticRegression(solver="lbfgs")(x_stacked, ys_t[k]))

ys_p = [ys_p[order.index(j)] for j in range(n_targets)]
y_p = ColumnStack()(ys_p)

model = Model(inputs=x, outputs=y_p, targets=y_t)
plot_model(model, filename="classifier_chain.png",
           dpi=96)  # This might take a few seconds

# ------- Train model
model.fit(X_train, Y_train)

# ------- Evaluate model
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)