コード例 #1
0
    def test_with_improperly_defined_step(self, teardown):
        x = Input()
        y = DummyImproperlyDefined()(x)
        model = Model(x, y)

        with pytest.raises(RuntimeError):
            model.predict(iris.data)
コード例 #2
0
def test_nested_model(teardown):
    x_data = iris.data
    y_t_data = iris.target

    # Sub-model
    x = Input()
    y_t = Input()
    h = PCA(n_components=2)(x)
    y = LogisticRegression()(h, y_t)
    submodel = Model(x, y, y_t)

    # Model
    x = Input()
    y_t = Input()
    y = submodel(x, y_t)
    model = Model(x, y, y_t)

    with raises_with_cause(RuntimeError, NotFittedError):
        submodel.predict(x_data)

    model.fit(x_data, y_t_data)
    y_pred = model.predict(x_data)
    y_pred_sub = submodel.predict(x_data)

    assert_array_equal(y_pred, y_pred_sub)
コード例 #3
0
def test_single_input(step_class, teardown):
    x = Input()
    y = step_class()(x)
    model = Model(x, y)

    x_data = np.array([[1, 2], [3, 4]])
    if step_class is Stack:
        assert_array_equal(x_data.reshape((2, 2, 1)), model.predict(x_data))
    else:
        assert_array_equal(x_data, model.predict(x_data))
コード例 #4
0
    def test_predict_with_not_fitted_steps(self, teardown):
        x_data = iris.data

        x = Input(name="x")
        xt = PCA(n_components=2)(x)
        y = LogisticRegression(multi_class="multinomial", solver="lbfgs")(xt)

        model = Model(x, y)
        with raises_with_cause(RuntimeError, NotFittedError):
            model.predict(x_data)
コード例 #5
0
ファイル: test_model.py プロジェクト: vitalyvels/baikal
def test_fit_predict_ensemble(teardown):
    mask = iris.target != 2  # Reduce to binary problem to avoid ConvergenceWarning
    x_data = iris.data
    y_t_data = iris.target
    random_state = 123

    # baikal way
    x = Input()
    y_t = Input()
    y1 = LogisticRegression(random_state=random_state)(x, y_t)
    y2 = RandomForestClassifier(random_state=random_state)(x, y_t)
    features = Stack(axis=1)([y1, y2])
    y = LogisticRegression(random_state=random_state)(features, y_t)

    model = Model(x, y, y_t)
    model.fit(x_data, y_t_data)
    y_pred_baikal = model.predict(x_data)

    # traditional way
    logreg = sklearn.linear_model.LogisticRegression(random_state=random_state)
    logreg.fit(x_data, y_t_data)
    logreg_pred = logreg.predict(x_data)

    random_forest = sklearn.ensemble.RandomForestClassifier(random_state=random_state)
    random_forest.fit(x_data, y_t_data)
    random_forest_pred = random_forest.predict(x_data)

    features = np.stack([logreg_pred, random_forest_pred], axis=1)
    ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state)
    ensemble.fit(features, y_t_data)
    y_pred_traditional = ensemble.predict(features)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
コード例 #6
0
def test_fit_predict_naive_stack(teardown):
    x_data = iris.data
    y_t_data = iris.target
    random_state = 123

    # baikal way
    x = Input()
    y_t = Input()
    y1 = LogisticRegression(random_state=random_state, solver="liblinear")(x, y_t)
    y2 = RandomForestClassifier(random_state=random_state)(x, y_t)
    features = Stack(axis=1)([y1, y2])
    y = LogisticRegression(random_state=random_state, solver="liblinear")(features, y_t)

    model = Model(x, y, y_t)
    model.fit(x_data, y_t_data)
    y_pred_baikal = model.predict(x_data)

    # traditional way
    logreg = LogisticRegression(random_state=random_state, solver="liblinear")
    logreg.fit(x_data, y_t_data)
    logreg_pred = logreg.predict(x_data)

    random_forest = RandomForestClassifier(random_state=random_state)
    random_forest.fit(x_data, y_t_data)
    random_forest_pred = random_forest.predict(x_data)

    features = np.stack([logreg_pred, random_forest_pred], axis=1)
    stacked = LogisticRegression(random_state=random_state, solver="liblinear")
    stacked.fit(features, y_t_data)
    y_pred_traditional = stacked.predict(features)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
コード例 #7
0
 def test_predict_with_shared_step(self, teardown):
     x1 = Input()
     x2 = Input()
     doubler = Lambda(lambda x: x * 2)
     y1 = doubler(x1)
     y2 = doubler(x2)
     model = Model([x1, x2], [y1, y2])
     assert model.predict([2, 3]) == [4, 6]
コード例 #8
0
def test_lazy_model(teardown):
    x_data = np.array([[1, 2], [3, 4]])

    x = Input()
    model = Model(x, x)
    model.fit(x_data)  # nothing to fit
    x_pred = model.predict(x_data)

    assert_array_equal(x_pred, x_data)
コード例 #9
0
def test_try_and_raise_with_cause(teardown):
    x = Input()
    y = DummyStepWithFaultyPredict(name="faultystep")(x)
    model = Model(x, y)
    with raises_with_cause(RuntimeError, KeyError):
        model.predict(123)

    x = Input()
    y = DummyStepWithFaultyFit(name="faultystep")(x)
    model = Model(x, y)
    with raises_with_cause(RuntimeError, ValueError):
        model.fit(123)

    x = Input()
    y = DummyStepWithFaultyFitPredict(name="faultystep")(x)
    model = Model(x, y)
    with raises_with_cause(RuntimeError, ValueError):
        model.fit(123)
コード例 #10
0
def test_multiedge(teardown):
    x = Input()
    z1, z2 = DummySIMO()(x)
    y = DummyMISO()([z1, z2])
    model = Model(x, y)

    x_data = np.array([[1], [2]])
    y_out = model.predict(x_data)

    assert_array_equal(y_out, np.array([[2], [4]]))
コード例 #11
0
def test_fit_predict_with_shared_step(teardown):
    x = Input()
    scaler = StandardScaler()
    z = scaler(x, compute_func="transform", trainable=True)
    y = scaler(z, compute_func="inverse_transform", trainable=False)
    model = Model(x, y)

    X_data = np.array([1, 3, 1, 3]).reshape(-1, 1)
    model.fit(X_data)
    assert_array_equal(model.predict(X_data), X_data)
コード例 #12
0
def test_split(x, indices_or_sections, teardown):
    x1 = Input()
    ys = Split(indices_or_sections, axis=0)(x1)
    model = Model(x1, ys)

    y_expected = np.split(x, indices_or_sections, axis=0)
    y_pred = model.predict(x)
    y_pred = listify(y_pred)

    for actual, expected in safezip2(y_pred, y_expected):
        assert_array_equal(actual, expected)
コード例 #13
0
def test_concatenate(teardown):
    x1 = Input()
    x2 = Input()
    y = Concatenate(axis=1)([x1, x2])
    model = Model([x1, x2], y)

    x1_data = np.array([[1, 2], [10, 20]])
    x2_data = np.array([[3, 4, 5], [30, 40, 50]])
    y_expected = np.concatenate([x1_data, x2_data], axis=1)
    y_pred = model.predict([x1_data, x2_data])

    assert_array_equal(y_pred, y_expected)
コード例 #14
0
def test_fit_and_predict_model_with_no_fittable_steps(teardown):
    X_data = np.array([[1, 2], [3, 4]])
    y_expected = np.array([[2, 4], [6, 8]])

    x = Input()
    y = DummySISO()(x)

    model = Model(x, y)
    model.fit(X_data)  # nothing to fit
    y_pred = model.predict(X_data)

    assert_array_equal(y_pred, y_expected)
コード例 #15
0
def test_stack(teardown):
    x1 = Input()
    x2 = Input()
    y = Stack(axis=1)([x1, x2])
    model = Model([x1, x2], y)

    x1_data = np.array([[1, 2], [10, 20]])
    x2_data = np.array([[3, 4], [30, 40]])
    y_expected = np.stack([x1_data, x2_data], axis=1)
    y_pred = model.predict([x1_data, x2_data])

    assert_array_equal(y_pred, y_expected)
コード例 #16
0
def test_fit_predict_standard_stack(teardown):
    # This uses the "standard" protocol where the 2nd level features
    # are the out-of-fold predictions of the 1st. It also appends the
    # original data to the 2nd level features.
    # See for example: https://www.kdnuggets.com/2017/02/stacking-models-imropved-predictions.html
    X_data, y_t_data = breast_cancer.data, breast_cancer.target
    X_train, X_test, y_t_train, y_t_test = train_test_split(X_data,
                                                            y_t_data,
                                                            test_size=0.2,
                                                            random_state=0)
    random_state = 42

    # baikal way
    x = Input()
    y_t = Input()

    y_p1 = RandomForestClassifierOOF(n_estimators=10,
                                     random_state=random_state)(
                                         x, y_t, compute_func="predict_proba")
    y_p1 = Lambda(lambda array: array[:, 1:])(y_p1)  # remove collinear feature

    x_scaled = StandardScaler()(x)
    y_p2 = LinearSVCOOF(random_state=random_state)(
        x_scaled, y_t, compute_func="decision_function")

    stacked_features = ColumnStack()([x, y_p1, y_p2])
    y_p = LogisticRegression(solver="liblinear",
                             random_state=random_state)(stacked_features, y_t)

    model = Model(x, y_p, y_t)
    model.fit(X_train, y_t_train)
    y_pred_baikal = model.predict(X_test)

    # traditional way
    estimators = [
        ("rf",
         RandomForestClassifier(n_estimators=10, random_state=random_state)),
        ("svr",
         make_pipeline(StandardScaler(),
                       LinearSVC(random_state=random_state))),
    ]
    clf = sklearn.ensemble.StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(solver="liblinear",
                                           random_state=random_state),
        passthrough=True,
    )
    y_pred_traditional = clf.fit(X_train, y_t_train).predict(X_test)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
コード例 #17
0
def test_columnstack(teardown):
    x1 = Input()
    x2 = Input()
    y = ColumnStack()([x1, x2])
    model = Model([x1, x2], y)

    x1_data = np.array([1, 10, 100])
    x2_data = np.array([2, 20, 200])

    y_expected = np.column_stack([x1_data, x2_data])

    y_pred = model.predict([x1_data, x2_data])

    assert_array_equal(y_pred, y_expected)
コード例 #18
0
ファイル: test_model.py プロジェクト: vitalyvels/baikal
def test_fit_and_predict_model_with_no_fittable_steps(teardown):
    X1_data = np.array([[1, 2], [3, 4]])
    X2_data = np.array([[5, 6], [7, 8]])
    y_expected = np.array([[12, 16], [20, 24]])

    x1 = Input()
    x2 = Input()
    z = DummyMISO()([x1, x2])
    y = DummySISO()(z)

    model = Model([x1, x2], y)
    model.fit([X1_data, X2_data])  # nothing to fit
    y_pred = model.predict([X1_data, X2_data])

    assert_array_equal(y_pred, y_expected)
コード例 #19
0
ファイル: test_expression.py プロジェクト: vitalyvels/baikal
def test_lambda(teardown):
    def function(x1, x2, p1, p2=1):
        return p1 * x1, x2 / p2

    x = Input()
    y1, y2 = Lambda(function, n_outputs=2, p1=2, p2=2)([x, x])
    model = Model(x, [y1, y2])

    x_data = np.array([[1.0, 2.0], [3.0, 4.0]])

    y1_expected = np.array([[2.0, 4.0], [6.0, 8.0]])
    y2_expected = np.array([[0.5, 1.0], [1.5, 2.0]])

    y1_pred, y2_pred = model.predict(x_data)

    assert_array_equal(y1_pred, y1_expected)
    assert_array_equal(y2_pred, y2_expected)
コード例 #20
0
def test_fit_predict_naive_stack_with_proba_features(teardown):
    mask = iris.target != 2  # Reduce to binary problem to avoid ConvergenceWarning
    x_data = iris.data[mask]
    y_t_data = iris.target[mask]
    random_state = 123
    n_estimators = 5

    # baikal way
    x = Input()
    y_t = Input()
    y_p1 = LogisticRegression(random_state=random_state)(
        x, y_t, compute_func="predict_proba"
    )
    y_p2 = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)(
        x, y_t, compute_func="apply"
    )
    y_p1 = Lambda(compute_func=lambda array: array[:, 1:])(y_p1)
    y_p2 = Lambda(compute_func=lambda array: array[:, 1:])(y_p2)
    features = Concatenate(axis=1)([y_p1, y_p2])
    y_p = LogisticRegression(random_state=random_state)(features, y_t)

    model = Model(x, y_p, y_t)
    model.fit(x_data, y_t_data)
    y_pred_baikal = model.predict(x_data)

    # traditional way
    logreg = LogisticRegression(random_state=random_state)
    logreg.fit(x_data, y_t_data)
    logreg_proba = logreg.predict_proba(x_data)

    random_forest = RandomForestClassifier(
        n_estimators=n_estimators, random_state=random_state
    )
    random_forest.fit(x_data, y_t_data)
    random_forest_leafidx = random_forest.apply(x_data)

    features = np.concatenate(
        [logreg_proba[:, 1:], random_forest_leafidx[:, 1:]], axis=1
    )
    stacked = LogisticRegression(random_state=random_state)
    stacked.fit(features, y_t_data)
    y_pred_traditional = stacked.predict(features)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
コード例 #21
0
ファイル: test_model.py プロジェクト: vitalyvels/baikal
def test_fit_predict_ensemble_with_proba_features(teardown):
    mask = iris.target != 2  # Reduce to binary problem to avoid ConvergenceWarning
    x_data = iris.data[mask]
    y_t_data = iris.target[mask]
    random_state = 123
    n_estimators = 5

    # baikal way
    x = Input()
    y_t = Input()
    y1 = LogisticRegression(random_state=random_state, function="predict_proba")(x, y_t)
    y2 = RandomForestClassifier(
        n_estimators=n_estimators, random_state=random_state, function="apply"
    )(x, y_t)
    features = Concatenate(axis=1)([y1, y2])
    y = LogisticRegression(random_state=random_state)(features, y_t)

    model = Model(x, y, y_t)
    model.fit(x_data, y_t_data)
    y_pred_baikal = model.predict(x_data)

    # traditional way
    logreg = sklearn.linear_model.LogisticRegression(random_state=random_state)
    logreg.fit(x_data, y_t_data)
    logreg_proba = logreg.predict_proba(x_data)

    random_forest = sklearn.ensemble.RandomForestClassifier(
        n_estimators=n_estimators, random_state=random_state
    )
    random_forest.fit(x_data, y_t_data)
    random_forest_leafidx = random_forest.apply(x_data)

    features = np.concatenate([logreg_proba, random_forest_leafidx], axis=1)
    ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state)
    ensemble.fit(features, y_t_data)
    y_pred_traditional = ensemble.predict(features)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
コード例 #22
0
ys_t = Split(n_targets, axis=1)(y_t)
ys_p = []
for j, k in enumerate(order):
    x_stacked = ColumnStack()(inputs=[x, *ys_p[:j]])
    ys_t[k] = squeeze(ys_t[k])
    ys_p.append(LogisticRegression(solver="lbfgs")(x_stacked, ys_t[k]))

ys_p = [ys_p[order.index(j)] for j in range(n_targets)]
y_p = ColumnStack()(ys_p)

model = Model(inputs=x, outputs=y_p, targets=y_t)
# This might take a few seconds
plot_model(model, filename="classifier_chain.png", dpi=96)

# ------- Train model
model.fit(X_train, Y_train)

# ------- Evaluate model
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

print(
    "Jaccard score on train data:",
    jaccard_score(Y_train, Y_train_pred, average="samples"),
)
print(
    "Jaccard score on test data:",
    jaccard_score(Y_test, Y_test_pred, average="samples"),
)
コード例 #23
0
import sklearn.svm
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from baikal import Input, Model, make_step
from baikal.plot import plot_model

# 1. Define a step
SVC = make_step(sklearn.svm.SVC)

# 2. Build the model
x = Input()
y_t = Input()
y_p = SVC(C=1.0, kernel="rbf", gamma=0.5)(x, y_t)

model = Model(x, y_p, y_t)
plot_model(model, filename="readme_quick_example.png")

# 3. Train the model
dataset = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(dataset.data,
                                                    dataset.target,
                                                    random_state=0)

model.fit(X_train, y_train)

# 4. Use the model
y_test_pred = model.predict(X_test)
コード例 #24
0
y3 = LogisticRegression()(z, y_t)

stacked_features = Stack()([y1, y2, y3])
y_p = SVC()(stacked_features, y_t)

model = Model([x1, x2], y_p, y_t)
plot_model(model,
           filename="multiple_input_nonlinear_pipeline_example_plot.png")

# 3. Train the model
dataset = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(dataset.data,
                                                    dataset.target,
                                                    random_state=0)

# Let's suppose the dataset is originally split in two
X1_train, X2_train = X_train[:, :15], X_train[:, 15:]
X1_test, X2_test = X_test[:, :15], X_test[:, 15:]

model.fit([X1_train, X2_train], y_train)

# 4. Use the model
y_test_pred = model.predict([X1_test, X2_test])

# This also works:
# y_test_pred = model.predict({x1: X1_test, x2: X2_test})

# We can also query any intermediate outputs:
outs = model.predict([X1_test, X2_test],
                     output_names=["ExtraTreesClassifier_0:0/0", "PCA_0:0/0"])
コード例 #25
0
def test_steps_cache(teardown):
    x1_data = iris.data[:, :2]
    x2_data = iris.data[:, 2:]
    y1_t_data = iris.target

    x1 = Input(name="x1")
    x2 = Input(name="x2")
    y1_t = Input(name="y1_t")
    y1 = LogisticRegression(name="LogReg")(x1, y1_t)
    y2 = PCA(name="PCA")(x2)

    hits, misses = 0, 0

    # 1) instantiation always misses
    misses += 1
    model = Model([x1, x2], [y1, y2], y1_t)
    assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses

    # 2) calling fit for the first time, hence a miss
    misses += 1
    model.fit([x1_data, x2_data], y1_t_data)
    assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses

    # 3) same as above, just different format, hence a hit
    hits += 1
    model.fit({x1: x1_data, x2: x2_data}, {y1_t: y1_t_data})
    assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses

    # 4) trainable flags are considered in cache keys, hence a miss
    misses += 1
    model.get_step("LogReg").trainable = False
    model.fit(
        [x1_data, x2_data], y1_t_data
    )  # NOTE: target is superfluous, but it affects caching
    assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses

    # 5) same as above, just different format, hence a hit
    hits += 1
    model.fit({x1: x1_data, x2: x2_data}, y1_t_data)
    assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses

    # 6) we drop the (superflous) target, hence a miss
    misses += 1
    model.fit({x1: x1_data, x2: x2_data})
    assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses

    # 7) same as above, hence a hit
    hits += 1
    model.fit({x1: x1_data, x2: x2_data})
    assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses

    # 8) we restore the flag, becoming the same as 2) and 3), hence a hit
    hits += 1
    model.get_step("LogReg").trainable = True
    model.fit({x1: x1_data, x2: x2_data}, y1_t_data)
    assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses

    # 9) new inputs/targets/outputs signature, hence a miss
    misses += 1
    model.predict([x1_data, x2_data])
    assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses

    # 10) same inputs/outputs signature as 9), hence a hit
    hits += 1
    model.predict({"x1": x1_data, "x2": x2_data}, ["PCA:0/0", "LogReg:0/0"])
    assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses

    # 11) new inputs/outputs signature, hence a miss
    misses += 1
    model.predict({x1: x1_data}, "LogReg:0/0")
    assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses

    # 12) same as above, hence a hit
    hits += 1
    model.predict({x1: x1_data}, "LogReg:0/0")
    assert model._nodes_cache.hits == hits and model._nodes_cache.misses == misses