Example #1
0
def test_fit_predict_pipeline(teardown):
    x_data = iris.data
    y_t_data = iris.target
    random_state = 123
    n_components = 2

    # baikal way
    x = Input()
    y_t = Input()
    x_pca = PCA(n_components=n_components, random_state=random_state, name="pca")(x)
    y = LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        random_state=random_state,
        name="logreg",
    )(x_pca, y_t)

    model = Model(x, y, y_t)
    y_pred_baikal = model.fit(x_data, y_t_data).predict(x_data)

    # traditional way
    pca = PCA(n_components=n_components, random_state=random_state)
    logreg = LogisticRegression(
        multi_class="multinomial", solver="lbfgs", random_state=random_state
    )
    x_data_transformed = pca.fit_transform(x_data)
    y_pred_traditional = logreg.fit(x_data_transformed, y_t_data).predict(
        x_data_transformed
    )

    assert_array_equal(y_pred_baikal, y_pred_traditional)
Example #2
0
def test_fit_predict_ensemble(teardown):
    mask = iris.target != 2  # Reduce to binary problem to avoid ConvergenceWarning
    x_data = iris.data
    y_t_data = iris.target
    random_state = 123

    # baikal way
    x = Input()
    y_t = Input()
    y1 = LogisticRegression(random_state=random_state)(x, y_t)
    y2 = RandomForestClassifier(random_state=random_state)(x, y_t)
    features = Stack(axis=1)([y1, y2])
    y = LogisticRegression(random_state=random_state)(features, y_t)

    model = Model(x, y, y_t)
    model.fit(x_data, y_t_data)
    y_pred_baikal = model.predict(x_data)

    # traditional way
    logreg = sklearn.linear_model.LogisticRegression(random_state=random_state)
    logreg.fit(x_data, y_t_data)
    logreg_pred = logreg.predict(x_data)

    random_forest = sklearn.ensemble.RandomForestClassifier(random_state=random_state)
    random_forest.fit(x_data, y_t_data)
    random_forest_pred = random_forest.predict(x_data)

    features = np.stack([logreg_pred, random_forest_pred], axis=1)
    ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state)
    ensemble.fit(features, y_t_data)
    y_pred_traditional = ensemble.predict(features)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
Example #3
0
def test_fit_params(teardown):
    x_data = iris.data
    y_t_data = iris.target
    random_state = 123
    n_components = 2

    sample_weight = y_t_data + 1  # Just weigh the classes differently
    fit_params = {"logreg__sample_weight": sample_weight}

    # baikal way
    x = Input()
    y_t = Input()
    x_pca = PCA(n_components=n_components, random_state=random_state, name="pca")(x)
    y = LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        random_state=random_state,
        name="logreg",
    )(x_pca, y_t)

    model = Model(x, y, y_t)
    model.fit(x_data, y_t_data, **fit_params)

    # traditional way
    pca = PCA(n_components=n_components, random_state=random_state)
    logreg = LogisticRegression(
        multi_class="multinomial", solver="lbfgs", random_state=random_state
    )
    pipe = Pipeline([("pca", pca), ("logreg", logreg)])
    pipe.fit(x_data, y_t_data, **fit_params)

    # Use assert_allclose instead of all equal due to small numerical differences
    # between fit_transform(...) and fit(...).transform(...)
    assert_allclose(model.get_step("logreg").coef_, pipe.named_steps["logreg"].coef_)
def test_grid_search_cv_with_tunable_step():
    param_grid = {
        "classifier": [
            LogisticRegression(random_state=random_state),
            RandomForestClassifier(random_state=random_state),
        ],
        "pca__n_components": [2, 4],
    }

    # baikal way
    def build_fn():
        x = Input()
        y_t = Input()
        h = PCA(random_state=random_state, name="pca")(x)
        y = LogisticRegression(random_state=random_state,
                               name="classifier")(h, y_t)
        model = Model(x, y, y_t)
        return model

    sk_model = SKLearnWrapper(build_fn)
    gscv_baikal = GridSearchCV(
        sk_model,
        param_grid,
        cv=cv,
        scoring="accuracy",
        return_train_score=True,
        verbose=verbose,
    )
    gscv_baikal.fit(x_data, y_t_data)

    # traditional way
    pca = PCA(random_state=random_state)
    classifier = LogisticRegression(random_state=random_state)
    pipe = Pipeline([("pca", pca), ("classifier", classifier)])

    gscv_traditional = GridSearchCV(
        pipe,
        param_grid,
        cv=cv,
        scoring="accuracy",
        return_train_score=True,
        verbose=verbose,
    )
    gscv_traditional.fit(x_data, y_t_data)

    assert gscv_baikal.best_params_ == gscv_traditional.best_params_
    assert_array_equal(
        gscv_traditional.cv_results_["mean_train_score"],
        gscv_baikal.cv_results_["mean_train_score"],
    )
    assert_array_equal(
        gscv_traditional.cv_results_["mean_test_score"],
        gscv_baikal.cv_results_["mean_test_score"],
    )
Example #5
0
    def test_with_unnecessary_target(self, teardown):
        x = Input()
        y_t = Input()
        logreg = LogisticRegression()
        y_p = logreg(x, y_t)
        model = Model(x, y_p, y_t)

        model.fit(iris.data, iris.target)

        # won't require the target is trainable was set to False,
        # but won't complain if it was passed to fit
        logreg.trainable = False
        model.fit(iris.data, iris.target)
Example #6
0
def make_naive_stacked_model(n_components, random_state, x_data, y_t_data):
    # An unnecessarily complex Model

    # Sub-model 1
    x1 = Input(name="x1")
    y1_t = Input(name="y1_t")
    h1 = PCA(n_components=n_components, random_state=random_state, name="pca_sub1")(x1)
    y1 = LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        random_state=random_state,
        name="logreg_sub1",
    )(h1, y1_t)
    submodel1 = Model(x1, y1, y1_t, name="submodel1")

    # Sub-model 2 (a nested stacked model)
    x2 = Input(name="x2")
    y2_t = Input(name="y2_t")
    y2_1 = RandomForestClassifier(random_state=random_state, name="rforest_sub2")(
        x2, y2_t
    )
    y2_2 = ExtraTreesClassifier(random_state=random_state, name="extrees_sub2")(
        x2, y2_t
    )
    features = Stack(axis=1, name="stack_sub2")([y2_1, y2_2])
    y2 = LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        random_state=random_state,
        name="logreg_sub2",
    )(features, y2_t)
    submodel2 = Model(x2, y2, y2_t, name="submodel2")

    # Stack of submodels
    x = Input(name="x")
    y_t = Input(name="y_t")
    y1 = submodel1(x, y_t)
    y2 = submodel2(x, y_t)
    features = Stack(axis=1, name="stack")([y1, y2])
    y = LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        random_state=random_state,
        name="logreg_stacked",
    )(features, y_t)
    stacked_model_baikal = Model(x, y, y_t, name="stacked")

    stacked_model_baikal.fit(x_data, y_t_data)

    return stacked_model_baikal
Example #7
0
def test_fit_predict_standard_stack(teardown):
    # This uses the "standard" protocol where the 2nd level features
    # are the out-of-fold predictions of the 1st. It also appends the
    # original data to the 2nd level features.
    # See for example: https://www.kdnuggets.com/2017/02/stacking-models-imropved-predictions.html
    X_data, y_t_data = breast_cancer.data, breast_cancer.target
    X_train, X_test, y_t_train, y_t_test = train_test_split(X_data,
                                                            y_t_data,
                                                            test_size=0.2,
                                                            random_state=0)
    random_state = 42

    # baikal way
    x = Input()
    y_t = Input()

    y_p1 = RandomForestClassifierOOF(n_estimators=10,
                                     random_state=random_state)(
                                         x, y_t, compute_func="predict_proba")
    y_p1 = Lambda(lambda array: array[:, 1:])(y_p1)  # remove collinear feature

    x_scaled = StandardScaler()(x)
    y_p2 = LinearSVCOOF(random_state=random_state)(
        x_scaled, y_t, compute_func="decision_function")

    stacked_features = ColumnStack()([x, y_p1, y_p2])
    y_p = LogisticRegression(solver="liblinear",
                             random_state=random_state)(stacked_features, y_t)

    model = Model(x, y_p, y_t)
    model.fit(X_train, y_t_train)
    y_pred_baikal = model.predict(X_test)

    # traditional way
    estimators = [
        ("rf",
         RandomForestClassifier(n_estimators=10, random_state=random_state)),
        ("svr",
         make_pipeline(StandardScaler(),
                       LinearSVC(random_state=random_state))),
    ]
    clf = sklearn.ensemble.StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(solver="liblinear",
                                           random_state=random_state),
        passthrough=True,
    )
    y_pred_traditional = clf.fit(X_train, y_t_train).predict(X_test)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
Example #8
0
    def test_with_steps_with_duplicated_names(self, teardown):
        x = Input()
        h = PCA(name="duplicated-name")(x)
        y = LogisticRegression(name="duplicated-name")(h)

        with pytest.raises(RuntimeError):
            Model(x, y)
Example #9
0
 def dataplaceholders(self):
     x1 = Input(name="x1")
     x2 = Input(name="x2")
     y1_t = Input(name="y1_t")
     y1 = LogisticRegression()(x1, y1_t)
     y2 = PCA()(x2)
     return x1, x2, y1, y2, y1_t
Example #10
0
def test_nested_model(teardown):
    x_data = iris.data
    y_t_data = iris.target

    # Sub-model
    x = Input()
    y_t = Input()
    h = PCA(n_components=2)(x)
    y = LogisticRegression()(h, y_t)
    submodel = Model(x, y, y_t)

    # Model
    x = Input()
    y_t = Input()
    y = submodel(x, y_t)
    model = Model(x, y, y_t)

    with raises_with_cause(RuntimeError, NotFittedError):
        submodel.predict(x_data)

    model.fit(x_data, y_t_data)
    y_pred = model.predict(x_data)
    y_pred_sub = submodel.predict(x_data)

    assert_array_equal(y_pred, y_pred_sub)
Example #11
0
def test_plot_independent_submodels(teardown, tmp_path, expand_nested):
    xs, y_ts, y_ps, steps = [], [], [], []

    for i in range(3):
        step = (LogisticRegression() if i == 0 else build_submodel(
            LogisticRegression(), i))
        x = Input(name="x{}".format(i))
        y_t = Input(name="y_t{}".format(i))
        y_p = step(x, y_t)
        xs.append(x)
        y_ts.append(y_t)
        y_ps.append(y_p)

    model = Model(xs, y_ps, y_ts)

    filename = str(tmp_path / "test_plot_model.png")
    plot_model(model, filename, show=False, expand_nested=expand_nested)
 def build_fn():
     x = Input()
     y_t = Input()
     h = PCA(random_state=random_state, name="pca")(x)
     y = LogisticRegression(random_state=random_state,
                            name="classifier")(h, y_t)
     model = Model(x, y, y_t)
     return model
Example #13
0
 def test_with_undefined_target(self, teardown):
     x = Input()
     y = LogisticRegression()(x, trainable=True)
     model = Model(inputs=x, outputs=y)
     with raises_with_cause(RuntimeError, TypeError):
         # LogisticRegression.fit will be called with not enough arguments
         # hence the TypeError
         model.fit(iris.data)
Example #14
0
 def dataplaceholders(self):
     x1 = Input(name="x1")
     x2 = Input(name="x2")
     y1_t = Input(name="y1_t")
     x1_rescaled = StandardScaler()(x1)
     y1 = LogisticRegression()(x1_rescaled, y1_t)
     y2 = PCA()(x2)
     return x1, x2, x1_rescaled, y1, y2, y1_t
Example #15
0
    def test_set_params(self, teardown):
        step = LogisticRegression()

        new_params_wrong = {"non_existent_param": 42}
        with pytest.raises(ValueError):
            step.set_params(**new_params_wrong)

        new_params = {"C": 100.0, "fit_intercept": False, "penalty": "l1"}

        step.set_params(**new_params)
        params = step.get_params()

        expected = {
            "C": 100.0,
            "class_weight": None,
            "dual": False,
            "fit_intercept": False,
            "intercept_scaling": 1,
            "max_iter": 100,
            "multi_class": "warn",
            "n_jobs": None,
            "penalty": "l1",
            "random_state": None,
            "solver": "warn",
            "tol": 0.0001,
            "verbose": 0,
            "warm_start": False,
            "l1_ratio": None,
        }

        assert expected == params
Example #16
0
    def test_compute_func(self, simple_step, shared_step, dataplaceholders, teardown):
        assert simple_step.compute_func == simple_step.predict
        simple_step.compute_func = simple_step.predict_proba
        assert simple_step.compute_func == simple_step.predict_proba

        with pytest.raises(AttributeError):
            shared_step.compute_func

        with pytest.raises(AttributeError):
            shared_step.compute_func = shared_step.predict_proba

        with pytest.raises(AttributeError):
            # because the step hasn't been called
            LogisticRegression().compute_func

        with pytest.raises(AttributeError):
            # because the step hasn't been called
            LogisticRegression().compute_func = lambda x: x
Example #17
0
    def test_trainable(self, simple_step, shared_step, dataplaceholders, teardown):
        assert simple_step.trainable
        simple_step.trainable = False
        assert not simple_step.trainable

        with pytest.raises(AttributeError):
            shared_step.trainable

        with pytest.raises(AttributeError):
            shared_step.trainable = True

        with pytest.raises(AttributeError):
            # because the step hasn't been called
            LogisticRegression().trainable

        with pytest.raises(AttributeError):
            # because the step hasn't been called
            LogisticRegression().trainable = False
 def build_fn():
     x = Input()
     y_t = Input()
     h = PCA(random_state=random_state, name="pca")(x)
     y = LogisticRegression(random_state=random_state,
                            solver="liblinear",
                            name="logreg")(h, y_t)
     model = Model(x, y, y_t)
     return model
Example #19
0
    def test_inputs(self, simple_step, shared_step, dataplaceholders, teardown):
        x1 = dataplaceholders[0]
        assert simple_step.inputs == [x1]

        with pytest.raises(AttributeError):
            shared_step.inputs

        with pytest.raises(AttributeError):
            # because the step hasn't been called
            LogisticRegression().inputs
Example #20
0
    def test_predict_with_not_fitted_steps(self, teardown):
        x_data = iris.data

        x = Input(name="x")
        xt = PCA(n_components=2)(x)
        y = LogisticRegression(multi_class="multinomial", solver="lbfgs")(xt)

        model = Model(x, y)
        with raises_with_cause(RuntimeError, NotFittedError):
            model.predict(x_data)
Example #21
0
    def test_outputs(self, simple_step, shared_step, dataplaceholders, teardown):
        *_, y_simple, y_shared_1, y_shared_2 = dataplaceholders
        assert simple_step.outputs == [y_simple]

        with pytest.raises(AttributeError):
            shared_step.outputs

        with pytest.raises(AttributeError):
            # because the step hasn't been called
            LogisticRegression().outputs
Example #22
0
    def test_targets(self, simple_step, shared_step, dataplaceholders, teardown):
        y_t = dataplaceholders[2]
        assert simple_step.targets == [y_t]

        with pytest.raises(AttributeError):
            shared_step.targets

        with pytest.raises(AttributeError):
            # because the step hasn't been called
            LogisticRegression().targets
def test_grid_search_cv():
    param_grid = {
        "pca__n_components": [2, 4],
        "logreg__C": [0.1, 1.0, 10],
        "logreg__penalty": ["l1", "l2"],
    }

    # baikal way
    def build_fn():
        x = Input()
        y_t = Input()
        h = PCA(random_state=random_state, name="pca")(x)
        y = LogisticRegression(random_state=random_state,
                               solver="liblinear",
                               name="logreg")(h, y_t)
        model = Model(x, y, y_t)
        return model

    sk_model = SKLearnWrapper(build_fn)
    assert isinstance(sk_model.model, Model)

    gscv_baikal = GridSearchCV(
        sk_model,
        param_grid,
        cv=cv,
        scoring="accuracy",
        return_train_score=True,
        verbose=verbose,
    )
    gscv_baikal.fit(x_data, y_t_data)

    # traditional way
    pca = PCA(random_state=random_state)
    logreg = LogisticRegression(random_state=random_state, solver="liblinear")
    pipe = Pipeline([("pca", pca), ("logreg", logreg)])

    gscv_traditional = GridSearchCV(
        pipe,
        param_grid,
        cv=cv,
        scoring="accuracy",
        return_train_score=True,
        verbose=verbose,
    )
    gscv_traditional.fit(x_data, y_t_data)

    assert gscv_baikal.best_params_ == gscv_traditional.best_params_
    assert_array_equal(
        gscv_traditional.cv_results_["mean_train_score"],
        gscv_baikal.cv_results_["mean_train_score"],
    )
    assert_array_equal(
        gscv_traditional.cv_results_["mean_test_score"],
        gscv_baikal.cv_results_["mean_test_score"],
    )
Example #24
0
 def test_with_non_fitted_non_trainable_step(self, teardown):
     x = Input()
     y_t = Input()
     z = PCA()(x, trainable=False)
     y = LogisticRegression()(z, y_t)
     model = Model(x, y, y_t)
     with raises_with_cause(RuntimeError, NotFittedError):
         # this will raise an error when calling compute
         # on PCA which was flagged as trainable=False but
         # hasn't been fitted
         model.fit(iris.data, iris.target)
Example #25
0
    def test_with_unnecessary_inputs(self, teardown):
        x1 = Input()
        x2 = Input()
        y_t = Input()
        h = PCA()(x1)
        y = LogisticRegression()(h, y_t)

        with pytest.raises(ValueError):
            Model([x1, x2], y, y_t)

        with pytest.raises(ValueError):
            Model([x1, h], y, y_t)  # x1 is an unnecessary input upstream of h
Example #26
0
def test_fit_predict_ensemble_with_proba_features(teardown):
    mask = iris.target != 2  # Reduce to binary problem to avoid ConvergenceWarning
    x_data = iris.data[mask]
    y_t_data = iris.target[mask]
    random_state = 123
    n_estimators = 5

    # baikal way
    x = Input()
    y_t = Input()
    y1 = LogisticRegression(random_state=random_state, function="predict_proba")(x, y_t)
    y2 = RandomForestClassifier(
        n_estimators=n_estimators, random_state=random_state, function="apply"
    )(x, y_t)
    features = Concatenate(axis=1)([y1, y2])
    y = LogisticRegression(random_state=random_state)(features, y_t)

    model = Model(x, y, y_t)
    model.fit(x_data, y_t_data)
    y_pred_baikal = model.predict(x_data)

    # traditional way
    logreg = sklearn.linear_model.LogisticRegression(random_state=random_state)
    logreg.fit(x_data, y_t_data)
    logreg_proba = logreg.predict_proba(x_data)

    random_forest = sklearn.ensemble.RandomForestClassifier(
        n_estimators=n_estimators, random_state=random_state
    )
    random_forest.fit(x_data, y_t_data)
    random_forest_leafidx = random_forest.apply(x_data)

    features = np.concatenate([logreg_proba, random_forest_leafidx], axis=1)
    ensemble = sklearn.linear_model.LogisticRegression(random_state=random_state)
    ensemble.fit(features, y_t_data)
    y_pred_traditional = ensemble.predict(features)

    assert_array_equal(y_pred_baikal, y_pred_traditional)
Example #27
0
    def test_get_params(self, teardown):
        step = LogisticRegression()
        params = step.get_params()

        expected = {
            "C": 1.0,
            "class_weight": None,
            "dual": False,
            "fit_intercept": True,
            "intercept_scaling": 1,
            "max_iter": 100,
            "multi_class": "warn",
            "n_jobs": None,
            "penalty": "l2",
            "random_state": None,
            "solver": "warn",
            "tol": 0.0001,
            "verbose": 0,
            "warm_start": False,
            "l1_ratio": None,
        }

        assert expected == params
Example #28
0
def test_plot_nested_submodels(teardown, tmp_path, levels, expand_nested):

    submodels = [LogisticRegression()]
    for level in range(levels):
        sub_model = build_submodel(submodels[level], level + 1)
        submodels.append(sub_model)

    x = Input(name="x")
    y_t = Input(name="y_t")
    y_p = submodels[-1](x, y_t)
    model = Model(x, y_p, y_t)

    filename = str(tmp_path / "test_plot_model.png")
    plot_model(model, filename, show=False, expand_nested=expand_nested)
Example #29
0
    def test_with_wrong_type(self, teardown):
        x = Input()
        y_t = Input()
        y = LogisticRegression()(x, y_t)

        wrong = np.zeros((10,))
        with pytest.raises(ValueError):
            Model(wrong, y, y_t)

        with pytest.raises(ValueError):
            Model(x, wrong, y_t)

        with pytest.raises(ValueError):
            Model(x, y, wrong)
Example #30
0
def test_plot_shared_submodel(teardown, tmp_path, expand_nested):
    xs, y_ts, y_ps = [], [], []
    submodel = build_submodel(LogisticRegression(), 0)

    for i in range(2):
        x = Input(name="x{}".format(i))
        y_t = Input(name="y_t{}".format(i))
        y_p = submodel(x, y_t)
        xs.append(x)
        y_ts.append(y_t)
        y_ps.append(y_p)

    model = Model(xs, y_ps, y_ts)

    filename = str(tmp_path / PNG_FILENAME)
    plot_model(model, filename, show=False, expand_nested=expand_nested)