Ejemplo n.º 1
0
def _yield_datasets(model: Estimator):
    """Generates datasets for a given model."""

    from sklearn import datasets as sk_datasets

    from river import base, compose, datasets, preprocessing, stream, utils

    # Recommendation models can be regressors or classifiers, but they have requirements as to the
    # structure of the data
    if isinstance(utils.inspect.extract_relevant(model), Recommender):
        if utils.inspect.isregressor(model):
            yield _DummyDataset(
                ({"user": "******", "item": "Superman"}, 8),
                ({"user": "******", "item": "Terminator"}, 9),
                ({"user": "******", "item": "Star Wars"}, 8),
                ({"user": "******", "item": "Notting Hill"}, 2),
                ({"user": "******", "item": "Harry Potter"}, 5),
                ({"user": "******", "item": "Superman"}, 8),
                ({"user": "******", "item": "Terminator"}, 9),
                ({"user": "******", "item": "Star Wars"}, 8),
                ({"user": "******", "item": "Notting Hill"}, 2),
            )
        return

    # Multi-output regression
    elif utils.inspect.ismoregressor(model):

        # 1
        yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud())

        # 2
        class SolarFlare:
            """One-hot encoded version of `datasets.SolarFlare"""

            def __iter__(self):
                oh = (
                    compose.SelectType(str) | preprocessing.OneHotEncoder()
                ) + compose.SelectType(int)
                for x, y in datasets.SolarFlare().take(200):
                    yield oh.transform_one(x), y

        yield SolarFlare()

    # Regression
    elif utils.inspect.isregressor(model):
        yield datasets.TrumpApproval().take(200)

    # Multi-output classification
    if utils.inspect.ismoclassifier(model):
        yield datasets.Music().take(200)

    # Classification
    elif utils.inspect.isclassifier(model):

        yield datasets.Phishing().take(200)
        yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200))

        # Multi-class classification
        if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags:
            yield datasets.ImageSegments().take(200)
Ejemplo n.º 2
0
def test_clone_idempotent():

    model = preprocessing.StandardScaler() | linear_model.LogisticRegression(
        optimizer=optim.Adam(), l2=0.1)

    trace = []
    for x, y in datasets.Phishing():
        trace.append(model.predict_proba_one(x))
        model.learn_one(x, y)

    clone = model.clone()
    for i, (x, y) in enumerate(datasets.Phishing()):
        assert clone.predict_proba_one(x) == trace[i]
        clone.learn_one(x, y)
Ejemplo n.º 3
0
def yield_datasets(model):

    from river import base
    from river import compose
    from river import datasets
    from river import preprocessing
    from river import stream
    from river import utils
    from sklearn import datasets as sk_datasets

    # Multi-output regression
    if utils.inspect.ismoregressor(model):

        # 1
        yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud())

        # 2
        class SolarFlare:
            """One-hot encoded version of `datasets.SolarFlare"""

            def __iter__(self):
                oh = (compose.SelectType(str) | preprocessing.OneHotEncoder()) + compose.SelectType(
                    int
                )
                for x, y in datasets.SolarFlare().take(200):
                    yield oh.transform_one(x), y

        yield SolarFlare()

    # Regression
    elif utils.inspect.isregressor(model):
        yield datasets.TrumpApproval().take(200)

    # Multi-output classification
    if utils.inspect.ismoclassifier(model):
        yield datasets.Music().take(200)

    # Classification
    elif utils.inspect.isclassifier(model):

        yield datasets.Phishing().take(200)
        yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200))

        # Multi-class classification
        if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags:
            yield datasets.ImageSegments().take(200)
Ejemplo n.º 4
0
def test_sklearn_coherence(river_params, sklearn_params):
    """Checks that the sklearn and river implementations produce the same results."""

    rv = anomaly.OneClassSVM(**river_params)
    sk = sklm.SGDOneClassSVM(**sklearn_params)

    for x, _ in datasets.Phishing().take(100):
        rv.learn_one(x)
        sk.partial_fit([list(x.values())])

    for i, w in enumerate(rv.weights.values()):
        assert math.isclose(w, sk.coef_[i])
Ejemplo n.º 5
0
def load_binary_clf_tracks() -> typing.List[Track]:
    """Return binary classification tracks."""

    return [
        Track(
            name="Phishing",
            dataset=datasets.Phishing(),
            metric=metrics.Accuracy() + metrics.F1(),
        ),
        Track(
            name="Bananas",
            dataset=datasets.Bananas(),
            metric=metrics.Accuracy() + metrics.F1(),
        ),
    ]
Ejemplo n.º 6
0
from river import compose
from river import preprocessing
from river import linear_model
from river import metrics
from river import datasets
from river import optim

optimizer = optim.SGD(0.1)
model = compose.Pipeline(preprocessing.StandardScaler(),
                         linear_model.LogisticRegression(optimizer))

metric = metrics.ROCAUC()
precision = metrics.Precision()

for x, y in datasets.Phishing():
    y_pred = model.predict_proba_one(x)
    model.learn_one(x, y)
    metric.update(y, y_pred)
    precision.update(y, y_pred)

print(metric)
print(precision)
Ejemplo n.º 7
0
from river import datasets

dataset = datasets.Phishing()

from river import compose
from river import linear_model
from river import metrics
from river import preprocessing

model = compose.Pipeline(preprocessing.StandardScaler(),
                         linear_model.LogisticRegression())
metric = metrics.Accuracy()

for x, y in dataset:
    y_pred = model.predict_one(x)
    metric = metric.update(y, y_pred)
    model = model.learn_one(x, y)

print(metric)
Ejemplo n.º 8
0
import pytest

from river import datasets, synth, tree


def get_regression_data():
    return iter(synth.Friedman(seed=42).take(200))


@pytest.mark.parametrize(
    "dataset, splitter",
    [
        (datasets.Phishing(), tree.splitter.ExhaustiveSplitter()),
        (datasets.Phishing(), tree.splitter.HistogramSplitter()),
        (datasets.Phishing(), tree.splitter.GaussianSplitter()),
    ],
)
def test_class_splitter(dataset, splitter):
    model = tree.HoeffdingTreeClassifier(splitter=splitter,
                                         grace_period=10,
                                         leaf_prediction="mc",
                                         split_confidence=0.1)

    for x, y in dataset:
        model.learn_one(x, y)

    assert model.height > 0


@pytest.mark.parametrize(
    "dataset, splitter",