コード例 #1
0
ファイル: __init__.py プロジェクト: online-ml/river
def _yield_datasets(model: Estimator):
    """Generates datasets for a given model."""

    from sklearn import datasets as sk_datasets

    from river import base, compose, datasets, preprocessing, stream, utils

    # Recommendation models can be regressors or classifiers, but they have requirements as to the
    # structure of the data
    if isinstance(utils.inspect.extract_relevant(model), Recommender):
        if utils.inspect.isregressor(model):
            yield _DummyDataset(
                ({"user": "******", "item": "Superman"}, 8),
                ({"user": "******", "item": "Terminator"}, 9),
                ({"user": "******", "item": "Star Wars"}, 8),
                ({"user": "******", "item": "Notting Hill"}, 2),
                ({"user": "******", "item": "Harry Potter"}, 5),
                ({"user": "******", "item": "Superman"}, 8),
                ({"user": "******", "item": "Terminator"}, 9),
                ({"user": "******", "item": "Star Wars"}, 8),
                ({"user": "******", "item": "Notting Hill"}, 2),
            )
        return

    # Multi-output regression
    elif utils.inspect.ismoregressor(model):

        # 1
        yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud())

        # 2
        class SolarFlare:
            """One-hot encoded version of `datasets.SolarFlare"""

            def __iter__(self):
                oh = (
                    compose.SelectType(str) | preprocessing.OneHotEncoder()
                ) + compose.SelectType(int)
                for x, y in datasets.SolarFlare().take(200):
                    yield oh.transform_one(x), y

        yield SolarFlare()

    # Regression
    elif utils.inspect.isregressor(model):
        yield datasets.TrumpApproval().take(200)

    # Multi-output classification
    if utils.inspect.ismoclassifier(model):
        yield datasets.Music().take(200)

    # Classification
    elif utils.inspect.isclassifier(model):

        yield datasets.Phishing().take(200)
        yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200))

        # Multi-class classification
        if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags:
            yield datasets.ImageSegments().take(200)
コード例 #2
0
ファイル: test_scale.py プロジェクト: maseDevelop/river
def test_standard_scaler_add_remove_columns():
    """Checks that no exceptions are raised whenever columns are dropped and/or added."""

    X = pd.read_csv(datasets.TrumpApproval().path)

    ss = preprocessing.StandardScaler()
    for xb in np.array_split(X, 10):
        # Pick half of the columns at random
        cols = np.random.choice(X.columns, len(X.columns) // 2, replace=False)
        ss.learn_many(xb[cols])
コード例 #3
0
def test_add_remove_columns():
    """Checks that no exceptions are raised whenever columns are dropped and/or added."""

    X = pd.read_csv(datasets.TrumpApproval().path)
    Y = X.pop('five_thirty_eight')

    lin_reg = lm.LinearRegression()
    for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)):
        # Pick half of the columns at random
        cols = np.random.choice(X.columns, len(X.columns) // 2, replace=False)
        lin_reg.learn_many(xb[cols], yb)
コード例 #4
0
def test_one_many_consistent():
    """Checks that using learn_one or learn_many produces the same result."""

    X = pd.read_csv(datasets.TrumpApproval().path)
    Y = X.pop('five_thirty_eight')

    one = lm.LinearRegression()
    for x, y in stream.iter_pandas(X, Y):
        one.learn_one(x, y)

    many = lm.LinearRegression()
    for xb, yb in zip(np.array_split(X, len(X)), np.array_split(Y, len(Y))):
        many.learn_many(xb, yb)

    for i in X:
        assert math.isclose(one.weights[i], many.weights[i])
コード例 #5
0
def test_lin_reg_sklearn_coherence(river_params, sklearn_params):
    """Checks that the sklearn and river implementations produce the same results."""

    ss = preprocessing.StandardScaler()
    rv = lm.LinearRegression(**river_params)
    sk = sklm.SGDRegressor(**sklearn_params)

    for x, y in datasets.TrumpApproval().take(100):
        x = ss.learn_one(x).transform_one(x)
        rv.learn_one(x, y)
        sk.partial_fit([list(x.values())], [y])

    for i, w in enumerate(rv.weights.values()):
        assert math.isclose(w, sk.coef_[i])

    assert math.isclose(rv.intercept, sk.intercept_[0])
コード例 #6
0
def yield_datasets(model):

    from river import base
    from river import compose
    from river import datasets
    from river import preprocessing
    from river import stream
    from river import utils
    from sklearn import datasets as sk_datasets

    # Multi-output regression
    if utils.inspect.ismoregressor(model):

        # 1
        yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud())

        # 2
        class SolarFlare:
            """One-hot encoded version of `datasets.SolarFlare"""

            def __iter__(self):
                oh = (compose.SelectType(str) | preprocessing.OneHotEncoder()) + compose.SelectType(
                    int
                )
                for x, y in datasets.SolarFlare().take(200):
                    yield oh.transform_one(x), y

        yield SolarFlare()

    # Regression
    elif utils.inspect.isregressor(model):
        yield datasets.TrumpApproval().take(200)

    # Multi-output classification
    if utils.inspect.ismoclassifier(model):
        yield datasets.Music().take(200)

    # Classification
    elif utils.inspect.isclassifier(model):

        yield datasets.Phishing().take(200)
        yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200))

        # Multi-class classification
        if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags:
            yield datasets.ImageSegments().take(200)
コード例 #7
0
def test_shuffle_columns():
    """Checks that learn_many works identically whether columns are shuffled or not."""

    X = pd.read_csv(datasets.TrumpApproval().path)
    Y = X.pop('five_thirty_eight')

    normal = lm.LinearRegression()
    for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)):
        normal.learn_many(xb, yb)

    shuffled = lm.LinearRegression()
    for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)):
        cols = np.random.permutation(X.columns)
        shuffled.learn_many(xb[cols], yb)

    for i in X:
        assert math.isclose(normal.weights[i], shuffled.weights[i])
コード例 #8
0
ファイル: test_scale.py プロジェクト: maseDevelop/river
def test_standard_scaler_one_many_consistent():
    """Checks that using learn_one or learn_many produces the same result."""

    X = pd.read_csv(datasets.TrumpApproval().path)

    one = preprocessing.StandardScaler()
    for x, _ in stream.iter_pandas(X):
        one.learn_one(x)

    many = preprocessing.StandardScaler()
    for xb in np.array_split(X, 10):
        many.learn_many(xb)

    for i in X:
        assert math.isclose(one.counts[i], many.counts[i])
        assert math.isclose(one.means[i], many.means[i])
        assert math.isclose(one.vars[i], many.vars[i])
コード例 #9
0
ファイル: test_scale.py プロジェクト: maseDevelop/river
def test_standard_scaler_shuffle_columns():
    """Checks that learn_many works identically whether columns are shuffled or not."""

    X = pd.read_csv(datasets.TrumpApproval().path)

    normal = preprocessing.StandardScaler()
    for xb in np.array_split(X, 10):
        normal.learn_many(xb)

    shuffled = preprocessing.StandardScaler()
    for xb in np.array_split(X, 10):
        cols = np.random.permutation(X.columns)
        shuffled.learn_many(xb[cols])

    for i in X:
        assert math.isclose(shuffled.counts[i], shuffled.counts[i])
        assert math.isclose(shuffled.means[i], shuffled.means[i])
        assert math.isclose(shuffled.vars[i], shuffled.vars[i])
コード例 #10
0
def test_lin_reg_sklearn_coherence():
    """Checks that the sklearn and river implementations produce the same results."""
    class SquaredLoss:
        """sklearn removes the leading 2 from the gradient of the squared loss."""
        def gradient(self, y_true, y_pred):
            return y_pred - y_true

    ss = preprocessing.StandardScaler()
    cr = lm.LinearRegression(optimizer=optim.SGD(.01), loss=SquaredLoss())
    sk = sklm.SGDRegressor(learning_rate='constant', eta0=.01, alpha=.0)

    for x, y in datasets.TrumpApproval():
        x = ss.learn_one(x).transform_one(x)
        cr.learn_one(x, y)
        sk.partial_fit([list(x.values())], [y])

    for i, w in enumerate(cr.weights.values()):
        assert math.isclose(w, sk.coef_[i])

    assert math.isclose(cr.intercept, sk.intercept_[0])
コード例 #11
0
    for _ in range(n):
        p = {j: random.gauss(0, 1) for j in keys}
        norm = utils.math.norm(p, order=2)
        for j in p:
            p[j] /= norm
        yield p


@pytest.mark.parametrize(
    'lm, dataset',
    [
        pytest.param(lm(
            optimizer=copy.deepcopy(optimizer), initializer=initializer, l2=0),
                     dataset,
                     id=f'{lm.__name__} - {optimizer} - {initializer}')
        for lm, dataset in [(lm.LinearRegression, datasets.TrumpApproval()
                             ), (lm.LogisticRegression, datasets.Bananas())]
        for optimizer, initializer in itertools.product(
            [
                optim.AdaBound(),
                optim.AdaDelta(),
                optim.AdaGrad(),
                optim.AdaMax(),
                optim.Adam(),
                optim.AMSGrad(),
                # TODO: check momentum optimizers
                # optim.Momentum(),
                # optim.NesterovMomentum(),
                optim.RMSProp(),
                optim.SGD()
            ],
コード例 #12
0
        for j in p:
            p[j] /= norm
        yield p


@pytest.mark.parametrize(
    "lm, dataset",
    [
        pytest.param(
            lm(optimizer=copy.deepcopy(optimizer),
               initializer=initializer,
               l2=0),
            dataset,
            id=f"{lm.__name__} - {optimizer} - {initializer}",
        ) for lm, dataset in [
            (lm.LinearRegression, datasets.TrumpApproval()),
            (lm.LogisticRegression, datasets.Bananas()),
        ] for optimizer, initializer in itertools.product(
            [
                optim.AdaBound(),
                optim.AdaDelta(),
                optim.AdaGrad(),
                optim.AdaMax(),
                optim.Adam(),
                optim.AMSGrad(),
                # TODO: check momentum optimizers
                # optim.Momentum(),
                # optim.NesterovMomentum(),
                optim.RMSProp(),
                optim.SGD(),
            ],
コード例 #13
0
from river import datasets
from river import metrics
from river.evaluate import Track


def trump_mse_track(n_samples=10_000, seed=42):
    dataset = datasets.TrumpApproval().take(n_samples)
    track = Track("TRUMP Approval + R2", dataset, metrics.R2(), n_samples)
    return track


def chickweights_mse_track(n_samples=10_000, seed=42):
    dataset = datasets.ChickWeights().take(n_samples)
    track = Track("ChickWeights + R2", dataset, metrics.R2(), n_samples)
    return track