Example #1
0
 def _default_params(cls):
     return {
         'regressors': [
             pp.StandardScaler() | lm.LinearRegression(intercept_lr=.1),
             pp.StandardScaler() | lm.PARegressor(),
         ]
     }
Example #2
0
File: base.py Project: Leo-VK/creme
 def _unit_test_params(cls):
     yield {
         "models": [
             compose.Pipeline(
                 preprocessing.StandardScaler(),
                 linear_model.LinearRegression(optimizer=optim.SGD(
                     lr=1e-2)),
             ),
             compose.Pipeline(
                 preprocessing.StandardScaler(),
                 linear_model.LinearRegression(optimizer=optim.SGD(
                     lr=1e-1)),
             ),
         ],
         "metric":
         metrics.MAE(),
     }
     yield {
         "models": [
             compose.Pipeline(
                 preprocessing.StandardScaler(),
                 linear_model.LinearRegression(optimizer=optim.SGD(lr=lr)),
             ) for lr in [1e-4, 1e-3, 1e-2, 1e-1]
         ],
         "metric":
         metrics.MAE(),
     }
Example #3
0
 def _unit_test_params(cls):
     return {
         "models": [
             pp.StandardScaler() | lm.LinearRegression(intercept_lr=0.1),
             pp.StandardScaler() | lm.PARegressor(),
         ]
     }
Example #4
0
 def _unit_test_params(cls):
     return {
         "models": [
             compose.Pipeline(
                 preprocessing.StandardScaler(),
                 linear_model.LinearRegression(optimizer=optim.SGD(
                     lr=0.01)),
             ),
             compose.Pipeline(
                 preprocessing.StandardScaler(),
                 linear_model.LinearRegression(optimizer=optim.SGD(lr=0.1)),
             ),
         ],
         "metric":
         metrics.MAE(),
     }
Example #5
0
    def __init__(
        self,
        p: int,
        d: int,
        q: int,
        m: int = 1,
        sp: int = 0,
        sd: int = 0,
        sq: int = 0,
        regressor: base.Regressor = None,
    ):

        self.p = p
        self.d = d
        self.q = q
        self.m = m
        self.sp = sp
        self.sd = sd
        self.sq = sq
        self.regressor = (
            regressor
            if regressor is not None
            else preprocessing.StandardScaler() | linear_model.LinearRegression()
        )
        self.differencer = Differencer(d=d, m=1) + Differencer(d=sd, m=1)
        self.y_trues = collections.deque(maxlen=max(p, m * sp))
        self.errors = collections.deque(maxlen=max(p, m * sq))
Example #6
0
def test_memory_usage():

    model = preprocessing.StandardScaler() | linear_model.LogisticRegression()

    # We can't test the exact value because it depends on the platform and the Python version
    # TODO: we could create a table of expected values for each platform and Python version
    assert isinstance(model._memory_usage, str)
Example #7
0
class RiverML:
    # fraud detection model
    model = compose.Pipeline(preprocessing.StandardScaler(),
                             linear_model.LogisticRegression())

    # ROCAUC metric to score the model as it trains
    metric = metrics.ROCAUC()
Example #8
0
def test_pipeline_add_at_start():
    def a(x):
        pass

    pipeline = preprocessing.StandardScaler() | linear_model.LinearRegression()
    pipeline = a | pipeline
    assert str(pipeline) == "a | StandardScaler | LinearRegression"
Example #9
0
def get_model():
    extract_features = compose.TransformerUnion(
    get_ordinal_date,
    get_day_distances
    )
    
    model = (
     extract_features |
     time_series.SNARIMAX(
        p=0,
        d=0,
        q=0,
        m=7,
        sp=3,
        sq=0,
        regressor=(
            preprocessing.StandardScaler() |
            linear_model.LinearRegression(
                intercept_init=0,
                intercept_lr=0.3,
                optimizer=optim.SGD(0.01)
                )
            )
        )
    )
    return model
Example #10
0
 def __init__(self):
     optimizer = optim.SGD(0.1)
     self.model = compose.Pipeline(
         preprocessing.StandardScaler(),
         linear_model.LogisticRegression(optimizer))
     self.metric = metrics.Accuracy()
     self.count = 0
Example #11
0
def test_standard_scaler_one_many_consistent():
    """Checks that using learn_one or learn_many produces the same result."""

    X = pd.read_csv(datasets.TrumpApproval().path)

    one = preprocessing.StandardScaler()
    for x, _ in stream.iter_pandas(X):
        one.learn_one(x)

    many = preprocessing.StandardScaler()
    for xb in np.array_split(X, 10):
        many.learn_many(xb)

    for i in X:
        assert math.isclose(one.counts[i], many.counts[i])
        assert math.isclose(one.means[i], many.means[i])
        assert math.isclose(one.vars[i], many.vars[i])
Example #12
0
def test_standard_scaler_shuffle_columns():
    """Checks that learn_many works identically whether columns are shuffled or not."""

    X = pd.read_csv(datasets.TrumpApproval().path)

    normal = preprocessing.StandardScaler()
    for xb in np.array_split(X, 10):
        normal.learn_many(xb)

    shuffled = preprocessing.StandardScaler()
    for xb in np.array_split(X, 10):
        cols = np.random.permutation(X.columns)
        shuffled.learn_many(xb[cols])

    for i in X:
        assert math.isclose(shuffled.counts[i], shuffled.counts[i])
        assert math.isclose(shuffled.means[i], shuffled.means[i])
        assert math.isclose(shuffled.vars[i], shuffled.vars[i])
Example #13
0
def test_online_batch_consistent():

    # Batch

    batch = preprocessing.StandardScaler() | multiclass.OneVsRestClassifier(
        linear_model.LogisticRegression())

    dataset = datasets.ImageSegments()

    batch_metric = metrics.MacroF1()

    for i, x in enumerate(pd.read_csv(dataset.path, chunksize=1)):
        y = x.pop("category")
        y_pred = batch.predict_many(x)
        batch.learn_many(x, y)

        for yt, yp in zip(y, y_pred):
            if yp is not None:
                batch_metric.update(yt, yp)

        if i == 30:
            break

    # Online

    online = preprocessing.StandardScaler() | multiclass.OneVsRestClassifier(
        linear_model.LogisticRegression())

    online_metric = metrics.MacroF1()

    X = pd.read_csv(dataset.path)
    Y = X.pop("category")

    for i, (x, y) in enumerate(stream.iter_pandas(X, Y)):
        y_pred = online.predict_one(x)
        online.learn_one(x, y)

        if y_pred is not None:
            online_metric.update(y, y_pred)

        if i == 30:
            break

    assert online_metric.get() == batch_metric.get()
Example #14
0
def test_set_params_pipeline():

    obj = preprocessing.StandardScaler() | linear_model.LinearRegression(l2=42)
    obj.learn_one({'x': 3}, 6)

    new = obj._set_params({'LinearRegression': {'l2': 21}})
    assert new['LinearRegression'].l2 == 21
    assert obj['LinearRegression'].l2 == 42
    assert new['LinearRegression'].weights == {}
    assert new['LinearRegression'].weights != obj['LinearRegression'].weights
Example #15
0
def test_standard_scaler_add_remove_columns():
    """Checks that no exceptions are raised whenever columns are dropped and/or added."""

    X = pd.read_csv(datasets.TrumpApproval().path)

    ss = preprocessing.StandardScaler()
    for xb in np.array_split(X, 10):
        # Pick half of the columns at random
        cols = np.random.choice(X.columns, len(X.columns) // 2, replace=False)
        ss.learn_many(xb[cols])
Example #16
0
def test_set_params_pipeline():

    obj = preprocessing.StandardScaler() | linear_model.LinearRegression(l2=42)
    obj.learn_one({"x": 3}, 6)

    params = {"LinearRegression": {"l2": 21}}
    new = obj._set_params(params)
    assert new["LinearRegression"].l2 == 21
    assert obj["LinearRegression"].l2 == 42
    assert new["LinearRegression"].weights == {}
    assert new["LinearRegression"].weights != obj["LinearRegression"].weights
Example #17
0
def test_finite_differences(lm, dataset):
    """Checks the gradient of a linear model via finite differences.

    References
    ----------
    [^1]: [How to test gradient implementations](https://timvieira.github.io/blog/post/2017/04/21/how-to-test-gradient-implementations/)
    [^2]: [Stochastic Gradient Descent Tricks](https://cilvr.cs.nyu.edu/diglib/lsml/bottou-sgd-tricks-2012.pdf)

    """

    scaler = preprocessing.StandardScaler()
    eps = 1e-6

    for x, y in dataset:

        x = scaler.learn_one(x).transform_one(x)

        # Store the current gradient and weights
        gradient, _ = lm._eval_gradient_one(x, y, 1)
        weights = copy.deepcopy(lm._weights)

        # d is a set of weight perturbations
        for d in iter_perturbations(weights.keys()):

            # Pertubate the weights and obtain the loss with the new weights
            lm._weights = utils.VectorDict(
                {i: weights[i] + eps * di
                 for i, di in d.items()})
            forward = lm.loss(y_true=y, y_pred=lm._raw_dot_one(x))
            lm._weights = utils.VectorDict(
                {i: weights[i] - eps * di
                 for i, di in d.items()})
            backward = lm.loss(y_true=y, y_pred=lm._raw_dot_one(x))

            # We expect g and h to be equal
            g = utils.math.dot(d, gradient)
            h = (forward - backward) / (2 * eps)

            # Compare signs
            # TODO: reactivate this check
            #assert np.sign(g) == np.sign(h)

            # Check absolute difference
            # TODO: decrease the tolerance
            assert abs(g - h) < 1e-5

        # Reset the weights to their original values in order not to influence
        # the training loop, even though it doesn't really matter.
        lm._weights = weights
        lm.learn_one(x, y)
Example #18
0
def test_clone_idempotent():

    model = preprocessing.StandardScaler() | linear_model.LogisticRegression(
        optimizer=optim.Adam(), l2=0.1)

    trace = []
    for x, y in datasets.Phishing():
        trace.append(model.predict_proba_one(x))
        model.learn_one(x, y)

    clone = model.clone()
    for i, (x, y) in enumerate(datasets.Phishing()):
        assert clone.predict_proba_one(x) == trace[i]
        clone.learn_one(x, y)
Example #19
0
def test_perceptron_sklearn_coherence():
    """Checks that the sklearn and river implementations produce the same results."""

    ss = preprocessing.StandardScaler()
    cr = lm.Perceptron()
    sk = sklm.Perceptron()

    for x, y in datasets.Bananas():
        x = ss.learn_one(x).transform_one(x)
        cr.learn_one(x, y)
        sk.partial_fit([list(x.values())], [y], classes=[False, True])

    for i, w in enumerate(cr.weights.values()):
        assert math.isclose(w, sk.coef_[0][i])

    assert math.isclose(cr.intercept, sk.intercept_[0])
Example #20
0
def test_log_reg_sklearn_coherence(river_params, sklearn_params):
    """Checks that the sklearn and river implementations produce the same results."""

    ss = preprocessing.StandardScaler()
    rv = lm.LogisticRegression(**river_params)
    sk = sklm.SGDClassifier(**sklearn_params)

    for x, y in datasets.Bananas().take(100):
        x = ss.learn_one(x).transform_one(x)
        rv.learn_one(x, y)
        sk.partial_fit([list(x.values())], [y], classes=[False, True])

    for i, w in enumerate(rv.weights.values()):
        assert math.isclose(w, sk.coef_[0][i])

    assert math.isclose(rv.intercept, sk.intercept_[0])
Example #21
0
def test_no_learn_unsupervised_score_one():
    pipeline = compose.Pipeline(
        ("scale", preprocessing.StandardScaler()),
        ("anomaly", anomaly.HalfSpaceTrees()),
    )

    dataset = [(dict(a=x, b=x), x) for x in range(100)]

    for x, y in dataset:
        counts_pre = dict(pipeline.steps["scale"].counts)
        pipeline.score_one(x, learn_unsupervised=True)
        counts_post = dict(pipeline.steps["scale"].counts)
        pipeline.score_one(x, learn_unsupervised=False)
        counts_no_learn = dict(pipeline.steps["scale"].counts)

        assert counts_pre != counts_post
        assert counts_post == counts_no_learn
Example #22
0
def test_no_learn_unsupervised_one(func):
    pipeline = compose.Pipeline(
        ("scale", preprocessing.StandardScaler()),
        ("log_reg", linear_model.LogisticRegression()),
    )

    dataset = [(dict(a=x, b=x), x) for x in range(100)]

    for x, y in dataset:
        counts_pre = dict(pipeline.steps["scale"].counts)
        func(pipeline, x, learn_unsupervised=True)
        counts_post = dict(pipeline.steps["scale"].counts)
        func(pipeline, x, learn_unsupervised=False)
        counts_no_learn = dict(pipeline.steps["scale"].counts)

        assert counts_pre != counts_post
        assert counts_post == counts_no_learn
Example #23
0
def test_learn_one_warm_up_mode():
    pipeline = compose.Pipeline(
        ("scale", preprocessing.StandardScaler()),
        ("log_reg", linear_model.LogisticRegression()),
    )

    dataset = [(dict(a=x, b=x), bool(x % 2)) for x in range(100)]

    for x, y in dataset:
        counts_pre = dict(pipeline["scale"].counts)
        with utils.warm_up_mode():
            pipeline.learn_one(x, y)
        counts_post = dict(pipeline["scale"].counts)
        pipeline.learn_one(x, y)
        counts_no_learn = dict(pipeline["scale"].counts)

        assert counts_pre != counts_post
        assert counts_post == counts_no_learn
Example #24
0
 def __init__(self, step, name):
     self.name = name
     self.optimizer = SynchronousSGD(0.01, name, None)
     self.model = compose.Pipeline(
         preprocessing.StandardScaler(),
         linear_model.LogisticRegression(self.optimizer))
     self.metrics = [
         metrics.Accuracy(),
         metrics.MAE(),
         metrics.RMSE(),
         metrics.Precision(),
         metrics.Recall()
     ]
     self.count = 0
     if step is None:
         self.step = 50
     else:
         self.step = int(step)
Example #25
0
def test_log_reg_sklearn_coherence():
    """Checks that the sklearn and river implementations produce the same results."""

    ss = preprocessing.StandardScaler()
    cr = lm.LogisticRegression(optimizer=optim.SGD(.01))
    sk = sklm.SGDClassifier(learning_rate='constant',
                            eta0=.01,
                            alpha=.0,
                            loss='log')

    for x, y in datasets.Bananas():
        x = ss.learn_one(x).transform_one(x)
        cr.learn_one(x, y)
        sk.partial_fit([list(x.values())], [y], classes=[False, True])

    for i, w in enumerate(cr.weights.values()):
        assert math.isclose(w, sk.coef_[0][i])

    assert math.isclose(cr.intercept, sk.intercept_[0])
Example #26
0
def test_lin_reg_sklearn_coherence():
    """Checks that the sklearn and river implementations produce the same results."""
    class SquaredLoss:
        """sklearn removes the leading 2 from the gradient of the squared loss."""
        def gradient(self, y_true, y_pred):
            return y_pred - y_true

    ss = preprocessing.StandardScaler()
    cr = lm.LinearRegression(optimizer=optim.SGD(.01), loss=SquaredLoss())
    sk = sklm.SGDRegressor(learning_rate='constant', eta0=.01, alpha=.0)

    for x, y in datasets.TrumpApproval():
        x = ss.learn_one(x).transform_one(x)
        cr.learn_one(x, y)
        sk.partial_fit([list(x.values())], [y])

    for i, w in enumerate(cr.weights.values()):
        assert math.isclose(w, sk.coef_[i])

    assert math.isclose(cr.intercept, sk.intercept_[0])
Example #27
0
def test_learn_many_warm_up_mode():
    pipeline = compose.Pipeline(
        ("scale", preprocessing.StandardScaler()),
        ("log_reg", linear_model.LogisticRegression()),
    )

    dataset = [(dict(a=x, b=x), x) for x in range(100)]

    for i in range(0, len(dataset), 5):
        X = pd.DataFrame([x for x, _ in dataset][i:i + 5])
        y = pd.Series([bool(y % 2) for _, y in dataset][i:i + 5])

        counts_pre = dict(pipeline["scale"].counts)
        with utils.warm_up_mode():
            pipeline.learn_many(X, y)
        counts_post = dict(pipeline["scale"].counts)
        pipeline.learn_many(X, y)
        counts_no_learn = dict(pipeline["scale"].counts)

        assert counts_pre != counts_post
        assert counts_post == counts_no_learn
Example #28
0
    def __init__(
        self,
        models: typing.List[base.Estimator],
        metric: metrics.Metric,
        explore_each_arm: int,
        start_after: int,
        seed: int = None,
    ):

        if len(models) <= 1:
            raise ValueError(
                f"You supplied {len(models)} models. At least 2 models should be supplied."
            )

        # Check that the model and the metric are in accordance
        for model in models:
            if not metric.works_with(model):
                raise ValueError(
                    f"{metric.__class__.__name__} metric can't be used to evaluate a "
                    + f"{model.__class__.__name__}")
        super().__init__(models)
        self.metric = copy.deepcopy(metric)
        self._y_scaler = copy.deepcopy(preprocessing.StandardScaler())

        # Initializing bandits internals
        self._n_arms = len(models)
        self._n_iter = 0  # number of times learn_one is called
        self._N = [0] * self._n_arms
        self.explore_each_arm = explore_each_arm
        self.average_reward = [0.0] * self._n_arms

        # Warm up
        self.start_after = start_after
        self.warm_up = True

        # Randomization
        self.seed = seed
        self._rng = random.Random(seed)
Example #29
0
from river import compose
from river import preprocessing
from river import linear_model
from river import metrics
from river import datasets
from river import optim

optimizer = optim.SGD(0.1)
model = compose.Pipeline(preprocessing.StandardScaler(),
                         linear_model.LogisticRegression(optimizer))

metric = metrics.ROCAUC()
precision = metrics.Precision()

for x, y in datasets.Phishing():
    y_pred = model.predict_proba_one(x)
    model.learn_one(x, y)
    metric.update(y, y_pred)
    precision.update(y, y_pred)

print(metric)
print(precision)
Example #30
0
from river import preprocessing
from river import evaluate
from river import metrics
from river import datasets
from river import tree
from river import compose
from river import optim

X_y = synth.PredictionInfluenceStream(stream=[
    synth.RandomRBF(seed_model=42,
                    seed_sample=42,
                    n_classes=2,
                    n_features=4,
                    n_centroids=20),
    synth.RandomRBF(seed_model=41,
                    seed_sample=49,
                    n_classes=2,
                    n_features=4,
                    n_centroids=20)
])

model = preprocessing.StandardScaler()
model |= tree.HoeffdingAdaptiveTreeClassifier(grace_period=100,
                                              split_confidence=1e-5,
                                              leaf_prediction='nb',
                                              nb_threshold=10,
                                              seed=0)

metric = metrics.Accuracy()

evaluate.evaluate_influential(X_y, model, metric, print_every=100)