def _default_params(cls): return { 'regressors': [ pp.StandardScaler() | lm.LinearRegression(intercept_lr=.1), pp.StandardScaler() | lm.PARegressor(), ] }
def _unit_test_params(cls): yield { "models": [ compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD( lr=1e-2)), ), compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD( lr=1e-1)), ), ], "metric": metrics.MAE(), } yield { "models": [ compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD(lr=lr)), ) for lr in [1e-4, 1e-3, 1e-2, 1e-1] ], "metric": metrics.MAE(), }
def _unit_test_params(cls): return { "models": [ pp.StandardScaler() | lm.LinearRegression(intercept_lr=0.1), pp.StandardScaler() | lm.PARegressor(), ] }
def _unit_test_params(cls): return { "models": [ compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD( lr=0.01)), ), compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD(lr=0.1)), ), ], "metric": metrics.MAE(), }
def __init__( self, p: int, d: int, q: int, m: int = 1, sp: int = 0, sd: int = 0, sq: int = 0, regressor: base.Regressor = None, ): self.p = p self.d = d self.q = q self.m = m self.sp = sp self.sd = sd self.sq = sq self.regressor = ( regressor if regressor is not None else preprocessing.StandardScaler() | linear_model.LinearRegression() ) self.differencer = Differencer(d=d, m=1) + Differencer(d=sd, m=1) self.y_trues = collections.deque(maxlen=max(p, m * sp)) self.errors = collections.deque(maxlen=max(p, m * sq))
def test_memory_usage(): model = preprocessing.StandardScaler() | linear_model.LogisticRegression() # We can't test the exact value because it depends on the platform and the Python version # TODO: we could create a table of expected values for each platform and Python version assert isinstance(model._memory_usage, str)
class RiverML: # fraud detection model model = compose.Pipeline(preprocessing.StandardScaler(), linear_model.LogisticRegression()) # ROCAUC metric to score the model as it trains metric = metrics.ROCAUC()
def test_pipeline_add_at_start(): def a(x): pass pipeline = preprocessing.StandardScaler() | linear_model.LinearRegression() pipeline = a | pipeline assert str(pipeline) == "a | StandardScaler | LinearRegression"
def get_model(): extract_features = compose.TransformerUnion( get_ordinal_date, get_day_distances ) model = ( extract_features | time_series.SNARIMAX( p=0, d=0, q=0, m=7, sp=3, sq=0, regressor=( preprocessing.StandardScaler() | linear_model.LinearRegression( intercept_init=0, intercept_lr=0.3, optimizer=optim.SGD(0.01) ) ) ) ) return model
def __init__(self): optimizer = optim.SGD(0.1) self.model = compose.Pipeline( preprocessing.StandardScaler(), linear_model.LogisticRegression(optimizer)) self.metric = metrics.Accuracy() self.count = 0
def test_standard_scaler_one_many_consistent(): """Checks that using learn_one or learn_many produces the same result.""" X = pd.read_csv(datasets.TrumpApproval().path) one = preprocessing.StandardScaler() for x, _ in stream.iter_pandas(X): one.learn_one(x) many = preprocessing.StandardScaler() for xb in np.array_split(X, 10): many.learn_many(xb) for i in X: assert math.isclose(one.counts[i], many.counts[i]) assert math.isclose(one.means[i], many.means[i]) assert math.isclose(one.vars[i], many.vars[i])
def test_standard_scaler_shuffle_columns(): """Checks that learn_many works identically whether columns are shuffled or not.""" X = pd.read_csv(datasets.TrumpApproval().path) normal = preprocessing.StandardScaler() for xb in np.array_split(X, 10): normal.learn_many(xb) shuffled = preprocessing.StandardScaler() for xb in np.array_split(X, 10): cols = np.random.permutation(X.columns) shuffled.learn_many(xb[cols]) for i in X: assert math.isclose(shuffled.counts[i], shuffled.counts[i]) assert math.isclose(shuffled.means[i], shuffled.means[i]) assert math.isclose(shuffled.vars[i], shuffled.vars[i])
def test_online_batch_consistent(): # Batch batch = preprocessing.StandardScaler() | multiclass.OneVsRestClassifier( linear_model.LogisticRegression()) dataset = datasets.ImageSegments() batch_metric = metrics.MacroF1() for i, x in enumerate(pd.read_csv(dataset.path, chunksize=1)): y = x.pop("category") y_pred = batch.predict_many(x) batch.learn_many(x, y) for yt, yp in zip(y, y_pred): if yp is not None: batch_metric.update(yt, yp) if i == 30: break # Online online = preprocessing.StandardScaler() | multiclass.OneVsRestClassifier( linear_model.LogisticRegression()) online_metric = metrics.MacroF1() X = pd.read_csv(dataset.path) Y = X.pop("category") for i, (x, y) in enumerate(stream.iter_pandas(X, Y)): y_pred = online.predict_one(x) online.learn_one(x, y) if y_pred is not None: online_metric.update(y, y_pred) if i == 30: break assert online_metric.get() == batch_metric.get()
def test_set_params_pipeline(): obj = preprocessing.StandardScaler() | linear_model.LinearRegression(l2=42) obj.learn_one({'x': 3}, 6) new = obj._set_params({'LinearRegression': {'l2': 21}}) assert new['LinearRegression'].l2 == 21 assert obj['LinearRegression'].l2 == 42 assert new['LinearRegression'].weights == {} assert new['LinearRegression'].weights != obj['LinearRegression'].weights
def test_standard_scaler_add_remove_columns(): """Checks that no exceptions are raised whenever columns are dropped and/or added.""" X = pd.read_csv(datasets.TrumpApproval().path) ss = preprocessing.StandardScaler() for xb in np.array_split(X, 10): # Pick half of the columns at random cols = np.random.choice(X.columns, len(X.columns) // 2, replace=False) ss.learn_many(xb[cols])
def test_set_params_pipeline(): obj = preprocessing.StandardScaler() | linear_model.LinearRegression(l2=42) obj.learn_one({"x": 3}, 6) params = {"LinearRegression": {"l2": 21}} new = obj._set_params(params) assert new["LinearRegression"].l2 == 21 assert obj["LinearRegression"].l2 == 42 assert new["LinearRegression"].weights == {} assert new["LinearRegression"].weights != obj["LinearRegression"].weights
def test_finite_differences(lm, dataset): """Checks the gradient of a linear model via finite differences. References ---------- [^1]: [How to test gradient implementations](https://timvieira.github.io/blog/post/2017/04/21/how-to-test-gradient-implementations/) [^2]: [Stochastic Gradient Descent Tricks](https://cilvr.cs.nyu.edu/diglib/lsml/bottou-sgd-tricks-2012.pdf) """ scaler = preprocessing.StandardScaler() eps = 1e-6 for x, y in dataset: x = scaler.learn_one(x).transform_one(x) # Store the current gradient and weights gradient, _ = lm._eval_gradient_one(x, y, 1) weights = copy.deepcopy(lm._weights) # d is a set of weight perturbations for d in iter_perturbations(weights.keys()): # Pertubate the weights and obtain the loss with the new weights lm._weights = utils.VectorDict( {i: weights[i] + eps * di for i, di in d.items()}) forward = lm.loss(y_true=y, y_pred=lm._raw_dot_one(x)) lm._weights = utils.VectorDict( {i: weights[i] - eps * di for i, di in d.items()}) backward = lm.loss(y_true=y, y_pred=lm._raw_dot_one(x)) # We expect g and h to be equal g = utils.math.dot(d, gradient) h = (forward - backward) / (2 * eps) # Compare signs # TODO: reactivate this check #assert np.sign(g) == np.sign(h) # Check absolute difference # TODO: decrease the tolerance assert abs(g - h) < 1e-5 # Reset the weights to their original values in order not to influence # the training loop, even though it doesn't really matter. lm._weights = weights lm.learn_one(x, y)
def test_clone_idempotent(): model = preprocessing.StandardScaler() | linear_model.LogisticRegression( optimizer=optim.Adam(), l2=0.1) trace = [] for x, y in datasets.Phishing(): trace.append(model.predict_proba_one(x)) model.learn_one(x, y) clone = model.clone() for i, (x, y) in enumerate(datasets.Phishing()): assert clone.predict_proba_one(x) == trace[i] clone.learn_one(x, y)
def test_perceptron_sklearn_coherence(): """Checks that the sklearn and river implementations produce the same results.""" ss = preprocessing.StandardScaler() cr = lm.Perceptron() sk = sklm.Perceptron() for x, y in datasets.Bananas(): x = ss.learn_one(x).transform_one(x) cr.learn_one(x, y) sk.partial_fit([list(x.values())], [y], classes=[False, True]) for i, w in enumerate(cr.weights.values()): assert math.isclose(w, sk.coef_[0][i]) assert math.isclose(cr.intercept, sk.intercept_[0])
def test_log_reg_sklearn_coherence(river_params, sklearn_params): """Checks that the sklearn and river implementations produce the same results.""" ss = preprocessing.StandardScaler() rv = lm.LogisticRegression(**river_params) sk = sklm.SGDClassifier(**sklearn_params) for x, y in datasets.Bananas().take(100): x = ss.learn_one(x).transform_one(x) rv.learn_one(x, y) sk.partial_fit([list(x.values())], [y], classes=[False, True]) for i, w in enumerate(rv.weights.values()): assert math.isclose(w, sk.coef_[0][i]) assert math.isclose(rv.intercept, sk.intercept_[0])
def test_no_learn_unsupervised_score_one(): pipeline = compose.Pipeline( ("scale", preprocessing.StandardScaler()), ("anomaly", anomaly.HalfSpaceTrees()), ) dataset = [(dict(a=x, b=x), x) for x in range(100)] for x, y in dataset: counts_pre = dict(pipeline.steps["scale"].counts) pipeline.score_one(x, learn_unsupervised=True) counts_post = dict(pipeline.steps["scale"].counts) pipeline.score_one(x, learn_unsupervised=False) counts_no_learn = dict(pipeline.steps["scale"].counts) assert counts_pre != counts_post assert counts_post == counts_no_learn
def test_no_learn_unsupervised_one(func): pipeline = compose.Pipeline( ("scale", preprocessing.StandardScaler()), ("log_reg", linear_model.LogisticRegression()), ) dataset = [(dict(a=x, b=x), x) for x in range(100)] for x, y in dataset: counts_pre = dict(pipeline.steps["scale"].counts) func(pipeline, x, learn_unsupervised=True) counts_post = dict(pipeline.steps["scale"].counts) func(pipeline, x, learn_unsupervised=False) counts_no_learn = dict(pipeline.steps["scale"].counts) assert counts_pre != counts_post assert counts_post == counts_no_learn
def test_learn_one_warm_up_mode(): pipeline = compose.Pipeline( ("scale", preprocessing.StandardScaler()), ("log_reg", linear_model.LogisticRegression()), ) dataset = [(dict(a=x, b=x), bool(x % 2)) for x in range(100)] for x, y in dataset: counts_pre = dict(pipeline["scale"].counts) with utils.warm_up_mode(): pipeline.learn_one(x, y) counts_post = dict(pipeline["scale"].counts) pipeline.learn_one(x, y) counts_no_learn = dict(pipeline["scale"].counts) assert counts_pre != counts_post assert counts_post == counts_no_learn
def __init__(self, step, name): self.name = name self.optimizer = SynchronousSGD(0.01, name, None) self.model = compose.Pipeline( preprocessing.StandardScaler(), linear_model.LogisticRegression(self.optimizer)) self.metrics = [ metrics.Accuracy(), metrics.MAE(), metrics.RMSE(), metrics.Precision(), metrics.Recall() ] self.count = 0 if step is None: self.step = 50 else: self.step = int(step)
def test_log_reg_sklearn_coherence(): """Checks that the sklearn and river implementations produce the same results.""" ss = preprocessing.StandardScaler() cr = lm.LogisticRegression(optimizer=optim.SGD(.01)) sk = sklm.SGDClassifier(learning_rate='constant', eta0=.01, alpha=.0, loss='log') for x, y in datasets.Bananas(): x = ss.learn_one(x).transform_one(x) cr.learn_one(x, y) sk.partial_fit([list(x.values())], [y], classes=[False, True]) for i, w in enumerate(cr.weights.values()): assert math.isclose(w, sk.coef_[0][i]) assert math.isclose(cr.intercept, sk.intercept_[0])
def test_lin_reg_sklearn_coherence(): """Checks that the sklearn and river implementations produce the same results.""" class SquaredLoss: """sklearn removes the leading 2 from the gradient of the squared loss.""" def gradient(self, y_true, y_pred): return y_pred - y_true ss = preprocessing.StandardScaler() cr = lm.LinearRegression(optimizer=optim.SGD(.01), loss=SquaredLoss()) sk = sklm.SGDRegressor(learning_rate='constant', eta0=.01, alpha=.0) for x, y in datasets.TrumpApproval(): x = ss.learn_one(x).transform_one(x) cr.learn_one(x, y) sk.partial_fit([list(x.values())], [y]) for i, w in enumerate(cr.weights.values()): assert math.isclose(w, sk.coef_[i]) assert math.isclose(cr.intercept, sk.intercept_[0])
def test_learn_many_warm_up_mode(): pipeline = compose.Pipeline( ("scale", preprocessing.StandardScaler()), ("log_reg", linear_model.LogisticRegression()), ) dataset = [(dict(a=x, b=x), x) for x in range(100)] for i in range(0, len(dataset), 5): X = pd.DataFrame([x for x, _ in dataset][i:i + 5]) y = pd.Series([bool(y % 2) for _, y in dataset][i:i + 5]) counts_pre = dict(pipeline["scale"].counts) with utils.warm_up_mode(): pipeline.learn_many(X, y) counts_post = dict(pipeline["scale"].counts) pipeline.learn_many(X, y) counts_no_learn = dict(pipeline["scale"].counts) assert counts_pre != counts_post assert counts_post == counts_no_learn
def __init__( self, models: typing.List[base.Estimator], metric: metrics.Metric, explore_each_arm: int, start_after: int, seed: int = None, ): if len(models) <= 1: raise ValueError( f"You supplied {len(models)} models. At least 2 models should be supplied." ) # Check that the model and the metric are in accordance for model in models: if not metric.works_with(model): raise ValueError( f"{metric.__class__.__name__} metric can't be used to evaluate a " + f"{model.__class__.__name__}") super().__init__(models) self.metric = copy.deepcopy(metric) self._y_scaler = copy.deepcopy(preprocessing.StandardScaler()) # Initializing bandits internals self._n_arms = len(models) self._n_iter = 0 # number of times learn_one is called self._N = [0] * self._n_arms self.explore_each_arm = explore_each_arm self.average_reward = [0.0] * self._n_arms # Warm up self.start_after = start_after self.warm_up = True # Randomization self.seed = seed self._rng = random.Random(seed)
from river import compose from river import preprocessing from river import linear_model from river import metrics from river import datasets from river import optim optimizer = optim.SGD(0.1) model = compose.Pipeline(preprocessing.StandardScaler(), linear_model.LogisticRegression(optimizer)) metric = metrics.ROCAUC() precision = metrics.Precision() for x, y in datasets.Phishing(): y_pred = model.predict_proba_one(x) model.learn_one(x, y) metric.update(y, y_pred) precision.update(y, y_pred) print(metric) print(precision)
from river import preprocessing from river import evaluate from river import metrics from river import datasets from river import tree from river import compose from river import optim X_y = synth.PredictionInfluenceStream(stream=[ synth.RandomRBF(seed_model=42, seed_sample=42, n_classes=2, n_features=4, n_centroids=20), synth.RandomRBF(seed_model=41, seed_sample=49, n_classes=2, n_features=4, n_centroids=20) ]) model = preprocessing.StandardScaler() model |= tree.HoeffdingAdaptiveTreeClassifier(grace_period=100, split_confidence=1e-5, leaf_prediction='nb', nb_threshold=10, seed=0) metric = metrics.Accuracy() evaluate.evaluate_influential(X_y, model, metric, print_every=100)