def _yield_datasets(model: Estimator): """Generates datasets for a given model.""" from sklearn import datasets as sk_datasets from river import base, compose, datasets, preprocessing, stream, utils # Recommendation models can be regressors or classifiers, but they have requirements as to the # structure of the data if isinstance(utils.inspect.extract_relevant(model), Recommender): if utils.inspect.isregressor(model): yield _DummyDataset( ({"user": "******", "item": "Superman"}, 8), ({"user": "******", "item": "Terminator"}, 9), ({"user": "******", "item": "Star Wars"}, 8), ({"user": "******", "item": "Notting Hill"}, 2), ({"user": "******", "item": "Harry Potter"}, 5), ({"user": "******", "item": "Superman"}, 8), ({"user": "******", "item": "Terminator"}, 9), ({"user": "******", "item": "Star Wars"}, 8), ({"user": "******", "item": "Notting Hill"}, 2), ) return # Multi-output regression elif utils.inspect.ismoregressor(model): # 1 yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) # 2 class SolarFlare: """One-hot encoded version of `datasets.SolarFlare""" def __iter__(self): oh = ( compose.SelectType(str) | preprocessing.OneHotEncoder() ) + compose.SelectType(int) for x, y in datasets.SolarFlare().take(200): yield oh.transform_one(x), y yield SolarFlare() # Regression elif utils.inspect.isregressor(model): yield datasets.TrumpApproval().take(200) # Multi-output classification if utils.inspect.ismoclassifier(model): yield datasets.Music().take(200) # Classification elif utils.inspect.isclassifier(model): yield datasets.Phishing().take(200) yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200)) # Multi-class classification if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags: yield datasets.ImageSegments().take(200)
def test_standard_scaler_add_remove_columns(): """Checks that no exceptions are raised whenever columns are dropped and/or added.""" X = pd.read_csv(datasets.TrumpApproval().path) ss = preprocessing.StandardScaler() for xb in np.array_split(X, 10): # Pick half of the columns at random cols = np.random.choice(X.columns, len(X.columns) // 2, replace=False) ss.learn_many(xb[cols])
def test_add_remove_columns(): """Checks that no exceptions are raised whenever columns are dropped and/or added.""" X = pd.read_csv(datasets.TrumpApproval().path) Y = X.pop('five_thirty_eight') lin_reg = lm.LinearRegression() for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)): # Pick half of the columns at random cols = np.random.choice(X.columns, len(X.columns) // 2, replace=False) lin_reg.learn_many(xb[cols], yb)
def test_one_many_consistent(): """Checks that using learn_one or learn_many produces the same result.""" X = pd.read_csv(datasets.TrumpApproval().path) Y = X.pop('five_thirty_eight') one = lm.LinearRegression() for x, y in stream.iter_pandas(X, Y): one.learn_one(x, y) many = lm.LinearRegression() for xb, yb in zip(np.array_split(X, len(X)), np.array_split(Y, len(Y))): many.learn_many(xb, yb) for i in X: assert math.isclose(one.weights[i], many.weights[i])
def test_lin_reg_sklearn_coherence(river_params, sklearn_params): """Checks that the sklearn and river implementations produce the same results.""" ss = preprocessing.StandardScaler() rv = lm.LinearRegression(**river_params) sk = sklm.SGDRegressor(**sklearn_params) for x, y in datasets.TrumpApproval().take(100): x = ss.learn_one(x).transform_one(x) rv.learn_one(x, y) sk.partial_fit([list(x.values())], [y]) for i, w in enumerate(rv.weights.values()): assert math.isclose(w, sk.coef_[i]) assert math.isclose(rv.intercept, sk.intercept_[0])
def yield_datasets(model): from river import base from river import compose from river import datasets from river import preprocessing from river import stream from river import utils from sklearn import datasets as sk_datasets # Multi-output regression if utils.inspect.ismoregressor(model): # 1 yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) # 2 class SolarFlare: """One-hot encoded version of `datasets.SolarFlare""" def __iter__(self): oh = (compose.SelectType(str) | preprocessing.OneHotEncoder()) + compose.SelectType( int ) for x, y in datasets.SolarFlare().take(200): yield oh.transform_one(x), y yield SolarFlare() # Regression elif utils.inspect.isregressor(model): yield datasets.TrumpApproval().take(200) # Multi-output classification if utils.inspect.ismoclassifier(model): yield datasets.Music().take(200) # Classification elif utils.inspect.isclassifier(model): yield datasets.Phishing().take(200) yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200)) # Multi-class classification if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags: yield datasets.ImageSegments().take(200)
def test_shuffle_columns(): """Checks that learn_many works identically whether columns are shuffled or not.""" X = pd.read_csv(datasets.TrumpApproval().path) Y = X.pop('five_thirty_eight') normal = lm.LinearRegression() for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)): normal.learn_many(xb, yb) shuffled = lm.LinearRegression() for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)): cols = np.random.permutation(X.columns) shuffled.learn_many(xb[cols], yb) for i in X: assert math.isclose(normal.weights[i], shuffled.weights[i])
def test_standard_scaler_one_many_consistent(): """Checks that using learn_one or learn_many produces the same result.""" X = pd.read_csv(datasets.TrumpApproval().path) one = preprocessing.StandardScaler() for x, _ in stream.iter_pandas(X): one.learn_one(x) many = preprocessing.StandardScaler() for xb in np.array_split(X, 10): many.learn_many(xb) for i in X: assert math.isclose(one.counts[i], many.counts[i]) assert math.isclose(one.means[i], many.means[i]) assert math.isclose(one.vars[i], many.vars[i])
def test_standard_scaler_shuffle_columns(): """Checks that learn_many works identically whether columns are shuffled or not.""" X = pd.read_csv(datasets.TrumpApproval().path) normal = preprocessing.StandardScaler() for xb in np.array_split(X, 10): normal.learn_many(xb) shuffled = preprocessing.StandardScaler() for xb in np.array_split(X, 10): cols = np.random.permutation(X.columns) shuffled.learn_many(xb[cols]) for i in X: assert math.isclose(shuffled.counts[i], shuffled.counts[i]) assert math.isclose(shuffled.means[i], shuffled.means[i]) assert math.isclose(shuffled.vars[i], shuffled.vars[i])
def test_lin_reg_sklearn_coherence(): """Checks that the sklearn and river implementations produce the same results.""" class SquaredLoss: """sklearn removes the leading 2 from the gradient of the squared loss.""" def gradient(self, y_true, y_pred): return y_pred - y_true ss = preprocessing.StandardScaler() cr = lm.LinearRegression(optimizer=optim.SGD(.01), loss=SquaredLoss()) sk = sklm.SGDRegressor(learning_rate='constant', eta0=.01, alpha=.0) for x, y in datasets.TrumpApproval(): x = ss.learn_one(x).transform_one(x) cr.learn_one(x, y) sk.partial_fit([list(x.values())], [y]) for i, w in enumerate(cr.weights.values()): assert math.isclose(w, sk.coef_[i]) assert math.isclose(cr.intercept, sk.intercept_[0])
for _ in range(n): p = {j: random.gauss(0, 1) for j in keys} norm = utils.math.norm(p, order=2) for j in p: p[j] /= norm yield p @pytest.mark.parametrize( 'lm, dataset', [ pytest.param(lm( optimizer=copy.deepcopy(optimizer), initializer=initializer, l2=0), dataset, id=f'{lm.__name__} - {optimizer} - {initializer}') for lm, dataset in [(lm.LinearRegression, datasets.TrumpApproval() ), (lm.LogisticRegression, datasets.Bananas())] for optimizer, initializer in itertools.product( [ optim.AdaBound(), optim.AdaDelta(), optim.AdaGrad(), optim.AdaMax(), optim.Adam(), optim.AMSGrad(), # TODO: check momentum optimizers # optim.Momentum(), # optim.NesterovMomentum(), optim.RMSProp(), optim.SGD() ],
for j in p: p[j] /= norm yield p @pytest.mark.parametrize( "lm, dataset", [ pytest.param( lm(optimizer=copy.deepcopy(optimizer), initializer=initializer, l2=0), dataset, id=f"{lm.__name__} - {optimizer} - {initializer}", ) for lm, dataset in [ (lm.LinearRegression, datasets.TrumpApproval()), (lm.LogisticRegression, datasets.Bananas()), ] for optimizer, initializer in itertools.product( [ optim.AdaBound(), optim.AdaDelta(), optim.AdaGrad(), optim.AdaMax(), optim.Adam(), optim.AMSGrad(), # TODO: check momentum optimizers # optim.Momentum(), # optim.NesterovMomentum(), optim.RMSProp(), optim.SGD(), ],
from river import datasets from river import metrics from river.evaluate import Track def trump_mse_track(n_samples=10_000, seed=42): dataset = datasets.TrumpApproval().take(n_samples) track = Track("TRUMP Approval + R2", dataset, metrics.R2(), n_samples) return track def chickweights_mse_track(n_samples=10_000, seed=42): dataset = datasets.ChickWeights().take(n_samples) track = Track("ChickWeights + R2", dataset, metrics.R2(), n_samples) return track