def _yield_datasets(model: Estimator): """Generates datasets for a given model.""" from sklearn import datasets as sk_datasets from river import base, compose, datasets, preprocessing, stream, utils # Recommendation models can be regressors or classifiers, but they have requirements as to the # structure of the data if isinstance(utils.inspect.extract_relevant(model), Recommender): if utils.inspect.isregressor(model): yield _DummyDataset( ({"user": "******", "item": "Superman"}, 8), ({"user": "******", "item": "Terminator"}, 9), ({"user": "******", "item": "Star Wars"}, 8), ({"user": "******", "item": "Notting Hill"}, 2), ({"user": "******", "item": "Harry Potter"}, 5), ({"user": "******", "item": "Superman"}, 8), ({"user": "******", "item": "Terminator"}, 9), ({"user": "******", "item": "Star Wars"}, 8), ({"user": "******", "item": "Notting Hill"}, 2), ) return # Multi-output regression elif utils.inspect.ismoregressor(model): # 1 yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) # 2 class SolarFlare: """One-hot encoded version of `datasets.SolarFlare""" def __iter__(self): oh = ( compose.SelectType(str) | preprocessing.OneHotEncoder() ) + compose.SelectType(int) for x, y in datasets.SolarFlare().take(200): yield oh.transform_one(x), y yield SolarFlare() # Regression elif utils.inspect.isregressor(model): yield datasets.TrumpApproval().take(200) # Multi-output classification if utils.inspect.ismoclassifier(model): yield datasets.Music().take(200) # Classification elif utils.inspect.isclassifier(model): yield datasets.Phishing().take(200) yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200)) # Multi-class classification if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags: yield datasets.ImageSegments().take(200)
def yield_datasets(model): from river import base from river import compose from river import datasets from river import preprocessing from river import stream from river import utils from sklearn import datasets as sk_datasets # Multi-output regression if utils.inspect.ismoregressor(model): # 1 yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) # 2 class SolarFlare: """One-hot encoded version of `datasets.SolarFlare""" def __iter__(self): oh = (compose.SelectType(str) | preprocessing.OneHotEncoder()) + compose.SelectType( int ) for x, y in datasets.SolarFlare().take(200): yield oh.transform_one(x), y yield SolarFlare() # Regression elif utils.inspect.isregressor(model): yield datasets.TrumpApproval().take(200) # Multi-output classification if utils.inspect.ismoclassifier(model): yield datasets.Music().take(200) # Classification elif utils.inspect.isclassifier(model): yield datasets.Phishing().take(200) yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200)) # Multi-class classification if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags: yield datasets.ImageSegments().take(200)
def test_online_batch_consistent(): # Batch batch = preprocessing.StandardScaler() | multiclass.OneVsRestClassifier( linear_model.LogisticRegression()) dataset = datasets.ImageSegments() batch_metric = metrics.MacroF1() for i, x in enumerate(pd.read_csv(dataset.path, chunksize=1)): y = x.pop("category") y_pred = batch.predict_many(x) batch.learn_many(x, y) for yt, yp in zip(y, y_pred): if yp is not None: batch_metric.update(yt, yp) if i == 30: break # Online online = preprocessing.StandardScaler() | multiclass.OneVsRestClassifier( linear_model.LogisticRegression()) online_metric = metrics.MacroF1() X = pd.read_csv(dataset.path) Y = X.pop("category") for i, (x, y) in enumerate(stream.iter_pandas(X, Y)): y_pred = online.predict_one(x) online.learn_one(x, y) if y_pred is not None: online_metric.update(y, y_pred) if i == 30: break assert online_metric.get() == batch_metric.get()