def _yield_datasets(model: Estimator): """Generates datasets for a given model.""" from sklearn import datasets as sk_datasets from river import base, compose, datasets, preprocessing, stream, utils # Recommendation models can be regressors or classifiers, but they have requirements as to the # structure of the data if isinstance(utils.inspect.extract_relevant(model), Recommender): if utils.inspect.isregressor(model): yield _DummyDataset( ({"user": "******", "item": "Superman"}, 8), ({"user": "******", "item": "Terminator"}, 9), ({"user": "******", "item": "Star Wars"}, 8), ({"user": "******", "item": "Notting Hill"}, 2), ({"user": "******", "item": "Harry Potter"}, 5), ({"user": "******", "item": "Superman"}, 8), ({"user": "******", "item": "Terminator"}, 9), ({"user": "******", "item": "Star Wars"}, 8), ({"user": "******", "item": "Notting Hill"}, 2), ) return # Multi-output regression elif utils.inspect.ismoregressor(model): # 1 yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) # 2 class SolarFlare: """One-hot encoded version of `datasets.SolarFlare""" def __iter__(self): oh = ( compose.SelectType(str) | preprocessing.OneHotEncoder() ) + compose.SelectType(int) for x, y in datasets.SolarFlare().take(200): yield oh.transform_one(x), y yield SolarFlare() # Regression elif utils.inspect.isregressor(model): yield datasets.TrumpApproval().take(200) # Multi-output classification if utils.inspect.ismoclassifier(model): yield datasets.Music().take(200) # Classification elif utils.inspect.isclassifier(model): yield datasets.Phishing().take(200) yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200)) # Multi-class classification if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags: yield datasets.ImageSegments().take(200)
def test_memory_usage_multilabel(): dataset = datasets.Music().take(500) model = tree.LabelCombinationHoeffdingTreeClassifier( leaf_prediction="mc", splitter=tree.splitter.ExhaustiveSplitter(), max_size=1, memory_estimate_period=100, ) for x, y in dataset: model.learn_one(x, y) assert model._raw_memory_usage / (2 ** 20) < 1
def yield_datasets(model): from river import base from river import compose from river import datasets from river import preprocessing from river import stream from river import utils from sklearn import datasets as sk_datasets # Multi-output regression if utils.inspect.ismoregressor(model): # 1 yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) # 2 class SolarFlare: """One-hot encoded version of `datasets.SolarFlare""" def __iter__(self): oh = (compose.SelectType(str) | preprocessing.OneHotEncoder()) + compose.SelectType( int ) for x, y in datasets.SolarFlare().take(200): yield oh.transform_one(x), y yield SolarFlare() # Regression elif utils.inspect.isregressor(model): yield datasets.TrumpApproval().take(200) # Multi-output classification if utils.inspect.ismoclassifier(model): yield datasets.Music().take(200) # Classification elif utils.inspect.isclassifier(model): yield datasets.Phishing().take(200) yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200)) # Multi-class classification if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags: yield datasets.ImageSegments().take(200)