def _yield_datasets(model: Estimator): """Generates datasets for a given model.""" from sklearn import datasets as sk_datasets from river import base, compose, datasets, preprocessing, stream, utils # Recommendation models can be regressors or classifiers, but they have requirements as to the # structure of the data if isinstance(utils.inspect.extract_relevant(model), Recommender): if utils.inspect.isregressor(model): yield _DummyDataset( ({"user": "******", "item": "Superman"}, 8), ({"user": "******", "item": "Terminator"}, 9), ({"user": "******", "item": "Star Wars"}, 8), ({"user": "******", "item": "Notting Hill"}, 2), ({"user": "******", "item": "Harry Potter"}, 5), ({"user": "******", "item": "Superman"}, 8), ({"user": "******", "item": "Terminator"}, 9), ({"user": "******", "item": "Star Wars"}, 8), ({"user": "******", "item": "Notting Hill"}, 2), ) return # Multi-output regression elif utils.inspect.ismoregressor(model): # 1 yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) # 2 class SolarFlare: """One-hot encoded version of `datasets.SolarFlare""" def __iter__(self): oh = ( compose.SelectType(str) | preprocessing.OneHotEncoder() ) + compose.SelectType(int) for x, y in datasets.SolarFlare().take(200): yield oh.transform_one(x), y yield SolarFlare() # Regression elif utils.inspect.isregressor(model): yield datasets.TrumpApproval().take(200) # Multi-output classification if utils.inspect.ismoclassifier(model): yield datasets.Music().take(200) # Classification elif utils.inspect.isclassifier(model): yield datasets.Phishing().take(200) yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200)) # Multi-class classification if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags: yield datasets.ImageSegments().take(200)
def yield_datasets(model): from river import base from river import compose from river import datasets from river import preprocessing from river import stream from river import utils from sklearn import datasets as sk_datasets # Multi-output regression if utils.inspect.ismoregressor(model): # 1 yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) # 2 class SolarFlare: """One-hot encoded version of `datasets.SolarFlare""" def __iter__(self): oh = (compose.SelectType(str) | preprocessing.OneHotEncoder()) + compose.SelectType( int ) for x, y in datasets.SolarFlare().take(200): yield oh.transform_one(x), y yield SolarFlare() # Regression elif utils.inspect.isregressor(model): yield datasets.TrumpApproval().take(200) # Multi-output classification if utils.inspect.ismoclassifier(model): yield datasets.Music().take(200) # Classification elif utils.inspect.isclassifier(model): yield datasets.Phishing().take(200) yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200)) # Multi-class classification if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags: yield datasets.ImageSegments().take(200)
def evo_led_accuracy_track(n_samples=10_000, seed=42): dataset = synth.LEDDrift(seed=seed, noise_percentage=.1, n_drift_features=4).take(n_samples) track = EvoTrack("LEDDrift()", dataset, metrics.Accuracy(), n_samples) return track def evo_hyperplane_accuracy_001_track(n_samples=10_000, seed=42): dataset = synth.Hyperplane(seed=seed,n_features=50,n_drift_features=25,mag_change=.001).take(n_samples) track = EvoTrack("Hyperplane(50,0.001)", dataset, metrics.Accuracy(), n_samples) return track def evo_hyperplane_accuracy_0001_track(n_samples=10_000, seed=42): dataset = synth.Hyperplane(seed=seed,n_features=50,n_drift_features=25,mag_change=.0001).take(n_samples) track = EvoTrack("Hyperplane(50, 0.0001)", dataset, metrics.Accuracy(), n_samples) return track def evo_sine_accuracy_track(n_samples=10_000, seed=42): dataset = synth.Sine(seed=seed).take(n_samples) track = EvoTrack("SINE()", dataset, metrics.Accuracy(), n_samples) return track def evo_elec2_accuracy_track(n_samples=10_000, seed=42): dataset = Elec2().take(n_samples) track = EvoTrack("Elec", dataset, metrics.Accuracy(), n_samples) return track def evo_covtype_accuracy_track(n_samples=10_000, seed=42): dataset = stream.iter_sklearn_dataset(sk_datasets.fetch_covtype()) track = EvoTrack('Covtype', dataset, metrics.Accuracy(), n_samples) return track
from river import linear_model from river import optim from river import preprocessing from sklearn import datasets from sklearn import metrics from river import stream scaler = preprocessing.StandardScaler() optimizer = optim.SGD(lr=0.01) log_reg = linear_model.LogisticRegression(optimizer) y_true = [] y_pred = [] for xi, yi in stream.iter_sklearn_dataset(datasets.load_breast_cancer(), shuffle=True, seed=42): # Scale the features xi_scaled = scaler.learn_one(xi).transform_one(xi) # Test the current model on the new "unobserved" sample yi_pred = log_reg.predict_proba_one(xi_scaled) # Train the model with the new sample log_reg.learn_one(xi_scaled, yi) # Store the truth and the prediction y_true.append(yi) y_pred.append(yi_pred[True]) print(f'ROC AUC: {metrics.roc_auc_score(y_true, y_pred):.3f}')
# Define the steps of the model model = pipeline.Pipeline( [ ('scale', preprocessing.StandardScaler()), ('lin_reg', linear_model.LogisticRegression(solver='lbfgs')) ] ) # Define a determistic cross-validation procedure cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42) #Conpute the MSE values scorer = metrics.make_scorer(metrics.roc_auc_score) scores = model_selection.cross_val_score(model, X, y, scoring=scorer, cv=cv) # Display the average score and it's standard deviation print(f'ROC AUC: {scores.mean():.3f} (+/- {scores.std():.3f})') for xi, yi in zip(X, y): # This is where the model learns xi = dict(zip(dataset.feature_names, xi)) print(xi['mean area']) from river import stream for xi, yi in stream.iter_sklearn_dataset(datasets.load_breast_cancer()): pass