Beispiel #1
0
def _yield_datasets(model: Estimator):
    """Generates datasets for a given model."""

    from sklearn import datasets as sk_datasets

    from river import base, compose, datasets, preprocessing, stream, utils

    # Recommendation models can be regressors or classifiers, but they have requirements as to the
    # structure of the data
    if isinstance(utils.inspect.extract_relevant(model), Recommender):
        if utils.inspect.isregressor(model):
            yield _DummyDataset(
                ({"user": "******", "item": "Superman"}, 8),
                ({"user": "******", "item": "Terminator"}, 9),
                ({"user": "******", "item": "Star Wars"}, 8),
                ({"user": "******", "item": "Notting Hill"}, 2),
                ({"user": "******", "item": "Harry Potter"}, 5),
                ({"user": "******", "item": "Superman"}, 8),
                ({"user": "******", "item": "Terminator"}, 9),
                ({"user": "******", "item": "Star Wars"}, 8),
                ({"user": "******", "item": "Notting Hill"}, 2),
            )
        return

    # Multi-output regression
    elif utils.inspect.ismoregressor(model):

        # 1
        yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud())

        # 2
        class SolarFlare:
            """One-hot encoded version of `datasets.SolarFlare"""

            def __iter__(self):
                oh = (
                    compose.SelectType(str) | preprocessing.OneHotEncoder()
                ) + compose.SelectType(int)
                for x, y in datasets.SolarFlare().take(200):
                    yield oh.transform_one(x), y

        yield SolarFlare()

    # Regression
    elif utils.inspect.isregressor(model):
        yield datasets.TrumpApproval().take(200)

    # Multi-output classification
    if utils.inspect.ismoclassifier(model):
        yield datasets.Music().take(200)

    # Classification
    elif utils.inspect.isclassifier(model):

        yield datasets.Phishing().take(200)
        yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200))

        # Multi-class classification
        if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags:
            yield datasets.ImageSegments().take(200)
Beispiel #2
0
def yield_datasets(model):

    from river import base
    from river import compose
    from river import datasets
    from river import preprocessing
    from river import stream
    from river import utils
    from sklearn import datasets as sk_datasets

    # Multi-output regression
    if utils.inspect.ismoregressor(model):

        # 1
        yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud())

        # 2
        class SolarFlare:
            """One-hot encoded version of `datasets.SolarFlare"""

            def __iter__(self):
                oh = (compose.SelectType(str) | preprocessing.OneHotEncoder()) + compose.SelectType(
                    int
                )
                for x, y in datasets.SolarFlare().take(200):
                    yield oh.transform_one(x), y

        yield SolarFlare()

    # Regression
    elif utils.inspect.isregressor(model):
        yield datasets.TrumpApproval().take(200)

    # Multi-output classification
    if utils.inspect.ismoclassifier(model):
        yield datasets.Music().take(200)

    # Classification
    elif utils.inspect.isclassifier(model):

        yield datasets.Phishing().take(200)
        yield ((x, np.bool_(y)) for x, y in datasets.Phishing().take(200))

        # Multi-class classification
        if model._multiclass and base.tags.POSITIVE_INPUT not in model._tags:
            yield datasets.ImageSegments().take(200)
def evo_led_accuracy_track(n_samples=10_000, seed=42):
    dataset = synth.LEDDrift(seed=seed, noise_percentage=.1, n_drift_features=4).take(n_samples)
    track = EvoTrack("LEDDrift()", dataset, metrics.Accuracy(), n_samples)
    return track

def evo_hyperplane_accuracy_001_track(n_samples=10_000, seed=42):
    dataset = synth.Hyperplane(seed=seed,n_features=50,n_drift_features=25,mag_change=.001).take(n_samples)
    track = EvoTrack("Hyperplane(50,0.001)", dataset, metrics.Accuracy(), n_samples)
    return track

def evo_hyperplane_accuracy_0001_track(n_samples=10_000, seed=42):
    dataset = synth.Hyperplane(seed=seed,n_features=50,n_drift_features=25,mag_change=.0001).take(n_samples)
    track = EvoTrack("Hyperplane(50, 0.0001)", dataset, metrics.Accuracy(), n_samples)
    return track

def evo_sine_accuracy_track(n_samples=10_000, seed=42):
    dataset = synth.Sine(seed=seed).take(n_samples)
    track = EvoTrack("SINE()", dataset, metrics.Accuracy(), n_samples)
    return track

def evo_elec2_accuracy_track(n_samples=10_000, seed=42):
    dataset = Elec2().take(n_samples)
    track = EvoTrack("Elec", dataset, metrics.Accuracy(), n_samples)
    return track

def evo_covtype_accuracy_track(n_samples=10_000, seed=42):
    dataset = stream.iter_sklearn_dataset(sk_datasets.fetch_covtype())
    track = EvoTrack('Covtype', dataset, metrics.Accuracy(), n_samples)
    return track

Beispiel #4
0
from river import linear_model
from river import optim
from river import preprocessing
from sklearn import datasets
from sklearn import metrics
from river import stream

scaler = preprocessing.StandardScaler()
optimizer = optim.SGD(lr=0.01)
log_reg = linear_model.LogisticRegression(optimizer)

y_true = []
y_pred = []

for xi, yi in stream.iter_sklearn_dataset(datasets.load_breast_cancer(), shuffle=True, seed=42):

    # Scale the features
    xi_scaled = scaler.learn_one(xi).transform_one(xi)

    # Test the current model on the new "unobserved" sample
    yi_pred = log_reg.predict_proba_one(xi_scaled)
    # Train the model with the new sample
    log_reg.learn_one(xi_scaled, yi)

    # Store the truth and the prediction
    y_true.append(yi)
    y_pred.append(yi_pred[True])

print(f'ROC AUC: {metrics.roc_auc_score(y_true, y_pred):.3f}')
# Define the steps of the model
model = pipeline.Pipeline(
    [
        ('scale', preprocessing.StandardScaler()),
        ('lin_reg', linear_model.LogisticRegression(solver='lbfgs'))
    ]
)

# Define a determistic cross-validation procedure
cv = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)

#Conpute the MSE values
scorer = metrics.make_scorer(metrics.roc_auc_score)
scores = model_selection.cross_val_score(model, X, y, scoring=scorer, cv=cv)

# Display the average score and it's standard deviation
print(f'ROC AUC: {scores.mean():.3f} (+/- {scores.std():.3f})')

for xi, yi in zip(X, y):
    # This is where the model learns
    xi = dict(zip(dataset.feature_names, xi))
    print(xi['mean area'])


from river import stream

for xi, yi in stream.iter_sklearn_dataset(datasets.load_breast_cancer()):
    pass