Esempio n. 1
0
def iter_sklearn_dataset(dataset: "sklearn.utils.Bunch",
                         **kwargs) -> base.typing.Stream:
    """Iterates rows from one of the datasets provided by scikit-learn.

    This allows you to use any dataset from [scikit-learn's `datasets` module](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets). For instance, you can use the `fetch_openml` function to get access to all of the
    datasets from the OpenML website.

    Parameters
    ----------
    dataset
        A scikit-learn dataset.
    kwargs
        Extra keyword arguments are passed to the underlying call to `stream.iter_array`.

    Examples
    --------

    >>> import pprint
    >>> from sklearn import datasets
    >>> from river import stream

    >>> dataset = datasets.load_boston()

    >>> for xi, yi in stream.iter_sklearn_dataset(dataset):
    ...     pprint.pprint(xi)
    ...     print(yi)
    ...     break
    {'AGE': 65.2,
        'B': 396.9,
        'CHAS': 0.0,
        'CRIM': 0.00632,
        'DIS': 4.09,
        'INDUS': 2.31,
        'LSTAT': 4.98,
        'NOX': 0.538,
        'PTRATIO': 15.3,
        'RAD': 1.0,
        'RM': 6.575,
        'TAX': 296.0,
        'ZN': 18.0}
    24.0

    """
    kwargs["X"] = dataset.data
    kwargs["y"] = dataset.target
    try:
        kwargs["feature_names"] = dataset.feature_names
    except AttributeError:
        pass

    if isinstance(kwargs["X"], pd.DataFrame):
        yield from stream.iter_pandas(**kwargs)
    else:
        yield from stream.iter_array(**kwargs)
Esempio n. 2
0
def iter_sklearn_dataset(dataset: "sklearn.utils.Bunch",
                         **kwargs) -> base.typing.Stream:
    """Iterates rows from one of the datasets provided by scikit-learn.

    This allows you to use any dataset from [scikit-learn's `datasets` module](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets). For instance, you can use the `fetch_openml` function to get access to all of the
    datasets from the OpenML website.

    Parameters
    ----------
    dataset
        A scikit-learn dataset.
    kwargs
        Extra keyword arguments are passed to the underlying call to `stream.iter_array`.

    Examples
    --------

    >>> import pprint
    >>> from sklearn import datasets
    >>> from river import stream

    >>> dataset = datasets.load_diabetes()

    >>> for xi, yi in stream.iter_sklearn_dataset(dataset):
    ...     pprint.pprint(xi)
    ...     print(yi)
    ...     break
    {'age': 0.0380759064334241,
     'bmi': 0.0616962065186885,
     'bp': 0.0218723549949558,
     's1': -0.0442234984244464,
     's2': -0.0348207628376986,
     's3': -0.0434008456520269,
     's4': -0.00259226199818282,
     's5': 0.0199084208763183,
     's6': -0.0176461251598052,
     'sex': 0.0506801187398187}
    151.0

    """
    kwargs["X"] = dataset.data
    kwargs["y"] = dataset.target
    try:
        kwargs["feature_names"] = dataset.feature_names
    except AttributeError:
        pass

    if isinstance(kwargs["X"], pd.DataFrame):
        yield from stream.iter_pandas(**kwargs)
    else:
        yield from stream.iter_array(**kwargs)
Esempio n. 3
0
def test_one_many_consistent():
    """Checks that using learn_one or learn_many produces the same result."""

    X = pd.read_csv(datasets.TrumpApproval().path)
    Y = X.pop('five_thirty_eight')

    one = lm.LinearRegression()
    for x, y in stream.iter_pandas(X, Y):
        one.learn_one(x, y)

    many = lm.LinearRegression()
    for xb, yb in zip(np.array_split(X, len(X)), np.array_split(Y, len(Y))):
        many.learn_many(xb, yb)

    for i in X:
        assert math.isclose(one.weights[i], many.weights[i])
Esempio n. 4
0
def test_standard_scaler_one_many_consistent():
    """Checks that using learn_one or learn_many produces the same result."""

    X = pd.read_csv(datasets.TrumpApproval().path)

    one = preprocessing.StandardScaler()
    for x, _ in stream.iter_pandas(X):
        one.learn_one(x)

    many = preprocessing.StandardScaler()
    for xb in np.array_split(X, 10):
        many.learn_many(xb)

    for i in X:
        assert math.isclose(one.counts[i], many.counts[i])
        assert math.isclose(one.means[i], many.means[i])
        assert math.isclose(one.vars[i], many.vars[i])
Esempio n. 5
0
def test_online_batch_consistent():

    # Batch

    batch = preprocessing.StandardScaler() | multiclass.OneVsRestClassifier(
        linear_model.LogisticRegression())

    dataset = datasets.ImageSegments()

    batch_metric = metrics.MacroF1()

    for i, x in enumerate(pd.read_csv(dataset.path, chunksize=1)):
        y = x.pop("category")
        y_pred = batch.predict_many(x)
        batch.learn_many(x, y)

        for yt, yp in zip(y, y_pred):
            if yp is not None:
                batch_metric.update(yt, yp)

        if i == 30:
            break

    # Online

    online = preprocessing.StandardScaler() | multiclass.OneVsRestClassifier(
        linear_model.LogisticRegression())

    online_metric = metrics.MacroF1()

    X = pd.read_csv(dataset.path)
    Y = X.pop("category")

    for i, (x, y) in enumerate(stream.iter_pandas(X, Y)):
        y_pred = online.predict_one(x)
        online.learn_one(x, y)

        if y_pred is not None:
            online_metric.update(y, y_pred)

        if i == 30:
            break

    assert online_metric.get() == batch_metric.get()