def iter_sklearn_dataset(dataset: "sklearn.utils.Bunch", **kwargs) -> base.typing.Stream: """Iterates rows from one of the datasets provided by scikit-learn. This allows you to use any dataset from [scikit-learn's `datasets` module](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets). For instance, you can use the `fetch_openml` function to get access to all of the datasets from the OpenML website. Parameters ---------- dataset A scikit-learn dataset. kwargs Extra keyword arguments are passed to the underlying call to `stream.iter_array`. Examples -------- >>> import pprint >>> from sklearn import datasets >>> from river import stream >>> dataset = datasets.load_boston() >>> for xi, yi in stream.iter_sklearn_dataset(dataset): ... pprint.pprint(xi) ... print(yi) ... break {'AGE': 65.2, 'B': 396.9, 'CHAS': 0.0, 'CRIM': 0.00632, 'DIS': 4.09, 'INDUS': 2.31, 'LSTAT': 4.98, 'NOX': 0.538, 'PTRATIO': 15.3, 'RAD': 1.0, 'RM': 6.575, 'TAX': 296.0, 'ZN': 18.0} 24.0 """ kwargs["X"] = dataset.data kwargs["y"] = dataset.target try: kwargs["feature_names"] = dataset.feature_names except AttributeError: pass if isinstance(kwargs["X"], pd.DataFrame): yield from stream.iter_pandas(**kwargs) else: yield from stream.iter_array(**kwargs)
def iter_sklearn_dataset(dataset: "sklearn.utils.Bunch", **kwargs) -> base.typing.Stream: """Iterates rows from one of the datasets provided by scikit-learn. This allows you to use any dataset from [scikit-learn's `datasets` module](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets). For instance, you can use the `fetch_openml` function to get access to all of the datasets from the OpenML website. Parameters ---------- dataset A scikit-learn dataset. kwargs Extra keyword arguments are passed to the underlying call to `stream.iter_array`. Examples -------- >>> import pprint >>> from sklearn import datasets >>> from river import stream >>> dataset = datasets.load_diabetes() >>> for xi, yi in stream.iter_sklearn_dataset(dataset): ... pprint.pprint(xi) ... print(yi) ... break {'age': 0.0380759064334241, 'bmi': 0.0616962065186885, 'bp': 0.0218723549949558, 's1': -0.0442234984244464, 's2': -0.0348207628376986, 's3': -0.0434008456520269, 's4': -0.00259226199818282, 's5': 0.0199084208763183, 's6': -0.0176461251598052, 'sex': 0.0506801187398187} 151.0 """ kwargs["X"] = dataset.data kwargs["y"] = dataset.target try: kwargs["feature_names"] = dataset.feature_names except AttributeError: pass if isinstance(kwargs["X"], pd.DataFrame): yield from stream.iter_pandas(**kwargs) else: yield from stream.iter_array(**kwargs)
def test_one_many_consistent(): """Checks that using learn_one or learn_many produces the same result.""" X = pd.read_csv(datasets.TrumpApproval().path) Y = X.pop('five_thirty_eight') one = lm.LinearRegression() for x, y in stream.iter_pandas(X, Y): one.learn_one(x, y) many = lm.LinearRegression() for xb, yb in zip(np.array_split(X, len(X)), np.array_split(Y, len(Y))): many.learn_many(xb, yb) for i in X: assert math.isclose(one.weights[i], many.weights[i])
def test_standard_scaler_one_many_consistent(): """Checks that using learn_one or learn_many produces the same result.""" X = pd.read_csv(datasets.TrumpApproval().path) one = preprocessing.StandardScaler() for x, _ in stream.iter_pandas(X): one.learn_one(x) many = preprocessing.StandardScaler() for xb in np.array_split(X, 10): many.learn_many(xb) for i in X: assert math.isclose(one.counts[i], many.counts[i]) assert math.isclose(one.means[i], many.means[i]) assert math.isclose(one.vars[i], many.vars[i])
def test_online_batch_consistent(): # Batch batch = preprocessing.StandardScaler() | multiclass.OneVsRestClassifier( linear_model.LogisticRegression()) dataset = datasets.ImageSegments() batch_metric = metrics.MacroF1() for i, x in enumerate(pd.read_csv(dataset.path, chunksize=1)): y = x.pop("category") y_pred = batch.predict_many(x) batch.learn_many(x, y) for yt, yp in zip(y, y_pred): if yp is not None: batch_metric.update(yt, yp) if i == 30: break # Online online = preprocessing.StandardScaler() | multiclass.OneVsRestClassifier( linear_model.LogisticRegression()) online_metric = metrics.MacroF1() X = pd.read_csv(dataset.path) Y = X.pop("category") for i, (x, y) in enumerate(stream.iter_pandas(X, Y)): y_pred = online.predict_one(x) online.learn_one(x, y) if y_pred is not None: online_metric.update(y, y_pred) if i == 30: break assert online_metric.get() == batch_metric.get()