def main(): import datetime as dt from creme import compose from creme import datasets from creme import feature_extraction from creme import linear_model from creme import metrics as metricss from creme import preprocessing from creme import stats from creme import stream X_y = datasets.Bikes() X_y = stream.simulate_qa(X_y, moment='moment', delay=dt.timedelta(minutes=30)) def add_time_features(x): return {**x, 'hour': x['moment'].hour, 'day': x['moment'].weekday()} model = add_time_features model |= (compose.Select('clouds', 'humidity', 'pressure', 'temperature', 'wind') + feature_extraction.TargetAgg( by=['station', 'hour'], how=stats.Mean()) + feature_extraction.TargetAgg(by='station', how=stats.EWMean())) model |= preprocessing.StandardScaler() model |= linear_model.LinearRegression() metric = metricss.MAE() questions = {} for i, x, y in X_y: # Question is_question = y is None if is_question: y_pred = model.predict_one(x) questions[i] = y_pred # Answer else: metric.update(y, questions[i]) model = model.fit_one(x, y) if i >= 30000 and i % 30000 == 0: print(i, metric)
def progressive_val_score(dataset: base.typing.Stream, model: base.Predictor, metric: metrics.Metric, moment: typing.Union[str, typing.Callable] = None, delay: typing.Union[str, int, dt.timedelta, typing.Callable] = None, print_every=0, show_time=False, show_memory=False, **print_kwargs) -> metrics.Metric: """Evaluates the performance of a model on a streaming dataset. This method is the canonical way to evaluate a model's performance. When used correctly, it allows you to exactly assess how a model would have performed in a production scenario. `dataset` is converted into a stream of questions and answers. At each step the model is either asked to predict an observation, or is either updated. The target is only revealed to the model after a certain amount of time, which is determined by the `delay` parameter. Note that under the hood this uses the `stream.simulate_qa` function to go through the data in arrival order. By default, there is no delay, which means that the samples are processed one after the other. When there is no delay, this function essentially performs progressive validation. When there is a delay, then we refer to it as delayed progressive validation. It is recommended to use this method when you want to determine a model's performance on a dataset. In particular, it is advised to use the `delay` parameter in order to get a reliable assessment. Indeed, in a production scenario, it is often the case that ground truths are made available after a certain amount of time. By using this method, you can reproduce this scenario and therefore truthfully assess what would have been the performance of a model on a given dataset. Parameters: dataset: The stream of observations against which the model will be evaluated. model: The model to evaluate. metric: The metric used to evaluate the model's predictions. moment: The attribute used for measuring time. If a callable is passed, then it is expected to take as input a `dict` of features. If `None`, then the observations are implicitely timestamped in the order in which they arrive. delay: The amount to wait before revealing the target associated with each observation to the model. This value is expected to be able to sum with the `moment` value. For instance, if `moment` is a `datetime.date`, then `delay` is expected to be a `datetime.timedelta`. If a callable is passed, then it is expected to take as input a `dict` of features and the target. If a `str` is passed, then it will be used to access the relevant field from the features. If `None` is passed, then no delay will be used, which leads to doing standard online validation. print_every: Iteration number at which to print the current metric. This only takes into account the predictions, and not the training steps. show_time: Whether or not to display the elapsed time. show_memory: Whether or not to display the memory usage of the model. print_kwargs: Extra keyword arguments are passed to the `print` function. For instance, this allows providing a `file` argument, which indicates where to output progress. Example: Take the following model: >>> from creme import linear_model >>> from creme import preprocessing >>> model = ( ... preprocessing.StandardScaler() | ... linear_model.LogisticRegression() ... ) We can evaluate it on the `Phishing` dataset as so: >>> from creme import datasets >>> from creme import evaluate >>> from creme import metrics >>> evaluate.progressive_val_score( ... model=model, ... dataset=datasets.Phishing(), ... metric=metrics.ROCAUC(), ... print_every=200 ... ) [200] ROCAUC: 0.897995 [400] ROCAUC: 0.920896 [600] ROCAUC: 0.931339 [800] ROCAUC: 0.939909 [1,000] ROCAUC: 0.947417 [1,200] ROCAUC: 0.950304 ROCAUC: 0.950363 We haven't specified a delay, therefore this is strictly equivalent to the following piece of code: >>> model = ( ... preprocessing.StandardScaler() | ... linear_model.LogisticRegression() ... ) >>> metric = metrics.ROCAUC() >>> for x, y in datasets.Phishing(): ... y_pred = model.predict_proba_one(x) ... metric = metric.update(y, y_pred) ... model = model.fit_one(x, y) >>> metric ROCAUC: 0.950363 When `print_every` is specified, the current state is printed at regular intervals. Under the hood, Python's `print` method is being used. You can pass extra keyword arguments to modify its behavior. For instance, you may use the `file` argument if you want to log the progress to a file of your choice. >>> with open('progress.log', 'w') as f: ... metric = evaluate.progressive_val_score( ... model=model, ... dataset=datasets.Phishing(), ... metric=metrics.ROCAUC(), ... print_every=200, ... file=f ... ) >>> with open('progress.log') as f: ... for line in f.read().splitlines(): ... print(line) [200] ROCAUC: 0.94 [400] ROCAUC: 0.946969 [600] ROCAUC: 0.9517 [800] ROCAUC: 0.954238 [1,000] ROCAUC: 0.958207 [1,200] ROCAUC: 0.96002 Note that the performance is slightly better than above because we haven't used a fresh copy of the model. Instead, we've reused the existing model which has already done a full pass on the data. >>> import os; os.remove('progress.log') References: 1. [Beating the Hold-Out: Bounds for K-fold and Progressive Cross-Validation](http://hunch.net/~jl/projects/prediction_bounds/progressive_validation/coltfinal.pdf) 2. [Grzenda, M., Gomes, H.M. and Bifet, A., 2019. Delayed labelling evaluation for data streams. Data Mining and Knowledge Discovery, pp.1-30](https://link.springer.com/content/pdf/10.1007%2Fs10618-019-00654-y.pdf) """ # Check that the model and the metric are in accordance if not metric.works_with(model): raise ValueError( f'{metric.__class__.__name__} metric is not compatible with {model}' ) # Determine if predict_one or predict_proba_one should be used in case of a classifier pred_func = model.predict_one if utils.inspect.isclassifier(model) and not metric.requires_labels: pred_func = model.predict_proba_one preds = {} n_total_answers = 0 if show_time: start = time.perf_counter() for i, x, y in stream.simulate_qa(dataset, moment, delay, copy=True): # Question if y is None: preds[i] = pred_func(x=x) continue # Answer y_pred = preds.pop(i) if y_pred != {} and y_pred is not None: metric.update(y_true=y, y_pred=y_pred) model.fit_one(x=x, y=y) # Update the answer counter n_total_answers += 1 if print_every and not n_total_answers % print_every: msg = f'[{n_total_answers:,d}] {metric}' if show_time: now = time.perf_counter() msg += f' – {dt.timedelta(seconds=int(now - start))}' if show_memory: msg += f' – {model._memory_usage}' print(msg, **print_kwargs) return metric
def progressive_val_score(X_y: base.typing.Stream, model: base.Predictor, metric: metrics.Metric, moment: typing.Union[str, typing.Callable] = None, delay: typing.Union[str, int, dt.timedelta, typing.Callable] = None, print_every=0, show_time=False, show_memory=False) -> metrics.Metric: """Evaluates the performance of a model on a streaming dataset. This method is the canonical way to evaluate a model's performance. When used correctly, it allows you to exactly assess how a model would have performed in a production scenario. `X_y` is converted into a stream of questions and answers. At each step the model is either asked to predict an observation, or is either updated. The target is only revealed to the model after a certain amount of time, which is determined by the `delay` parameter. Note that under the hood this uses the `stream.simulate_qa` function to go through the data in arrival order. By default, there is no delay, which means that the samples are processed one after the other. When there is no delay, this function essentially performs progressive validation. When there is a delay, then we refer to it as delayed progressive validation. It is recommended to use this method when you want to determine a model's performance on a dataset. In particular, it is advised to use the `delay` parameter in order to get a reliable assessment. Indeed, in a production scenario, it is often the case that ground truths are made available after a certain amount of time. By using this method, you can reproduce this scenario and therefore truthfully assess what would have been the performance of a model on a given dataset. Parameters: X_y: The stream of observations against which the model will be evaluated. model: The model to evaluate. metric: The metric used to evaluate the model's predictions. moment (callable or str): The attribute used for measuring time. If a callable is passed, then it is expected to take as input a `dict` of features. If `None`, then the observations are implicitely timestamped in the order in which they arrive. delay: The amount to wait before revealing the target associated with each observation to the model. This value is expected to be able to sum with the `moment` value. For instance, if `moment` is a `datetime.date`, then `delay` is expected to be a `datetime.timedelta`. If a callable is passed, then it is expected to take as input a `dict` of features and the target. If a `str` is passed, then it will be used to access the relevant field from the features. If `None` is passed, then no delay will be used, which leads to doing standard online validation. print_every (int): Iteration number at which to print the current metric. This only takes into account the predictions, and not the training steps. show_time (bool): Whether or not to display the elapsed time. show_memory (bool): Whether or not to display the memory usage of the model. Example: Take the following model: >>> from creme import linear_model >>> from creme import preprocessing >>> model = ( ... preprocessing.StandardScaler() | ... linear_model.LogisticRegression() ... ) We can evaluate it on the `Phishing` dataset as so: >>> from creme import datasets >>> from creme import metrics >>> from creme import model_selection >>> model_selection.progressive_val_score( ... model=model, ... X_y=datasets.Phishing(), ... metric=metrics.ROCAUC() ... ) ROCAUC: 0.950224 We haven't specified a delay, therefore this is strictly equivalent to the following piece of code: >>> model = ( ... preprocessing.StandardScaler() | ... linear_model.LogisticRegression() ... ) >>> metric = metrics.ROCAUC() >>> for x, y in datasets.Phishing(): ... y_pred = model.predict_proba_one(x) ... metric = metric.update(y, y_pred) ... model = model.fit_one(x, y) >>> metric ROCAUC: 0.950224 References: 1. [Beating the Hold-Out: Bounds for K-fold and Progressive Cross-Validation](http://hunch.net/~jl/projects/prediction_bounds/progressive_validation/coltfinal.pdf) 2. [Grzenda, M., Gomes, H.M. and Bifet, A., 2019. Delayed labelling evaluation for data streams. Data Mining and Knowledge Discovery, pp.1-30](https://link.springer.com/content/pdf/10.1007%2Fs10618-019-00654-y.pdf) """ # Check that the model and the metric are in accordance if not metric.works_with(model): raise ValueError(f'{metric.__class__.__name__} metric is not compatible with {model}') # Determine if predict_one or predict_proba_one should be used in case of a classifier pred_func = model.predict_one is_classifier = isinstance(utils.estimator_checks.guess_model(model), base.Classifier) if is_classifier and not metric.requires_labels: pred_func = model.predict_proba_one preds = {} n_total_answers = 0 if show_time: start = time.perf_counter() for i, x, y in stream.simulate_qa(X_y, moment, delay, copy=True): # Question if y is None: preds[i] = pred_func(x=x) continue # Answer y_pred = preds.pop(i) if y_pred != {} and y_pred is not None: metric.update(y_true=y, y_pred=y_pred) model.fit_one(x=x, y=y) # Update the answer counter n_total_answers += 1 if print_every and not n_total_answers % print_every: msg = f'[{n_total_answers:,d}] {metric}' if show_time: now = time.perf_counter() msg += f' – {dt.timedelta(seconds=int(now - start))}' if show_memory: msg += f' – {model._memory_usage}' print(msg) return metric
args = parser.parse_args() def sleep(td: dt.timedelta): if td.seconds >= 0: time.sleep(td.seconds / args.speed_up) # Use the first trip's departure time as a reference time taxis = datasets.Taxis() now = next(iter(taxis))[0]['pickup_datetime'] mae = metrics.MAE() host = 'http://localhost:5000' predictions = {} for trip_no, trip, duration in stream.simulate_qa( taxis, moment='pickup_datetime', delay=lambda _, duration: dt.timedelta(seconds=duration) ): trip_no = str(trip_no).zfill(len(str(taxis.n_samples))) # Taxi trip starts if duration is None: # Wait sleep(trip['pickup_datetime'] - now) now = trip['pickup_datetime'] # Ask chantilly to make a prediction r = requests.post(host + '/api/predict', json={
def progressive_val_score(X_y: base.typing.Stream, model: base.Predictor, metric: metrics.Metric, moment: typing.Union[str, typing.Callable] = None, delay: typing.Union[str, int, dt.timedelta, typing.Callable] = None, print_every=0, show_time=False, show_memory=False) -> metrics.Metric: """A variant of online scoring where the targets are revealed with a delay. `X_y` is converted into a stream of questions and answers. At each step the model is either asked to predict an observation, or is either updated. The target is only revealed to the model after a certain amount of time, which is determined by `delay` the parameter. Parameters: X_y: The stream of observations against which the model will be evaluated. model: The model to evaluate. metric: The metric used to evaluate the model's predictions. moment (callable or str): The attribute used for measuring time. If a callable is passed, then it is expected to take as input a `dict` of features. If `None`, then the observations are implicitely timestamped in the order in which they arrive. delay: The amount to wait before revealing the target associated with each observation to the model. This value is expected to be able to sum with the `moment` value. For instance, if `moment` is a `datetime.date`, then `delay` is expected to be a `datetime.timedelta`. If a callable is passed, then it is expected to take as input a `dict` of features and the target. If a `str` is passed, then it will be used to access the relevant field from the features. If `None` is passed, then no delay will be used, which leads to doing standard online validation. print_every (int): Iteration number at which to print the current metric. This only takes into account the predictions, and not the training steps. show_time (bool): Whether or not to display the elapsed time. show_memory (bool): Whether or not to display the memory usage of the model. References: 1. [Beating the Hold-Out: Bounds for K-fold and Progressive Cross-Validation](http://hunch.net/~jl/projects/prediction_bounds/progressive_validation/coltfinal.pdf) 2. [Grzenda, M., Gomes, H.M. and Bifet, A., 2019. Delayed labelling evaluation for data streams. Data Mining and Knowledge Discovery, pp.1-30](https://link.springer.com/content/pdf/10.1007%2Fs10618-019-00654-y.pdf) """ # Check that the model and the metric are in accordance if not metric.works_with(model): raise ValueError( f'{metric.__class__.__name__} metric is not compatible with {model}' ) # Determine if predict_one or predict_proba_one should be used in case of a classifier pred_func = model.predict_one is_classifier = isinstance(utils.estimator_checks.guess_model(model), base.Classifier) if is_classifier and not metric.requires_labels: pred_func = model.predict_proba_one preds = {} n_total_answers = 0 if show_time: start = time.perf_counter() for i, x, y in stream.simulate_qa(X_y, moment, delay, copy=True): # Question if y is None: preds[i] = pred_func(x=x) # Answer else: y_pred = preds.pop(i) if y_pred != {} and y_pred is not None: metric.update(y_true=y, y_pred=y_pred) model.fit_one(x=x, y=y) # Update the answer counter n_total_answers += 1 if print_every and not n_total_answers % print_every: msg = f'[{n_total_answers:,d}] {metric}' if show_time: now = time.perf_counter() msg += f' – {dt.timedelta(seconds=int(now - start))}' if show_memory: msg += f' – {model._memory_usage}' print(msg) return metric