def _reduce(ds: ray.data.Dataset): tb_list = traceback.format_list(traceback.extract_stack()) _already_in_out_of_band_serialization = False for tb in tb_list: # TODO(xwjiang): Let's make this less hacky. if "serialize_lineage" in tb: _already_in_out_of_band_serialization = True break if not _already_in_out_of_band_serialization: return _deserialize_and_fully_execute_if_needed, (ds.serialize_lineage(),) else: return ds.__reduce__()
def predict( self, data: ray.data.Dataset, *, batch_size: int = 4096, min_scoring_workers: int = 1, max_scoring_workers: Optional[int] = None, num_cpus_per_worker: int = 1, num_gpus_per_worker: int = 0, ray_remote_args: Optional[Dict[str, Any]] = None, **predict_kwargs, ) -> ray.data.Dataset: """Run batch scoring on dataset. Args: data: Ray dataset to run batch prediction on. batch_size: Split dataset into batches of this size for prediction. min_scoring_workers: Minimum number of scoring actors. max_scoring_workers: If set, specify the maximum number of scoring actors. num_cpus_per_worker: Number of CPUs to allocate per scoring worker. num_gpus_per_worker: Number of GPUs to allocate per scoring worker. ray_remote_args: Additional resource requirements to request from ray. predict_kwargs: Keyword arguments passed to the predictor's ``predict()`` method. Returns: Dataset containing scoring results. """ predictor_cls = self.predictor_cls checkpoint_ref = self.checkpoint_ref predictor_kwargs = self.predictor_kwargs class ScoringWrapper: def __init__(self): checkpoint = Checkpoint.from_object_ref(checkpoint_ref) self.predictor = predictor_cls.from_checkpoint( checkpoint, **predictor_kwargs ) def __call__(self, batch): prediction_output = self.predictor.predict(batch, **predict_kwargs) return convert_batch_type_to_pandas(prediction_output) compute = ray.data.ActorPoolStrategy( min_size=min_scoring_workers, max_size=max_scoring_workers ) ray_remote_args = ray_remote_args or {} ray_remote_args["num_cpus"] = num_cpus_per_worker ray_remote_args["num_gpus"] = num_gpus_per_worker return data.map_batches( ScoringWrapper, compute=compute, batch_format="pandas", batch_size=batch_size, **ray_remote_args, )
def _create_async_parallel_reader(self, dataset: ray.data.Dataset, num_threads: int): q = queue.Queue(maxsize=100) batch_size = self.batch_size to_tensors = self._to_tensors_fn() splits = dataset.split(n=num_threads) def producer(i): for batch in (splits[i].map_batches( to_tensors, batch_format="pandas").iter_batches( prefetch_blocks=0, batch_size=batch_size, batch_format="pandas")): res = self._prepare_batch(batch) q.put(res) q.put(None) def async_parallel_read(): threads = [ threading.Thread(target=producer, args=(i, )) for i in range(num_threads) ] for t in threads: t.start() active_threads = num_threads while True: batch = q.get(block=True) if batch is None: active_threads -= 1 if active_threads == 0: break yield batch for t in threads: t.join() return async_parallel_read()
def _preprocess( self, ds: ray.data.Dataset, inferencing: bool) -> Tuple[ray.data.Dataset, ray.data.Dataset]: print( "\nStep 1: Dropping nulls, creating new_col, updating feature_1\n") def batch_transformer(df: pd.DataFrame): # Disable chained assignment warning. pd.options.mode.chained_assignment = None # Drop nulls. df = df.dropna(subset=["nullable_feature"]) # Add new column. df["new_col"] = (df["feature_1"] - 2 * df["feature_2"] + df["feature_3"]) / 3.0 # Transform column. df["feature_1"] = 2.0 * df["feature_1"] + 0.1 return df ds = ds.map_batches(batch_transformer, batch_format="pandas") print("\nStep 2: Precalculating fruit-grouped mean for new column and " "for one-hot encoding (latter only uses fruit groups)\n") fruit_means = { r["fruit"]: r["mean(feature_1)"] for r in ds.groupby("fruit").mean("feature_1").take_all() } print("\nStep 3: Create mean_by_fruit as mean of feature_1 groupby " "fruit; one-hot encode fruit column\n") if inferencing: assert self.fruits is not None else: assert self.fruits is None self.fruits = list(fruit_means.keys()) fruit_one_hots = { fruit: collections.defaultdict(int, **{fruit: 1}) for fruit in self.fruits } def batch_transformer(df: pd.DataFrame): # Add column containing the feature_1-mean of the fruit groups. df["mean_by_fruit"] = df["fruit"].map(fruit_means) # One-hot encode the fruit column. for fruit, one_hot in fruit_one_hots.items(): df[f"fruit_{fruit}"] = df["fruit"].map(one_hot) # Drop the fruit column, which is no longer needed. df.drop(columns="fruit", inplace=True) return df ds = ds.map_batches(batch_transformer, batch_format="pandas") if inferencing: print("\nStep 4: Standardize inference dataset\n") assert self.standard_stats is not None else: assert self.standard_stats is None print("\nStep 4a: Split training dataset into train-test split\n") # Split into train/test datasets. split_index = int(0.9 * ds.count()) # Split into 90% training set, 10% test set. train_ds, test_ds = ds.split_at_indices([split_index]) print("\nStep 4b: Precalculate training dataset stats for " "standard scaling\n") # Calculate stats needed for standard scaling feature columns. feature_columns = [ col for col in train_ds.schema().names if col != "label" ] standard_aggs = [ agg(on=col) for col in feature_columns for agg in (Mean, Std) ] self.standard_stats = train_ds.aggregate(*standard_aggs) print("\nStep 4c: Standardize training dataset\n") # Standard scaling of feature columns. standard_stats = self.standard_stats def batch_standard_scaler(df: pd.DataFrame): def column_standard_scaler(s: pd.Series): if s.name == "label": # Don't scale the label column. return s s_mean = standard_stats[f"mean({s.name})"] s_std = standard_stats[f"std({s.name})"] return (s - s_mean) / s_std return df.transform(column_standard_scaler) if inferencing: # Apply standard scaling to inference dataset. inference_ds = ds.map_batches(batch_standard_scaler, batch_format="pandas") return inference_ds, None else: # Apply standard scaling to both training dataset and test dataset. train_ds = train_ds.map_batches(batch_standard_scaler, batch_format="pandas") test_ds = test_ds.map_batches(batch_standard_scaler, batch_format="pandas") return train_ds, test_ds
def predict_pipelined( self, data: ray.data.Dataset, *, blocks_per_window: Optional[int] = None, bytes_per_window: Optional[int] = None, **kwargs, ) -> ray.data.DatasetPipeline: """Setup a prediction pipeline for batch scoring. Unlike `predict()`, this generates a DatasetPipeline object and does not perform execution. Execution can be triggered by pulling from the pipeline. This is a convenience wrapper around calling `.window()` on the Dataset prior to passing it `BatchPredictor.predict()`. Examples: >>> import pandas as pd >>> import ray >>> from ray.air import Checkpoint >>> from ray.train.predictor import Predictor >>> from ray.train.batch_predictor import BatchPredictor >>> # Create a dummy predictor that always returns `42` for each input. >>> class DummyPredictor(Predictor): ... @classmethod ... def from_checkpoint(cls, checkpoint, **kwargs): ... return DummyPredictor() ... def predict(self, data, **kwargs): ... return pd.DataFrame({"a": [42] * len(data)}) >>> # Create a batch predictor for this dummy predictor. >>> batch_pred = BatchPredictor( # doctest: +SKIP ... Checkpoint.from_dict({"x": 0}), DummyPredictor) >>> # Create a dummy dataset. >>> ds = ray.data.range_tensor(1000, parallelism=4) # doctest: +SKIP >>> # Setup a prediction pipeline. >>> print(batch_pred.predict_pipelined( # doctest: +SKIP ... ds, blocks_per_window=1)) DatasetPipeline(num_windows=4, num_stages=3) Args: data: Ray dataset to run batch prediction on. blocks_per_window: The window size (parallelism) in blocks. Increasing window size increases pipeline throughput, but also increases the latency to initial output, since it decreases the length of the pipeline. Setting this to infinity effectively disables pipelining. bytes_per_window: Specify the window size in bytes instead of blocks. This will be treated as an upper bound for the window size, but each window will still include at least one block. This is mutually exclusive with ``blocks_per_window``. kwargs: Keyword arguments passed to BatchPredictor.predict(). Returns: DatasetPipeline that generates scoring results. """ if blocks_per_window is None and bytes_per_window is None: raise ValueError( "It is required to specify one of `blocks_per_window` or " "`bytes_per_window`.") pipe = data.window(blocks_per_window=blocks_per_window, bytes_per_window=bytes_per_window) return self.predict(pipe)