def _reduce(ds: ray.data.Dataset):
    tb_list = traceback.format_list(traceback.extract_stack())
    _already_in_out_of_band_serialization = False
    for tb in tb_list:
        # TODO(xwjiang): Let's make this less hacky.
        if "serialize_lineage" in tb:
            _already_in_out_of_band_serialization = True
            break
    if not _already_in_out_of_band_serialization:
        return _deserialize_and_fully_execute_if_needed, (ds.serialize_lineage(),)
    else:
        return ds.__reduce__()
Beispiel #2
0
    def predict(
        self,
        data: ray.data.Dataset,
        *,
        batch_size: int = 4096,
        min_scoring_workers: int = 1,
        max_scoring_workers: Optional[int] = None,
        num_cpus_per_worker: int = 1,
        num_gpus_per_worker: int = 0,
        ray_remote_args: Optional[Dict[str, Any]] = None,
        **predict_kwargs,
    ) -> ray.data.Dataset:
        """Run batch scoring on dataset.

        Args:
            data: Ray dataset to run batch prediction on.
            batch_size: Split dataset into batches of this size for prediction.
            min_scoring_workers: Minimum number of scoring actors.
            max_scoring_workers: If set, specify the maximum number of scoring actors.
            num_cpus_per_worker: Number of CPUs to allocate per scoring worker.
            num_gpus_per_worker: Number of GPUs to allocate per scoring worker.
            ray_remote_args: Additional resource requirements to request from
                ray.
            predict_kwargs: Keyword arguments passed to the predictor's
                ``predict()`` method.

        Returns:
            Dataset containing scoring results.

        """
        predictor_cls = self.predictor_cls
        checkpoint_ref = self.checkpoint_ref
        predictor_kwargs = self.predictor_kwargs

        class ScoringWrapper:
            def __init__(self):
                checkpoint = Checkpoint.from_object_ref(checkpoint_ref)
                self.predictor = predictor_cls.from_checkpoint(
                    checkpoint, **predictor_kwargs
                )

            def __call__(self, batch):
                prediction_output = self.predictor.predict(batch, **predict_kwargs)
                return convert_batch_type_to_pandas(prediction_output)

        compute = ray.data.ActorPoolStrategy(
            min_size=min_scoring_workers, max_size=max_scoring_workers
        )

        ray_remote_args = ray_remote_args or {}
        ray_remote_args["num_cpus"] = num_cpus_per_worker
        ray_remote_args["num_gpus"] = num_gpus_per_worker

        return data.map_batches(
            ScoringWrapper,
            compute=compute,
            batch_format="pandas",
            batch_size=batch_size,
            **ray_remote_args,
        )
Beispiel #3
0
    def _create_async_parallel_reader(self, dataset: ray.data.Dataset,
                                      num_threads: int):
        q = queue.Queue(maxsize=100)

        batch_size = self.batch_size

        to_tensors = self._to_tensors_fn()
        splits = dataset.split(n=num_threads)

        def producer(i):
            for batch in (splits[i].map_batches(
                    to_tensors, batch_format="pandas").iter_batches(
                        prefetch_blocks=0,
                        batch_size=batch_size,
                        batch_format="pandas")):
                res = self._prepare_batch(batch)
                q.put(res)
            q.put(None)

        def async_parallel_read():
            threads = [
                threading.Thread(target=producer, args=(i, ))
                for i in range(num_threads)
            ]
            for t in threads:
                t.start()

            active_threads = num_threads
            while True:
                batch = q.get(block=True)
                if batch is None:
                    active_threads -= 1
                    if active_threads == 0:
                        break
                yield batch

            for t in threads:
                t.join()

        return async_parallel_read()
Beispiel #4
0
    def _preprocess(
            self, ds: ray.data.Dataset,
            inferencing: bool) -> Tuple[ray.data.Dataset, ray.data.Dataset]:
        print(
            "\nStep 1: Dropping nulls, creating new_col, updating feature_1\n")

        def batch_transformer(df: pd.DataFrame):
            # Disable chained assignment warning.
            pd.options.mode.chained_assignment = None

            # Drop nulls.
            df = df.dropna(subset=["nullable_feature"])

            # Add new column.
            df["new_col"] = (df["feature_1"] - 2 * df["feature_2"] +
                             df["feature_3"]) / 3.0

            # Transform column.
            df["feature_1"] = 2.0 * df["feature_1"] + 0.1

            return df

        ds = ds.map_batches(batch_transformer, batch_format="pandas")

        print("\nStep 2: Precalculating fruit-grouped mean for new column and "
              "for one-hot encoding (latter only uses fruit groups)\n")
        fruit_means = {
            r["fruit"]: r["mean(feature_1)"]
            for r in ds.groupby("fruit").mean("feature_1").take_all()
        }

        print("\nStep 3: Create mean_by_fruit as mean of feature_1 groupby "
              "fruit; one-hot encode fruit column\n")

        if inferencing:
            assert self.fruits is not None
        else:
            assert self.fruits is None
            self.fruits = list(fruit_means.keys())

        fruit_one_hots = {
            fruit: collections.defaultdict(int, **{fruit: 1})
            for fruit in self.fruits
        }

        def batch_transformer(df: pd.DataFrame):
            # Add column containing the feature_1-mean of the fruit groups.
            df["mean_by_fruit"] = df["fruit"].map(fruit_means)

            # One-hot encode the fruit column.
            for fruit, one_hot in fruit_one_hots.items():
                df[f"fruit_{fruit}"] = df["fruit"].map(one_hot)

            # Drop the fruit column, which is no longer needed.
            df.drop(columns="fruit", inplace=True)

            return df

        ds = ds.map_batches(batch_transformer, batch_format="pandas")

        if inferencing:
            print("\nStep 4: Standardize inference dataset\n")
            assert self.standard_stats is not None
        else:
            assert self.standard_stats is None

            print("\nStep 4a: Split training dataset into train-test split\n")

            # Split into train/test datasets.
            split_index = int(0.9 * ds.count())
            # Split into 90% training set, 10% test set.
            train_ds, test_ds = ds.split_at_indices([split_index])

            print("\nStep 4b: Precalculate training dataset stats for "
                  "standard scaling\n")
            # Calculate stats needed for standard scaling feature columns.
            feature_columns = [
                col for col in train_ds.schema().names if col != "label"
            ]
            standard_aggs = [
                agg(on=col) for col in feature_columns for agg in (Mean, Std)
            ]
            self.standard_stats = train_ds.aggregate(*standard_aggs)
            print("\nStep 4c: Standardize training dataset\n")

        # Standard scaling of feature columns.
        standard_stats = self.standard_stats

        def batch_standard_scaler(df: pd.DataFrame):
            def column_standard_scaler(s: pd.Series):
                if s.name == "label":
                    # Don't scale the label column.
                    return s
                s_mean = standard_stats[f"mean({s.name})"]
                s_std = standard_stats[f"std({s.name})"]
                return (s - s_mean) / s_std

            return df.transform(column_standard_scaler)

        if inferencing:
            # Apply standard scaling to inference dataset.
            inference_ds = ds.map_batches(batch_standard_scaler,
                                          batch_format="pandas")
            return inference_ds, None
        else:
            # Apply standard scaling to both training dataset and test dataset.
            train_ds = train_ds.map_batches(batch_standard_scaler,
                                            batch_format="pandas")
            test_ds = test_ds.map_batches(batch_standard_scaler,
                                          batch_format="pandas")
            return train_ds, test_ds
Beispiel #5
0
    def predict_pipelined(
        self,
        data: ray.data.Dataset,
        *,
        blocks_per_window: Optional[int] = None,
        bytes_per_window: Optional[int] = None,
        **kwargs,
    ) -> ray.data.DatasetPipeline:
        """Setup a prediction pipeline for batch scoring.

        Unlike `predict()`, this generates a DatasetPipeline object and does not
        perform execution. Execution can be triggered by pulling from the pipeline.

        This is a convenience wrapper around calling `.window()` on the Dataset prior
        to passing it `BatchPredictor.predict()`.

        Examples:
            >>> import pandas as pd
            >>> import ray
            >>> from ray.air import Checkpoint
            >>> from ray.train.predictor import Predictor
            >>> from ray.train.batch_predictor import BatchPredictor
            >>> # Create a dummy predictor that always returns `42` for each input.
            >>> class DummyPredictor(Predictor):
            ...     @classmethod
            ...     def from_checkpoint(cls, checkpoint, **kwargs):
            ...         return DummyPredictor()
            ...     def predict(self, data, **kwargs):
            ...         return pd.DataFrame({"a": [42] * len(data)})
            >>> # Create a batch predictor for this dummy predictor.
            >>> batch_pred = BatchPredictor( # doctest: +SKIP
            ...     Checkpoint.from_dict({"x": 0}), DummyPredictor)
            >>> # Create a dummy dataset.
            >>> ds = ray.data.range_tensor(1000, parallelism=4) # doctest: +SKIP
            >>> # Setup a prediction pipeline.
            >>> print(batch_pred.predict_pipelined( # doctest: +SKIP
            ...     ds, blocks_per_window=1))
            DatasetPipeline(num_windows=4, num_stages=3)

        Args:
            data: Ray dataset to run batch prediction on.
            blocks_per_window: The window size (parallelism) in blocks.
                Increasing window size increases pipeline throughput, but also
                increases the latency to initial output, since it decreases the
                length of the pipeline. Setting this to infinity effectively
                disables pipelining.
            bytes_per_window: Specify the window size in bytes instead of blocks.
                This will be treated as an upper bound for the window size, but each
                window will still include at least one block. This is mutually
                exclusive with ``blocks_per_window``.
            kwargs: Keyword arguments passed to BatchPredictor.predict().

        Returns:
            DatasetPipeline that generates scoring results.
        """

        if blocks_per_window is None and bytes_per_window is None:
            raise ValueError(
                "It is required to specify one of `blocks_per_window` or "
                "`bytes_per_window`.")

        pipe = data.window(blocks_per_window=blocks_per_window,
                           bytes_per_window=bytes_per_window)

        return self.predict(pipe)