コード例 #1
0
def delete_at_uri(uri: str):
    _assert_pyarrow_installed()

    fs, bucket_path = get_fs_and_path(uri)
    if not fs:
        raise ValueError(
            f"Could not clear URI contents: "
            f"URI `{uri}` is not a valid or supported cloud target. "
            f"Hint: {fs_hint(uri)}")

    try:
        fs.delete_dir(bucket_path)
    except Exception as e:
        logger.warning(f"Caught exception when clearing URI `{uri}`: {e}")
コード例 #2
0
def _predict(model: xgb.Booster,
             data: RayDMatrix,
             num_actors: int = 4,
             cpus_per_actor: int = 0,
             gpus_per_actor: int = 0,
             resources_per_actor: Optional[Dict] = None,
             **kwargs):
    _assert_ray_support()

    if not ray.is_initialized():
        ray.init()

    # Create remote actors
    actors = [
        _create_actor(i, num_actors, cpus_per_actor, gpus_per_actor,
                      resources_per_actor) for i in range(num_actors)
    ]
    logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.")

    # Split data across workers
    wait_load = []
    for _, actor in enumerate(actors):
        wait_load.extend(_trigger_data_load(actor, data, []))

    try:
        ray.get(wait_load)
    except Exception as exc:
        logger.warning(f"Caught an error during prediction: {str(exc)}")
        _shutdown(actors, force=True)
        raise

    # Put model into object store
    model_ref = ray.put(model)

    logger.info("[RayXGBoost] Starting XGBoost prediction.")

    # Train
    fut = [actor.predict.remote(model_ref, data, **kwargs) for actor in actors]

    try:
        actor_results = ray.get(fut)
    except Exception as exc:
        logger.warning(f"Caught an error during prediction: {str(exc)}")
        _shutdown(remote_workers=actors, force=True)
        raise

    _shutdown(remote_workers=actors, force=False)

    return combine_data(data.sharding, actor_results)
コード例 #3
0
    def get_data_source(self) -> Type[DataSource]:
        if self.data_source:
            return self.data_source

        data_source = None
        for source in data_sources:
            if not source.supports_central_loading:
                continue

            try:
                if source.is_data_type(self.data, self.filetype):
                    data_source = source
                    break
            except Exception as exc:
                # If checking the data throws an exception, the data source
                # is not available.
                logger.warning(
                    f"Checking data source {source.__name__} failed "
                    f"with exception: {exc}")
                continue

        if not data_source:
            raise ValueError(
                "Unknown data source type: {} with FileType: {}."
                "\nFIX THIS by passing a supported data type. Supported "
                "data types include pandas.DataFrame, pandas.Series, "
                "np.ndarray, and CSV/Parquet file paths. If you specify a "
                "file, path, consider passing the `filetype` argument to "
                "specify the type of the source. Use the `RayFileType` "
                "enum for that. If using Modin, Dask, or Petastorm, "
                "make sure the library is installed.".format(
                    type(self.data), self.filetype))

        if self.label is not None and not isinstance(self.label, str) and \
                not type(self.data) != type(self.label):  # noqa: E721:
            # Label is an object of a different type than the main data.
            # We have to make sure they are compatible
            if not data_source.is_data_type(self.label):
                raise ValueError(
                    "The passed `data` and `label` types are not compatible."
                    "\nFIX THIS by passing the same types to the "
                    "`RayDMatrix` - e.g. a `pandas.DataFrame` as `data` "
                    "and `label`. The `label` can always be a string. Got "
                    "{} for the main data and {} for the label.".format(
                        type(self.data), type(self.label)))

        self.data_source = data_source
        self._cached_n = data_source.get_n(self.data)
        return self.data_source
コード例 #4
0
def clear_bucket(bucket: str):
    if not is_cloud_target(bucket):
        raise ValueError(
            f"Could not clear bucket contents: "
            f"Bucket `{bucket}` is not a valid or supported cloud target.")

    try:
        if bucket.startswith(S3_PREFIX):
            subprocess.check_call(
                ["aws", "s3", "rm", "--recursive", "--quiet", bucket])
        elif bucket.startswith(GS_PREFIX):
            subprocess.check_call(["gsutil", "-m", "rm", "-f", "-r", bucket])
        elif bucket.startswith(HDFS_PREFIX):
            subprocess.check_call(["hdfs", "dfs", "-rm", "-r", bucket])

    except Exception as e:
        logger.warning(
            f"Caught exception when clearing bucket `{bucket}`: {e}")
コード例 #5
0
def _shutdown(remote_workers: List[ActorHandle],
              queue: Optional[Queue] = None,
              force: bool = False):
    if force:
        logger.debug(f"Killing {len(remote_workers)} workers.")
        for worker in remote_workers:
            ray.kill(worker)
        if queue is not None:
            logger.debug("Killing Queue.")
            ray.kill(queue.actor)
    else:
        try:
            [worker.__ray_terminate__.remote() for worker in remote_workers]
            if queue is not None:
                queue.actor.__ray_terminate__.remote()
        except RayActorError:
            logger.warning("Failed to shutdown gracefully, forcing a "
                           "shutdown.")
            _shutdown(remote_workers, force=True)
コード例 #6
0
 def run(self):
     while True:
         # This thread won't be broken by exceptions.
         try:
             self.write()
         except Exception as e:
             logger.warning("Writing a service discovery file, {},"
                            "failed.".format(
                                self.writer.get_target_file_name()))
             logger.warning(traceback.format_exc())
             logger.warning(f"Error message: {e}")
         time.sleep(self.default_service_discovery_flush_period)
コード例 #7
0
ファイル: matrix.py プロジェクト: ijrsvt/xgboost_ray
    def get_data_source(self) -> Type[DataSource]:
        if self.data_source:
            return self.data_source

        invalid_data = False
        if isinstance(self.data, str):
            if self.filetype == RayFileType.PETASTORM:
                self.data = [self.data]
            elif os.path.isdir(self.data):
                if self.filetype == RayFileType.PARQUET:
                    self.data = sorted(glob.glob(f"{self.data}/**/*.parquet"))
                elif self.filetype == RayFileType.CSV:
                    self.data = sorted(glob.glob(f"{self.data}/**/*.csv"))
                else:
                    invalid_data = True
            elif os.path.exists(self.data):
                self.data = [self.data]
            else:
                invalid_data = True

        # Todo (krfricke): It would be good to have a more general way to
        # check for compatibility here. Combine with test below?
        if not isinstance(self.data, (Iterable, MLDataset)) or invalid_data:
            raise ValueError(
                f"Distributed data loading only works with already "
                f"distributed datasets. These should be specified through a "
                f"list of locations (or a single string). "
                f"Got: {type(self.data)}."
                f"\nFIX THIS by passing a list of files (e.g. on S3) to the "
                f"RayDMatrix.")

        if self.label is not None and not isinstance(self.label, str):
            raise ValueError(
                f"Invalid `label` value for distributed datasets: "
                f"{self.label}. Only strings are supported. "
                f"\nFIX THIS by passing a string indicating the label "
                f"column of the dataset as the `label` argument.")

        data_source = None
        for source in data_sources:
            if not source.supports_distributed_loading:
                continue

            try:
                if source.is_data_type(self.data, self.filetype):
                    data_source = source
                    break
            except Exception as exc:
                # If checking the data throws an exception, the data source
                # is not available.
                logger.warning(
                    f"Checking data source {source.__name__} failed "
                    f"with exception: {exc}")
                continue

        if not data_source:
            raise ValueError(
                f"Invalid data source type: {type(self.data)} "
                f"with FileType: {self.filetype} for a distributed dataset."
                "\nFIX THIS by passing a supported data type. Supported "
                "data types for distributed datasets are a list of "
                "CSV or Parquet sources as well as Ray MLDatasets.")

        self.data_source = data_source
        self._cached_n = data_source.get_n(self.data)
        return self.data_source
コード例 #8
0
    from opencensus.stats import aggregation
    from opencensus.stats import measure as measure_module
    from opencensus.stats import stats as stats_module
    from opencensus.stats.view import View
    from opencensus.stats.view_data import ViewData
    from opencensus.stats.aggregation_data import (CountAggregationData,
                                                   DistributionAggregationData,
                                                   LastValueAggregationData)
    from opencensus.metrics.export.value import ValueDouble
    from opencensus.tags import tag_key as tag_key_module
    from opencensus.tags import tag_map as tag_map_module
    from opencensus.tags import tag_value as tag_value_module
except (ModuleNotFoundError, ImportError):
    gpustat = None
    logger.warning("`gpustat` package is not installed. GPU monitoring is "
                   "not available. In Ray 1.4+, the Ray CLI, autoscaler, and "
                   "dashboard will only be usable via `pip install 'ray["
                   "default]'`. Please update your install command")

import ray
from ray._private import services

import ray._private.prometheus_exporter as prometheus_exporter
from ray.core.generated.metrics_pb2 import Metric

logger = logging.getLogger(__name__)


class Gauge(View):
    """Gauge representation of opencensus view.

    This class is used to collect process metrics from the reporter agent.
コード例 #9
0
ファイル: main.py プロジェクト: amogkam/xgboost_ray
def predict(model: xgb.Booster,
            data: RayDMatrix,
            num_actors: int = 4,
            cpus_per_actor: int = 0,
            gpus_per_actor: int = 0,
            resources_per_actor: Optional[Dict] = None,
            max_actor_restarts: int = 0,
            **kwargs) -> Optional[np.ndarray]:
    """Distributed XGBoost predict via Ray.

    This function will connect to a Ray cluster, create ``num_actors``
    remote actors, send data shards to them, and have them predict labels
    using an XGBoost booster model. The results are then combined and
    returned.

    Args:
        model (xgb.Booster): Booster object to call for prediction.
        data (RayDMatrix): Data object containing the prediction data.
        num_actors (int): Number of parallel Ray actors.
        cpus_per_actor (int): Number of CPUs to be used per Ray actor.
        gpus_per_actor (int): Number of GPUs to be used per Ray actor.
        resources_per_actor (Optional[Dict]): Dict of additional resources
            required per Ray actor.
        max_actor_restarts (int): Number of retries when Ray actors fail.
            Defaults to 0 (no retries). Set to -1 for unlimited retries.

    Returns: ``np.ndarray`` containing the predicted labels.

    """
    max_actor_restarts = max_actor_restarts \
        if max_actor_restarts >= 0 else float("inf")
    _assert_ray_support()

    if not isinstance(data, RayDMatrix):
        raise ValueError(
            "The `data` argument passed to `train()` is not a RayDMatrix, "
            "but of type {}. "
            "\nFIX THIS by instantiating a RayDMatrix first: "
            "`data = RayDMatrix(data=data)`.".format(type(data)))

    tries = 0
    while tries <= max_actor_restarts:
        try:
            return _predict(model,
                            data,
                            num_actors=num_actors,
                            cpus_per_actor=cpus_per_actor,
                            gpus_per_actor=gpus_per_actor,
                            resources_per_actor=resources_per_actor,
                            **kwargs)
        except RayActorError:
            if tries + 1 <= max_actor_restarts:
                logger.warning(
                    "A Ray actor died during prediction. Trying to restart "
                    "prediction from scratch.")
            else:
                raise RuntimeError(
                    "A Ray actor died during prediction and the maximum "
                    "number of retries ({}) is exhausted.".format(
                        max_actor_restarts))
            tries += 1
    return None
コード例 #10
0
ファイル: main.py プロジェクト: amogkam/xgboost_ray
def train(params: Dict,
          dtrain: RayDMatrix,
          *args,
          evals=(),
          evals_result: Optional[Dict] = None,
          num_actors: int = 4,
          cpus_per_actor: int = 0,
          gpus_per_actor: int = -1,
          resources_per_actor: Optional[Dict] = None,
          max_actor_restarts: int = 0,
          **kwargs):
    """Distributed XGBoost training via Ray.

    This function will connect to a Ray cluster, create ``num_actors``
    remote actors, send data shards to them, and have them train an
    XGBoost classifier. The XGBoost parameters will be shared and combined
    via Rabit's all-reduce protocol.

    Args:
        params (Dict): parameter dict passed to ``xgboost.train()``
        dtrain (RayDMatrix): Data object containing the training data.
        evals (Union[List[Tuple], Tuple]): ``evals`` tuple passed to
            ``xgboost.train()``.
        evals_result (Optional[Dict]): Dict to store evaluation results in.
        num_actors (int): Number of parallel Ray actors.
        cpus_per_actor (int): Number of CPUs to be used per Ray actor.
        gpus_per_actor (int): Number of GPUs to be used per Ray actor.
        resources_per_actor (Optional[Dict]): Dict of additional resources
            required per Ray actor.
        max_actor_restarts (int): Number of retries when Ray actors fail.
            Defaults to 0 (no retries). Set to -1 for unlimited retries.

    Keyword Args:
        checkpoint_prefix (str): Prefix for the checkpoint filenames.
            Defaults to ``.xgb_ray_{time.time()}``.
        checkpoint_path (str): Path to store checkpoints at. Defaults to
            ``/tmp``
        checkpoint_frequency (int): How often to save checkpoints. Defaults
            to ``5``.

    Returns: An ``xgboost.Booster`` object.
    """
    max_actor_restarts = max_actor_restarts \
        if max_actor_restarts >= 0 else float("inf")
    _assert_ray_support()

    if not isinstance(dtrain, RayDMatrix):
        raise ValueError(
            "The `dtrain` argument passed to `train()` is not a RayDMatrix, "
            "but of type {}. "
            "\nFIX THIS by instantiating a RayDMatrix first: "
            "`dtrain = RayDMatrix(data=data, label=label)`.".format(
                type(dtrain)))

    if not dtrain.loaded and not dtrain.distributed:
        dtrain.load_data(num_actors)
    for (deval, name) in evals:
        if not deval.loaded and not deval.distributed:
            deval.load_data(num_actors)

    checkpoint_prefix = kwargs.pop("checkpoint_prefix",
                                   f".xgb_ray_{time.time()}")
    checkpoint_path = kwargs.pop("checkpoint_path", "/tmp")
    checkpoint_frequency = kwargs.pop("checkpoint_frequency", 5)

    bst = None
    train_evals_result = {}

    tries = 0
    while tries <= max_actor_restarts:
        try:
            bst, train_evals_result = _train(
                params,
                dtrain,
                *args,
                evals=evals,
                num_actors=num_actors,
                cpus_per_actor=cpus_per_actor,
                gpus_per_actor=gpus_per_actor,
                resources_per_actor=resources_per_actor,
                checkpoint_prefix=checkpoint_prefix,
                checkpoint_path=checkpoint_path,
                checkpoint_frequency=checkpoint_frequency,
                **kwargs)
            break
        except RayActorError:
            if tries + 1 <= max_actor_restarts:
                logger.warning(
                    "A Ray actor died during training. Trying to restart "
                    "and continue training from last checkpoint.")
            else:
                raise RuntimeError(
                    "A Ray actor died during training and the maximum number "
                    "of retries ({}) is exhausted. Checkpoints have been "
                    "stored at `{}` with prefix `{}` - you can pass these "
                    "parameters as `checkpoint_path` and `checkpoint_prefix` "
                    "to the `train()` function to try to continue "
                    "the training.".format(max_actor_restarts, checkpoint_path,
                                           checkpoint_prefix))
            tries += 1
    if isinstance(evals_result, dict):
        evals_result.update(train_evals_result)
    return bst