def delete_at_uri(uri: str): _assert_pyarrow_installed() fs, bucket_path = get_fs_and_path(uri) if not fs: raise ValueError( f"Could not clear URI contents: " f"URI `{uri}` is not a valid or supported cloud target. " f"Hint: {fs_hint(uri)}") try: fs.delete_dir(bucket_path) except Exception as e: logger.warning(f"Caught exception when clearing URI `{uri}`: {e}")
def _predict(model: xgb.Booster, data: RayDMatrix, num_actors: int = 4, cpus_per_actor: int = 0, gpus_per_actor: int = 0, resources_per_actor: Optional[Dict] = None, **kwargs): _assert_ray_support() if not ray.is_initialized(): ray.init() # Create remote actors actors = [ _create_actor(i, num_actors, cpus_per_actor, gpus_per_actor, resources_per_actor) for i in range(num_actors) ] logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.") # Split data across workers wait_load = [] for _, actor in enumerate(actors): wait_load.extend(_trigger_data_load(actor, data, [])) try: ray.get(wait_load) except Exception as exc: logger.warning(f"Caught an error during prediction: {str(exc)}") _shutdown(actors, force=True) raise # Put model into object store model_ref = ray.put(model) logger.info("[RayXGBoost] Starting XGBoost prediction.") # Train fut = [actor.predict.remote(model_ref, data, **kwargs) for actor in actors] try: actor_results = ray.get(fut) except Exception as exc: logger.warning(f"Caught an error during prediction: {str(exc)}") _shutdown(remote_workers=actors, force=True) raise _shutdown(remote_workers=actors, force=False) return combine_data(data.sharding, actor_results)
def get_data_source(self) -> Type[DataSource]: if self.data_source: return self.data_source data_source = None for source in data_sources: if not source.supports_central_loading: continue try: if source.is_data_type(self.data, self.filetype): data_source = source break except Exception as exc: # If checking the data throws an exception, the data source # is not available. logger.warning( f"Checking data source {source.__name__} failed " f"with exception: {exc}") continue if not data_source: raise ValueError( "Unknown data source type: {} with FileType: {}." "\nFIX THIS by passing a supported data type. Supported " "data types include pandas.DataFrame, pandas.Series, " "np.ndarray, and CSV/Parquet file paths. If you specify a " "file, path, consider passing the `filetype` argument to " "specify the type of the source. Use the `RayFileType` " "enum for that. If using Modin, Dask, or Petastorm, " "make sure the library is installed.".format( type(self.data), self.filetype)) if self.label is not None and not isinstance(self.label, str) and \ not type(self.data) != type(self.label): # noqa: E721: # Label is an object of a different type than the main data. # We have to make sure they are compatible if not data_source.is_data_type(self.label): raise ValueError( "The passed `data` and `label` types are not compatible." "\nFIX THIS by passing the same types to the " "`RayDMatrix` - e.g. a `pandas.DataFrame` as `data` " "and `label`. The `label` can always be a string. Got " "{} for the main data and {} for the label.".format( type(self.data), type(self.label))) self.data_source = data_source self._cached_n = data_source.get_n(self.data) return self.data_source
def clear_bucket(bucket: str): if not is_cloud_target(bucket): raise ValueError( f"Could not clear bucket contents: " f"Bucket `{bucket}` is not a valid or supported cloud target.") try: if bucket.startswith(S3_PREFIX): subprocess.check_call( ["aws", "s3", "rm", "--recursive", "--quiet", bucket]) elif bucket.startswith(GS_PREFIX): subprocess.check_call(["gsutil", "-m", "rm", "-f", "-r", bucket]) elif bucket.startswith(HDFS_PREFIX): subprocess.check_call(["hdfs", "dfs", "-rm", "-r", bucket]) except Exception as e: logger.warning( f"Caught exception when clearing bucket `{bucket}`: {e}")
def _shutdown(remote_workers: List[ActorHandle], queue: Optional[Queue] = None, force: bool = False): if force: logger.debug(f"Killing {len(remote_workers)} workers.") for worker in remote_workers: ray.kill(worker) if queue is not None: logger.debug("Killing Queue.") ray.kill(queue.actor) else: try: [worker.__ray_terminate__.remote() for worker in remote_workers] if queue is not None: queue.actor.__ray_terminate__.remote() except RayActorError: logger.warning("Failed to shutdown gracefully, forcing a " "shutdown.") _shutdown(remote_workers, force=True)
def run(self): while True: # This thread won't be broken by exceptions. try: self.write() except Exception as e: logger.warning("Writing a service discovery file, {}," "failed.".format( self.writer.get_target_file_name())) logger.warning(traceback.format_exc()) logger.warning(f"Error message: {e}") time.sleep(self.default_service_discovery_flush_period)
def get_data_source(self) -> Type[DataSource]: if self.data_source: return self.data_source invalid_data = False if isinstance(self.data, str): if self.filetype == RayFileType.PETASTORM: self.data = [self.data] elif os.path.isdir(self.data): if self.filetype == RayFileType.PARQUET: self.data = sorted(glob.glob(f"{self.data}/**/*.parquet")) elif self.filetype == RayFileType.CSV: self.data = sorted(glob.glob(f"{self.data}/**/*.csv")) else: invalid_data = True elif os.path.exists(self.data): self.data = [self.data] else: invalid_data = True # Todo (krfricke): It would be good to have a more general way to # check for compatibility here. Combine with test below? if not isinstance(self.data, (Iterable, MLDataset)) or invalid_data: raise ValueError( f"Distributed data loading only works with already " f"distributed datasets. These should be specified through a " f"list of locations (or a single string). " f"Got: {type(self.data)}." f"\nFIX THIS by passing a list of files (e.g. on S3) to the " f"RayDMatrix.") if self.label is not None and not isinstance(self.label, str): raise ValueError( f"Invalid `label` value for distributed datasets: " f"{self.label}. Only strings are supported. " f"\nFIX THIS by passing a string indicating the label " f"column of the dataset as the `label` argument.") data_source = None for source in data_sources: if not source.supports_distributed_loading: continue try: if source.is_data_type(self.data, self.filetype): data_source = source break except Exception as exc: # If checking the data throws an exception, the data source # is not available. logger.warning( f"Checking data source {source.__name__} failed " f"with exception: {exc}") continue if not data_source: raise ValueError( f"Invalid data source type: {type(self.data)} " f"with FileType: {self.filetype} for a distributed dataset." "\nFIX THIS by passing a supported data type. Supported " "data types for distributed datasets are a list of " "CSV or Parquet sources as well as Ray MLDatasets.") self.data_source = data_source self._cached_n = data_source.get_n(self.data) return self.data_source
from opencensus.stats import aggregation from opencensus.stats import measure as measure_module from opencensus.stats import stats as stats_module from opencensus.stats.view import View from opencensus.stats.view_data import ViewData from opencensus.stats.aggregation_data import (CountAggregationData, DistributionAggregationData, LastValueAggregationData) from opencensus.metrics.export.value import ValueDouble from opencensus.tags import tag_key as tag_key_module from opencensus.tags import tag_map as tag_map_module from opencensus.tags import tag_value as tag_value_module except (ModuleNotFoundError, ImportError): gpustat = None logger.warning("`gpustat` package is not installed. GPU monitoring is " "not available. In Ray 1.4+, the Ray CLI, autoscaler, and " "dashboard will only be usable via `pip install 'ray[" "default]'`. Please update your install command") import ray from ray._private import services import ray._private.prometheus_exporter as prometheus_exporter from ray.core.generated.metrics_pb2 import Metric logger = logging.getLogger(__name__) class Gauge(View): """Gauge representation of opencensus view. This class is used to collect process metrics from the reporter agent.
def predict(model: xgb.Booster, data: RayDMatrix, num_actors: int = 4, cpus_per_actor: int = 0, gpus_per_actor: int = 0, resources_per_actor: Optional[Dict] = None, max_actor_restarts: int = 0, **kwargs) -> Optional[np.ndarray]: """Distributed XGBoost predict via Ray. This function will connect to a Ray cluster, create ``num_actors`` remote actors, send data shards to them, and have them predict labels using an XGBoost booster model. The results are then combined and returned. Args: model (xgb.Booster): Booster object to call for prediction. data (RayDMatrix): Data object containing the prediction data. num_actors (int): Number of parallel Ray actors. cpus_per_actor (int): Number of CPUs to be used per Ray actor. gpus_per_actor (int): Number of GPUs to be used per Ray actor. resources_per_actor (Optional[Dict]): Dict of additional resources required per Ray actor. max_actor_restarts (int): Number of retries when Ray actors fail. Defaults to 0 (no retries). Set to -1 for unlimited retries. Returns: ``np.ndarray`` containing the predicted labels. """ max_actor_restarts = max_actor_restarts \ if max_actor_restarts >= 0 else float("inf") _assert_ray_support() if not isinstance(data, RayDMatrix): raise ValueError( "The `data` argument passed to `train()` is not a RayDMatrix, " "but of type {}. " "\nFIX THIS by instantiating a RayDMatrix first: " "`data = RayDMatrix(data=data)`.".format(type(data))) tries = 0 while tries <= max_actor_restarts: try: return _predict(model, data, num_actors=num_actors, cpus_per_actor=cpus_per_actor, gpus_per_actor=gpus_per_actor, resources_per_actor=resources_per_actor, **kwargs) except RayActorError: if tries + 1 <= max_actor_restarts: logger.warning( "A Ray actor died during prediction. Trying to restart " "prediction from scratch.") else: raise RuntimeError( "A Ray actor died during prediction and the maximum " "number of retries ({}) is exhausted.".format( max_actor_restarts)) tries += 1 return None
def train(params: Dict, dtrain: RayDMatrix, *args, evals=(), evals_result: Optional[Dict] = None, num_actors: int = 4, cpus_per_actor: int = 0, gpus_per_actor: int = -1, resources_per_actor: Optional[Dict] = None, max_actor_restarts: int = 0, **kwargs): """Distributed XGBoost training via Ray. This function will connect to a Ray cluster, create ``num_actors`` remote actors, send data shards to them, and have them train an XGBoost classifier. The XGBoost parameters will be shared and combined via Rabit's all-reduce protocol. Args: params (Dict): parameter dict passed to ``xgboost.train()`` dtrain (RayDMatrix): Data object containing the training data. evals (Union[List[Tuple], Tuple]): ``evals`` tuple passed to ``xgboost.train()``. evals_result (Optional[Dict]): Dict to store evaluation results in. num_actors (int): Number of parallel Ray actors. cpus_per_actor (int): Number of CPUs to be used per Ray actor. gpus_per_actor (int): Number of GPUs to be used per Ray actor. resources_per_actor (Optional[Dict]): Dict of additional resources required per Ray actor. max_actor_restarts (int): Number of retries when Ray actors fail. Defaults to 0 (no retries). Set to -1 for unlimited retries. Keyword Args: checkpoint_prefix (str): Prefix for the checkpoint filenames. Defaults to ``.xgb_ray_{time.time()}``. checkpoint_path (str): Path to store checkpoints at. Defaults to ``/tmp`` checkpoint_frequency (int): How often to save checkpoints. Defaults to ``5``. Returns: An ``xgboost.Booster`` object. """ max_actor_restarts = max_actor_restarts \ if max_actor_restarts >= 0 else float("inf") _assert_ray_support() if not isinstance(dtrain, RayDMatrix): raise ValueError( "The `dtrain` argument passed to `train()` is not a RayDMatrix, " "but of type {}. " "\nFIX THIS by instantiating a RayDMatrix first: " "`dtrain = RayDMatrix(data=data, label=label)`.".format( type(dtrain))) if not dtrain.loaded and not dtrain.distributed: dtrain.load_data(num_actors) for (deval, name) in evals: if not deval.loaded and not deval.distributed: deval.load_data(num_actors) checkpoint_prefix = kwargs.pop("checkpoint_prefix", f".xgb_ray_{time.time()}") checkpoint_path = kwargs.pop("checkpoint_path", "/tmp") checkpoint_frequency = kwargs.pop("checkpoint_frequency", 5) bst = None train_evals_result = {} tries = 0 while tries <= max_actor_restarts: try: bst, train_evals_result = _train( params, dtrain, *args, evals=evals, num_actors=num_actors, cpus_per_actor=cpus_per_actor, gpus_per_actor=gpus_per_actor, resources_per_actor=resources_per_actor, checkpoint_prefix=checkpoint_prefix, checkpoint_path=checkpoint_path, checkpoint_frequency=checkpoint_frequency, **kwargs) break except RayActorError: if tries + 1 <= max_actor_restarts: logger.warning( "A Ray actor died during training. Trying to restart " "and continue training from last checkpoint.") else: raise RuntimeError( "A Ray actor died during training and the maximum number " "of retries ({}) is exhausted. Checkpoints have been " "stored at `{}` with prefix `{}` - you can pass these " "parameters as `checkpoint_path` and `checkpoint_prefix` " "to the `train()` function to try to continue " "the training.".format(max_actor_restarts, checkpoint_path, checkpoint_prefix)) tries += 1 if isinstance(evals_result, dict): evals_result.update(train_evals_result) return bst