def _create_placement_group(self): """Creates a placement group if it does not exist. If a placement group is already detected (Tune) this will be a no-op. By default the placement group will be created with PACK strategy. This is optimized for colocating GPUs on a minimal number of nodes. This behavior can be overridden to use the SPREAD strategy by defining ``TRAIN_ENABLE_WORKER_SPREAD_ENV`` If a placement group is created it will be stored as self._placement_group. """ current_placement_group = get_current_placement_group() should_capture_child_tasks_in_placement_group = ( ray.worker.global_worker.should_capture_child_tasks_in_placement_group ) should_create_placement_group = ( current_placement_group is None or not should_capture_child_tasks_in_placement_group ) if should_create_placement_group: additional_resources_per_worker = ( self._additional_resources_per_worker or {} ) bundle = { "CPU": self._num_cpus_per_worker, "GPU": self._num_gpus_per_worker, **additional_resources_per_worker, } bundles = [bundle.copy() for _ in range(self._num_workers)] use_spread = bool(env_integer(TRAIN_ENABLE_WORKER_SPREAD_ENV, 0)) strategy = "SPREAD" if use_spread else "PACK" placement_group = ray.util.placement_group(bundles, strategy=strategy) logger.debug("Waiting for placement group to start.") timeout = env_integer(TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, 100) ready, _ = ray.wait([placement_group.ready()], timeout=timeout) if ready: logger.debug("Placement group has started.") else: raise TimeoutError( "Placement group creation timed out. Make sure your " "cluster either has enough resources or use an " "autoscaling cluster. If you are running on a cluster, " "make sure you specify an address in `ray.init()`, for example, " '`ray.init("auto")`. You can also increase the timeout by setting ' "the TRAIN_PLACEMENT_GROUP_TIMEOUT_S environment variable. " "Current resources available: {}, resources requested by the " "placement group: {}".format( ray.available_resources(), placement_group.bundle_specs ) ) self._placement_group = placement_group
def start(self, initialization_hook: Optional[Callable[[], None]] = None, train_cls: Optional[Type] = None, train_cls_args: Optional[Tuple] = None, train_cls_kwargs: Optional[Dict] = None): """Starts the worker group.""" self.worker_group = WorkerGroup( num_workers=self._num_workers, num_cpus_per_worker=self._num_cpus_per_worker, num_gpus_per_worker=self._num_gpus_per_worker, additional_resources_per_worker=self. _additional_resources_per_worker, actor_cls=train_cls, actor_cls_args=train_cls_args, actor_cls_kwargs=train_cls_kwargs) try: if initialization_hook: self._initialization_hook = initialization_hook self.worker_group.execute(initialization_hook) share_cuda_visible_devices_enabled = bool( env_integer(ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV, self._backend.share_cuda_visible_devices)) if (self._num_gpus_per_worker > 0 and share_cuda_visible_devices_enabled): self._share_cuda_visible_devices() self._backend.on_start(self.worker_group, self._backend_config) except RayActorError as exc: logger.exception(str(exc)) self._increment_failures() self._restart()
def set_sync_periods(sync_config): """Sets sync periods from config.""" global CLOUD_SYNC_PERIOD global NODE_SYNC_PERIOD if os.environ.get("TUNE_CLOUD_SYNC_S"): logger.warning("'TUNE_CLOUD_SYNC_S' is deprecated. Set " "`cloud_sync_period` via tune.SyncConfig instead.") CLOUD_SYNC_PERIOD = env_integer(key="TUNE_CLOUD_SYNC_S", default=300) NODE_SYNC_PERIOD = int(sync_config.node_sync_period) CLOUD_SYNC_PERIOD = int(sync_config.cloud_sync_period)
def __init__(self, local_dir: str, remote_dir: str, sync_client: Optional[SyncClient] = None): configure_logging(log_style="record", verbosity=env_integer("TUNE_SYNCER_VERBOSITY", 0)) self.local_ip = services.get_node_ip_address() self.worker_ip = None sync_client = sync_client or DockerSyncClient() sync_client.configure(self._cluster_config_file) super(NodeSyncer, self).__init__(local_dir, remote_dir, sync_client)
def start_training(self, train_func: Callable[[], T], checkpoint: Optional[Dict] = None) -> None: """Executes a training function on all workers in a separate thread. ``finish_training`` should be called after this. Args: train_func (Callable): The training function to run on each worker. checkpoint (Optional[Dict]): The checkpoint data that should be loaded onto each worker and accessed by the training function via ``sgd.load_checkpoint()``. """ use_detailed_autofilled_metrics = env_integer( ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0) # First initialize the session. def initialize_session(world_rank, train_func, checkpoint): try: init_session( training_func=train_func, world_rank=world_rank, checkpoint=checkpoint, detailed_autofilled_metrics=use_detailed_autofilled_metrics ) except ValueError: raise SGDBackendError( "Attempting to start training but a " "previous training run is still ongoing. " "You must call `finish_training` before " "calling `start_training` again.") futures = [] for world_rank in range(len(self.worker_group)): futures.append( self.worker_group.execute_single_async(world_rank, initialize_session, world_rank=world_rank, train_func=train_func, checkpoint=checkpoint)) ray.get(futures) # Run the training function asynchronously in its own thread. def train_async(): session = get_session() session.start() self.worker_group.execute_async(train_async)
def start( self, initialization_hook: Optional[Callable[[], None]] = None, train_cls: Optional[Type] = None, train_cls_args: Optional[Tuple] = None, train_cls_kwargs: Optional[Dict] = None, ): """Starts the worker group.""" self._create_placement_group() placement_group = self._placement_group or "default" self.worker_group = WorkerGroup( num_workers=self._num_workers, num_cpus_per_worker=self._num_cpus_per_worker, num_gpus_per_worker=self._num_gpus_per_worker, additional_resources_per_worker=self._additional_resources_per_worker, actor_cls=train_cls, actor_cls_args=train_cls_args, actor_cls_kwargs=train_cls_kwargs, placement_group=placement_group, ) try: if initialization_hook: self._initialization_hook = initialization_hook self.worker_group.execute(initialization_hook) share_cuda_visible_devices_enabled = bool( env_integer( ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV, self._backend.share_cuda_visible_devices, ) ) if self._num_gpus_per_worker > 0 and share_cuda_visible_devices_enabled: self._share_cuda_visible_devices() self._backend.on_start(self.worker_group, self._backend_config) except RayActorError as exc: logger.exception(str(exc)) logger.warning( "Failure occurred during startup. Restarting all workers and " "attempting to startup again." ) self._increment_failures() self._restart()
from ray.ray_constants import env_integer USE_FP16 = "__use_fp16__" NUM_STEPS = "__num_steps__" SCHEDULER_STEP = "scheduler_step" SCHEDULER_STEP_BATCH = "batch" SCHEDULER_STEP_EPOCH = "epoch" SCHEDULER_STEP_MANUAL = "manual" NCCL_TIMEOUT_S = env_integer("NCCL_TIMEOUT_S", 1800) SGD_PLACEMENT_GROUP_TIMEOUT_S = env_integer("SGD_PLACEMENT_GROUP_TIMEOUT_S", 100) VALID_SCHEDULER_STEP = { SCHEDULER_STEP_BATCH, SCHEDULER_STEP_EPOCH, SCHEDULER_STEP_MANUAL }
import os import time from shlex import quote from ray import ray_constants from ray import services from ray.tune.cluster_info import get_ssh_key, get_ssh_user from ray.tune.sync_client import (CommandBasedClient, get_sync_client, get_cloud_sync_client, NOOP) logger = logging.getLogger(__name__) # Syncing period for syncing local checkpoints to cloud. # In env variable is not set, sync happens every 300 seconds. CLOUD_SYNC_PERIOD = ray_constants.env_integer(key="TUNE_CLOUD_SYNC_S", default=300) # Syncing period for syncing worker logs to driver. NODE_SYNC_PERIOD = 300 _log_sync_warned = False _syncers = {} def wait_for_sync(): for syncer in _syncers.values(): syncer.wait() def log_sync_template(options=""): """Template enabling syncs between driver and worker when possible.
from ray.ray_constants import env_integer DASHBOARD_LOG_FILENAME = "dashboard.log" DASHBOARD_AGENT_PORT_PREFIX = "DASHBOARD_AGENT_PORT_PREFIX:" DASHBOARD_AGENT_LOG_FILENAME = "dashboard_agent.log" DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS = 2 RETRY_REDIS_CONNECTION_TIMES = 10 CONNECT_REDIS_INTERNAL_SECONDS = 2 PURGE_DATA_INTERVAL_SECONDS = 60 * 10 ORGANIZE_DATA_INTERVAL_SECONDS = 2 DASHBOARD_RPC_ADDRESS = "dashboard_rpc" GCS_SERVER_ADDRESS = "GcsServerAddress" # GCS check alive GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR = env_integer( "GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR", 10 ) GCS_CHECK_ALIVE_INTERVAL_SECONDS = env_integer("GCS_CHECK_ALIVE_INTERVAL_SECONDS", 5) GCS_CHECK_ALIVE_RPC_TIMEOUT = env_integer("GCS_CHECK_ALIVE_RPC_TIMEOUT", 10) GCS_RETRY_CONNECT_INTERVAL_SECONDS = env_integer( "GCS_RETRY_CONNECT_INTERVAL_SECONDS", 2 ) # aiohttp_cache AIOHTTP_CACHE_TTL_SECONDS = 2 AIOHTTP_CACHE_MAX_SIZE = 128 AIOHTTP_CACHE_DISABLE_ENVIRONMENT_KEY = "RAY_DASHBOARD_NO_CACHE" # Named signals SIGNAL_NODE_INFO_FETCHED = "node_info_fetched" SIGNAL_NODE_SUMMARY_FETCHED = "node_summary_fetched" SIGNAL_JOB_INFO_FETCHED = "job_info_fetched" SIGNAL_WORKER_INFO_FETCHED = "worker_info_fetched" # Default value for datacenter (the default value in protobuf)
def start_training( self, train_func: Callable[[], T], dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None, checkpoint: Optional[Dict] = None, ) -> None: """Executes a training function on all workers in a separate thread. ``finish_training`` should be called after this. Args: train_func (Callable): The training function to run on each worker. dataset (Optional[Union[Dataset, DatasetPipeline]]) Distributed Ray Dataset or DatasetPipeline to pass into worker, which can be accessed from the training function via ``train.get_dataset_shard()``. Sharding will automatically be handled by the Trainer. Multiple Datasets can be passed in as a ``Dict`` that maps each name key to a Dataset value, and each Dataset can be accessed from the training function by passing in a `dataset_name` argument to ``train.get_dataset_shard()``. checkpoint (Optional[Dict]): The checkpoint data that should be loaded onto each worker and accessed by the training function via ``train.load_checkpoint()``. If this is ``None`` then no checkpoint will be loaded. """ use_detailed_autofilled_metrics = env_integer( ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0) # First initialize the session. def initialize_session( train_func, world_rank, local_rank, world_size, checkpoint, dataset_shard, encode_data_fn, ): try: init_session( training_func=train_func, world_rank=world_rank, local_rank=local_rank, world_size=world_size, dataset_shard=dataset_shard, checkpoint=checkpoint, encode_data_fn=encode_data_fn, detailed_autofilled_metrics=use_detailed_autofilled_metrics, ) except ValueError: raise TrainBackendError( "Attempting to start training but a " "previous training run is still ongoing. " "You must call `finish_training` before " "calling `start_training` again.") if self.dataset_shards is None: self.dataset_shards = self._get_dataset_shards(dataset) local_rank_map = self._create_local_rank_map() futures = [] for index in range(len(self.worker_group)): futures.append( self.worker_group.execute_single_async( index, initialize_session, world_rank=index, local_rank=local_rank_map[index], world_size=len(self.worker_group), train_func=train_func, dataset_shard=self.dataset_shards[index], checkpoint=checkpoint, encode_data_fn=self._backend.encode_data, )) self.get_with_failure_handling(futures) # Run the training function asynchronously in its own thread. def train_async(): session = get_session() session.start() self.worker_group.execute_async(train_async)
import ray.ray_constants as ray_constants RUNTIME_ENV_RETRY_TIMES = ray_constants.env_integer("RUNTIME_ENV_RETRY_TIMES", 3) RUNTIME_ENV_RETRY_INTERVAL_MS = ray_constants.env_integer( "RUNTIME_ENV_RETRY_INTERVAL_MS", 1000)
from ray.util.client.server.proxier import serve_proxier from ray.util.client.server.server_pickler import dumps_from_server from ray.util.client.server.server_pickler import loads_from_client from ray.util.client.server.dataservicer import DataServicer from ray.util.client.server.logservicer import LogstreamServicer from ray.util.client.server.server_stubs import current_server from ray.ray_constants import env_integer from ray._private.client_mode_hook import disable_client_hook from ray._private.ray_logging import setup_logger from ray._private.services import canonicalize_bootstrap_address from ray._private.tls_utils import add_port_to_grpc_server from ray._private.gcs_utils import GcsClient logger = logging.getLogger(__name__) TIMEOUT_FOR_SPECIFIC_SERVER_S = env_integer("TIMEOUT_FOR_SPECIFIC_SERVER_S", 30) def _use_response_cache(func): """ Decorator for gRPC stubs. Before calling the real stubs, checks if there's an existing entry in the caches. If there is, then return the cached entry. Otherwise, call the real function and use the real cache """ @functools.wraps(func) def wrapper(self, request, context): metadata = {k: v for k, v in context.invocation_metadata()} expected_ids = ("client_id", "thread_id", "req_id") if any(i not in metadata for i in expected_ids): # Missing IDs, skip caching and call underlying stub directly return func(self, request, context)
def start_training( self, train_func: Callable[[], T], dataset_spec: RayDatasetSpec, checkpoint: Optional[Dict] = None, ) -> None: """Executes a training function on all workers in a separate thread. ``finish_training`` should be called after this. Args: train_func: The training function to run on each worker. dataset_spec: A specification for the Ray Dataset to be passed to the training workers, and the logic on how to shard the Ray Dataset. checkpoint: The checkpoint data that should be loaded onto each worker and accessed by the training function via ``train.load_checkpoint()``. If this is ``None`` then no checkpoint will be loaded. """ use_detailed_autofilled_metrics = env_integer( ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0) # First initialize the session. def initialize_session( train_func, world_rank, local_rank, world_size, checkpoint, dataset_shard, encode_data_fn, ): try: init_session( training_func=train_func, world_rank=world_rank, local_rank=local_rank, world_size=world_size, dataset_shard=dataset_shard, checkpoint=checkpoint, encode_data_fn=encode_data_fn, detailed_autofilled_metrics=use_detailed_autofilled_metrics, ) except ValueError: raise TrainBackendError( "Attempting to start training but a " "previous training run is still ongoing. " "You must call `finish_training` before " "calling `start_training` again.") if self.dataset_shards is None: actors = [worker.actor for worker in self.worker_group.workers] self.dataset_shards = dataset_spec.get_dataset_shards(actors) local_rank_map = self._create_local_rank_map() futures = [] for index in range(len(self.worker_group)): futures.append( self.worker_group.execute_single_async( index, initialize_session, world_rank=index, local_rank=local_rank_map[index], world_size=len(self.worker_group), train_func=train_func, dataset_shard=self.dataset_shards[index], checkpoint=checkpoint, encode_data_fn=self._backend.encode_data, )) self.get_with_failure_handling(futures) # Run the training function asynchronously in its own thread. def train_async(): session = get_session() session.start() self.worker_group.execute_async(train_async)
import ray.ray_constants as ray_constants REPORTER_PREFIX = "RAY_REPORTER:" # The reporter will report its statistics this often (milliseconds). REPORTER_UPDATE_INTERVAL_MS = ray_constants.env_integer( "REPORTER_UPDATE_INTERVAL_MS", 2500)
from ray.ray_constants import env_integer from ray.tune.result import RESULT_DUPLICATE from ray.tune.logger import NoopLogger from ray.tune.function_runner import wrap_function from ray.tune.trainable import DistributedTrainable from ray.tune.utils.placement_groups import PlacementGroupFactory from ray.tune.utils.trainable import PlacementGroupUtil, TrainableUtil from ray.tune.utils import detect_checkpoint_function from ray.util.ml_utils.util import find_free_port from ray.util.placement_group import remove_placement_group logger = logging.getLogger(__name__) _distributed_enabled = False NCCL_TIMEOUT_S = env_integer("NCCL_TIMEOUT_S", 1800) def is_distributed_trainable(): """Returns True if executing within a DistributedTrainable.""" return _distributed_enabled def enable_distributed_trainable(): global _distributed_enabled _distributed_enabled = True def logger_creator(log_config: Dict, logdir: str, rank: int) -> NoopLogger: worker_dir = os.path.join(logdir, "worker_{}".format(rank)) os.makedirs(worker_dir, exist_ok=True)
from ray.ray_constants import env_integer from ray.core.generated import event_pb2 EVENT_MODULE_ENVIRONMENT_KEY = "RAY_DASHBOARD_MODULE_EVENT" LOG_ERROR_EVENT_STRING_LENGTH_LIMIT = 1000 RETRY_CONNECT_TO_DASHBOARD_INTERVAL_SECONDS = 2 # Monitor events SCAN_EVENT_DIR_INTERVAL_SECONDS = env_integer( "SCAN_EVENT_DIR_INTERVAL_SECONDS", 2) SCAN_EVENT_START_OFFSET_SECONDS = -30 * 60 CONCURRENT_READ_LIMIT = 50 EVENT_READ_LINE_COUNT_LIMIT = 200 EVENT_READ_LINE_LENGTH_LIMIT = env_integer("EVENT_READ_LINE_LENGTH_LIMIT", 2 * 1024 * 1024) # 2MB # Report events EVENT_AGENT_REPORT_INTERVAL_SECONDS = 0.1 EVENT_AGENT_RETRY_TIMES = 10 EVENT_AGENT_CACHE_SIZE = 10240 # Event sources EVENT_HEAD_MONITOR_SOURCE_TYPES = [ event_pb2.Event.SourceType.Name(event_pb2.Event.GCS) ] EVENT_AGENT_MONITOR_SOURCE_TYPES = list( set(event_pb2.Event.SourceType.keys()) - set(EVENT_HEAD_MONITOR_SOURCE_TYPES)) EVENT_SOURCE_ALL = event_pb2.Event.SourceType.keys()
import ray from ray.ray_constants import env_integer from ray.types import ObjectRef from ray.util.annotations import PublicAPI try: import tqdm needs_warning = False except ImportError: tqdm = None needs_warning = True # Whether progress bars are enabled in this thread. _enabled = not bool(env_integer("RAY_DATA_DISABLE_PROGRESS_BARS", 0)) # Used a signal to cancel execution. _canceled_threads = set() _canceled_threads_lock = threading.Lock() @PublicAPI def set_progress_bars(enabled: bool) -> bool: """Set whether progress bars are enabled. The default behavior is controlled by the ``RAY_DATA_DISABLE_PROGRESS_BARS`` environment variable. By default, it is set to "0". Setting it to "1" will disable progress bars, unless they are reenabled by this method.
def start_training( self, train_func: Callable[[], T], run_dir: Path, dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None, checkpoint: Optional[Union[Dict, str, Path]] = None, checkpoint_strategy: Optional[CheckpointStrategy] = None, latest_checkpoint_id: Optional[int] = None, ) -> None: """Executes a training function on all workers in a separate thread. ``finish_training`` should be called after this. Args: train_func (Callable): The training function to run on each worker. run_dir (Path): The directory to use for this run. dataset (Optional[Union[Dataset, DatasetPipeline]]) Distributed Ray Dataset or DatasetPipeline to pass into worker, which can be accessed from the training function via ``train.get_dataset_shard()``. Sharding will automatically be handled by the Trainer. Multiple Datasets can be passed in as a ``Dict`` that maps each name key to a Dataset value, and each Dataset can be accessed from the training function by passing in a `dataset_name` argument to ``train.get_dataset_shard()``. checkpoint (Optional[Dict|str|Path]): The checkpoint data that should be loaded onto each worker and accessed by the training function via ``train.load_checkpoint()``. If this is a ``str`` or ``Path`` then the value is expected to be a path to a file that contains a serialized checkpoint dict. If this is ``None`` then no checkpoint will be loaded. checkpoint_strategy (Optional[CheckpointStrategy]): The configurations for saving checkpoints. latest_checkpoint_id (Optional[int]): The checkpoint id of the most recently saved checkpoint. """ self.checkpoint_manager.on_start_training( checkpoint_strategy=checkpoint_strategy, run_dir=run_dir, latest_checkpoint_id=latest_checkpoint_id) use_detailed_autofilled_metrics = env_integer( ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0) # First initialize the session. def initialize_session(train_func, world_rank, local_rank, checkpoint, dataset_shard): try: init_session( training_func=train_func, world_rank=world_rank, local_rank=local_rank, dataset_shard=dataset_shard, checkpoint=checkpoint, detailed_autofilled_metrics=use_detailed_autofilled_metrics ) except ValueError: raise TrainBackendError( "Attempting to start training but a " "previous training run is still ongoing. " "You must call `finish_training` before " "calling `start_training` again.") if self.dataset_shards is None: self.dataset_shards = self._get_dataset_shards(dataset) checkpoint_dict = self.checkpoint_manager._load_checkpoint(checkpoint) local_rank_map = self._create_local_rank_map() futures = [] for index in range(len(self.worker_group)): futures.append( self.worker_group.execute_single_async( index, initialize_session, world_rank=index, local_rank=local_rank_map[index], train_func=train_func, dataset_shard=self.dataset_shards[index], checkpoint=checkpoint_dict)) self.get_with_failure_handling(futures) # Run the training function asynchronously in its own thread. def train_async(): session = get_session() session.start() self.worker_group.execute_async(train_async)
def start_training( self, train_func: Callable[[], T], run_dir: Path, checkpoint: Optional[Union[Dict, str, Path]] = None, checkpoint_strategy: Optional[CheckpointStrategy] = None, latest_checkpoint_id: Optional[int] = None, ) -> None: """Executes a training function on all workers in a separate thread. ``finish_training`` should be called after this. Args: train_func (Callable): The training function to run on each worker. run_dir (Path): The directory to use for this run. checkpoint (Optional[Dict|str|Path]): The checkpoint data that should be loaded onto each worker and accessed by the training function via ``sgd.load_checkpoint()``. If this is a ``str`` or ``Path`` then the value is expected to be a path to a file that contains a serialized checkpoint dict. If this is ``None`` then no checkpoint will be loaded. checkpoint_strategy (Optional[CheckpointStrategy]): The configurations for saving checkpoints. latest_checkpoint_id (Optional[int]): The checkpoint id of the most recently saved checkpoint. """ self.checkpoint_manager.on_start_training( checkpoint_strategy=checkpoint_strategy, run_dir=run_dir, latest_checkpoint_id=latest_checkpoint_id) use_detailed_autofilled_metrics = env_integer( ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0) # First initialize the session. def initialize_session(world_rank, local_rank, train_func, checkpoint): try: init_session( training_func=train_func, world_rank=world_rank, local_rank=local_rank, checkpoint=checkpoint, detailed_autofilled_metrics=use_detailed_autofilled_metrics ) except ValueError: raise SGDBackendError( "Attempting to start training but a " "previous training run is still ongoing. " "You must call `finish_training` before " "calling `start_training` again.") checkpoint_dict = self.checkpoint_manager._load_checkpoint(checkpoint) local_rank_map = self._create_local_rank_map() futures = [] for world_rank in range(len(self.worker_group)): futures.append( self.worker_group.execute_single_async( world_rank, initialize_session, world_rank=world_rank, local_rank=local_rank_map[world_rank], train_func=train_func, checkpoint=checkpoint_dict)) self.get_with_failure_handling(futures) # Run the training function asynchronously in its own thread. def train_async(): session = get_session() session.start() self.worker_group.execute_async(train_async)
from ray.ray_constants import env_integer DASHBOARD_LOG_FILENAME = "dashboard.log" DASHBOARD_AGENT_PORT_PREFIX = "DASHBOARD_AGENT_PORT_PREFIX:" DASHBOARD_AGENT_LOG_FILENAME = "dashboard_agent.log" DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS = 2 RETRY_REDIS_CONNECTION_TIMES = 10 CONNECT_REDIS_INTERNAL_SECONDS = 2 PURGE_DATA_INTERVAL_SECONDS = 60 * 10 ORGANIZE_DATA_INTERVAL_SECONDS = 2 DASHBOARD_RPC_ADDRESS = "dashboard_rpc" GCS_SERVER_ADDRESS = "GcsServerAddress" # GCS check alive GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR = env_integer( "GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR", 10) GCS_CHECK_ALIVE_INTERVAL_SECONDS = env_integer( "GCS_CHECK_ALIVE_INTERVAL_SECONDS", 5) GCS_CHECK_ALIVE_RPC_TIMEOUT = env_integer("GCS_CHECK_ALIVE_RPC_TIMEOUT", 10) GCS_RETRY_CONNECT_INTERVAL_SECONDS = env_integer( "GCS_RETRY_CONNECT_INTERVAL_SECONDS", 2) # aiohttp_cache AIOHTTP_CACHE_TTL_SECONDS = 2 AIOHTTP_CACHE_MAX_SIZE = 128 AIOHTTP_CACHE_DISABLE_ENVIRONMENT_KEY = "RAY_DASHBOARD_NO_CACHE" # Named signals SIGNAL_NODE_INFO_FETCHED = "node_info_fetched" SIGNAL_NODE_SUMMARY_FETCHED = "node_summary_fetched" SIGNAL_JOB_INFO_FETCHED = "job_info_fetched" SIGNAL_WORKER_INFO_FETCHED = "worker_info_fetched" # Default value for datacenter (the default value in protobuf) DEFAULT_LANGUAGE = "PYTHON"