Ejemplo n.º 1
0
    def _create_placement_group(self):
        """Creates a placement group if it does not exist.

        If a placement group is already detected (Tune) this will be a no-op.

        By default the placement group will be created with PACK strategy.
        This is optimized for colocating GPUs on a minimal number of nodes.
        This behavior can be overridden to use the SPREAD strategy by defining
        ``TRAIN_ENABLE_WORKER_SPREAD_ENV``

        If a placement group is created it will be stored as
        self._placement_group.
        """
        current_placement_group = get_current_placement_group()
        should_capture_child_tasks_in_placement_group = (
            ray.worker.global_worker.should_capture_child_tasks_in_placement_group
        )
        should_create_placement_group = (
            current_placement_group is None
            or not should_capture_child_tasks_in_placement_group
        )

        if should_create_placement_group:
            additional_resources_per_worker = (
                self._additional_resources_per_worker or {}
            )
            bundle = {
                "CPU": self._num_cpus_per_worker,
                "GPU": self._num_gpus_per_worker,
                **additional_resources_per_worker,
            }
            bundles = [bundle.copy() for _ in range(self._num_workers)]

            use_spread = bool(env_integer(TRAIN_ENABLE_WORKER_SPREAD_ENV, 0))
            strategy = "SPREAD" if use_spread else "PACK"

            placement_group = ray.util.placement_group(bundles, strategy=strategy)
            logger.debug("Waiting for placement group to start.")
            timeout = env_integer(TRAIN_PLACEMENT_GROUP_TIMEOUT_S_ENV, 100)
            ready, _ = ray.wait([placement_group.ready()], timeout=timeout)
            if ready:
                logger.debug("Placement group has started.")
            else:
                raise TimeoutError(
                    "Placement group creation timed out. Make sure your "
                    "cluster either has enough resources or use an "
                    "autoscaling cluster. If you are running on a cluster, "
                    "make sure you specify an address in `ray.init()`, for example, "
                    '`ray.init("auto")`. You can also increase the timeout by setting '
                    "the TRAIN_PLACEMENT_GROUP_TIMEOUT_S environment variable. "
                    "Current resources available: {}, resources requested by the "
                    "placement group: {}".format(
                        ray.available_resources(), placement_group.bundle_specs
                    )
                )
            self._placement_group = placement_group
Ejemplo n.º 2
0
Archivo: backend.py Proyecto: rlan/ray
    def start(self,
              initialization_hook: Optional[Callable[[], None]] = None,
              train_cls: Optional[Type] = None,
              train_cls_args: Optional[Tuple] = None,
              train_cls_kwargs: Optional[Dict] = None):
        """Starts the worker group."""
        self.worker_group = WorkerGroup(
            num_workers=self._num_workers,
            num_cpus_per_worker=self._num_cpus_per_worker,
            num_gpus_per_worker=self._num_gpus_per_worker,
            additional_resources_per_worker=self.
            _additional_resources_per_worker,
            actor_cls=train_cls,
            actor_cls_args=train_cls_args,
            actor_cls_kwargs=train_cls_kwargs)
        try:
            if initialization_hook:
                self._initialization_hook = initialization_hook
                self.worker_group.execute(initialization_hook)

            share_cuda_visible_devices_enabled = bool(
                env_integer(ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
                            self._backend.share_cuda_visible_devices))

            if (self._num_gpus_per_worker > 0
                    and share_cuda_visible_devices_enabled):
                self._share_cuda_visible_devices()
            self._backend.on_start(self.worker_group, self._backend_config)
        except RayActorError as exc:
            logger.exception(str(exc))
            self._increment_failures()
            self._restart()
Ejemplo n.º 3
0
Archivo: syncer.py Proyecto: hngenc/ray
def set_sync_periods(sync_config):
    """Sets sync periods from config."""
    global CLOUD_SYNC_PERIOD
    global NODE_SYNC_PERIOD
    if os.environ.get("TUNE_CLOUD_SYNC_S"):
        logger.warning("'TUNE_CLOUD_SYNC_S' is deprecated. Set "
                       "`cloud_sync_period` via tune.SyncConfig instead.")
        CLOUD_SYNC_PERIOD = env_integer(key="TUNE_CLOUD_SYNC_S", default=300)
    NODE_SYNC_PERIOD = int(sync_config.node_sync_period)
    CLOUD_SYNC_PERIOD = int(sync_config.cloud_sync_period)
Ejemplo n.º 4
0
Archivo: docker.py Proyecto: rlan/ray
    def __init__(self,
                 local_dir: str,
                 remote_dir: str,
                 sync_client: Optional[SyncClient] = None):
        configure_logging(log_style="record",
                          verbosity=env_integer("TUNE_SYNCER_VERBOSITY", 0))
        self.local_ip = services.get_node_ip_address()
        self.worker_ip = None

        sync_client = sync_client or DockerSyncClient()
        sync_client.configure(self._cluster_config_file)

        super(NodeSyncer, self).__init__(local_dir, remote_dir, sync_client)
Ejemplo n.º 5
0
    def start_training(self,
                       train_func: Callable[[], T],
                       checkpoint: Optional[Dict] = None) -> None:
        """Executes a training function on all workers in a separate thread.

        ``finish_training`` should be called after this.

        Args:
            train_func (Callable): The training function to run on each worker.
            checkpoint (Optional[Dict]): The checkpoint data that should be
                loaded onto each worker and accessed by the training function
                via ``sgd.load_checkpoint()``.
        """

        use_detailed_autofilled_metrics = env_integer(
            ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0)

        # First initialize the session.
        def initialize_session(world_rank, train_func, checkpoint):
            try:
                init_session(
                    training_func=train_func,
                    world_rank=world_rank,
                    checkpoint=checkpoint,
                    detailed_autofilled_metrics=use_detailed_autofilled_metrics
                )
            except ValueError:
                raise SGDBackendError(
                    "Attempting to start training but a "
                    "previous training run is still ongoing. "
                    "You must call `finish_training` before "
                    "calling `start_training` again.")

        futures = []
        for world_rank in range(len(self.worker_group)):
            futures.append(
                self.worker_group.execute_single_async(world_rank,
                                                       initialize_session,
                                                       world_rank=world_rank,
                                                       train_func=train_func,
                                                       checkpoint=checkpoint))

        ray.get(futures)

        # Run the training function asynchronously in its own thread.
        def train_async():
            session = get_session()
            session.start()

        self.worker_group.execute_async(train_async)
Ejemplo n.º 6
0
    def start(
        self,
        initialization_hook: Optional[Callable[[], None]] = None,
        train_cls: Optional[Type] = None,
        train_cls_args: Optional[Tuple] = None,
        train_cls_kwargs: Optional[Dict] = None,
    ):
        """Starts the worker group."""
        self._create_placement_group()
        placement_group = self._placement_group or "default"
        self.worker_group = WorkerGroup(
            num_workers=self._num_workers,
            num_cpus_per_worker=self._num_cpus_per_worker,
            num_gpus_per_worker=self._num_gpus_per_worker,
            additional_resources_per_worker=self._additional_resources_per_worker,
            actor_cls=train_cls,
            actor_cls_args=train_cls_args,
            actor_cls_kwargs=train_cls_kwargs,
            placement_group=placement_group,
        )
        try:
            if initialization_hook:
                self._initialization_hook = initialization_hook
                self.worker_group.execute(initialization_hook)

            share_cuda_visible_devices_enabled = bool(
                env_integer(
                    ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV,
                    self._backend.share_cuda_visible_devices,
                )
            )

            if self._num_gpus_per_worker > 0 and share_cuda_visible_devices_enabled:
                self._share_cuda_visible_devices()
            self._backend.on_start(self.worker_group, self._backend_config)
        except RayActorError as exc:
            logger.exception(str(exc))
            logger.warning(
                "Failure occurred during startup. Restarting all workers and "
                "attempting to startup again."
            )
            self._increment_failures()
            self._restart()
Ejemplo n.º 7
0
from ray.ray_constants import env_integer

USE_FP16 = "__use_fp16__"
NUM_STEPS = "__num_steps__"
SCHEDULER_STEP = "scheduler_step"
SCHEDULER_STEP_BATCH = "batch"
SCHEDULER_STEP_EPOCH = "epoch"
SCHEDULER_STEP_MANUAL = "manual"
NCCL_TIMEOUT_S = env_integer("NCCL_TIMEOUT_S", 1800)
SGD_PLACEMENT_GROUP_TIMEOUT_S = env_integer("SGD_PLACEMENT_GROUP_TIMEOUT_S",
                                            100)

VALID_SCHEDULER_STEP = {
    SCHEDULER_STEP_BATCH, SCHEDULER_STEP_EPOCH, SCHEDULER_STEP_MANUAL
}
Ejemplo n.º 8
0
import os
import time

from shlex import quote

from ray import ray_constants
from ray import services
from ray.tune.cluster_info import get_ssh_key, get_ssh_user
from ray.tune.sync_client import (CommandBasedClient, get_sync_client,
                                  get_cloud_sync_client, NOOP)

logger = logging.getLogger(__name__)

# Syncing period for syncing local checkpoints to cloud.
# In env variable is not set, sync happens every 300 seconds.
CLOUD_SYNC_PERIOD = ray_constants.env_integer(key="TUNE_CLOUD_SYNC_S",
                                              default=300)

# Syncing period for syncing worker logs to driver.
NODE_SYNC_PERIOD = 300

_log_sync_warned = False
_syncers = {}


def wait_for_sync():
    for syncer in _syncers.values():
        syncer.wait()


def log_sync_template(options=""):
    """Template enabling syncs between driver and worker when possible.
Ejemplo n.º 9
0
from ray.ray_constants import env_integer

DASHBOARD_LOG_FILENAME = "dashboard.log"
DASHBOARD_AGENT_PORT_PREFIX = "DASHBOARD_AGENT_PORT_PREFIX:"
DASHBOARD_AGENT_LOG_FILENAME = "dashboard_agent.log"
DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS = 2
RETRY_REDIS_CONNECTION_TIMES = 10
CONNECT_REDIS_INTERNAL_SECONDS = 2
PURGE_DATA_INTERVAL_SECONDS = 60 * 10
ORGANIZE_DATA_INTERVAL_SECONDS = 2
DASHBOARD_RPC_ADDRESS = "dashboard_rpc"
GCS_SERVER_ADDRESS = "GcsServerAddress"
# GCS check alive
GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR = env_integer(
    "GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR", 10
)
GCS_CHECK_ALIVE_INTERVAL_SECONDS = env_integer("GCS_CHECK_ALIVE_INTERVAL_SECONDS", 5)
GCS_CHECK_ALIVE_RPC_TIMEOUT = env_integer("GCS_CHECK_ALIVE_RPC_TIMEOUT", 10)
GCS_RETRY_CONNECT_INTERVAL_SECONDS = env_integer(
    "GCS_RETRY_CONNECT_INTERVAL_SECONDS", 2
)
# aiohttp_cache
AIOHTTP_CACHE_TTL_SECONDS = 2
AIOHTTP_CACHE_MAX_SIZE = 128
AIOHTTP_CACHE_DISABLE_ENVIRONMENT_KEY = "RAY_DASHBOARD_NO_CACHE"
# Named signals
SIGNAL_NODE_INFO_FETCHED = "node_info_fetched"
SIGNAL_NODE_SUMMARY_FETCHED = "node_summary_fetched"
SIGNAL_JOB_INFO_FETCHED = "job_info_fetched"
SIGNAL_WORKER_INFO_FETCHED = "worker_info_fetched"
# Default value for datacenter (the default value in protobuf)
Ejemplo n.º 10
0
    def start_training(
        self,
        train_func: Callable[[], T],
        dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None,
        checkpoint: Optional[Dict] = None,
    ) -> None:
        """Executes a training function on all workers in a separate thread.

        ``finish_training`` should be called after this.

        Args:
            train_func (Callable): The training function to run on each worker.
            dataset (Optional[Union[Dataset, DatasetPipeline]])
                Distributed Ray Dataset or DatasetPipeline to pass into
                worker, which can be accessed from the training function via
                ``train.get_dataset_shard()``. Sharding will automatically be
                handled by the Trainer. Multiple Datasets can be passed in as
                a ``Dict`` that maps each name key to a Dataset value,
                and each Dataset can be accessed from the training function
                by passing in a `dataset_name` argument to
                ``train.get_dataset_shard()``.
            checkpoint (Optional[Dict]): The checkpoint data that
                should be loaded onto each worker and accessed by the
                training function via ``train.load_checkpoint()``. If this
                is ``None`` then no checkpoint will be loaded.
        """
        use_detailed_autofilled_metrics = env_integer(
            ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0)

        # First initialize the session.
        def initialize_session(
            train_func,
            world_rank,
            local_rank,
            world_size,
            checkpoint,
            dataset_shard,
            encode_data_fn,
        ):
            try:
                init_session(
                    training_func=train_func,
                    world_rank=world_rank,
                    local_rank=local_rank,
                    world_size=world_size,
                    dataset_shard=dataset_shard,
                    checkpoint=checkpoint,
                    encode_data_fn=encode_data_fn,
                    detailed_autofilled_metrics=use_detailed_autofilled_metrics,
                )
            except ValueError:
                raise TrainBackendError(
                    "Attempting to start training but a "
                    "previous training run is still ongoing. "
                    "You must call `finish_training` before "
                    "calling `start_training` again.")

        if self.dataset_shards is None:
            self.dataset_shards = self._get_dataset_shards(dataset)

        local_rank_map = self._create_local_rank_map()

        futures = []
        for index in range(len(self.worker_group)):
            futures.append(
                self.worker_group.execute_single_async(
                    index,
                    initialize_session,
                    world_rank=index,
                    local_rank=local_rank_map[index],
                    world_size=len(self.worker_group),
                    train_func=train_func,
                    dataset_shard=self.dataset_shards[index],
                    checkpoint=checkpoint,
                    encode_data_fn=self._backend.encode_data,
                ))

        self.get_with_failure_handling(futures)

        # Run the training function asynchronously in its own thread.
        def train_async():
            session = get_session()
            session.start()

        self.worker_group.execute_async(train_async)
Ejemplo n.º 11
0
import ray.ray_constants as ray_constants

RUNTIME_ENV_RETRY_TIMES = ray_constants.env_integer("RUNTIME_ENV_RETRY_TIMES",
                                                    3)

RUNTIME_ENV_RETRY_INTERVAL_MS = ray_constants.env_integer(
    "RUNTIME_ENV_RETRY_INTERVAL_MS", 1000)
Ejemplo n.º 12
0
from ray.util.client.server.proxier import serve_proxier
from ray.util.client.server.server_pickler import dumps_from_server
from ray.util.client.server.server_pickler import loads_from_client
from ray.util.client.server.dataservicer import DataServicer
from ray.util.client.server.logservicer import LogstreamServicer
from ray.util.client.server.server_stubs import current_server
from ray.ray_constants import env_integer
from ray._private.client_mode_hook import disable_client_hook
from ray._private.ray_logging import setup_logger
from ray._private.services import canonicalize_bootstrap_address
from ray._private.tls_utils import add_port_to_grpc_server
from ray._private.gcs_utils import GcsClient

logger = logging.getLogger(__name__)

TIMEOUT_FOR_SPECIFIC_SERVER_S = env_integer("TIMEOUT_FOR_SPECIFIC_SERVER_S",
                                            30)


def _use_response_cache(func):
    """
    Decorator for gRPC stubs. Before calling the real stubs, checks if there's
    an existing entry in the caches. If there is, then return the cached
    entry. Otherwise, call the real function and use the real cache
    """
    @functools.wraps(func)
    def wrapper(self, request, context):
        metadata = {k: v for k, v in context.invocation_metadata()}
        expected_ids = ("client_id", "thread_id", "req_id")
        if any(i not in metadata for i in expected_ids):
            # Missing IDs, skip caching and call underlying stub directly
            return func(self, request, context)
Ejemplo n.º 13
0
    def start_training(
        self,
        train_func: Callable[[], T],
        dataset_spec: RayDatasetSpec,
        checkpoint: Optional[Dict] = None,
    ) -> None:
        """Executes a training function on all workers in a separate thread.

        ``finish_training`` should be called after this.

        Args:
            train_func: The training function to run on each worker.
            dataset_spec: A specification for the Ray Dataset to be
                passed to the training workers, and the logic on how to shard the Ray
                Dataset.
            checkpoint: The checkpoint data that
                should be loaded onto each worker and accessed by the
                training function via ``train.load_checkpoint()``. If this
                is ``None`` then no checkpoint will be loaded.
        """
        use_detailed_autofilled_metrics = env_integer(
            ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0)

        # First initialize the session.
        def initialize_session(
            train_func,
            world_rank,
            local_rank,
            world_size,
            checkpoint,
            dataset_shard,
            encode_data_fn,
        ):
            try:
                init_session(
                    training_func=train_func,
                    world_rank=world_rank,
                    local_rank=local_rank,
                    world_size=world_size,
                    dataset_shard=dataset_shard,
                    checkpoint=checkpoint,
                    encode_data_fn=encode_data_fn,
                    detailed_autofilled_metrics=use_detailed_autofilled_metrics,
                )
            except ValueError:
                raise TrainBackendError(
                    "Attempting to start training but a "
                    "previous training run is still ongoing. "
                    "You must call `finish_training` before "
                    "calling `start_training` again.")

        if self.dataset_shards is None:
            actors = [worker.actor for worker in self.worker_group.workers]
            self.dataset_shards = dataset_spec.get_dataset_shards(actors)

        local_rank_map = self._create_local_rank_map()

        futures = []
        for index in range(len(self.worker_group)):
            futures.append(
                self.worker_group.execute_single_async(
                    index,
                    initialize_session,
                    world_rank=index,
                    local_rank=local_rank_map[index],
                    world_size=len(self.worker_group),
                    train_func=train_func,
                    dataset_shard=self.dataset_shards[index],
                    checkpoint=checkpoint,
                    encode_data_fn=self._backend.encode_data,
                ))

        self.get_with_failure_handling(futures)

        # Run the training function asynchronously in its own thread.
        def train_async():
            session = get_session()
            session.start()

        self.worker_group.execute_async(train_async)
import ray.ray_constants as ray_constants

REPORTER_PREFIX = "RAY_REPORTER:"
# The reporter will report its statistics this often (milliseconds).
REPORTER_UPDATE_INTERVAL_MS = ray_constants.env_integer(
    "REPORTER_UPDATE_INTERVAL_MS", 2500)
Ejemplo n.º 15
0
from ray.ray_constants import env_integer
from ray.tune.result import RESULT_DUPLICATE
from ray.tune.logger import NoopLogger
from ray.tune.function_runner import wrap_function
from ray.tune.trainable import DistributedTrainable
from ray.tune.utils.placement_groups import PlacementGroupFactory
from ray.tune.utils.trainable import PlacementGroupUtil, TrainableUtil
from ray.tune.utils import detect_checkpoint_function
from ray.util.ml_utils.util import find_free_port
from ray.util.placement_group import remove_placement_group

logger = logging.getLogger(__name__)

_distributed_enabled = False

NCCL_TIMEOUT_S = env_integer("NCCL_TIMEOUT_S", 1800)


def is_distributed_trainable():
    """Returns True if executing within a DistributedTrainable."""
    return _distributed_enabled


def enable_distributed_trainable():
    global _distributed_enabled
    _distributed_enabled = True


def logger_creator(log_config: Dict, logdir: str, rank: int) -> NoopLogger:
    worker_dir = os.path.join(logdir, "worker_{}".format(rank))
    os.makedirs(worker_dir, exist_ok=True)
Ejemplo n.º 16
0
from ray.ray_constants import env_integer
from ray.core.generated import event_pb2

EVENT_MODULE_ENVIRONMENT_KEY = "RAY_DASHBOARD_MODULE_EVENT"
LOG_ERROR_EVENT_STRING_LENGTH_LIMIT = 1000
RETRY_CONNECT_TO_DASHBOARD_INTERVAL_SECONDS = 2
# Monitor events
SCAN_EVENT_DIR_INTERVAL_SECONDS = env_integer(
    "SCAN_EVENT_DIR_INTERVAL_SECONDS", 2)
SCAN_EVENT_START_OFFSET_SECONDS = -30 * 60
CONCURRENT_READ_LIMIT = 50
EVENT_READ_LINE_COUNT_LIMIT = 200
EVENT_READ_LINE_LENGTH_LIMIT = env_integer("EVENT_READ_LINE_LENGTH_LIMIT",
                                           2 * 1024 * 1024)  # 2MB
# Report events
EVENT_AGENT_REPORT_INTERVAL_SECONDS = 0.1
EVENT_AGENT_RETRY_TIMES = 10
EVENT_AGENT_CACHE_SIZE = 10240
# Event sources
EVENT_HEAD_MONITOR_SOURCE_TYPES = [
    event_pb2.Event.SourceType.Name(event_pb2.Event.GCS)
]
EVENT_AGENT_MONITOR_SOURCE_TYPES = list(
    set(event_pb2.Event.SourceType.keys()) -
    set(EVENT_HEAD_MONITOR_SOURCE_TYPES))
EVENT_SOURCE_ALL = event_pb2.Event.SourceType.keys()
Ejemplo n.º 17
0
import ray
from ray.ray_constants import env_integer
from ray.types import ObjectRef
from ray.util.annotations import PublicAPI

try:
    import tqdm

    needs_warning = False
except ImportError:
    tqdm = None
    needs_warning = True

# Whether progress bars are enabled in this thread.
_enabled = not bool(env_integer("RAY_DATA_DISABLE_PROGRESS_BARS", 0))

# Used a signal to cancel execution.
_canceled_threads = set()
_canceled_threads_lock = threading.Lock()


@PublicAPI
def set_progress_bars(enabled: bool) -> bool:
    """Set whether progress bars are enabled.

    The default behavior is controlled by the
    ``RAY_DATA_DISABLE_PROGRESS_BARS`` environment variable. By default,
    it is set to "0". Setting it to "1" will disable progress bars, unless
    they are reenabled by this method.
Ejemplo n.º 18
0
    def start_training(
        self,
        train_func: Callable[[], T],
        run_dir: Path,
        dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]] = None,
        checkpoint: Optional[Union[Dict, str, Path]] = None,
        checkpoint_strategy: Optional[CheckpointStrategy] = None,
        latest_checkpoint_id: Optional[int] = None,
    ) -> None:
        """Executes a training function on all workers in a separate thread.

        ``finish_training`` should be called after this.

        Args:
            train_func (Callable): The training function to run on each worker.
            run_dir (Path): The directory to use for this run.
            dataset (Optional[Union[Dataset, DatasetPipeline]])
                Distributed Ray Dataset or DatasetPipeline to pass into
                worker, which can be accessed from the training function via
                ``train.get_dataset_shard()``. Sharding will automatically be
                handled by the Trainer. Multiple Datasets can be passed in as
                a ``Dict`` that maps each name key to a Dataset value,
                and each Dataset can be accessed from the training function
                by passing in a `dataset_name` argument to
                ``train.get_dataset_shard()``.
            checkpoint (Optional[Dict|str|Path]): The checkpoint data that
                should be loaded onto each worker and accessed by the
                training function via ``train.load_checkpoint()``. If this is a
                ``str`` or ``Path`` then the value is expected to be a path
                to a file that contains a serialized checkpoint dict. If this
                is ``None`` then no checkpoint will be loaded.
            checkpoint_strategy (Optional[CheckpointStrategy]): The
                configurations for saving checkpoints.
            latest_checkpoint_id (Optional[int]): The checkpoint id of the
                most recently saved checkpoint.
        """
        self.checkpoint_manager.on_start_training(
            checkpoint_strategy=checkpoint_strategy,
            run_dir=run_dir,
            latest_checkpoint_id=latest_checkpoint_id)

        use_detailed_autofilled_metrics = env_integer(
            ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0)

        # First initialize the session.
        def initialize_session(train_func, world_rank, local_rank, checkpoint,
                               dataset_shard):
            try:
                init_session(
                    training_func=train_func,
                    world_rank=world_rank,
                    local_rank=local_rank,
                    dataset_shard=dataset_shard,
                    checkpoint=checkpoint,
                    detailed_autofilled_metrics=use_detailed_autofilled_metrics
                )
            except ValueError:
                raise TrainBackendError(
                    "Attempting to start training but a "
                    "previous training run is still ongoing. "
                    "You must call `finish_training` before "
                    "calling `start_training` again.")

        if self.dataset_shards is None:
            self.dataset_shards = self._get_dataset_shards(dataset)

        checkpoint_dict = self.checkpoint_manager._load_checkpoint(checkpoint)

        local_rank_map = self._create_local_rank_map()

        futures = []
        for index in range(len(self.worker_group)):
            futures.append(
                self.worker_group.execute_single_async(
                    index,
                    initialize_session,
                    world_rank=index,
                    local_rank=local_rank_map[index],
                    train_func=train_func,
                    dataset_shard=self.dataset_shards[index],
                    checkpoint=checkpoint_dict))

        self.get_with_failure_handling(futures)

        # Run the training function asynchronously in its own thread.
        def train_async():
            session = get_session()
            session.start()

        self.worker_group.execute_async(train_async)
Ejemplo n.º 19
0
    def start_training(
        self,
        train_func: Callable[[], T],
        run_dir: Path,
        checkpoint: Optional[Union[Dict, str, Path]] = None,
        checkpoint_strategy: Optional[CheckpointStrategy] = None,
        latest_checkpoint_id: Optional[int] = None,
    ) -> None:
        """Executes a training function on all workers in a separate thread.

        ``finish_training`` should be called after this.

        Args:
            train_func (Callable): The training function to run on each worker.
            run_dir (Path): The directory to use for this run.
            checkpoint (Optional[Dict|str|Path]): The checkpoint data that
                should be loaded onto each worker and accessed by the
                training function via ``sgd.load_checkpoint()``. If this is a
                ``str`` or ``Path`` then the value is expected to be a path
                to a file that contains a serialized checkpoint dict. If this
                is ``None`` then no checkpoint will be loaded.
            checkpoint_strategy (Optional[CheckpointStrategy]): The
                configurations for saving checkpoints.
            latest_checkpoint_id (Optional[int]): The checkpoint id of the
                most recently saved checkpoint.
        """
        self.checkpoint_manager.on_start_training(
            checkpoint_strategy=checkpoint_strategy,
            run_dir=run_dir,
            latest_checkpoint_id=latest_checkpoint_id)

        use_detailed_autofilled_metrics = env_integer(
            ENABLE_DETAILED_AUTOFILLED_METRICS_ENV, 0)

        # First initialize the session.
        def initialize_session(world_rank, local_rank, train_func, checkpoint):
            try:
                init_session(
                    training_func=train_func,
                    world_rank=world_rank,
                    local_rank=local_rank,
                    checkpoint=checkpoint,
                    detailed_autofilled_metrics=use_detailed_autofilled_metrics
                )
            except ValueError:
                raise SGDBackendError(
                    "Attempting to start training but a "
                    "previous training run is still ongoing. "
                    "You must call `finish_training` before "
                    "calling `start_training` again.")

        checkpoint_dict = self.checkpoint_manager._load_checkpoint(checkpoint)

        local_rank_map = self._create_local_rank_map()

        futures = []
        for world_rank in range(len(self.worker_group)):
            futures.append(
                self.worker_group.execute_single_async(
                    world_rank,
                    initialize_session,
                    world_rank=world_rank,
                    local_rank=local_rank_map[world_rank],
                    train_func=train_func,
                    checkpoint=checkpoint_dict))

        self.get_with_failure_handling(futures)

        # Run the training function asynchronously in its own thread.
        def train_async():
            session = get_session()
            session.start()

        self.worker_group.execute_async(train_async)
Ejemplo n.º 20
0
from ray.ray_constants import env_integer

DASHBOARD_LOG_FILENAME = "dashboard.log"
DASHBOARD_AGENT_PORT_PREFIX = "DASHBOARD_AGENT_PORT_PREFIX:"
DASHBOARD_AGENT_LOG_FILENAME = "dashboard_agent.log"
DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS = 2
RETRY_REDIS_CONNECTION_TIMES = 10
CONNECT_REDIS_INTERNAL_SECONDS = 2
PURGE_DATA_INTERVAL_SECONDS = 60 * 10
ORGANIZE_DATA_INTERVAL_SECONDS = 2
DASHBOARD_RPC_ADDRESS = "dashboard_rpc"
GCS_SERVER_ADDRESS = "GcsServerAddress"
# GCS check alive
GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR = env_integer(
    "GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR", 10)
GCS_CHECK_ALIVE_INTERVAL_SECONDS = env_integer(
    "GCS_CHECK_ALIVE_INTERVAL_SECONDS", 5)
GCS_CHECK_ALIVE_RPC_TIMEOUT = env_integer("GCS_CHECK_ALIVE_RPC_TIMEOUT", 10)
GCS_RETRY_CONNECT_INTERVAL_SECONDS = env_integer(
    "GCS_RETRY_CONNECT_INTERVAL_SECONDS", 2)
# aiohttp_cache
AIOHTTP_CACHE_TTL_SECONDS = 2
AIOHTTP_CACHE_MAX_SIZE = 128
AIOHTTP_CACHE_DISABLE_ENVIRONMENT_KEY = "RAY_DASHBOARD_NO_CACHE"
# Named signals
SIGNAL_NODE_INFO_FETCHED = "node_info_fetched"
SIGNAL_NODE_SUMMARY_FETCHED = "node_summary_fetched"
SIGNAL_JOB_INFO_FETCHED = "job_info_fetched"
SIGNAL_WORKER_INFO_FETCHED = "worker_info_fetched"
# Default value for datacenter (the default value in protobuf)
DEFAULT_LANGUAGE = "PYTHON"