Example #1
0
    def __init__(
        self,
        backend_executor: Union[BackendExecutor, ActorWrapper],
        backend_config: BackendConfig,
        train_func: Union[Callable[[], T], Callable[[Dict[str, Any]], T]],
        dataset_spec: RayDatasetSpec,
        checkpoint_manager: CheckpointManager,
        checkpoint: Optional[Union[Dict, str, Path, Checkpoint]],
        checkpoint_strategy: Optional[CheckpointConfig],
        run_dir: Optional[Path] = None,
    ):
        self._backend_executor = backend_executor
        self._backend = backend_config.backend_cls()
        self._train_func = train_func
        self._dataset_spec = dataset_spec
        self._run_dir = run_dir
        self._checkpoint_manager = checkpoint_manager
        self._checkpoint_strategy = checkpoint_strategy
        self._start_training(
            train_func=train_func,
            run_dir=run_dir,
            dataset_spec=self._dataset_spec,
            checkpoint=checkpoint,
            checkpoint_strategy=checkpoint_strategy,
        )

        self._final_results = None
        self._finished_training = False
Example #2
0
    def __init__(
        self,
        backend_config: BackendConfig,
        # TODO(xwjiang): Legacy Ray Train trainer clean up!
        trial_info: Optional[TrialInfo] = None,
        num_workers: int = 1,
        num_cpus_per_worker: float = 1,
        num_gpus_per_worker: float = 0,
        additional_resources_per_worker: Optional[Dict[str, float]] = None,
        max_retries: int = 3,
    ):
        self._backend_config = backend_config
        self._backend = backend_config.backend_cls()
        self._num_workers = num_workers
        self._num_cpus_per_worker = num_cpus_per_worker
        self._num_gpus_per_worker = num_gpus_per_worker
        self._additional_resources_per_worker = additional_resources_per_worker
        self._max_failures = max_retries
        if self._max_failures < 0:
            self._max_failures = float("inf")
        self._num_failures = 0
        self._initialization_hook = None
        self._placement_group = None

        self._trial_info = trial_info

        self.worker_group = InactiveWorkerGroup()
        self.dataset_shards = None
Example #3
0
    def __init__(
        self,
        backend_executor_actor: ActorHandle,
        backend_config: BackendConfig,
        train_func: Union[Callable[[], T], Callable[[Dict[str, Any]], T]],
        run_dir: Path,
        dataset: Optional[Union[RayDataset, Dict[str, RayDataset]]],
        checkpoint_manager: CheckpointManager,
        checkpoint: Optional[Union[Dict, str, Path]],
        checkpoint_strategy: Optional[CheckpointStrategy],
    ):
        self._backend_executor_actor = backend_executor_actor
        self._backend = backend_config.backend_cls()
        self._train_func = train_func
        self._dataset = dataset
        self._run_dir = run_dir
        self._checkpoint_manager = checkpoint_manager
        self._checkpoint_strategy = checkpoint_strategy
        self._start_training(
            train_func=train_func,
            run_dir=run_dir,
            dataset=dataset,
            checkpoint=checkpoint,
            checkpoint_strategy=checkpoint_strategy,
        )

        self._final_results = None
        self._finished_training = False