Ejemplo n.º 1
0
    def training_loop(self) -> None:
        scaling_config_dataclass = self._validate_and_get_scaling_config_data_class(
            self.scaling_config
        )

        train_loop_per_worker = construct_train_func(
            self.train_loop_per_worker,
            self.train_loop_config,
            fn_arg_name="train_loop_per_worker",
        )

        additional_resources_per_worker = (
            scaling_config_dataclass.additional_resources_per_worker
        )

        backend_executor = BackendExecutor(
            backend_config=self.backend_config,
            num_workers=scaling_config_dataclass.num_workers,
            num_cpus_per_worker=scaling_config_dataclass.num_cpus_per_worker,
            num_gpus_per_worker=scaling_config_dataclass.num_gpus_per_worker,
            additional_resources_per_worker=additional_resources_per_worker,
            max_retries=0,
        )

        checkpoint_manager = self._checkpoint_manager_cls()
        checkpoint_manager.on_init(preprocessor=self.preprocessor)

        # Start the remote actors.
        backend_executor.start(initialization_hook=None)

        if self.resume_from_checkpoint:
            resume_checkpoint_dict = self.resume_from_checkpoint.to_dict()
        else:
            resume_checkpoint_dict = None

        dataset_spec = _RayDatasetSpec(
            dataset_or_dict=self.datasets, dataset_split_fn=_default_dataset_split_fn
        )

        # TODO(amog): Have TrainingIterator also accept a checkpoint ObjectRef instead
        #  of just a Dict.
        training_iterator = TrainingIterator(
            backend_executor=backend_executor,
            backend_config=self.backend_config,
            train_func=train_loop_per_worker,
            dataset_spec=dataset_spec,
            checkpoint_manager=checkpoint_manager,
            checkpoint=resume_checkpoint_dict,
            checkpoint_strategy=None,
        )

        for results in training_iterator:
            # TODO(ml-team): add ability to report results from multiple workers.
            first_worker_results = results[0]

            tune.report(**first_worker_results)

        # Shutdown workers.
        backend_executor.shutdown()
Ejemplo n.º 2
0
def test_shutdown(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()
    assert len(e.worker_group) == 2
    e.shutdown()
    with pytest.raises(InactiveWorkerGroupError):
        e.start_training(lambda: 1)
Ejemplo n.º 3
0
    def training_loop(self) -> None:
        scaling_config_dataclass = ScalingConfigDataClass(
            **self.scaling_config)

        train_loop_per_worker = construct_train_func(
            self.train_loop_per_worker,
            self.train_loop_config,
            fn_arg_name="train_loop_per_worker",
        )

        additional_resources_per_worker = (
            scaling_config_dataclass.additional_resources_per_worker)

        backend_executor = BackendExecutor(
            backend_config=self.backend_config,
            num_workers=scaling_config_dataclass.num_workers,
            num_cpus_per_worker=scaling_config_dataclass.num_cpus_per_worker,
            num_gpus_per_worker=scaling_config_dataclass.num_gpus_per_worker,
            additional_resources_per_worker=additional_resources_per_worker,
            max_retries=0,
        )

        checkpoint_manager = _DataParallelCheckpointManager()
        checkpoint_manager.on_init(preprocessor=self.preprocessor)

        # Start the remote actors.
        backend_executor.start(initialization_hook=None)

        if self.resume_from_checkpoint:
            resume_checkpoint_dict = self.resume_from_checkpoint.to_dict()
        else:
            resume_checkpoint_dict = None

        # Tell Ray Train to only shard the train dataset and not the other datasets.
        # This is purely an implementation detail and users do not need to know about
        # this.
        # TODO(amog): Refactor this to remove hack and make this more modular.
        #  TrainingIterator should accept a generic custom_ingest_func that contains
        #  the logic for how to split the Datasets.
        updated_dataset_dict = {}
        for key, value in self.datasets.items():
            if key == TRAIN_DATASET_KEY:
                updated_dataset_dict[key] = value
            else:
                # Ray Train will strip out the added string before exposing to users.
                updated_dataset_dict[key + "_NO-SHARD"] = value

        # TODO(amog): Have TrainingIterator also accept a checkpoint ObjectRef instead
        #  of just a Dict.
        training_iterator = TrainingIterator(
            backend_executor=backend_executor,
            backend_config=self.backend_config,
            train_func=train_loop_per_worker,
            dataset=updated_dataset_dict
            if len(updated_dataset_dict) > 0 else None,
            checkpoint_manager=checkpoint_manager,
            checkpoint=resume_checkpoint_dict,
            checkpoint_strategy=None,
        )

        for results in training_iterator:
            # TODO(ml-team): add ability to report results from multiple workers.
            first_worker_results = results[0]

            tune.report(**first_worker_results)

        # Shutdown workers.
        backend_executor.shutdown()