Ejemplo n.º 1
0
def test_placement_group_pack(ray_4_node_4_cpu, num_workers):
    """Tests that workers are packed on nodes."""
    config = TestConfig()
    e = BackendExecutor(config, num_workers=num_workers)
    e.start()
    node_id_set = get_node_id_set()
    assert len(node_id_set) == math.ceil(num_workers / 4)
Ejemplo n.º 2
0
def test_placement_group_spread(ray_4_node_4_cpu, num_workers):
    """Tests that workers are spread across nodes."""
    os.environ[TRAIN_ENABLE_WORKER_SPREAD_ENV] = "1"
    config = TestConfig()
    e = BackendExecutor(config, num_workers=num_workers)
    e.start()
    node_id_set = get_node_id_set()
    assert len(node_id_set) == min(num_workers, 4)
Ejemplo n.º 3
0
def test_shutdown(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()
    assert len(e.worker_group) == 2
    e.shutdown()
    with pytest.raises(InactiveWorkerGroupError):
        e.start_training(lambda: 1)
Ejemplo n.º 4
0
def test_torch_start_shutdown(ray_start_2_cpus, init_method):
    torch_config = TorchConfig(backend="gloo", init_method=init_method)
    e = BackendExecutor(torch_config, num_workers=2)
    e.start()

    def check_process_group():
        import torch
        return torch.distributed.is_initialized(
        ) and torch.distributed.get_world_size() == 2

    e.start_training(check_process_group)
    assert all(e.finish_training())

    e._backend.on_shutdown(e.worker_group, e._backend_config)

    e.start_training(check_process_group)
    assert not any(e.finish_training())
Ejemplo n.º 5
0
def test_train(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()

    e.start_training(lambda: 1)
    assert e.finish_training() == [1, 1]
Ejemplo n.º 6
0
def test_train(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()

    e.start_training(lambda: 1, dataset_spec=EMPTY_RAY_DATASET_SPEC)
    assert e.finish_training() == [1, 1]
Ejemplo n.º 7
0
def test_mismatch_checkpoint_report(ray_start_2_cpus):
    def train_func():
        if (train.world_rank()) == 0:
            train.save_checkpoint(epoch=0)
        else:
            train.report(iter=0)

    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()
    e.start_training(train_func)
    with pytest.raises(RuntimeError):
        e.get_next_results()
Ejemplo n.º 8
0
def test_worker_failure(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()

    def train_fail():
        ray.actor.exit_actor()

    new_execute_func = gen_execute_special(train_fail)
    with patch.object(WorkerGroup, "execute_async", new_execute_func):
        with pytest.raises(TrainingWorkerError):
            e.start_training(lambda: 1)
            e.finish_training()
Ejemplo n.º 9
0
def test_local_ranks(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()

    def train_func():
        return train.local_rank()

    e.start_training(train_func, dataset_spec=EMPTY_RAY_DATASET_SPEC)
    assert set(e.finish_training()) == {0, 1}
Ejemplo n.º 10
0
def test_local_ranks(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()

    def train_func():
        return train.local_rank()

    e.start_training(train_func)
    assert set(e.finish_training()) == {0, 1}
Ejemplo n.º 11
0
def test_start(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    with pytest.raises(InactiveWorkerGroupError):
        e.start_training(lambda: 1, dataset_spec=EMPTY_RAY_DATASET_SPEC)
    e.start()
    assert len(e.worker_group) == 2
Ejemplo n.º 12
0
def test_start(ray_start_2_cpus, tmp_path):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    with pytest.raises(InactiveWorkerGroupError):
        e.start_training(lambda: 1, run_dir=tmp_path)
    e.start()
    assert len(e.worker_group) == 2
Ejemplo n.º 13
0
    def training_loop(self) -> None:
        scaling_config_dataclass = self._validate_and_get_scaling_config_data_class(
            self.scaling_config
        )

        train_loop_per_worker = construct_train_func(
            self.train_loop_per_worker,
            self.train_loop_config,
            fn_arg_name="train_loop_per_worker",
        )

        additional_resources_per_worker = (
            scaling_config_dataclass.additional_resources_per_worker
        )

        backend_executor = BackendExecutor(
            backend_config=self.backend_config,
            num_workers=scaling_config_dataclass.num_workers,
            num_cpus_per_worker=scaling_config_dataclass.num_cpus_per_worker,
            num_gpus_per_worker=scaling_config_dataclass.num_gpus_per_worker,
            additional_resources_per_worker=additional_resources_per_worker,
            max_retries=0,
        )

        checkpoint_manager = self._checkpoint_manager_cls()
        checkpoint_manager.on_init(preprocessor=self.preprocessor)

        # Start the remote actors.
        backend_executor.start(initialization_hook=None)

        if self.resume_from_checkpoint:
            resume_checkpoint_dict = self.resume_from_checkpoint.to_dict()
        else:
            resume_checkpoint_dict = None

        dataset_spec = _RayDatasetSpec(
            dataset_or_dict=self.datasets, dataset_split_fn=_default_dataset_split_fn
        )

        # TODO(amog): Have TrainingIterator also accept a checkpoint ObjectRef instead
        #  of just a Dict.
        training_iterator = TrainingIterator(
            backend_executor=backend_executor,
            backend_config=self.backend_config,
            train_func=train_loop_per_worker,
            dataset_spec=dataset_spec,
            checkpoint_manager=checkpoint_manager,
            checkpoint=resume_checkpoint_dict,
            checkpoint_strategy=None,
        )

        for results in training_iterator:
            # TODO(ml-team): add ability to report results from multiple workers.
            first_worker_results = results[0]

            tune.report(**first_worker_results)

        # Shutdown workers.
        backend_executor.shutdown()
Ejemplo n.º 14
0
def test_initialization_hook(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)

    def init_hook():
        import os
        os.environ["TEST"] = "1"

    e.start(initialization_hook=init_hook)

    def check():
        import os
        return os.getenv("TEST", "0")

    e.start_training(check)
    assert e.finish_training() == ["1", "1"]
Ejemplo n.º 15
0
def test_cuda_visible_devices_multiple(ray_2_node_4_gpu, worker_results):
    config = TestConfig()

    def get_resources():
        return os.environ["CUDA_VISIBLE_DEVICES"]

    num_workers, expected_results = worker_results

    os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1"
    e = BackendExecutor(
        config, num_workers=num_workers, num_cpus_per_worker=0, num_gpus_per_worker=2
    )
    e.start()
    e.start_training(get_resources, dataset_spec=EMPTY_RAY_DATASET_SPEC)
    results = e.finish_training()
    results.sort()
    assert results == expected_results
Ejemplo n.º 16
0
def test_cuda_visible_devices_fractional(ray_2_node_2_gpu, worker_results):
    config = TestConfig()

    def get_resources():
        return os.environ["CUDA_VISIBLE_DEVICES"]

    num_workers, expected_results = worker_results

    os.environ[ENABLE_SHARE_CUDA_VISIBLE_DEVICES_ENV] = "1"
    e = BackendExecutor(config,
                        num_workers=num_workers,
                        num_cpus_per_worker=0,
                        num_gpus_per_worker=0.5)
    e.start()
    e.start_training(get_resources)
    results = e.finish_training()
    results.sort()
    assert results == expected_results
Ejemplo n.º 17
0
def test_tensorflow_start(ray_start_2_cpus):
    num_workers = 2
    tensorflow_config = TensorflowConfig()
    e = BackendExecutor(tensorflow_config, num_workers=num_workers)
    e.start()

    def get_tf_config():
        import json
        import os
        return json.loads(os.environ["TF_CONFIG"])

    e.start_training(get_tf_config)
    results = e.finish_training()
    assert len(results) == num_workers

    workers = [result["cluster"]["worker"] for result in results]
    assert all(worker == workers[0] for worker in workers)

    indexes = [result["task"]["index"] for result in results]
    assert len(set(indexes)) == num_workers
Ejemplo n.º 18
0
    def training_loop(self) -> None:
        scaling_config_dataclass = ScalingConfigDataClass(
            **self.scaling_config)

        train_loop_per_worker = construct_train_func(
            self.train_loop_per_worker,
            self.train_loop_config,
            fn_arg_name="train_loop_per_worker",
        )

        additional_resources_per_worker = (
            scaling_config_dataclass.additional_resources_per_worker)

        backend_executor = BackendExecutor(
            backend_config=self.backend_config,
            num_workers=scaling_config_dataclass.num_workers,
            num_cpus_per_worker=scaling_config_dataclass.num_cpus_per_worker,
            num_gpus_per_worker=scaling_config_dataclass.num_gpus_per_worker,
            additional_resources_per_worker=additional_resources_per_worker,
            max_retries=0,
        )

        checkpoint_manager = _DataParallelCheckpointManager()
        checkpoint_manager.on_init(preprocessor=self.preprocessor)

        # Start the remote actors.
        backend_executor.start(initialization_hook=None)

        if self.resume_from_checkpoint:
            resume_checkpoint_dict = self.resume_from_checkpoint.to_dict()
        else:
            resume_checkpoint_dict = None

        # Tell Ray Train to only shard the train dataset and not the other datasets.
        # This is purely an implementation detail and users do not need to know about
        # this.
        # TODO(amog): Refactor this to remove hack and make this more modular.
        #  TrainingIterator should accept a generic custom_ingest_func that contains
        #  the logic for how to split the Datasets.
        updated_dataset_dict = {}
        for key, value in self.datasets.items():
            if key == TRAIN_DATASET_KEY:
                updated_dataset_dict[key] = value
            else:
                # Ray Train will strip out the added string before exposing to users.
                updated_dataset_dict[key + "_NO-SHARD"] = value

        # TODO(amog): Have TrainingIterator also accept a checkpoint ObjectRef instead
        #  of just a Dict.
        training_iterator = TrainingIterator(
            backend_executor=backend_executor,
            backend_config=self.backend_config,
            train_func=train_loop_per_worker,
            dataset=updated_dataset_dict
            if len(updated_dataset_dict) > 0 else None,
            checkpoint_manager=checkpoint_manager,
            checkpoint=resume_checkpoint_dict,
            checkpoint_strategy=None,
        )

        for results in training_iterator:
            # TODO(ml-team): add ability to report results from multiple workers.
            first_worker_results = results[0]

            tune.report(**first_worker_results)

        # Shutdown workers.
        backend_executor.shutdown()
Ejemplo n.º 19
0
 def test():
     config = TestConfig()
     e = BackendExecutor(config, num_workers=2)
     e.start()
     e.start_training(train_func)
     return e.finish_training()
Ejemplo n.º 20
0
 def test():
     config = TestConfig()
     e = BackendExecutor(config, num_workers=2)
     e.start()
     e.start_training(train_func, dataset_spec=EMPTY_RAY_DATASET_SPEC)
     return e.finish_training()
Ejemplo n.º 21
0
def test_train_failure(ray_start_2_cpus):
    config = TestConfig()
    e = BackendExecutor(config, num_workers=2)
    e.start()

    with pytest.raises(TrainBackendError):
        e.get_next_results()

    with pytest.raises(TrainBackendError):
        e.pause_reporting()

    with pytest.raises(TrainBackendError):
        e.finish_training()

    e.start_training(lambda: 1)

    with pytest.raises(TrainBackendError):
        e.start_training(lambda: 2)

    assert e.finish_training() == [1, 1]