def test_resume_from_checkpoint(ray_start_4_cpus, tmpdir): def train_func(): checkpoint = session.get_checkpoint() if checkpoint: epoch = checkpoint.to_dict()["epoch"] else: epoch = 0 for i in range(epoch, epoch + 2): session.report({"epoch": i}, checkpoint=Checkpoint.from_dict({"epoch": i})) trainer = DataParallelTrainer( train_loop_per_worker=train_func, scaling_config=scale_config ) result = trainer.fit() assert result.checkpoint.to_dict()["epoch"] == 1 # Move checkpoint to a different directory. checkpoint_dict = result.checkpoint.to_dict() checkpoint = Checkpoint.from_dict(checkpoint_dict) checkpoint_path = checkpoint.to_directory(tmpdir) resume_from = Checkpoint.from_directory(checkpoint_path) trainer = DataParallelTrainer( train_loop_per_worker=train_func, scaling_config=scale_config, resume_from_checkpoint=resume_from, ) result = trainer.fit() assert result.checkpoint.to_dict()["epoch"] == 2
def test_datasets(ray_start_4_cpus): num_train_data = 10 num_val_data = 6 train_dataset = ray.data.range(num_train_data) val_dataset = ray.data.range(num_val_data) def get_dataset(): # Train dataset should be sharded. train_dataset = train.get_dataset_shard("train") assert train_dataset.count( ) == num_train_data / scale_config["num_workers"] # All other datasets should not be sharded. val_dataset = train.get_dataset_shard("val") assert val_dataset.count() == num_val_data trainer = DataParallelTrainer( train_loop_per_worker=get_dataset, scaling_config=scale_config, datasets={ "train": train_dataset, "val": val_dataset }, ) trainer.fit()
def test_scaling_config(ray_start_4_cpus): def train_func(): assert ray.available_resources()["CPU"] == 1 train.report(loss=1) assert ray.available_resources()["CPU"] == 4 trainer = DataParallelTrainer(train_loop_per_worker=train_func, scaling_config={"num_workers": 2}) trainer.fit()
def test_scaling_config(ray_start_4_cpus): def train_func(): assert ray.available_resources()["CPU"] == 1 session.report({"loss": 1}) assert ray.available_resources()["CPU"] == 4 trainer = DataParallelTrainer( train_loop_per_worker=train_func, scaling_config=ScalingConfig(num_workers=2) ) trainer.fit()
def test_run(ray_start_4_cpus): """Tests that Train can be run without any specific backends.""" num_workers = 2 key = "value" value = 1 config = TestConfig() def train_func(): checkpoint = session.get_checkpoint() session.report(metrics=checkpoint.to_dict(), checkpoint=checkpoint) return checkpoint.to_dict()[key] checkpoint = Checkpoint.from_dict({ # this would be set during checkpoint saving "_current_checkpoint_id": 1, key: value, }) trainer = DataParallelTrainer( train_func, backend_config=config, resume_from_checkpoint=checkpoint, scaling_config=ScalingConfig(num_workers=num_workers), ) results = trainer.fit() assert results.checkpoint.to_dict()[key] == checkpoint.to_dict()[key]
def test_fit_train(ray_start_4_cpus): def train_func(): train.report(loss=1) trainer = DataParallelTrainer(train_loop_per_worker=train_func, scaling_config=scale_config) assert trainer.fit().metrics["loss"] == 1
def test_bad_return_in_train_loop(ray_start_4_cpus): """Test to check if returns from train loop are discarded.""" # Simulates what happens with eg. torch models class FailOnUnpickle: def __reduce__(self): raise RuntimeError("Failing") def train_loop(config): session.report({"loss": 1}) return FailOnUnpickle() trainer = DataParallelTrainer( train_loop_per_worker=train_loop, scaling_config=scale_config ) # No exception should happen here trainer.fit()
def test_checkpoint(ray_start_4_cpus): def train_func(): for i in range(3): train.save_checkpoint(model=i) trainer = DataParallelTrainer(train_loop_per_worker=train_func, scaling_config=scale_config) result = trainer.fit() assert result.checkpoint.to_dict()["model"] == 2
def test_checkpoint(ray_start_4_cpus): def train_func(): for i in range(3): session.report({"epoch": i}, checkpoint=Checkpoint.from_dict({"model": i})) trainer = DataParallelTrainer( train_loop_per_worker=train_func, scaling_config=scale_config ) result = trainer.fit() assert result.checkpoint.to_dict()["model"] == 2
def test_fit_train_config(ray_start_4_cpus): def train_func(config): session.report({"loss": config["x"]}) trainer = DataParallelTrainer( train_loop_per_worker=train_func, scaling_config=scale_config, train_loop_config={"x": 100}, ) assert trainer.fit().metrics["loss"] == 100
def test_preprocessor_in_checkpoint(ray_start_4_cpus): class DummyPreprocessor(Preprocessor): def __init__(self): super().__init__() self.is_same = True def train_func(): for i in range(3): session.report({"epoch": i}, checkpoint=Checkpoint.from_dict({"model": i})) trainer = DataParallelTrainer( train_loop_per_worker=train_func, scaling_config=scale_config, preprocessor=DummyPreprocessor(), ) result = trainer.fit() assert result.checkpoint.to_dict()["model"] == 2 assert result.checkpoint.to_dict()[PREPROCESSOR_KEY].is_same