def test_tuner_with_torch_trainer(self): """Test a successful run using torch trainer.""" shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"), ignore_errors=True) # The following two should be tunable. config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10} scaling_config = ScalingConfig(num_workers=1, use_gpu=False) trainer = TorchTrainer( train_loop_per_worker=linear_train_func, train_loop_config=config, scaling_config=scaling_config, ) param_space = { "scaling_config": ScalingConfig(num_workers=tune.grid_search([1, 2])), "train_loop_config": { "batch_size": tune.grid_search([4, 8]), "epochs": tune.grid_search([5, 10]), }, } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner"), param_space=param_space, tune_config=TuneConfig(mode="min", metric="loss"), ) results = tuner.fit() assert len(results) == 8
def test_arg_override(ray_start_4_cpus): def check_override(self): assert self.scaling_config.num_workers == 1 # Should do deep update. assert not self.custom_arg["outer"]["inner"] assert self.custom_arg["outer"]["fixed"] == 1 # Should merge with base config. assert self.preprocessor.original pg = get_current_placement_group() assert len(pg.bundle_specs) == 2 # 1 trainer, 1 worker preprocessor = DummyPreprocessor() preprocessor.original = True scale_config = ScalingConfig(num_workers=4) trainer = DummyTrainer( check_override, custom_arg={"outer": { "inner": True, "fixed": 1 }}, preprocessor=preprocessor, scaling_config=scale_config, ) new_config = { "custom_arg": { "outer": { "inner": False } }, "scaling_config": ScalingConfig(num_workers=1), } tune.run(trainer.as_trainable(), config=new_config)
def test_scaling_config_pgf_equivalance(trainer_resources, resources_per_worker_and_use_gpu, num_workers, placement_strategy): resources_per_worker, use_gpu = resources_per_worker_and_use_gpu scaling_config = ScalingConfig( trainer_resources=trainer_resources, num_workers=num_workers, resources_per_worker=resources_per_worker, use_gpu=use_gpu, placement_strategy=placement_strategy, ) pgf = scaling_config.as_placement_group_factory() scaling_config_from_pgf = ScalingConfig.from_placement_group_factory(pgf) assert scaling_config == scaling_config_from_pgf assert scaling_config_from_pgf.as_placement_group_factory() == pgf
def test_torch_e2e_state_dict(ray_start_4_cpus): def train_func(): model = torch.nn.Linear(1, 1).state_dict() session.report({}, checkpoint=Checkpoint.from_dict(dict(model=model))) scaling_config = ScalingConfig(num_workers=2) trainer = TorchTrainer( train_loop_per_worker=train_func, scaling_config=scaling_config ) result = trainer.fit() # If loading from a state dict, a model definition must be passed in. with pytest.raises(ValueError): TorchPredictor.from_checkpoint(result.checkpoint) class TorchScorer: def __init__(self): self.pred = TorchPredictor.from_checkpoint( result.checkpoint, model=torch.nn.Linear(1, 1) ) def __call__(self, x): return self.pred.predict(x, dtype=torch.float) predict_dataset = ray.data.range(3) predictions = predict_dataset.map_batches( TorchScorer, batch_format="pandas", compute="actors" ) assert predictions.count() == 3
def test_run(ray_start_4_cpus): """Tests that Train can be run without any specific backends.""" num_workers = 2 key = "value" value = 1 config = TestConfig() def train_func(): checkpoint = session.get_checkpoint() session.report(metrics=checkpoint.to_dict(), checkpoint=checkpoint) return checkpoint.to_dict()[key] checkpoint = Checkpoint.from_dict({ # this would be set during checkpoint saving "_current_checkpoint_id": 1, key: value, }) trainer = DataParallelTrainer( train_func, backend_config=config, resume_from_checkpoint=checkpoint, scaling_config=ScalingConfig(num_workers=num_workers), ) results = trainer.fit() assert results.checkpoint.to_dict()[key] == checkpoint.to_dict()[key]
def test_reuse_checkpoint(ray_start_4_cpus): def train_func(config): itr = 0 ckpt = session.get_checkpoint() if ckpt is not None: ckpt = ckpt.to_dict() itr = ckpt["iter"] + 1 for i in range(itr, config["max_iter"]): session.report( dict(test=i, training_iteration=i), checkpoint=Checkpoint.from_dict(dict(iter=i)), ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=ScalingConfig(num_workers=1), ) tuner = Tuner( trainer, param_space={"train_loop_config": {"max_iter": 5}}, ) [trial] = tuner.fit()._experiment_analysis.trials checkpoint_path = trial.checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 4 tuner = Tuner( trainer, param_space={"train_loop_config": {"max_iter": 10}}, ).restore(trial.local_dir) analysis = tuner.fit()._experiment_analysis trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 5
def tune_linear(num_workers, num_samples, use_gpu): train_dataset, val_dataset = get_datasets() config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), datasets={ "train": train_dataset, "validation": val_dataset }, ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([4, 16, 32]), "epochs": 3, } }, tune_config=TuneConfig(num_samples=num_samples, metric="loss", mode="min"), ) result_grid = tuner.fit() best_result = result_grid.get_best_result() print(best_result) return best_result
def test_report_and_load_using_ml_session(ray_start_4_cpus): def train_func(): if session.get_checkpoint(): with session.get_checkpoint().as_directory() as checkpoint_dir: import tensorflow as tf model = tf.keras.models.load_model(checkpoint_dir) else: model = build_model() model.save("my_model", overwrite=True) session.report(metrics={"iter": 1}, checkpoint=Checkpoint.from_directory("my_model")) scaling_config = ScalingConfig(num_workers=2) trainer = TensorflowTrainer(train_loop_per_worker=train_func, scaling_config=scaling_config) result = trainer.fit() trainer2 = TensorflowTrainer( train_loop_per_worker=train_func, scaling_config=scaling_config, resume_from_checkpoint=result.checkpoint, ) result = trainer2.fit() checkpoint = result.checkpoint with checkpoint.as_directory() as ckpt_dir: assert os.path.exists(os.path.join(ckpt_dir, "saved_model.pb")) assert result.metrics["iter"] == 1
def train_torch_ray_air( *, config: dict, num_workers: int = 4, cpus_per_worker: int = 8, use_gpu: bool = False, ) -> Tuple[float, float]: # This function is kicked off by the main() function and runs a full training # run using Ray AIR. from ray.train.torch import TorchTrainer from ray.air.config import ScalingConfig def train_loop(config): train_func(use_ray=True, config=config) start_time = time.monotonic() trainer = TorchTrainer( train_loop_per_worker=train_loop, train_loop_config=config, scaling_config=ScalingConfig( trainer_resources={"CPU": 0}, num_workers=num_workers, resources_per_worker={"CPU": cpus_per_worker}, use_gpu=use_gpu, ), ) result = trainer.fit() time_taken = time.monotonic() - start_time print(f"Last result: {result.metrics}") return time_taken, result.metrics["loss"]
def test_retry(ray_start_4_cpus): def train_func(): ckpt = session.get_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: ckpt = ckpt.to_dict() itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") session.report( dict(test=i, training_iteration=i), checkpoint=Checkpoint.from_dict(dict(iter=i)), ) trainer = DataParallelTrainer( train_func, backend_config=TestConfig(), scaling_config=ScalingConfig(num_workers=1), ) tuner = Tuner( trainer, run_config=RunConfig(failure_config=FailureConfig(max_failures=3)) ) analysis = tuner.fit()._experiment_analysis checkpoint_path = analysis.trials[0].checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 3 trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 4
def __init__( self, num_workers: int, expect_ds: bool, expect_sizes: Optional[dict], **kwargs ): def train_loop_per_worker(): data_shard = session.get_dataset_shard("train") if expect_ds: assert isinstance(data_shard, Dataset), data_shard else: assert isinstance(data_shard, DatasetPipeline), data_shard for k, v in expect_sizes.items(): shard = session.get_dataset_shard(k) if v == -1: assert shard is None, shard else: if isinstance(shard, DatasetPipeline): assert next(shard.iter_epochs()).count() == v, shard else: assert shard.count() == v, shard kwargs.pop("scaling_config", None) super().__init__( train_loop_per_worker=train_loop_per_worker, scaling_config=ScalingConfig(num_workers=num_workers), **kwargs, )
def test_data_parallel_trainer(ray_start_8_cpus): num_workers = 2 trainer = AssertingDataParallelTrainer( train_fn, scaling_config=ScalingConfig(num_workers=num_workers) ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "num_epochs": 100, "metric": tune.grid_search([1, 2, 3, 4, 5]), } }, tune_config=TuneConfig( mode="max", metric="metric", scheduler=ResourceChangingScheduler( ASHAScheduler(), resources_allocation_function=DistributeResources( add_bundles=True, reserve_resources={"CPU": 1} ), ), ), run_config=RunConfig(failure_config=FailureConfig(fail_fast=True)), ) result_grid = tuner.fit() assert not any(x.error for x in result_grid) # + 1 for Trainable assert result_grid.get_dataframe()["num_cpus"].max() > num_workers + 1
def test_horovod_state_dict(ray_start_4_cpus): def train_func(config): result = hvd_train_func(config) assert len(result) == epochs assert result[-1] < result[0] num_workers = 2 epochs = 10 scaling_config = ScalingConfig(num_workers=num_workers) config = {"num_epochs": epochs, "save_model_as_dict": True} trainer = HorovodTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=scaling_config, ) result = trainer.fit() predictor = TorchPredictor.from_checkpoint(result.checkpoint, model=Net()) # Find some test data to run on. test_set = datasets.MNIST( "./data", train=False, download=True, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ), ) test_dataloader = DataLoader(test_set, batch_size=10) test_dataloader_iter = iter(test_dataloader) images, labels = next( test_dataloader_iter ) # only running a batch inference of 10 images predicted_labels = run_image_prediction(predictor.model, images) assert torch.equal(predicted_labels, labels)
def _reconcile_scaling_config_with_trial_resources( self, scaling_config: ScalingConfig) -> ScalingConfig: """ ResourceChangingScheduler workaround. Ensures that the scaling config matches trial resources. This should be replaced with RCS returning a ScalingConfig in the future. """ trial_resources = self.trial_resources # This will be false if the resources are default if not isinstance(trial_resources, PlacementGroupFactory): return scaling_config if scaling_config: scaling_config = trainer_cls._validate_scaling_config( scaling_config) scaling_config_from_trial_resources = ( ScalingConfig.from_placement_group_factory(trial_resources) ) # This check should always pass if ResourceChangingScheduler is not # used. if scaling_config_from_trial_resources != scaling_config: scaling_config = trainer_cls._validate_scaling_config( scaling_config_from_trial_resources) return scaling_config
def __init__( self, *, scaling_config: Optional[ScalingConfig] = None, run_config: Optional[RunConfig] = None, datasets: Optional[Dict[str, GenDataset]] = None, preprocessor: Optional["Preprocessor"] = None, resume_from_checkpoint: Optional[Checkpoint] = None, ): self.scaling_config = (scaling_config if scaling_config is not None else ScalingConfig()) self.run_config = run_config if run_config is not None else RunConfig() self.datasets = datasets if datasets is not None else {} self.preprocessor = preprocessor self.resume_from_checkpoint = resume_from_checkpoint self._validate_attributes() if datasets and not self.scaling_config._max_cpu_fraction_per_node: logger.warning( "When passing `datasets` to a Trainer, it is recommended to " "reserve at least 20% of node CPUs for Dataset execution by setting " "`_max_cpu_fraction_per_node = 0.8` in the Trainer `scaling_config`. " "Not doing so can lead to resource contention or hangs. " "See https://docs.ray.io/en/master/data/key-concepts.html" "#example-datasets-in-tune for more info.")
def main(num_workers, use_gpu, kwargs): trainer = HorovodTrainer( train_func, train_loop_config=kwargs, scaling_config=ScalingConfig(use_gpu=use_gpu, num_workers=num_workers), ) results = trainer.fit() print(results.metrics)
def test_resources(ray_start_4_cpus): def check_cpus(self): assert ray.available_resources()["CPU"] == 2 assert ray.available_resources()["CPU"] == 4 trainer = DummyTrainer( check_cpus, scaling_config=ScalingConfig(trainer_resources={"CPU": 2})) trainer.fit()
def test_reserved_cpu_warnings(ray_start_4_cpus): def train_loop(self): pass class MockLogger: def __init__(self): self.warnings = [] def warning(self, msg): self.warnings.append(msg) def info(self, msg): print(msg) try: old = base_trainer.logger base_trainer.logger = MockLogger() # Fraction correctly specified. DummyTrainer( train_loop, scaling_config=ScalingConfig(num_workers=1, _max_cpu_fraction_per_node=0.9), datasets={"train": ray.data.range(10)}, ) assert not base_trainer.logger.warnings # No datasets, no fraction. DummyTrainer( train_loop, scaling_config=ScalingConfig(num_workers=1), ) assert not base_trainer.logger.warnings # Should warn. DummyTrainer( train_loop, scaling_config=ScalingConfig(num_workers=1), datasets={"train": ray.data.range(10)}, ) assert len( base_trainer.logger.warnings) == 1, base_trainer.logger.warnings assert "_max_cpu_fraction_per_node" in base_trainer.logger.warnings[0] finally: base_trainer.logger = old
def test_trainable_name_is_overriden_gbdt_trainer(ray_start_4_cpus): trainer = DummyGBDTTrainer( params={}, label_column="__values__", datasets={"train": ray.data.from_items([1, 2, 3])}, scaling_config=ScalingConfig(num_workers=1), ) _is_trainable_name_overriden(trainer)
def test_scaling_config_validate_config_bad_allowed_keys(): # Check for keys not present in dict scaling_config = {"num_workers": 2} with pytest.raises(ValueError) as exc_info: ensure_only_allowed_dataclass_keys_updated( ScalingConfig(**scaling_config), ["BAD_KEY"], ) assert "BAD_KEY" in str(exc_info.value) assert "are not present in" in str(exc_info.value)
def test_scaling_config(ray_start_4_cpus): def train_func(): assert ray.available_resources()["CPU"] == 1 session.report({"loss": 1}) assert ray.available_resources()["CPU"] == 4 trainer = DataParallelTrainer( train_loop_per_worker=train_func, scaling_config=ScalingConfig(num_workers=2) ) trainer.fit()
def test_scaling_config_validate_config_prohibited_class(): # Check for prohibited keys scaling_config = {"num_workers": 2} with pytest.raises(ValueError) as exc_info: ensure_only_allowed_dataclass_keys_updated( ScalingConfig(**scaling_config), ["trainer_resources"], ) assert "num_workers" in str(exc_info.value) assert "to be updated" in str(exc_info.value)
def train_linear(num_workers=2, use_gpu=False, epochs=3): config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer = TorchTrainer( train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) results = trainer.fit() print(results.metrics) return results
def train_tensorflow_mnist( num_workers: int = 2, use_gpu: bool = False, epochs: int = 4 ) -> Result: config = {"lr": 1e-3, "batch_size": 64, "epochs": epochs} trainer = TensorflowTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) results = trainer.fit() return results
def test_reserved_cpus(ray_start_4_cpus): def train_loop(self): ray.data.range(10).show() # Will deadlock without reserved CPU fraction. scale_config = ScalingConfig(num_workers=1, _max_cpu_fraction_per_node=0.9) trainer = DummyTrainer( train_loop, scaling_config=scale_config, ) tune.run(trainer.as_trainable(), num_samples=4)
def train_fashion_mnist(num_workers=2, use_gpu=False): trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config={ "lr": 1e-3, "batch_size": 64, "epochs": 4 }, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) result = trainer.fit() print(f"Last result: {result.metrics}")
def train_tensorflow_mnist(num_workers=2, use_gpu=False, epochs=4): trainer = TensorflowTrainer( train_func, train_loop_config={ "lr": 1e-3, "batch_size": 64, "epochs": epochs }, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) results = trainer.fit() print(f"Results: {results.metrics}")
def test_validation(ray_start_4_cpus): train_dataset = ray.data.from_pandas(train_df) valid_dataset = ray.data.from_pandas(test_df) with pytest.raises(KeyError, match=TRAIN_DATASET_KEY): XGBoostTrainer( scaling_config=ScalingConfig(num_workers=2), label_column="target", params=params, datasets={"valid": valid_dataset}, ) with pytest.raises(KeyError, match="dmatrix_params"): XGBoostTrainer( scaling_config=ScalingConfig(num_workers=2), label_column="target", params=params, dmatrix_params={"data": {}}, datasets={ TRAIN_DATASET_KEY: train_dataset, "valid": valid_dataset }, )
def main(num_workers, use_gpu, kwargs): trainer = HorovodTrainer( train_loop_per_worker=train_func, train_loop_config={ "num_epochs": kwargs["num_epochs"], "log_interval": kwargs["log_interval"], "use_cuda": kwargs["use_cuda"], }, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) result = trainer.fit() print(result)
def main(data_size_gb: int, num_epochs=2, num_workers=1): data_url = f"s3://air-example-data-2/{data_size_gb}G-image-data-synthetic-raw" print("Running Pytorch image model training with " f"{data_size_gb}GB data from {data_url}") print(f"Training for {num_epochs} epochs with {num_workers} workers.") start = time.time() # Enable cross host NCCL for larger scale tests runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}} ray.init(runtime_env=runtime_env) dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[data_url]) preprocessor = BatchMapper(preprocess_image_with_label) trainer = TorchTrainer( train_loop_per_worker=train_loop_per_worker, train_loop_config={ "batch_size": 64, "num_epochs": num_epochs }, datasets={"train": dataset}, preprocessor=preprocessor, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=True), ) trainer.fit() total_time_s = round(time.time() - start, 2) # For structured output integration with internal tooling results = {"data_size_gb": data_size_gb, "num_epochs": num_epochs} results["perf_metrics"] = [ { "perf_metric_name": "total_time_s", "perf_metric_value": total_time_s, "perf_metric_type": "LATENCY", }, { "perf_metric_name": "throughout_MB_s", "perf_metric_value": round(num_epochs * data_size_gb * 1024 / total_time_s, 2), "perf_metric_type": "THROUGHPUT", }, ] test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/release_test_out.json") with open(test_output_json, "wt") as f: json.dump(results, f) print(results)