def test_torch_get_device_dist(ray_2_node_4_gpu, num_gpus_per_worker): @patch("torch.cuda.is_available", lambda: True) def train_fn(): return train.torch.get_device().index trainer = Trainer( TorchConfig(backend="gloo"), num_workers=int(8 / num_gpus_per_worker), use_gpu=True, resources_per_worker={"GPU": num_gpus_per_worker}, ) trainer.start() devices = trainer.run(train_fn) trainer.shutdown() count = Counter(devices) if num_gpus_per_worker == 0.5: for i in range(4): assert count[i] == 4 elif num_gpus_per_worker == 1: for i in range(4): assert count[i] == 2 elif num_gpus_per_worker == 2: for i in range(2): assert count[2 * i] == 2 else: raise RuntimeError( "New parameter for this test has been added without checking that the " "correct devices have been returned.")
def __init__( self, train_loop_per_worker: Union[Callable[[], None], Callable[[Dict], None]], train_loop_config: Optional[Dict] = None, torch_config: Optional[TorchConfig] = None, scaling_config: Optional[ScalingConfig] = None, run_config: Optional[RunConfig] = None, datasets: Optional[Dict[str, GenDataset]] = None, preprocessor: Optional[Preprocessor] = None, resume_from_checkpoint: Optional[Checkpoint] = None, ): if not torch_config: torch_config = TorchConfig() super(TorchTrainer, self).__init__( train_loop_per_worker=train_loop_per_worker, train_loop_config=train_loop_config, backend_config=torch_config, scaling_config=scaling_config, run_config=run_config, datasets=datasets, preprocessor=preprocessor, resume_from_checkpoint=resume_from_checkpoint, )
def train_linear(num_workers=2, use_gpu=False, epochs=3): trainer = Trainer(backend=TorchConfig(backend="gloo"), num_workers=num_workers, use_gpu=use_gpu) config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer.start() results = trainer.run( train_func, config, callbacks=[JsonLoggerCallback(), TBXLoggerCallback()]) trainer.shutdown() print(results) return results
def test_torch_start_shutdown(ray_start_2_cpus, init_method): torch_config = TorchConfig(backend="gloo", init_method=init_method) e = BackendExecutor(torch_config, num_workers=2) e.start() def check_process_group(): import torch return torch.distributed.is_initialized( ) and torch.distributed.get_world_size() == 2 e.start_training(check_process_group) assert all(e.finish_training()) e._backend.on_shutdown(e.worker_group, e._backend_config) e.start_training(check_process_group) assert not any(e.finish_training())
def test_worker_kill(ray_start_2_cpus, backend): if backend == "test": test_config = TestConfig() elif backend == "torch": test_config = TorchConfig() elif backend == "tf": test_config = TensorflowConfig() elif backend == "horovod": test_config = HorovodConfig() trainer = Trainer(test_config, num_workers=2) def train_func(): for i in range(2): train.report(loss=1, iter=i) trainer.start() kill_callback = KillCallback(fail_on=0, trainer=trainer) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: iter=0, counter=1, Successful # Run 2: iter=1, counter=1, Unsuccessful, starts training from beginning # Run 3: iter=0, counter=2, Successful # Run 4: iter=1, counter=3, Successful assert kill_callback.counter == 3 trainer.shutdown() trainer.start() kill_callback = KillCallback(fail_on=1, trainer=trainer) trainer.run(train_func, callbacks=[kill_callback]) # Run 1: iter=0, counter=1, Successful # Run 2: iter=1, counter=2, Successful # Run 3: None, counter=2, Unsuccessful, starts training from beginning. # Run 4: iter=0, counter=3, Successful # Run 5: iter=1, counter=4, Successful assert kill_callback.counter == 4 def train_func(): return 1 # Make sure Trainer is usable even after failure handling. trainer.run(train_func)
def test_tune_torch_get_device_gpu(ray_2_node_4_gpu, num_gpus_per_worker): from ray import tune from ray.tune.tuner import Tuner, TuneConfig num_samples = 2 @patch("torch.cuda.is_available", lambda: True) def train_func(): train.report(device_id=train.torch.get_device().index) trainer = TorchTrainer( train_func, torch_config=TorchConfig(backend="gloo"), scaling_config=ScalingConfig( num_workers=2, use_gpu=True, resources_per_worker={"GPU": num_gpus_per_worker}, ), ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "dummy": tune.choice([32, 64, 128]), } }, tune_config=TuneConfig( num_samples=num_samples, ), ) analysis = tuner.fit()._experiment_analysis trial_dfs = list(analysis.trial_dataframes.values()) device_ids = [trial_df["device_id"].tolist() for trial_df in trial_dfs] assert len(device_ids) == num_samples for i in range(num_samples): assert device_ids[i][0] == 0
parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", action="store_true", default=False, help="Finish quickly for training.", ) args = parser.parse_args() ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True) num_training_workers = 1 if args.smoke_test else 3 trainer = Trainer( num_workers=num_training_workers, use_gpu=not args.smoke_test, backend=TorchConfig(backend="gloo"), ) TorchTrainable = trainer.to_tune_trainable(train_func=train_func) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="loss", mode="min", perturbation_interval=1, hyperparam_mutations={ # distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], }, )
action="store_true", default=False, help="Finish quickly for training.", ) args = parser.parse_args() ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True) num_training_workers = 1 if args.smoke_test else 3 trainer = TorchTrainer( train_func, scaling_config=ScalingConfig( num_workers=num_training_workers, use_gpu=not args.smoke_test, ), torch_config=TorchConfig(backend="gloo"), ) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", perturbation_interval=1, hyperparam_mutations={ "train_loop_config": { # distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], } }, )