def __init__(self, model, trainer_kwargs, executable_kwargs): self.model = model.cpu() self.executable_kwargs = executable_kwargs self.trainer = Trainer(**{**get_trainer_kwargs(), **trainer_kwargs}) self.trainer.start() self._validation_field = None self._validation_metric = None
class RayTrainerV2(BaseTrainer): def __init__(self, model, trainer_kwargs, executable_kwargs): self.model = model.cpu() self.executable_kwargs = executable_kwargs self.trainer = Trainer(**{**get_trainer_kwargs(), **trainer_kwargs}) self.trainer.start() self._validation_field = None self._validation_metric = None def train( self, training_set: RayDataset, validation_set: Optional[RayDataset] = None, test_set: Optional[RayDataset] = None, **kwargs, ): executable_kwargs = self.executable_kwargs kwargs = { "training_set_metadata": training_set.training_set_metadata, "features": training_set.features, **kwargs, } dataset = {"train": training_set.pipeline()} if validation_set is not None: dataset["val"] = validation_set.pipeline(shuffle=False) if test_set is not None: dataset["test"] = test_set.pipeline(shuffle=False) results, self._validation_field, self._validation_metric = self.trainer.run( lambda config: train_fn(**config), config={ "executable_kwargs": executable_kwargs, "model_ref": ray.put(self.model), **kwargs }, dataset=dataset, )[0] # load state dict back into the model state_dict, *args = results self.model.load_state_dict(state_dict) results = (self.model, *args) return results def train_online(self, *args, **kwargs): raise NotImplementedError() @property def validation_field(self): return self._validation_field @property def validation_metric(self): return self._validation_metric def shutdown(self): self.trainer.shutdown()
def test_tune_error_legacy(ray_start_4_cpus): def train_func(config): raise RuntimeError("Error in training function!") trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) with pytest.raises(TuneError): tune.run(TestTrainable)
def test_tune_checkpoint_legacy(ray_start_4_cpus): def train_func(): for i in range(10): train.report(test=i) train.save_checkpoint(hello="world") trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) [trial] = tune.run(TestTrainable).trials checkpoint_path = trial.checkpoint.dir_or_data assert os.path.exists(checkpoint_path) checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["hello"] == "world"
def test_reuse_checkpoint_legacy(ray_start_4_cpus): def train_func(config): itr = 0 ckpt = train.load_checkpoint() if ckpt is not None: itr = ckpt["iter"] + 1 for i in range(itr, config["max_iter"]): train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i) trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) [trial] = tune.run(TestTrainable, config={"max_iter": 5}).trials checkpoint_path = trial.checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 4 analysis = tune.run(TestTrainable, config={"max_iter": 10}, restore=checkpoint_path) trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 5
def create_runner(self): trainer = Trainer(**{**get_trainer_kwargs(), **self.trainer_kwargs}) trainer.start() try: yield trainer finally: trainer.shutdown()
def test_retry_legacy(ray_start_4_cpus): def train_func(): ckpt = train.load_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i) trainer = Trainer(TestConfig(), num_workers=1) TestTrainable = trainer.to_tune_trainable(train_func) analysis = tune.run(TestTrainable, max_failures=3) checkpoint_path = analysis.trials[0].checkpoint.dir_or_data checkpoint = Checkpoint.from_directory(checkpoint_path).to_dict() assert checkpoint["iter"] == 3 trial_dfs = list(analysis.trial_dataframes.values()) assert len(trial_dfs[0]["training_iteration"]) == 4
def train_fashion_mnist(num_workers=2, use_gpu=False): trainer = Trainer( backend="torch", num_workers=num_workers, use_gpu=use_gpu) trainer.start() result = trainer.run( train_func=train_func, config={ "lr": 1e-3, "batch_size": 64, "epochs": 4 }, callbacks=[JsonLoggerCallback()]) trainer.shutdown() print(f"Loss results: {result}")
def batch_evaluation( self, dataset: RayDataset, collect_predictions: bool = False, collect_logits=False, **kwargs, ): # We need to be in a Horovod context to collect the aggregated metrics, since it relies on collective # communication ops. However, Horovod is not suitable for transforming one big dataset to another. For that # we will use Ray Datasets. Therefore, we break this up into two separate steps, and two passes over the # dataset. In the future, we can explore ways to combine these into a single step to reduce IO. runner = Trainer(**{**get_trainer_kwargs(), **self.trainer_kwargs}) runner.start() try: # Collect eval metrics by distributing work across nodes / gpus with Horovod datasets = { "eval": dataset.pipeline(shuffle=False, **self.data_loader_kwargs) } predictor_kwargs = { **self.predictor_kwargs, "collect_predictions": False, } eval_stats, _ = runner.run( lambda config: eval_fn(**config), config={ "predictor_kwargs": predictor_kwargs, "model_ref": ray.put(self.model), "training_set_metadata": dataset.training_set_metadata, "features": dataset.features, **kwargs, }, dataset=datasets, )[0] finally: runner.shutdown() predictions = None if collect_predictions: # Collect eval predictions by using Ray Datasets to transform partitions of the data in parallel predictions = self.batch_predict(dataset, collect_logits=collect_logits) return eval_stats, predictions