def train_torch_ray_air( *, config: dict, num_workers: int = 4, cpus_per_worker: int = 8, use_gpu: bool = False, ) -> Tuple[float, float]: # This function is kicked off by the main() function and runs a full training # run using Ray AIR. from ray.train.torch import TorchTrainer from ray.air.config import ScalingConfig def train_loop(config): train_func(use_ray=True, config=config) start_time = time.monotonic() trainer = TorchTrainer( train_loop_per_worker=train_loop, train_loop_config=config, scaling_config=ScalingConfig( trainer_resources={"CPU": 0}, num_workers=num_workers, resources_per_worker={"CPU": cpus_per_worker}, use_gpu=use_gpu, ), ) result = trainer.fit() time_taken = time.monotonic() - start_time print(f"Last result: {result.metrics}") return time_taken, result.metrics["loss"]
def test_torch_e2e_state_dict(ray_start_4_cpus): def train_func(): model = torch.nn.Linear(1, 1).state_dict() train.save_checkpoint(model=model) scaling_config = {"num_workers": 2} trainer = TorchTrainer(train_loop_per_worker=train_func, scaling_config=scaling_config) result = trainer.fit() # If loading from a state dict, a model definition must be passed in. with pytest.raises(ValueError): TorchPredictor.from_checkpoint(result.checkpoint) class TorchScorer: def __init__(self): self.pred = TorchPredictor.from_checkpoint(result.checkpoint, model=torch.nn.Linear( 1, 1)) def __call__(self, x): return self.pred.predict(x, dtype=torch.float) predict_dataset = ray.data.range(3) predictions = predict_dataset.map_batches(TorchScorer, batch_format="pandas", compute="actors") assert predictions.count() == 3
def train_linear(num_workers=2, use_gpu=False, epochs=3): config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer = TorchTrainer( train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) results = trainer.fit() print(results.metrics) return results
def train_fashion_mnist(num_workers=2, use_gpu=False): trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config={ "lr": 1e-3, "batch_size": 64, "epochs": 4 }, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) result = trainer.fit() print(f"Last result: {result.metrics}")
def main(data_size_gb: int, num_epochs=2, num_workers=1): data_url = f"s3://air-example-data-2/{data_size_gb}G-image-data-synthetic-raw" print("Running Pytorch image model training with " f"{data_size_gb}GB data from {data_url}") print(f"Training for {num_epochs} epochs with {num_workers} workers.") start = time.time() # Enable cross host NCCL for larger scale tests runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}} ray.init(runtime_env=runtime_env) dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[data_url]) preprocessor = BatchMapper(preprocess_image_with_label) trainer = TorchTrainer( train_loop_per_worker=train_loop_per_worker, train_loop_config={ "batch_size": 64, "num_epochs": num_epochs }, datasets={"train": dataset}, preprocessor=preprocessor, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=True), ) trainer.fit() total_time_s = round(time.time() - start, 2) # For structured output integration with internal tooling results = {"data_size_gb": data_size_gb, "num_epochs": num_epochs} results["perf_metrics"] = [ { "perf_metric_name": "total_time_s", "perf_metric_value": total_time_s, "perf_metric_type": "LATENCY", }, { "perf_metric_name": "throughout_MB_s", "perf_metric_value": round(num_epochs * data_size_gb * 1024 / total_time_s, 2), "perf_metric_type": "THROUGHPUT", }, ] test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/release_test_out.json") with open(test_output_json, "wt") as f: json.dump(results, f) print(results)
def test_checkpoint_freq(ray_start_4_cpus): # checkpoint_freq is not supported so raise an error trainer = TorchTrainer( train_loop_per_worker=lambda config: None, scaling_config=ray.air.ScalingConfig(num_workers=1), run_config=ray.air.RunConfig( checkpoint_config=ray.air.CheckpointConfig( checkpoint_frequency=2, ), ), ) with pytest.raises(TuneError): trainer.fit()
def train_linear(num_workers=2, use_gpu=False): datasets, dataset_configs = get_datasets_and_configs() config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer = TorchTrainer( train_func, train_loop_config=config, datasets=datasets, dataset_config=dataset_configs, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) results = trainer.fit() print(results.metrics) return results
def train_linear(num_workers=2, use_gpu=False, epochs=3): config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config={ "num_workers": num_workers, "use_gpu": use_gpu }, ) result = trainer.fit() print(result.metrics) return result.metrics
def tune_linear(num_workers, num_samples, use_gpu): train_dataset, val_dataset = get_datasets() config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), datasets={ "train": train_dataset, "validation": val_dataset }, ) tuner = Tuner( trainer, param_space={ "train_loop_config": { "lr": tune.loguniform(1e-4, 1e-1), "batch_size": tune.choice([4, 16, 32]), "epochs": 3, } }, tune_config=TuneConfig(num_samples=num_samples, metric="loss", mode="min"), ) result_grid = tuner.fit() best_result = result_grid.get_best_result() print(best_result) return best_result
def test_tuner_with_torch_trainer(self): """Test a successful run using torch trainer.""" shutil.rmtree(os.path.join(DEFAULT_RESULTS_DIR, "test_tuner_torch"), ignore_errors=True) # The following two should be tunable. config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 10} scaling_config = {"num_workers": 1, "use_gpu": False} trainer = TorchTrainer( train_loop_per_worker=linear_train_func, train_loop_config=config, scaling_config=scaling_config, ) param_space = { "scaling_config": { "num_workers": tune.grid_search([1, 2]), }, "train_loop_config": { "batch_size": tune.grid_search([4, 8]), "epochs": tune.grid_search([5, 10]), }, } tuner = Tuner( trainable=trainer, run_config=RunConfig(name="test_tuner"), param_space=param_space, tune_config=TuneConfig(mode="min", metric="loss"), ) results = tuner.fit() assert len(results) == 8
def main(num_workers=2, use_gpu=False): trainer = TorchTrainer( train_func, train_loop_config={ "lr": 1e-3, "batch_size": 64, "epochs": 4 }, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), run_config=RunConfig(callbacks=[ MLflowLoggerCallback(experiment_name="train_fashion_mnist") ]), ) final_results = trainer.fit() print("Final metrics: ", final_results.metrics)
def test_torch_linear(ray_start_4_cpus, num_workers): def train_func(config): result = linear_train_func(config) assert len(result) == epochs assert result[-1]["loss"] < result[0]["loss"] num_workers = num_workers epochs = 3 scaling_config = {"num_workers": num_workers} config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=scaling_config, ) trainer.fit()
def train_linear(num_workers=2, use_gpu=False): train_dataset, val_dataset = get_datasets() config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), datasets={ "train": train_dataset, "validation": val_dataset }, ) result = trainer.fit() print(result.metrics) return result
def train_gnn( num_workers=2, use_gpu=False, epochs=3, global_batch_size=32, dataset="reddit" ): per_worker_batch_size = global_batch_size // num_workers trainer = TorchTrainer( train_loop_per_worker=train_loop_per_worker, train_loop_config={ "num_epochs": epochs, "batch_size": per_worker_batch_size, "dataset_fn": gen_reddit_dataset if dataset == "reddit" else gen_fake_dataset(), }, scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) result = trainer.fit() print(result.metrics)
def train_linear(num_workers=1, num_hidden_layers=1, use_auto_transfer=True, epochs=3): config = { "lr": 1e-2, "hidden_size": num_hidden_layers, "batch_size": 4096, "epochs": epochs, "use_auto_transfer": use_auto_transfer, } trainer = TorchTrainer( train_func, train_loop_config=config, scaling_config=ScalingConfig(use_gpu=True, num_workers=num_workers), ) results = trainer.fit() print(results.metrics) return results
def main(): args = parse_args() config = {"args": args} if args.start_local or args.address or args.num_workers > 1 or args.use_gpu: if args.start_local: # Start a local Ray runtime. ray.init(num_cpus=args.num_workers + 2) else: # Connect to a Ray cluster for distributed training. ray.init(address=args.address) trainer = TorchTrainer( train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=args.num_workers, use_gpu=args.use_gpu), ) results = trainer.fit() print(results.metrics) else: # Run training locally. train_func(config)
def test_torch_e2e(ray_start_4_cpus): def train_func(): model = torch.nn.Linear(1, 1) session.report({}, checkpoint=Checkpoint.from_dict(dict(model=model))) scaling_config = ScalingConfig(num_workers=2) trainer = TorchTrainer( train_loop_per_worker=train_func, scaling_config=scaling_config ) result = trainer.fit() predict_dataset = ray.data.range(3) class TorchScorer: def __init__(self): self.pred = TorchPredictor.from_checkpoint(result.checkpoint) def __call__(self, x): return self.pred.predict(x, dtype=torch.float) predictions = predict_dataset.map_batches( TorchScorer, batch_format="pandas", compute="actors" ) assert predictions.count() == 3
optimizer.zero_grad() loss.backward() optimizer.step() print(f"epoch: {epoch}, loss: {loss.item()}") # __torch_distributed_end__ if __name__ == "__main__": # __torch_single_run_begin__ train_func() # __torch_single_run_end__ # __torch_trainer_begin__ from ray.train.torch import TorchTrainer from ray.air.config import ScalingConfig # For GPU Training, set `use_gpu` to True. use_gpu = False trainer = TorchTrainer(train_func_distributed, scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu)) results = trainer.fit() # __torch_trainer_end__
parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", action="store_true", default=False, help="Finish quickly for training.", ) args = parser.parse_args() ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True) num_training_workers = 1 if args.smoke_test else 3 trainer = TorchTrainer( train_func, scaling_config=ScalingConfig( num_workers=num_training_workers, use_gpu=not args.smoke_test, ), torch_config=TorchConfig(backend="gloo"), ) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", perturbation_interval=1, hyperparam_mutations={ "train_loop_config": { # distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], } },
# __config_1__ import ray from ray.train.torch import TorchTrainer from ray.air.config import DatasetConfig train_ds = ray.data.range_tensor(1000) valid_ds = ray.data.range_tensor(100) test_ds = ray.data.range_tensor(100) my_trainer = TorchTrainer( lambda: None, # No-op training loop. scaling_config={"num_workers": 2}, datasets={ "train": train_ds, "valid": valid_ds, "test": test_ds, }, dataset_config={ "valid": DatasetConfig(split=True), "test": DatasetConfig(split=True), }, ) print(my_trainer.get_dataset_config()) # -> {'train': DatasetConfig(fit=True, split=True, ...), # 'valid': DatasetConfig(fit=False, split=True, ...), # 'test': DatasetConfig(fit=False, split=True, ...), ...} # __config_1_end__ # __config_2__ import ray from ray.train.torch import TorchTrainer
loss_fn = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) for _ in range(epochs): train_epoch(train_dataloader, model, loss_fn, optimizer) loss = validate_epoch(test_dataloader, model, loss_fn) train.report(loss=loss) num_workers = 2 use_gpu = False trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4}, scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) result = trainer.fit() print(f"Last result: {result.metrics}") # __air_pytorch_train_end__ # # __air_pytorch_batchpred_start__ # import random # from ray.air.batch_predictor import BatchPredictor # from ray.air.predictors.integrations.torch import TorchPredictor # batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, TorchPredictor) # items = [{"x": random.uniform(0, 1) for _ in range(10)}]
optimizer.step() loss = train_loss.item() session.report({"loss": loss}, checkpoint=TorchCheckpoint.from_model(model)) num_features = len(train_dataset.schema().names) - 1 trainer = TorchTrainer( train_loop_per_worker=train_loop_per_worker, train_loop_config={ "batch_size": 128, "num_epochs": 20, "num_features": num_features, "lr": 0.001, }, scaling_config=ScalingConfig( num_workers=3, # Number of workers to use for data parallelism. use_gpu=False, trainer_resources={"CPU": 0}, # so that the example works on Colab. ), datasets={"train": train_dataset}, preprocessor=preprocessor, ) # Execute training. result = trainer.fit() print(f"Last result: {result.metrics}") # Last result: {'loss': 0.6559339960416158, ...} # __air_pytorch_train_end__ # __air_pytorch_tuner_start__ from ray import tune
optimizer = torch.optim.SGD(model.parameters(), lr=lr) for _ in range(epochs): train_epoch(train_dataloader, model, loss_fn, optimizer) loss = validate_epoch(test_dataloader, model, loss_fn) session.report(dict(loss=loss)) num_workers = 2 use_gpu = False trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config={ "lr": 1e-3, "batch_size": 64, "epochs": 4 }, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) result = trainer.fit() print(f"Last result: {result.metrics}") # __air_pytorch_train_end__ # # __air_pytorch_batchpred_start__ # import random # from ray.train.batch_predictor import BatchPredictor # from ray.train.torch import TorchPredictor # batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, TorchPredictor)
import ray.data def train_loop_per_worker(): # By default, bulk loading is used and returns a Dataset object. data_shard = session.get_dataset_shard("train") # Manually iterate over the data 10 times (10 epochs). for _ in range(10): for batch in data_shard.iter_batches(): print("Do some training on batch", batch) trainer = TorchTrainer( train_loop_per_worker, scaling_config=ScalingConfig(num_workers=1), datasets={"train": ray.data.range_tensor(1000)}, ) trainer.fit() # __config_scaling_1_end__ # __config_scaling_2__ from ray.air import session from ray.train.torch import TorchTrainer import ray.data from ray.air.config import ScalingConfig from ray import tune from ray.tune.tuner import Tuner from ray.tune.tune_config import TuneConfig
from ray.air import ScalingConfig, RunConfig, session from ray.train.torch import TorchTrainer from ray.tune.integration.mlflow import MLflowLoggerCallback from ray.tune.logger import TBXLoggerCallback def train_func(): for i in range(3): session.report(dict(epoch=i)) trainer = TorchTrainer( train_func, scaling_config=ScalingConfig(num_workers=2), run_config=RunConfig( callbacks=[ MLflowLoggerCallback(experiment_name="train_experiment"), TBXLoggerCallback(), ], ), ) # Run the training function, logging all the intermediate results # to MLflow and Tensorboard. result = trainer.fit() # For MLFLow logs: # MLFlow logs will by default be saved in an `mlflow` directory # in the current working directory. # $ cd mlflow
help="Finish quickly for testing.", ) parser.add_argument("--use-gpu", action="store_true", default=False, help="Enables GPU training") args, _ = parser.parse_known_args() if args.smoke_test: ray.init(num_cpus=4) else: ray.init(address=args.address) trainer = TorchTrainer( train_func, scaling_config=ScalingConfig(num_workers=args.num_workers, use_gpu=args.use_gpu), ) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", perturbation_interval=1, hyperparam_mutations={ "train_loop_config": { # distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], } }, )
# __config_1__ import ray from ray.train.torch import TorchTrainer from ray.air.config import ScalingConfig, DatasetConfig train_ds = ray.data.range_tensor(1000) valid_ds = ray.data.range_tensor(100) test_ds = ray.data.range_tensor(100) my_trainer = TorchTrainer( lambda: None, # No-op training loop. scaling_config=ScalingConfig(num_workers=2), datasets={ "train": train_ds, "valid": valid_ds, "test": test_ds, }, dataset_config={ "valid": DatasetConfig(split=True), "test": DatasetConfig(split=True), }, ) print(my_trainer.get_dataset_config()) # -> {'train': DatasetConfig(fit=True, split=True, ...), # 'valid': DatasetConfig(fit=False, split=True, ...), # 'test': DatasetConfig(fit=False, split=True, ...), ...} # __config_1_end__ # __config_2__ import ray from ray.train.torch import TorchTrainer