def main(data_size_gb: int, num_epochs=2, num_workers=1): data_url = f"s3://air-example-data-2/{data_size_gb}G-image-data-synthetic-raw" print("Running Pytorch image model training with " f"{data_size_gb}GB data from {data_url}") print(f"Training for {num_epochs} epochs with {num_workers} workers.") start = time.time() # Enable cross host NCCL for larger scale tests runtime_env = {"env_vars": {"NCCL_SOCKET_IFNAME": "ens3"}} ray.init(runtime_env=runtime_env) dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[data_url]) preprocessor = BatchMapper(preprocess_image_with_label) trainer = TorchTrainer( train_loop_per_worker=train_loop_per_worker, train_loop_config={ "batch_size": 64, "num_epochs": num_epochs }, datasets={"train": dataset}, preprocessor=preprocessor, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=True), ) trainer.fit() total_time_s = round(time.time() - start, 2) # For structured output integration with internal tooling results = {"data_size_gb": data_size_gb, "num_epochs": num_epochs} results["perf_metrics"] = [ { "perf_metric_name": "total_time_s", "perf_metric_value": total_time_s, "perf_metric_type": "LATENCY", }, { "perf_metric_name": "throughout_MB_s", "perf_metric_value": round(num_epochs * data_size_gb * 1024 / total_time_s, 2), "perf_metric_type": "THROUGHPUT", }, ] test_output_json = os.environ.get("TEST_OUTPUT_JSON", "/tmp/release_test_out.json") with open(test_output_json, "wt") as f: json.dump(results, f) print(results)
def test_checkpoint_freq(ray_start_4_cpus): # checkpoint_freq is not supported so raise an error trainer = TorchTrainer( train_loop_per_worker=lambda config: None, scaling_config=ray.air.ScalingConfig(num_workers=1), run_config=ray.air.RunConfig( checkpoint_config=ray.air.CheckpointConfig( checkpoint_frequency=2, ), ), ) with pytest.raises(TuneError): trainer.fit()
def test_torch_e2e_state_dict(ray_start_4_cpus): def train_func(): model = torch.nn.Linear(1, 1).state_dict() train.save_checkpoint(model=model) scaling_config = {"num_workers": 2} trainer = TorchTrainer(train_loop_per_worker=train_func, scaling_config=scaling_config) result = trainer.fit() # If loading from a state dict, a model definition must be passed in. with pytest.raises(ValueError): TorchPredictor.from_checkpoint(result.checkpoint) class TorchScorer: def __init__(self): self.pred = TorchPredictor.from_checkpoint(result.checkpoint, model=torch.nn.Linear( 1, 1)) def __call__(self, x): return self.pred.predict(x, dtype=torch.float) predict_dataset = ray.data.range(3) predictions = predict_dataset.map_batches(TorchScorer, batch_format="pandas", compute="actors") assert predictions.count() == 3
def test_torch_linear(ray_start_4_cpus, num_workers): def train_func(config): result = linear_train_func(config) assert len(result) == epochs assert result[-1]["loss"] < result[0]["loss"] num_workers = num_workers epochs = 3 scaling_config = {"num_workers": num_workers} config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=scaling_config, ) trainer.fit()
def train_torch_ray_air( *, config: dict, num_workers: int = 4, cpus_per_worker: int = 8, use_gpu: bool = False, ) -> Tuple[float, float]: # This function is kicked off by the main() function and runs a full training # run using Ray AIR. from ray.train.torch import TorchTrainer from ray.air.config import ScalingConfig def train_loop(config): train_func(use_ray=True, config=config) start_time = time.monotonic() trainer = TorchTrainer( train_loop_per_worker=train_loop, train_loop_config=config, scaling_config=ScalingConfig( trainer_resources={"CPU": 0}, num_workers=num_workers, resources_per_worker={"CPU": cpus_per_worker}, use_gpu=use_gpu, ), ) result = trainer.fit() time_taken = time.monotonic() - start_time print(f"Last result: {result.metrics}") return time_taken, result.metrics["loss"]
def train_linear(num_workers=2, use_gpu=False, epochs=3): config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer = TorchTrainer( train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) results = trainer.fit() print(results.metrics) return results
def train_fashion_mnist(num_workers=2, use_gpu=False): trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config={ "lr": 1e-3, "batch_size": 64, "epochs": 4 }, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) result = trainer.fit() print(f"Last result: {result.metrics}")
def train_linear(num_workers=2, use_gpu=False): datasets, dataset_configs = get_datasets_and_configs() config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer = TorchTrainer( train_func, train_loop_config=config, datasets=datasets, dataset_config=dataset_configs, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), ) results = trainer.fit() print(results.metrics) return results
def train_linear(num_workers=2, use_gpu=False, epochs=3): config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": epochs} trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config={ "num_workers": num_workers, "use_gpu": use_gpu }, ) result = trainer.fit() print(result.metrics) return result.metrics
def main(num_workers=2, use_gpu=False): trainer = TorchTrainer( train_func, train_loop_config={ "lr": 1e-3, "batch_size": 64, "epochs": 4 }, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), run_config=RunConfig(callbacks=[ MLflowLoggerCallback(experiment_name="train_fashion_mnist") ]), ) final_results = trainer.fit() print("Final metrics: ", final_results.metrics)
def train_linear(num_workers=2, use_gpu=False): train_dataset, val_dataset = get_datasets() config = {"lr": 1e-2, "hidden_size": 1, "batch_size": 4, "epochs": 3} trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu), datasets={ "train": train_dataset, "validation": val_dataset }, ) result = trainer.fit() print(result.metrics) return result
def train_gnn( num_workers=2, use_gpu=False, epochs=3, global_batch_size=32, dataset="reddit" ): per_worker_batch_size = global_batch_size // num_workers trainer = TorchTrainer( train_loop_per_worker=train_loop_per_worker, train_loop_config={ "num_epochs": epochs, "batch_size": per_worker_batch_size, "dataset_fn": gen_reddit_dataset if dataset == "reddit" else gen_fake_dataset(), }, scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) result = trainer.fit() print(result.metrics)
def train_linear(num_workers=1, num_hidden_layers=1, use_auto_transfer=True, epochs=3): config = { "lr": 1e-2, "hidden_size": num_hidden_layers, "batch_size": 4096, "epochs": epochs, "use_auto_transfer": use_auto_transfer, } trainer = TorchTrainer( train_func, train_loop_config=config, scaling_config=ScalingConfig(use_gpu=True, num_workers=num_workers), ) results = trainer.fit() print(results.metrics) return results
def main(): args = parse_args() config = {"args": args} if args.start_local or args.address or args.num_workers > 1 or args.use_gpu: if args.start_local: # Start a local Ray runtime. ray.init(num_cpus=args.num_workers + 2) else: # Connect to a Ray cluster for distributed training. ray.init(address=args.address) trainer = TorchTrainer( train_func, train_loop_config=config, scaling_config=ScalingConfig(num_workers=args.num_workers, use_gpu=args.use_gpu), ) results = trainer.fit() print(results.metrics) else: # Run training locally. train_func(config)
def test_torch_e2e(ray_start_4_cpus): def train_func(): model = torch.nn.Linear(1, 1) session.report({}, checkpoint=Checkpoint.from_dict(dict(model=model))) scaling_config = ScalingConfig(num_workers=2) trainer = TorchTrainer( train_loop_per_worker=train_func, scaling_config=scaling_config ) result = trainer.fit() predict_dataset = ray.data.range(3) class TorchScorer: def __init__(self): self.pred = TorchPredictor.from_checkpoint(result.checkpoint) def __call__(self, x): return self.pred.predict(x, dtype=torch.float) predictions = predict_dataset.map_batches( TorchScorer, batch_format="pandas", compute="actors" ) assert predictions.count() == 3
for _ in range(10): for batch in data_shard.iter_batches(): print("Do some training on batch", batch) # View the stats for performance debugging. print(data_shard.stats()) my_trainer = TorchTrainer( train_loop_per_worker, scaling_config={"num_workers": 1}, datasets={ "train": ray.data.range_tensor(1000), }, ) my_trainer.fit() # __config_4_end__ # __config_5__ import ray from ray import train from ray.data import DatasetPipeline from ray.train.torch import TorchTrainer from ray.air.config import DatasetConfig def train_loop_per_worker(): # A DatasetPipeline object is returned when `use_stream_api` is set. data_shard: DatasetPipeline = train.get_dataset_shard("train") # Use iter_epochs(10) to iterate over 10 epochs of data.
for _ in range(epochs): train_epoch(train_dataloader, model, loss_fn, optimizer) loss = validate_epoch(test_dataloader, model, loss_fn) train.report(loss=loss) num_workers = 2 use_gpu = False trainer = TorchTrainer( train_loop_per_worker=train_func, train_loop_config={"lr": 1e-3, "batch_size": 64, "epochs": 4}, scaling_config={"num_workers": num_workers, "use_gpu": use_gpu}, ) result = trainer.fit() print(f"Last result: {result.metrics}") # __air_pytorch_train_end__ # # __air_pytorch_batchpred_start__ # import random # from ray.air.batch_predictor import BatchPredictor # from ray.air.predictors.integrations.torch import TorchPredictor # batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, TorchPredictor) # items = [{"x": random.uniform(0, 1) for _ in range(10)}] # prediction_dataset = ray.data.from_items(items) # predictions = batch_predictor.predict(prediction_dataset, dtype=torch.float)
optimizer.zero_grad() loss.backward() optimizer.step() print(f"epoch: {epoch}, loss: {loss.item()}") # __torch_distributed_end__ if __name__ == "__main__": # __torch_single_run_begin__ train_func() # __torch_single_run_end__ # __torch_trainer_begin__ from ray.train.torch import TorchTrainer from ray.air.config import ScalingConfig # For GPU Training, set `use_gpu` to True. use_gpu = False trainer = TorchTrainer(train_func_distributed, scaling_config=ScalingConfig(num_workers=4, use_gpu=use_gpu)) results = trainer.fit() # __torch_trainer_end__
def train_loop_per_worker(): # By default, bulk loading is used and returns a Dataset object. data_shard = session.get_dataset_shard("train") # Manually iterate over the data 10 times (10 epochs). for _ in range(10): for batch in data_shard.iter_batches(): print("Do some training on batch", batch) trainer = TorchTrainer( train_loop_per_worker, scaling_config=ScalingConfig(num_workers=1), datasets={"train": ray.data.range_tensor(1000)}, ) trainer.fit() # __config_scaling_1_end__ # __config_scaling_2__ from ray.air import session from ray.train.torch import TorchTrainer import ray.data from ray.air.config import ScalingConfig from ray import tune from ray.tune.tuner import Tuner from ray.tune.tune_config import TuneConfig def train_loop_per_worker(): # By default, bulk loading is used and returns a Dataset object. data_shard = session.get_dataset_shard("train")