def train_example(num_workers=1, use_gpu=False): CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, scheduler_creator=scheduler_creator, loss_creator=nn.MSELoss) trainer1 = TorchTrainer( training_operator_cls=CustomTrainingOperator, num_workers=num_workers, use_gpu=use_gpu, config={ "lr": 1e-2, # used in optimizer_creator "hidden_size": 1, # used in model_creator "batch_size": 4, # used in data_creator }, backend="gloo", scheduler_step_freq="epoch") for i in range(5): stats = trainer1.train() print(stats) print(trainer1.validate()) m = trainer1.get_model() print("trained weight: % .2f, bias: % .2f" % ( m.weight.item(), m.bias.item())) trainer1.shutdown() print("success!")
def main(): setup_default_logging() args, args_text = parse_args() if args.smoke_test: ray.init(num_cpus=int(args.ray_num_workers)) else: ray.init(address=args.ray_address) CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, loss_creator=loss_creator) trainer = TorchTrainer(training_operator_cls=CustomTrainingOperator, use_tqdm=True, use_fp16=args.amp, apex_args={"opt_level": "O1"}, config={ "args": args, BATCH_SIZE: args.batch_size }, num_workers=args.ray_num_workers) if args.smoke_test: args.epochs = 1 pbar = trange(args.epochs, unit="epoch") for i in pbar: trainer.train(num_steps=1 if args.smoke_test else None) val_stats = trainer.validate(num_steps=1 if args.smoke_test else None) pbar.set_postfix(dict(acc=val_stats["val_accuracy"])) trainer.shutdown()
def test_correctness(ray_start_2_cpus, num_workers, use_local): layer = nn.Linear(1, 1) ptl_op = TrainingOperator.from_ptl(PTL_Module) trainer1 = TorchTrainer(training_operator_cls=ptl_op, config={ "layer": layer, "data_size": 3, "batch_size": 1 }, num_workers=num_workers, use_local=use_local) train1_stats = trainer1.train() val1_stats = trainer1.validate() trainer1.shutdown() trainer2 = TorchTrainer(training_operator_cls=CorrectnessOperator, scheduler_step_freq="manual", config={ "layer": layer, "data_size": 3, "batch_size": 1 }, num_workers=num_workers, use_local=use_local) train2_stats = trainer2.train() val2_stats = trainer2.validate() trainer2.shutdown() assert train1_stats["train_loss"] == train2_stats["train_loss"] assert val1_stats["val_loss"] == val2_stats["val_loss"] assert val1_stats["val_acc"] == val2_stats["val_accuracy"]
def train_mnist(num_workers=1, use_gpu=False, num_epochs=5): Operator = TrainingOperator.from_ptl(LitMNIST) trainer = TorchTrainer( training_operator_cls=Operator, num_workers=num_workers, config={"lr": 1e-3, "batch_size": 64}, use_gpu=use_gpu, use_tqdm=True, ) for i in range(num_epochs): stats = trainer.train() print(stats) print(trainer.validate()) print("Saving model checkpoint to ./model.pt") trainer.save("./model.pt") print("Model Checkpointed!") trainer.shutdown() print("success!")
return train_loader, validation_loader def optimizer_creator(model, config): """Returns optimizer""" return torch.optim.SGD(model.parameters(), lr=config.get("lr", 0.1), momentum=config.get("momentum", 0.9)) ray.init(address="auto" if not args.smoke_test else None, log_to_driver=True) num_training_workers = 1 if args.smoke_test else 3 CustomTrainingOperator = TrainingOperator.from_creators( model_creator=ResNet18, optimizer_creator=optimizer_creator, data_creator=cifar_creator, loss_creator=nn.CrossEntropyLoss) TorchTrainable = TorchTrainer.as_trainable( training_operator_cls=CustomTrainingOperator, initialization_hook=initialization_hook, num_workers=num_training_workers, config={ "test_mode": args.smoke_test, BATCH_SIZE: 128 * num_training_workers, }, use_gpu=not args.smoke_test, backend="gloo", # This should also work with NCCL )
def get_custom_training_operator(lr_reduce_on_plateau=False): return TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, loss_creator=nn.MSELoss, scheduler_creator=scheduler_creator if lr_reduce_on_plateau else None)
return torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9) # __torch_scheduler_end__ # __torch_ray_start__ import ray ray.init() # or ray.init(address="auto") to connect to a running cluster. # __torch_ray_end__ # __backwards_compat_start__ from ray.util.sgd import TorchTrainer MyTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, loss_creator=loss_creator, scheduler_creator=scheduler_creator, data_creator=data_creator) trainer = TorchTrainer( training_operator_cls=MyTrainingOperator, scheduler_step_freq="epoch", # if scheduler_creator is passed in config={"lr": 0.001, "batch_size": 64}) # __backwards_compat_end__ trainer.shutdown() # __torch_trainer_start__ from ray.util.sgd import TorchTrainer trainer = TorchTrainer(
self.loss = nn.MSELoss() def train_dataloader(self): return self.train_loader def val_dataloader(self): return self.val_loader def on_save_checkpoint(self, checkpoint): checkpoint["int"] = self.rand_int def on_load_checkpoint(self, checkpoint): self.rand_int = checkpoint["int"] Operator = TrainingOperator.from_ptl(PTL_Module) @pytest.mark.parametrize("use_local", [True, False]) def test_single_step(ray_start_2_cpus, use_local): # noqa: F811 trainer = TorchTrainer(training_operator_cls=Operator, num_workers=1, use_local=use_local, use_gpu=False) metrics = trainer.train(num_steps=1) assert metrics[BATCH_COUNT] == 1 val_metrics = trainer.validate(num_steps=1) assert val_metrics[BATCH_COUNT] == 1 trainer.shutdown()
parser.add_argument( "--use-gpu", action="store_true", default=False, help="Enables GPU training") parser.add_argument( "--lr-reduce-on-plateau", action="store_true", default=False, help="If enabled, use a ReduceLROnPlateau scheduler. If not set, " "no scheduler is used." ) args, _ = parser.parse_known_args() if args.smoke_test: ray.init(num_cpus=3) else: ray.init(address=args.address) CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, loss_creator=nn.MSELoss, scheduler_creator=scheduler_creator if args.lr_reduce_on_plateau else None) if not args.lr_reduce_on_plateau: tune_example(CustomTrainingOperator, num_workers=args.num_workers, use_gpu=args.use_gpu) else: tune_example_manual(CustomTrainingOperator, num_workers=args.num_workers, use_gpu=args.use_gpu)
parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", action="store_true", help="Finish quickly for testing") parser.add_argument( "--address", type=str, help="the address to use for Ray") parser.add_argument( "--num-workers", "-n", type=int, default=1, help="Sets number of workers for training.") parser.add_argument( "--use-gpu", action="store_true", default=False, help="Enables GPU training") args, _ = parser.parse_known_args() if args.smoke_test: ray.init(num_cpus=2) else: ray.init(address=args.address) CustomTrainingOperator = TrainingOperator.from_creators( model_creator=model_creator, optimizer_creator=optimizer_creator, data_creator=data_creator, loss_creator=nn.MSELoss) tune_example(CustomTrainingOperator, num_workers=args.num_workers, use_gpu=args.use_gpu)
batch_size=config["batch"]) validation_loader = DataLoader(CIFAR10(root="~/data", download=True, transform=tfms), batch_size=config["batch"]) return train_loader, validation_loader def optimizer_creator(model, config): """Returns an optimizer (or multiple)""" return torch.optim.SGD(model.parameters(), lr=config["lr"]) CustomTrainingOperator = TrainingOperator.from_creators( model_creator=ResNet18, # A function that returns a nn.Module optimizer_creator=optimizer_creator, # A function that returns an optimizer data_creator=cifar_creator, # A function that returns dataloaders loss_creator=torch.nn.CrossEntropyLoss # A loss function ) ray.init() trainer = TorchTrainer( training_operator_cls=CustomTrainingOperator, config={ "lr": 0.01, # used in optimizer_creator "batch": 64 # used in data_creator }, num_workers=2, # amount of parallelism use_gpu=torch.cuda.is_available(), use_tqdm=True)