def train_func(): if session.get_checkpoint(): with session.get_checkpoint().as_directory() as checkpoint_dir: import tensorflow as tf model = tf.keras.models.load_model(checkpoint_dir) else: model = build_model() model.save("my_model", overwrite=True) session.report(metrics={"iter": 1}, checkpoint=Checkpoint.from_directory("my_model"))
def train_func(config): step = 0 width, height = config["width"], config["height"] if session.get_checkpoint(): loaded_checkpoint = session.get_checkpoint() step = loaded_checkpoint.to_dict()["step"] + 1 for step in range(step, 100): intermediate_score = evaluation_fn(step, width, height) checkpoint = Checkpoint.from_dict({"step": step}) session.report({ "iterations": step, "mean_loss": intermediate_score }, checkpoint=checkpoint)
def train_func(): checkpoint = session.get_checkpoint() if checkpoint: epoch = checkpoint.to_dict()["epoch"] else: epoch = 0 for i in range(epoch, epoch + 2): session.report({"epoch": i}, checkpoint=Checkpoint.from_dict({"epoch": i}))
def train_convnet(config): # Create our data loaders, model, and optmizer. step = 0 train_loader, test_loader = get_data_loaders() model = ConvNet() optimizer = optim.SGD( model.parameters(), lr=config.get("lr", 0.01), momentum=config.get("momentum", 0.9), ) # If `session.get_checkpoint()` is not None, then we are resuming from a checkpoint. # Load model state and iteration step from checkpoint. if session.get_checkpoint(): print("Loading from checkpoint.") loaded_checkpoint = session.get_checkpoint() with loaded_checkpoint.as_directory() as loaded_checkpoint_dir: path = os.path.join(loaded_checkpoint_dir, "checkpoint.pt") checkpoint = torch.load(path) model.load_state_dict(checkpoint["model_state_dict"]) step = checkpoint["step"] while True: train(model, optimizer, train_loader) acc = test(model, test_loader) checkpoint = None if step % 5 == 0: # Every 5 steps, checkpoint our current state. # First get the checkpoint directory from tune. # Need to create a directory under current working directory # to construct an AIR Checkpoint object from. os.makedirs("my_model", exist_ok=True) torch.save( { "step": step, "model_state_dict": model.state_dict(), }, "my_model/checkpoint.pt", ) checkpoint = Checkpoint.from_directory("my_model") step += 1 session.report({"mean_accuracy": acc}, checkpoint=checkpoint)
def train_func(config): itr = 0 ckpt = session.get_checkpoint() if ckpt is not None: ckpt = ckpt.to_dict() itr = ckpt["iter"] + 1 for i in range(itr, config["max_iter"]): session.report( dict(test=i, training_iteration=i), checkpoint=Checkpoint.from_dict(dict(iter=i)), )
def train_func(): ckpt = session.get_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: ckpt = ckpt.to_dict() itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") session.report( dict(test=i, training_iteration=i), checkpoint=Checkpoint.from_dict(dict(iter=i)), )
def train_fn(config): start_epoch = 0 print(session.get_trial_resources()) checkpoint = session.get_checkpoint() if checkpoint: # assume that we have run the session.report() example # and successfully save some model weights checkpoint_dict = checkpoint.to_dict() start_epoch = checkpoint_dict.get("epoch", -1) + 1 # wrap the model in DDP for epoch in range(start_epoch, config["num_epochs"]): checkpoint = Checkpoint.from_dict(dict(epoch=epoch)) session.report( { "metric": config["metric"] * epoch, "epoch": epoch, "num_cpus": session.get_trial_resources().required_resources["CPU"], }, checkpoint=checkpoint, )
def train_cifar(config): net = Net(config["l1"], config["l2"]) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" if torch.cuda.device_count() > 1: net = nn.DataParallel(net) net.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) # Load existing checkpoint through `session.get_checkpoint()` API. if session.get_checkpoint(): loaded_checkpoint = session.get_checkpoint() with loaded_checkpoint.as_directory() as loaded_checkpoint_dir: model_state, optimizer_state = torch.load(os.path.join(loaded_checkpoint_dir, "checkpoint.pt")) net.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) data_dir = os.path.abspath("./data") trainset, testset = load_data(data_dir) test_abs = int(len(trainset) * 0.8) train_subset, val_subset = random_split( trainset, [test_abs, len(trainset) - test_abs]) trainloader = torch.utils.data.DataLoader( train_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8) valloader = torch.utils.data.DataLoader( val_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8) for epoch in range(10): # loop over the dataset multiple times running_loss = 0.0 epoch_steps = 0 for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() epoch_steps += 1 if i % 2000 == 1999: # print every 2000 mini-batches print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps)) running_loss = 0.0 # Validation loss val_loss = 0.0 val_steps = 0 total = 0 correct = 0 for i, data in enumerate(valloader, 0): with torch.no_grad(): inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) outputs = net(inputs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() loss = criterion(outputs, labels) val_loss += loss.cpu().numpy() val_steps += 1 # Here we save a checkpoint. It is automatically registered with # Ray Tune and will potentially be accessed through in ``session.get_checkpoint()`` # in future iterations. # Note to save a file like checkpoint, you still need to put it under a directory # to construct an AIR checkpoint. os.makedirs("my_model", exist_ok=True) # ok to overwrite the previous one. path = os.path.join("my_model", "checkpoint.pt") torch.save( (net.state_dict(), optimizer.state_dict()), path) checkpoint = Checkpoint.from_directory("my_model") session.report({"loss": (val_loss / val_steps), "accuracy": correct / total}, checkpoint=checkpoint) print("Finished Training")
def _huggingface_train_loop_per_worker(config): """Per-worker training loop for HuggingFace Transformers.""" trainer_init_per_worker = config.pop("_trainer_init_per_worker") # Env vars necessary for HF to setup DDP os.environ["RANK"] = str(train.world_rank()) os.environ["WORLD_SIZE"] = str(train.world_size()) os.environ["LOCAL_RANK"] = str(train.local_rank()) train_dataset = train.get_dataset_shard(TRAIN_DATASET_KEY) eval_dataset = train.get_dataset_shard(EVALUATION_DATASET_KEY) train_torch_dataset, eval_torch_dataset = process_datasets( train_dataset, eval_dataset, ) trainer: transformers.trainer.Trainer = trainer_init_per_worker( train_torch_dataset, eval_torch_dataset, **config) if trainer.args.push_to_hub and not trainer.args.hub_token: warnings.warn( "You have set `push_to_hub=True` but didn't specify `hub_token`. " "Pushing to hub will most likely fail, as the credentials will not " "be automatically propagated from the local enviroment to the Ray Actors. " "If that happens, specify `hub_token` in `TrainingArguments`.") if (trainer.args.evaluation_strategy == "steps" or trainer.args.save_strategy == "steps" or trainer.args.logging_strategy == "steps"): raise ValueError( "'steps' value for `evaluation_strategy`, `logging_strategy` " "or `save_strategy` is not yet supported.") trainer = wrap_transformers_trainer(trainer) # ensure no HF logging callbacks are added # aside from doubling functionality with our callbacks, # the Wandb callbacks causes training to freeze integration_callbacks = transformers.trainer.get_reporting_integration_callbacks( trainer.args.report_to) for callback in integration_callbacks: trainer.pop_callback(callback) trainer.add_callback(TrainReportCallback) checkpoint = session.get_checkpoint() checkpoint_path = None remove_checkpoint_path = False if checkpoint: assert isinstance(checkpoint, Checkpoint) checkpoint_dict = checkpoint.to_dict() source_ip = checkpoint_dict[NODE_IP_KEY] source_path = checkpoint_dict[CHECKPOINT_PATH_ON_NODE_KEY] target_ip = get_node_ip_address() if source_ip == target_ip: checkpoint_path = source_path else: checkpoint_path = tempfile.mkdtemp( suffix=Path(trainer.args.output_dir).name) remove_checkpoint_path = True sync_dir_between_nodes( source_ip=source_ip, source_path=source_path, target_ip=target_ip, target_path=checkpoint_path, return_futures=False, max_size_bytes=None, ) trainer.train(resume_from_checkpoint=checkpoint_path) if remove_checkpoint_path: shutil.rmtree(checkpoint_path, ignore_errors=True)
def train_func(): checkpoint = session.get_checkpoint() session.report(metrics=checkpoint.to_dict(), checkpoint=checkpoint) return checkpoint.to_dict()[key]
def dcgan_train(config): step = 0 use_cuda = config.get("use_gpu") and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") netD = Discriminator().to(device) netD.apply(weights_init) netG = Generator().to(device) netG.apply(weights_init) criterion = nn.BCELoss() optimizerD = optim.Adam(netD.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) with FileLock(os.path.expanduser("~/.data.lock")): dataloader = get_data_loader() if session.get_checkpoint(): loaded_checkpoint = session.get_checkpoint() with loaded_checkpoint.as_directory() as loaded_checkpoint_dir: path = os.path.join(loaded_checkpoint_dir, "checkpoint.pt") checkpoint = torch.load(path) netD.load_state_dict(checkpoint["netDmodel"]) netG.load_state_dict(checkpoint["netGmodel"]) optimizerD.load_state_dict(checkpoint["optimD"]) optimizerG.load_state_dict(checkpoint["optimG"]) step = checkpoint["step"] if "netD_lr" in config: for param_group in optimizerD.param_groups: param_group["lr"] = config["netD_lr"] if "netG_lr" in config: for param_group in optimizerG.param_groups: param_group["lr"] = config["netG_lr"] while True: lossG, lossD, is_score = train( netD, netG, optimizerG, optimizerD, criterion, dataloader, step, device, config["mnist_model_ref"], ) step += 1 os.makedirs("my_model", exist_ok=True) torch.save( { "netDmodel": netD.state_dict(), "netGmodel": netG.state_dict(), "optimD": optimizerD.state_dict(), "optimG": optimizerG.state_dict(), "step": step, }, "my_model/checkpoint.pt", ) session.report( { "lossg": lossG, "lossd": lossD, "is_score": is_score }, checkpoint=Checkpoint.from_directory("my_model"), )
def train_loop_per_worker(config): import horovod.torch as hvd hvd.init() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = ResNet18(None).to(device) optimizer = torch.optim.SGD( net.parameters(), lr=config["lr"], ) epoch = 0 checkpoint = session.get_checkpoint() if checkpoint: model_state = checkpoint["model_state"] optimizer_state = checkpoint["optimizer_state"] epoch = checkpoint["epoch"] net.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) criterion = nn.CrossEntropyLoss() optimizer = hvd.DistributedOptimizer(optimizer) np.random.seed(1 + hvd.rank()) torch.manual_seed(1234) # To ensure consistent initialization across workers, hvd.broadcast_parameters(net.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) trainset = ray.get(config["data"]) trainloader = DataLoader(trainset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=4) trainloader_len = len(trainloader) for epoch in range(epoch, 40): # loop over the dataset multiple times running_loss = 0.0 epoch_steps = 0 for i, data in enumerate(trainloader): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() epoch_steps += 1 if i == trainloader_len - 1: checkpoint = Checkpoint.from_dict( dict( model_state=net.state_dict(), optimizer_state=optimizer.state_dict(), epoch=epoch, )) else: checkpoint = None session.report(dict(loss=running_loss / epoch_steps), checkpoint=checkpoint) if i % 2000 == 1999: # print every 2000 mini-batches print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps))
def pbt_function(config): """Toy PBT problem for benchmarking adaptive learning rate. The goal is to optimize this trainable's accuracy. The accuracy increases fastest at the optimal lr, which is a function of the current accuracy. The optimal lr schedule for this problem is the triangle wave as follows. Note that many lr schedules for real models also follow this shape: best lr ^ | /\ | / \ | / \ | / \ ------------> accuracy In this problem, using PBT with a population of 2-4 is sufficient to roughly approximate this lr schedule. Higher population sizes will yield faster convergence. Training will not converge without PBT. """ lr = config["lr"] accuracy = 0.0 # end = 1000 start = 0 if session.get_checkpoint(): state = session.get_checkpoint().to_dict() accuracy = state["acc"] start = state["step"] midpoint = 100 # lr starts decreasing after acc > midpoint q_tolerance = 3 # penalize exceeding lr by more than this multiple noise_level = 2 # add gaussian noise to the acc increase # triangle wave: # - start at 0.001 @ t=0, # - peak at 0.01 @ t=midpoint, # - end at 0.001 @ t=midpoint * 2, for step in range(start, 100): if accuracy < midpoint: optimal_lr = 0.01 * accuracy / midpoint else: optimal_lr = 0.01 - 0.01 * (accuracy - midpoint) / midpoint optimal_lr = min(0.01, max(0.001, optimal_lr)) # compute accuracy increase q_err = max(lr, optimal_lr) / min(lr, optimal_lr) if q_err < q_tolerance: accuracy += (1.0 / q_err) * random.random() elif lr > optimal_lr: accuracy -= (q_err - q_tolerance) * random.random() accuracy += noise_level * np.random.normal() accuracy = max(0, accuracy) checkpoint = None if step % 3 == 0: checkpoint = Checkpoint.from_dict({"acc": accuracy, "step": start}) session.report( { "mean_accuracy": accuracy, "cur_lr": lr, "optimal_lr": optimal_lr, # for debugging "q_err": q_err, # for debugging "done": accuracy > midpoint * 2, # this stops the training process }, checkpoint=checkpoint, )