def train_func_checkpoint(): checkpoint = train.load_checkpoint() assert checkpoint is not None assert checkpoint["epoch"] == 2 for i in range(checkpoint["epoch"], 5): train.save_checkpoint(epoch=i) return 1
def train_func(): checkpoint = train.load_checkpoint() if checkpoint: epoch = checkpoint["epoch"] else: epoch = 0 for i in range(epoch, epoch + 2): train.save_checkpoint(epoch=i)
def train_func_checkpoint(): checkpoint = train.load_checkpoint() assert checkpoint is not None assert checkpoint["epoch"] == 3 result = [] for i in range(checkpoint["epoch"], 5): result.append(i) return result
def train_func(config): itr = 0 ckpt = train.load_checkpoint() if ckpt is not None: itr = ckpt["iter"] + 1 for i in range(itr, config["max_iter"]): train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i)
def train_func(): checkpoint = train.load_checkpoint() if checkpoint: epoch = checkpoint["epoch"] else: epoch = 0 print("Epoch: ", epoch) for i in range(epoch, 2): train.report(loss=1, iter=i) train.save_checkpoint(epoch=i + 1)
def train_func(): ckpt = train.load_checkpoint() restored = bool(ckpt) # Does a previous checkpoint exist? itr = 0 if ckpt: itr = ckpt["iter"] + 1 for i in range(itr, 4): if i == 2 and not restored: raise Exception("try to fail me") train.save_checkpoint(iter=i) train.report(test=i, training_iteration=i)
def train_func(): checkpoint = train.load_checkpoint() train.report(**checkpoint) train.save_checkpoint(**checkpoint) return checkpoint[key]
def _huggingface_train_loop_per_worker(config): """Per-worker training loop for HuggingFace Transformers.""" trainer_init_per_worker = config.pop("_trainer_init_per_worker") # Env vars necessary for HF to setup DDP os.environ["RANK"] = str(train.world_rank()) os.environ["WORLD_SIZE"] = str(train.world_size()) os.environ["LOCAL_RANK"] = str(train.local_rank()) train_dataset = train.get_dataset_shard(TRAIN_DATASET_KEY) eval_dataset = train.get_dataset_shard(EVALUATION_DATASET_KEY) train_torch_dataset, eval_torch_dataset = process_datasets( train_dataset, eval_dataset, ) trainer: transformers.trainer.Trainer = trainer_init_per_worker( train_torch_dataset, eval_torch_dataset, **config) if trainer.args.push_to_hub and not trainer.args.hub_token: warnings.warn( "You have set `push_to_hub=True` but didn't specify `hub_token`. " "Pushing to hub will most likely fail, as the credentials will not " "be automatically propagated from the local enviroment to the Ray Actors. " "If that happens, specify `hub_token` in `TrainingArguments`.") if (trainer.args.evaluation_strategy == "steps" or trainer.args.save_strategy == "steps" or trainer.args.logging_strategy == "steps"): raise ValueError( "'steps' value for `evaluation_strategy`, `logging_strategy` " "or `save_strategy` is not yet supported.") trainer = wrap_transformers_trainer(trainer) # ensure no HF logging callbacks are added # aside from doubling functionality with our callbacks, # the Wandb callbacks causes training to freeze integration_callbacks = transformers.trainer.get_reporting_integration_callbacks( trainer.args.report_to) for callback in integration_callbacks: trainer.pop_callback(callback) trainer.add_callback(TrainReportCallback) checkpoint = train.load_checkpoint() checkpoint_path = None remove_checkpoint_path = False if checkpoint: source_ip = checkpoint[NODE_IP_KEY] source_path = checkpoint[CHECKPOINT_PATH_ON_NODE_KEY] target_ip = get_node_ip_address() if source_ip == target_ip: checkpoint_path = source_path else: checkpoint_path = tempfile.mkdtemp( suffix=Path(trainer.args.output_dir).name) remove_checkpoint_path = True sync_dir_between_nodes( source_ip=source_ip, source_path=source_path, target_ip=target_ip, target_path=checkpoint_path, return_futures=False, max_size_bytes=None, ) trainer.train(resume_from_checkpoint=checkpoint_path) if remove_checkpoint_path: shutil.rmtree(checkpoint_path, ignore_errors=True)
def validate(): checkpoint = train.load_checkpoint() assert checkpoint is not None assert checkpoint["loss"] == 3
def validate(): checkpoint = train.load_checkpoint() assert checkpoint is not None assert checkpoint == latest_checkpoint
def train_func(): assert train.load_checkpoint() is None for i in range(3): train.save_checkpoint(epoch=i) return 1
def train_loop_per_worker(config): import horovod.torch as hvd hvd.init() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = ResNet18(None).to(device) optimizer = torch.optim.SGD( net.parameters(), lr=config["lr"], ) epoch = 0 checkpoint = train.load_checkpoint() if checkpoint: model_state = checkpoint["model_state"] optimizer_state = checkpoint["optimizer_state"] epoch = checkpoint["epoch"] net.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) criterion = nn.CrossEntropyLoss() optimizer = hvd.DistributedOptimizer(optimizer) np.random.seed(1 + hvd.rank()) torch.manual_seed(1234) # To ensure consistent initialization across workers, hvd.broadcast_parameters(net.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) trainset = ray.get(config["data"]) trainloader = DataLoader(trainset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=4) for epoch in range(epoch, 40): # loop over the dataset multiple times running_loss = 0.0 epoch_steps = 0 for i, data in enumerate(trainloader): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() epoch_steps += 1 train.report(loss=running_loss / epoch_steps) if i % 2000 == 1999: # print every 2000 mini-batches print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps)) train.save_checkpoint( model_state=net.state_dict(), optimizer_state=optimizer.state_dict(), epoch=epoch, )