def fit( self, train_dataset, learning_rate=5e-5, per_gpu_batch_size=8, num_epochs=1, recover_step=-1, recover_dir=None, save_model_to_dir=None, max_steps=-1, local_rank=-1, num_gpus=None, gpu_ids=None, gradient_accumulation_steps=1, weight_decay=0.01, adam_epsilon=1e-8, warmup_steps=0, fp16=False, fp16_opt_level="O1", max_grad_norm=1.0, verbose=True, seed=None, random_prob=0.1, keep_prob=0.1, ): """ Method for model-fine tuning. Args: train_dataset (S2SAbsSumDataset): Training dataset. learning_rate (float, optional): Learning rate. Defaults to 5e-5. per_gpu_batch_size (int, optional): Number of samples in each batch per GPU. Defaults to 8. num_epochs (int, optional): Number of passes through the entire training dataset. Ignored if `max_steps` is set. Defaults to 1. recover_step (int, optional): Step number to resume model fine-tuning from, assuming the model was saved by `S2SAbstractiveSummarizer.save_model` and the name is in the format "model.{recover_step}.bin". Defaults to -1, which means start model fine-tuning from scratch. recover_dir (str, optional): Directory to load model from if recover_step is provided. Defaults to None. save_model_to_dir (str, optional): Directory to save the model to. Defaults to None and the fine-tuned model is not saved. max_steps (int, optional): Maximum number of training steps. Defaults to -1 and the number of training steps is determined by `num_epochs` and the length of `train_dataset`. local_rank (int, optional): Rank of the device in distributed training. Defaults to -1 which means non-distributed training. num_gpus (int, optional): Number of GPUs to use. Ignored if `gpu_ids` is provided. Defaults to None and all available GPUs are used. gpu_ids (list, optional): List of GPU IDs ot use. Defaults to None and GPUs used are determined by num_gpus. gradient_accumulation_steps (int, optional): Number of steps to accmumulate gradient before each back propagation and model parameters update. Defaults to 1. weight_decay (float, optional): Weight decay to apply after each parameter update. Defaults to 0.01. adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8. warmup_steps (int, optional): Number of steps taken to increase learning rate from 0 to `learning rate`. Defaults to 0. fp16 (bool, optional): Whether to use 16-bit mixed precision through Apex. Defaults to False. fp16_opt_level(str, optional): Apex AMP optimization level for fp16. One of in ['O0', 'O1', 'O2', and 'O3']. See https://nvidia.github.io/apex/amp.html" Defaults to "01" max_grad_norm (float, optional): Maximum gradient norm for gradient clipping. Defaults to 1.0. verbose (bool, optional): Whether to output training log. Defaults to True. seed (int, optional): Random seed for model initialization. Defaults to None. random_prob (float, optional): Probability to randomly replace a masked token. Defaults to 0.1. keep_prob (float, optional): Probability to keep no change for a masked token. Defaults to 0.1. """ global_step = 0 if recover_step > 0: model_recover_checkpoint = os.path.join( recover_dir, "model.{}.bin".format(recover_step)) logger.info(" ** Recover model checkpoint in %s ** ", model_recover_checkpoint) model_state_dict = torch.load(model_recover_checkpoint, map_location="cpu") optimizer_recover_checkpoint = os.path.join( recover_dir, "optim.{}.bin".format(recover_step)) checkpoint_state_dict = torch.load(optimizer_recover_checkpoint, map_location="cpu") checkpoint_state_dict["model"] = model_state_dict global_step = recover_step else: checkpoint_state_dict = None device, num_gpus, amp = self.prepare_model_and_optimizer( num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank, fp16=fp16, fp16_opt_level=fp16_opt_level, weight_decay=weight_decay, learning_rate=learning_rate, adam_epsilon=adam_epsilon, checkpoint_state_dict=checkpoint_state_dict, ) per_node_train_batch_size = (per_gpu_batch_size * max(1, num_gpus) * gradient_accumulation_steps) # actual batch size, i.e. number of samples between each parameter update batch_size = per_node_train_batch_size * ( torch.distributed.get_world_size() if local_rank != -1 else 1) # max_steps is mainly used by the scheduler to determine the learning rate, # together with global_step if max_steps == -1: max_steps = max(num_epochs * len(train_dataset) // batch_size, 1) if max_steps <= global_step: logger.info( "Training is done. Please use a new dir or clean this dir!") return self.scheduler = Transformer.get_default_scheduler( optimizer=self.optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps, ) if recover_step > 0: self.scheduler.load_state_dict( checkpoint_state_dict["lr_scheduler"]) train_dataset = Seq2seqDatasetForBert( features=train_dataset, max_source_len=self.max_source_seq_length, max_target_len=self.max_target_seq_length, vocab_size=self.tokenizer.vocab_size, cls_id=self.tokenizer.cls_token_id, sep_id=self.tokenizer.sep_token_id, pad_id=self.tokenizer.pad_token_id, mask_id=self.tokenizer.mask_token_id, random_prob=random_prob, keep_prob=keep_prob, num_training_instances=batch_size * max_steps, offset=batch_size * global_step, ) # The training features are shuffled train_sampler = (SequentialSampler(train_dataset) if local_rank == -1 else DistributedSampler(train_dataset, shuffle=False)) # batch_size of the dataloader is the number of samples to load each # iteration on each node train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=per_node_train_batch_size // gradient_accumulation_steps, collate_fn=batch_list_to_batch_tensors, ) global_step, _ = super().fine_tune( train_dataloader=train_dataloader, device=device, num_gpus=num_gpus, get_inputs=S2SAbsSumProcessor.get_inputs, max_steps=max_steps, global_step=global_step, gradient_accumulation_steps=gradient_accumulation_steps, optimizer=self.optimizer, scheduler=self.scheduler, local_rank=local_rank, fp16=fp16, amp=amp, max_grad_norm=max_grad_norm, verbose=verbose, seed=seed, ) if save_model_to_dir is not None and local_rank in [-1, 0]: self.save_model(save_model_to_dir, global_step, fp16) # release GPU memories self.model.cpu() torch.cuda.empty_cache()
def fit( self, train_dataloader, num_epochs=1, max_steps=-1, gradient_accumulation_steps=1, num_gpus=None, gpu_ids=None, local_rank=-1, weight_decay=0.0, learning_rate=5e-5, adam_epsilon=1e-8, warmup_steps=0, fp16=False, fp16_opt_level="O1", checkpoint_state_dict=None, verbose=True, seed=None, ): """ Fine-tunes a pre-trained sequence classification model. Args: train_dataloader (Dataloader): A PyTorch DataLoader to be used for training. num_epochs (int, optional): Number of training epochs. Defaults to 1. max_steps (int, optional): Total number of training steps. If set to a positive value, it overrides num_epochs. Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs. Defualts to -1. gradient_accumulation_steps (int, optional): Number of steps to accumulate before performing a backward/update pass. Default to 1. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. gpu_ids (list): List of GPU IDs to be used. If set to None, the first num_gpus GPUs will be used. Defaults to None. local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to -1, which means non-distributed training. weight_decay (float, optional): Weight decay to apply after each parameter update. Defaults to 0.0. learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to 5e-5. adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8. warmup_steps (int, optional): Number of steps taken to increase learning rate from 0 to `learning rate`. Defaults to 0. fp16 (bool): Whether to use 16-bit mixed precision through Apex Defaults to False fp16_opt_level (str): Apex AMP optimization level for fp16. One of in ['O0', 'O1', 'O2', and 'O3'] See https://nvidia.github.io/apex/amp.html" Defaults to "01" checkpoint_state_dict (dict): Checkpoint states of model and optimizer. If specified, the model and optimizer's parameters are loaded using checkpoint_state_dict["model"] and checkpoint_state_dict["optimizer"] Defaults to None. verbose (bool, optional): Whether to print out the training log. Defaults to True. seed (int, optional): Random seed used to improve reproducibility. Defaults to None. """ # init device and optimizer device, num_gpus, amp = self.prepare_model_and_optimizer( num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank, weight_decay=weight_decay, learning_rate=learning_rate, adam_epsilon=adam_epsilon, fp16=fp16, fp16_opt_level=fp16_opt_level, checkpoint_state_dict=checkpoint_state_dict, ) # compute the max number of training steps max_steps = compute_training_steps( dataloader=train_dataloader, num_epochs=num_epochs, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps, ) # init scheduler scheduler = Transformer.get_default_scheduler( optimizer=self.optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps) # fine tune super().fine_tune( train_dataloader=train_dataloader, get_inputs=Processor.get_inputs, device=device, num_gpus=num_gpus, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps, optimizer=self.optimizer, scheduler=scheduler, fp16=fp16, amp=amp, local_rank=local_rank, verbose=verbose, seed=seed, )
def fit( self, train_dataloader, num_epochs=1, max_steps=-1, gradient_accumulation_steps=1, num_gpus=None, gpu_ids=None, local_rank=-1, weight_decay=0.0, learning_rate=5e-5, adam_epsilon=1e-8, warmup_steps=0, verbose=True, seed=None, ): """ Fine-tunes a pre-trained sequence classification model. Args: train_dataloader (Dataloader): A PyTorch DataLoader to be used for training. num_epochs (int, optional): Number of training epochs. Defaults to 1. max_steps (int, optional): Total number of training steps. If set to a positive value, it overrides num_epochs. Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs. Defualts to -1. gradient_accumulation_steps (int, optional): Number of steps to accumulate before performing a backward/update pass. Default to 1. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. gpu_ids (list): List of GPU IDs to be used. If set to None, the first num_gpus GPUs will be used. Defaults to None. local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to -1, which means non-distributed training. weight_decay (float, optional): Weight decay to apply after each parameter update. Defaults to 0.0. learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to 5e-5. adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8. warmup_steps (int, optional): Number of steps taken to increase learning rate from 0 to `learning rate`. Defaults to 0. verbose (bool, optional): Whether to print out the training log. Defaults to True. seed (int, optional): Random seed used to improve reproducibility. Defaults to None. """ # init optimizer optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon) # compute the max number of training steps max_steps = compute_training_steps( train_dataloader, num_epochs=num_epochs, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps, ) # init scheduler scheduler = Transformer.get_default_scheduler( optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps, ) # fine tune super().fine_tune( train_dataloader=train_dataloader, get_inputs=Processor.get_inputs, num_gpus=num_gpus, gpu_ids=gpu_ids, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps, optimizer=optimizer, scheduler=scheduler, local_rank=local_rank, verbose=verbose, seed=seed, )