Python get_amp Exemples

Langage de programmation: Python

Espace de nommage/Pack: utils_nlp.common.pytorch_utils

Méthode/Fonction: get_amp

Exemples au hotexamples.com: 2

Python get_amp - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de utils_nlp.common.pytorch_utils.get_amp extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Exemple #1

0

Afficher le fichier

def prepare_model_and_optimizer( self, num_gpus, gpu_ids, local_rank, weight_decay, learning_rate, adam_epsilon, fp16=False, fp16_opt_level="O1", checkpoint_state_dict=None, ): """ This function initializes an optimizer and moves the model to a device. It can be used by most child classes before calling fine_tune. Child classes that require custom optimizers need to either override this function or implement the steps listed below in the specified order before fine-tuning. The steps are performed in the following order: 1. Move model to device 2. Create optimizer 3. Initialize amp 4. Parallelize model """ amp = get_amp(fp16) # get device device, num_gpus = get_device(num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank) # move model self.model = move_model_to_device(model=self.model, device=device) # init optimizer self.optimizer = Transformer.get_default_optimizer( self.model, weight_decay, learning_rate, adam_epsilon) if fp16 and amp: self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level=fp16_opt_level) if checkpoint_state_dict: self.optimizer.load_state_dict(checkpoint_state_dict["optimizer"]) self.model.load_state_dict(checkpoint_state_dict["model"]) if fp16 and amp: amp.load_state_dict(checkpoint_state_dict["amp"]) self.model = parallelize_model( model=self.model, device=device, num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank, ) return device, num_gpus, amp

Exemple #2

0

Afficher le fichier

Fichier : abstractive_summarization_bertsum.py Projet : pemukl/german-bertabs

def fit( self, train_dataset, num_gpus=None, gpu_ids=None, batch_size=4, local_rank=-1, max_steps=5e4, warmup_steps_bert=20000, warmup_steps_dec=10000, learning_rate_bert=0.002, learning_rate_dec=0.2, optimization_method="adam", max_grad_norm=0, beta1=0.9, beta2=0.999, decay_method="noam", gradient_accumulation_steps=1, report_every=10, save_every=1000, verbose=True, seed=None, fp16=False, fp16_opt_level="O2", world_size=1, rank=0, validation_function=None, checkpoint=None, **kwargs, ): """ Fine-tune pre-trained transofmer models for extractive summarization. Args: train_dataset (SummarizationDataset): Training dataset. num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will be used. If set to 0 or GPUs are not available, CPU device will be used. Defaults to None. gpu_ids (list): List of GPU IDs to be used. If set to None, the first num_gpus GPUs will be used. Defaults to None. batch_size (int, optional): Maximum number of tokens in each batch. local_rank (int, optional): Local_rank for distributed training on GPUs. Local rank means the ranking of the current GPU device on the current node. Defaults to -1, which means non-distributed training. max_steps (int, optional): Maximum number of training steps. Defaults to 5e5. warmup_steps_bert (int, optional): Number of steps taken to increase learning rate from 0 to `learning_rate` for tuning the BERT encoder. Defaults to 2e4. warmup_steps_dec (int, optional): Number of steps taken to increase learning rate from 0 to `learning_rate` for tuning the decoder. Defaults to 1e4. learning_rate_bert (float, optional): Learning rate of the optimizer for the encoder. Defaults to 0.002. learning_rate_dec (float, optional): Learning rate of the optimizer for the decoder. Defaults to 0.2. optimization_method (string, optional): Optimization method used in fine tuning. Defaults to "adam". max_grad_norm (float, optional): Maximum gradient norm for gradient clipping. Defaults to 0. beta1 (float, optional): The exponential decay rate for the first moment estimates. Defaults to 0.9. beta2 (float, optional): The exponential decay rate for the second-moment estimates. This value should be set close to 1.0 on problems with a sparse gradient. Defaults to 0.99. decay_method (string, optional): learning rate decrease method. Default to 'noam'. gradient_accumulation_steps (int, optional): Number of batches to accumulate gradients on between each model parameter update. Defaults to 1. report_every (int, optional): The interval by steps to print out the training log. Defaults to 10. save_every (int, optional): The interval by steps to save the finetuned model. Defaults to 100. verbose (bool, optional): Whether to print out the training log. Defaults to True. seed (int, optional): Random seed used to improve reproducibility. Defaults to None. fp16 (bool, optional): Whether to use mixed precision training. Defaults to False. fp16_opt_level (str, optional): optimization level, refer to https://nvidia.github.io/apex/amp.html#opt-levels for details. Value choices are: "O0", "O1", "O2", "O3". Defaults to "O2". world_size (int, optional): Total number of GPUs that will be used. Defaults to 1. rank (int, optional): Global rank of the current GPU in distributed training. It's calculated with the rank of the current node in the cluster/world and the `local_rank` of the device in the current node. See an example in :file: `examples/text_summarization/ abstractive_summarization_bertsum_cnndm_distributed_train.py`. Defaults to 0. validation_function (function, optional): function used in fitting to validate the performance. Default to None. checkpoint (str, optional): file path for a checkpoint based on which the training continues. Default to None. """ # get device device, num_gpus = get_device(num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank) # move model to devices print("device is {}".format(device)) if checkpoint: checkpoint = torch.load(checkpoint, map_location="cpu") self.model.load_checkpoint(checkpoint["model"]) self.model = move_model_to_device(model=self.model, device=device) # init optimizer self.optim_bert = model_builder.build_optim_bert( self.model, optim=optimization_method, lr_bert=learning_rate_bert, warmup_steps_bert=warmup_steps_bert, max_grad_norm=max_grad_norm, beta1=beta1, beta2=beta2, ) self.optim_dec = model_builder.build_optim_dec( self.model, optim=optimization_method, lr_dec=learning_rate_dec, warmup_steps_dec=warmup_steps_dec, max_grad_norm=max_grad_norm, beta1=beta1, beta2=beta2, ) optimizers = [self.optim_bert, self.optim_dec] self.amp = get_amp(fp16) if self.amp: self.model, optim = self.amp.initialize(self.model, optimizers, opt_level=fp16_opt_level) global_step = 0 if checkpoint: if checkpoint["optimizers"]: for i in range(len(optimizers)): model_builder.load_optimizer_checkpoint( optimizers[i], checkpoint["optimizers"][i]) if self.amp and "amp" in checkpoint and checkpoint["amp"]: self.amp.load_state_dict(checkpoint["amp"]) if "global_step" in checkpoint and checkpoint["global_step"]: global_step = checkpoint["global_step"] / world_size print("global_step is {}".format(global_step)) self.model = parallelize_model(model=self.model, device=device, num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=local_rank, apex=self.amp) if local_rank == -1: sampler = RandomSampler(train_dataset) else: sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank) def collate_fn(data): return self.processor.collate(data, block_size=self.max_pos_length, device=device) train_dataloader = DataLoader(train_dataset, sampler=sampler, batch_size=batch_size, collate_fn=collate_fn) # compute the max number of training steps max_steps = compute_training_steps( train_dataloader, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps, ) super().fine_tune( train_dataloader=train_dataloader, get_inputs=BertSumAbsProcessor.get_inputs, device=device, num_gpus=num_gpus, max_steps=max_steps, global_step=global_step, max_grad_norm=max_grad_norm, gradient_accumulation_steps=gradient_accumulation_steps, verbose=verbose, seed=seed, report_every=report_every, save_every=save_every, clip_grad_norm=False, optimizer=optimizers, scheduler=None, fp16=fp16, amp=self.amp, validation_function=validation_function, ) # release GPU memories self.model.cpu() torch.cuda.empty_cache() self.save_model(max_steps)