def fixed_learning_rate_optimizers(self): "Prepare optimizer and schedule (linear warmup and decay)" no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in self.named_parameters() if (p.requires_grad) and (not any(nd in n for nd in no_decay)) ], "weight_decay": self.hparams.weight_decay, }, { "params": [ p for n, p in self.named_parameters() if (p.requires_grad) and (any(nd in n for nd in no_decay)) ], "weight_decay": 0.0, }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps # ) if self.hparams.lr_scheduler == 'linear': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps) elif self.hparams.lr_scheduler == 'cosine': scheduler = get_cosine_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps) elif self.hparams.lr_scheduler == 'cosine_restart': scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps) else: raise '{} is not supported'.format(self.hparams.lr_scheduler) scheduler = { 'scheduler': scheduler, 'interval': 'step', 'frequency': 1 } return [optimizer], [scheduler]
# Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: encoder = torch.nn.parallel.DistributedDataParallel( encoder, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) ######################################################################### # launch training ########################################################################## global_step = 0 loss_name = [ "loss_total", "loss_span", "loss_type", "loss_sup", "loss_ent", "loss_para" ] tr_loss, logging_loss = [0] * len(loss_name), [0] * len(loss_name) if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(args.exp_name) encoder.zero_grad() model.zero_grad()
def rec_adam_learning_optimizer(self): no_decay = ["bias", "LayerNorm.weight"] new_model = self.model args = self.hparams pretrained_model = self.encoder optimizer_grouped_parameters = [ { "params": [p for n, p in new_model.named_parameters() if not any(nd in n for nd in no_decay) and args.model_type in n], "weight_decay": args.weight_decay, "anneal_w": args.recadam_anneal_w, "pretrain_params": [p_p for p_n, p_p in pretrained_model.named_parameters() if not any(nd in p_n for nd in no_decay) and args.model_type in p_n] }, { "params": [p for n, p in new_model.named_parameters() if not any(nd in n for nd in no_decay) and args.model_type not in n], "weight_decay": args.weight_decay, "anneal_w": 0.0, "pretrain_params": [p_p for p_n, p_p in pretrained_model.named_parameters() if not any(nd in p_n for nd in no_decay) and args.model_type not in p_n] }, { "params": [p for n, p in new_model.named_parameters() if any(nd in n for nd in no_decay) and args.model_type in n], "weight_decay": 0.0, "anneal_w": args.recadam_anneal_w, "pretrain_params": [p_p for p_n, p_p in pretrained_model.named_parameters() if any(nd in p_n for nd in no_decay) and args.model_type in p_n] }, { "params": [p for n, p in new_model.named_parameters() if any(nd in n for nd in no_decay) and args.model_type not in n], "weight_decay": 0.0, "anneal_w": 0.0, "pretrain_params": [p_p for p_n, p_p in pretrained_model.named_parameters() if any(nd in p_n for nd in no_decay) and args.model_type not in p_n] } ] optimizer = RecAdam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, anneal_fun=args.recadam_anneal_fun, anneal_k=args.recadam_anneal_k, anneal_t0=args.recadam_anneal_t0, pretrain_cof=args.recadam_pretrain_cof) # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps # ) if self.hparams.lr_scheduler == 'linear': scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps) elif self.hparams.lr_scheduler == 'cosine': scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps) elif self.hparams.lr_scheduler == 'cosine_restart': scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps) else: raise '{} is not supported'.format(self.hparams.lr_scheduler) scheduler = { 'scheduler': scheduler, 'interval': 'step', 'frequency': 1 } return [optimizer], [scheduler] # def configure_optimizers(self): # # "Prepare optimizer and schedule (linear warmup and decay)" # if self.hparams.learning_rate_schema == 'fixed': # return self.fixed_learning_rate_optimizers() # else: # return self.layer_wise_learning_rate_optimizer() # # def fixed_learning_rate_optimizers(self): # "Prepare optimizer and schedule (linear warmup and decay)" # no_decay = ["bias", "LayerNorm.weight"] # optimizer_grouped_parameters = [ # { # "params": [p for n, p in self.named_parameters() if # (p.requires_grad) and (not any(nd in n for nd in no_decay))], # "weight_decay": self.hparams.weight_decay, # }, # { # "params": [p for n, p in self.named_parameters() if # (p.requires_grad) and (any(nd in n for nd in no_decay))], # "weight_decay": 0.0, # } # ] # optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps # ) # scheduler = { # 'scheduler': scheduler, # 'interval': 'step', # 'frequency': 1 # } # return [optimizer], [scheduler] # # def layer_wise_learning_rate_optimizer(self): # "Prepare optimizer and schedule (linear warmup and decay)" # encoder_layer_number_dict = {'roberta-large': 24, 'albert-xxlarge-v2': 1} # assert self.hparams.encoder_name_or_path in encoder_layer_number_dict # # def achieve_module_groups(encoder, number_of_layer, number_of_groups): # layer_num_each_group = number_of_layer // number_of_groups # number_of_divided_groups = number_of_groups + 1 if number_of_layer % number_of_groups > 0 else number_of_groups # groups = [] # groups.append([encoder.embeddings, *encoder.encoder.layer[:layer_num_each_group]]) # for group_id in range(1, number_of_divided_groups): # groups.append( # [*encoder.encoder.layer[(group_id * layer_num_each_group):((group_id + 1) * layer_num_each_group)]]) # return groups, number_of_divided_groups # if self.hparams.encoder_name_or_path == 'roberta-large': # encoder_layer_number = encoder_layer_number_dict[self.hparams.encoder_name_or_path] # encoder_group_number = 2 # module_groups, encoder_group_number = achieve_module_groups(encoder=self.encoder, number_of_layer=encoder_layer_number, # number_of_groups=encoder_group_number) # module_groups.append([self.model]) # assert len(module_groups) == encoder_group_number + 1 # elif self.hparams.encoder_name_or_path == 'albert-xxlarge-v2': # module_groups = [] # module_groups.append([self.encoder]) # module_groups.append([self.model]) # assert len(module_groups) == 2 # else: # raise 'Not supported {}'.format(self.hparams.encoder_name_or_path) # # def achieve_parameter_groups(module_group, weight_decay, lr): # named_parameters = [] # no_decay = ["bias", "LayerNorm.weight"] # for module in module_group: # named_parameters += module.named_parameters() # grouped_parameters = [ # { # "params": [p for n, p in named_parameters if # (p.requires_grad) and (not any(nd in n for nd in no_decay))], # "weight_decay": weight_decay, 'lr': lr # }, # { # "params": [p for n, p in named_parameters if # (p.requires_grad) and (any(nd in n for nd in no_decay))], # "weight_decay": 0.0, 'lr': lr # } # ] # return grouped_parameters # # optimizer_grouped_parameters = [] # for idx, module_group in enumerate(module_groups): # lr = self.hparams.learning_rate * (10.0**idx) # logging.info('group {} lr = {}'.format(idx, lr)) # grouped_parameters = achieve_parameter_groups(module_group=module_group, # weight_decay=self.hparams.weight_decay, # lr=lr) # optimizer_grouped_parameters += grouped_parameters # # optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps # ) # scheduler = { # 'scheduler': scheduler, # 'interval': 'step', # 'frequency': 1 # } # return [optimizer], [scheduler]
def layer_wise_learning_rate_optimizer(self): "Prepare optimizer and schedule (linear warmup and decay)" encoder_layer_number_dict = {'roberta-large': 24, 'albert-xxlarge-v2': 1} assert self.hparams.encoder_name_or_path in encoder_layer_number_dict def achieve_module_groups(encoder, number_of_layer, number_of_groups): layer_num_each_group = number_of_layer // number_of_groups number_of_divided_groups = number_of_groups + 1 if number_of_layer % number_of_groups > 0 else number_of_groups groups = [] groups.append([encoder.embeddings, *encoder.encoder.layer[:layer_num_each_group]]) for group_id in range(1, number_of_divided_groups): groups.append( [*encoder.encoder.layer[(group_id * layer_num_each_group):((group_id + 1) * layer_num_each_group)]]) return groups, number_of_divided_groups if self.hparams.encoder_name_or_path == 'roberta-large': encoder_layer_number = encoder_layer_number_dict[self.hparams.encoder_name_or_path] encoder_group_number = encoder_layer_number module_groups, encoder_group_number = achieve_module_groups(encoder=self.encoder, number_of_layer=encoder_layer_number, number_of_groups=encoder_group_number) module_groups.append([self.model]) assert len(module_groups) == encoder_group_number + 1 elif self.hparams.encoder_name_or_path == 'albert-xxlarge-v2': module_groups = [] module_groups.append([self.encoder]) module_groups.append([self.model]) assert len(module_groups) == 2 else: raise 'Not supported {}'.format(self.hparams.encoder_name_or_path) def achieve_parameter_groups(module_group, weight_decay, lr): named_parameters = [] no_decay = ["bias", "LayerNorm.weight"] for module in module_group: named_parameters += module.named_parameters() grouped_parameters = [ { "params": [p for n, p in named_parameters if (p.requires_grad) and (not any(nd in n for nd in no_decay))], "weight_decay": weight_decay, 'lr': lr }, { "params": [p for n, p in named_parameters if (p.requires_grad) and (any(nd in n for nd in no_decay))], "weight_decay": 0.0, 'lr': lr } ] return grouped_parameters optimizer_grouped_parameters = [] for idx, module_group in enumerate(module_groups): lr = self.hparams.learning_rate * (10.0 ** idx) logging.info('group {} lr = {}'.format(idx, lr)) grouped_parameters = achieve_parameter_groups(module_group=module_group, weight_decay=self.hparams.weight_decay, lr=lr) optimizer_grouped_parameters += grouped_parameters optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon) # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps # ) if self.hparams.lr_scheduler == 'linear': scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps) elif self.hparams.lr_scheduler == 'cosine': scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps) elif self.hparams.lr_scheduler == 'cosine_restart': scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps) else: raise '{} is not supported'.format(self.hparams.lr_scheduler) scheduler = { 'scheduler': scheduler, 'interval': 'step', 'frequency': 1 } return [optimizer], [scheduler]
def rec_adam_learning_optimizer(self, total_steps): no_decay = ["bias", "LayerNorm.weight"] new_model = self.model args = self.config pretrained_model = self.encoder optimizer_grouped_parameters = [{ "params": [ p for n, p in new_model.named_parameters() if not any(nd in n for nd in no_decay) and args.model_type in n ], "weight_decay": args.weight_decay, "anneal_w": args.recadam_anneal_w, "pretrain_params": [ p_p for p_n, p_p in pretrained_model.named_parameters() if not any(nd in p_n for nd in no_decay) and args.model_type in p_n ] }, { "params": [ p for n, p in new_model.named_parameters() if not any(nd in n for nd in no_decay) and args.model_type not in n ], "weight_decay": args.weight_decay, "anneal_w": 0.0, "pretrain_params": [ p_p for p_n, p_p in pretrained_model.named_parameters() if not any(nd in p_n for nd in no_decay) and args.model_type not in p_n ] }, { "params": [ p for n, p in new_model.named_parameters() if any(nd in n for nd in no_decay) and args.model_type in n ], "weight_decay": 0.0, "anneal_w": args.recadam_anneal_w, "pretrain_params": [ p_p for p_n, p_p in pretrained_model.named_parameters() if any(nd in p_n for nd in no_decay) and args.model_type in p_n ] }, { "params": [ p for n, p in new_model.named_parameters() if any(nd in n for nd in no_decay) and args.model_type not in n ], "weight_decay": 0.0, "anneal_w": 0.0, "pretrain_params": [ p_p for p_n, p_p in pretrained_model.named_parameters() if any(nd in p_n for nd in no_decay) and args.model_type not in p_n ] }] optimizer = RecAdam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, anneal_fun=args.recadam_anneal_fun, anneal_k=args.recadam_anneal_k, anneal_t0=args.recadam_anneal_t0, pretrain_cof=args.recadam_pretrain_cof) if self.config.lr_scheduler == 'linear': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=total_steps) elif self.config.lr_scheduler == 'cosine': scheduler = get_cosine_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=total_steps) elif self.config.lr_scheduler == 'cosine_restart': scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=total_steps) else: raise '{} is not supported'.format(self.config.lr_scheduler) return optimizer, scheduler