def get_bert_optimizer(models, type_optimization, learning_rate): """ Optimizes the network with AdamWithDecay """ if type_optimization not in patterns_optimizer: print('Error. Type optimizer must be one of %s' % (str(patterns_optimizer.keys()))) parameters_with_decay = [] parameters_with_decay_names = [] parameters_without_decay = [] parameters_without_decay_names = [] no_decay = ['bias', 'gamma', 'beta'] patterns = patterns_optimizer[type_optimization] for model in models: for n, p in model.named_parameters(): if any(t in n for t in patterns): if any(t in n for t in no_decay): parameters_without_decay.append(p) parameters_without_decay_names.append(n) else: parameters_with_decay.append(p) parameters_with_decay_names.append(n) print('The following parameters will be optimized WITH decay:') print(_ellipse(parameters_with_decay_names, 5, ' , ')) print('The following parameters will be optimized WITHOUT decay:') print(_ellipse(parameters_without_decay_names, 5, ' , ')) optimizer_grouped_parameters = [ {'params': parameters_with_decay, 'weight_decay': 0.01}, {'params': parameters_without_decay, 'weight_decay': 0.0} ] optimizer = AdamWithDecay(optimizer_grouped_parameters, lr=learning_rate) return optimizer
def get_bert_optimizer(models, bert_learning_rate, base_learning_rate, weight_decay): """ Optimizes the network with AdamWithDecay """ parameters_with_decay = [] parameters_with_decay_names = [] parameters_without_decay = [] parameters_without_decay_names = [] base_parameters = [] base_parameters_names = [] no_decay = ['bias', 'gamma', 'beta'] for model in models: for n, p in model.named_parameters(): if p.requires_grad: # fine-tune BERT if any(t in n for t in ["bert_model", "bert"]): if any(t in n for t in no_decay): parameters_without_decay.append(p) parameters_without_decay_names.append(n) else: parameters_with_decay.append(p) parameters_with_decay_names.append(n) else: base_parameters.append(p) base_parameters_names.append(n) print('The following parameters will be optimized WITH decay:') print(_ellipse(parameters_with_decay_names, 5, ' , ')) print('The following parameters will be optimized WITHOUT decay:') print(_ellipse(parameters_without_decay_names, 5, ' , ')) print('The following parameters will be optimized NORMALLY:') print(_ellipse(base_parameters_names, 5, ' , ')) optimizer_grouped_parameters = [{ 'params': parameters_with_decay, 'weight_decay': weight_decay, 'lr': bert_learning_rate }, { 'params': parameters_without_decay, 'weight_decay': 0.0, 'lr': bert_learning_rate }, { 'params': base_parameters, 'weight_decay': weight_decay, 'lr': base_learning_rate }] optimizer = AdamWithDecay(optimizer_grouped_parameters, lr=base_learning_rate) return optimizer
def __init__(self, model, opt): no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] parameters_with_decay = [] parameters_with_decay_names = [] parameters_without_decay = [] parameters_without_decay_names = [] base_parameters = [] base_parameters_names = [] for n, p in model.named_parameters(): if p.requires_grad: # fine-tune BERT if any(t in n for t in ["transformer"]): if any(t in n for t in no_decay): parameters_without_decay.append(p) parameters_without_decay_names.append(n) else: parameters_with_decay.append(p) parameters_with_decay_names.append(n) else: base_parameters.append(p) base_parameters_names.append(n) weight_decay = opt['weight_decay'] bert_learning_rate = opt['gpt_lr'] base_learning_rate = opt['lr'] optimizer_grouped_parameters = [ {'params': parameters_with_decay, 'weight_decay': weight_decay, 'lr': bert_learning_rate}, {'params': parameters_without_decay, 'weight_decay': 0.0, 'lr': bert_learning_rate}, {'params': base_parameters, 'weight_decay': weight_decay, 'lr': base_learning_rate} ] # print('The following parameters will be optimized WITH decay:') print(_ellipse(parameters_with_decay_names, 5, ' , ')) print('The following parameters will be optimized WITHOUT decay:') print(_ellipse(parameters_without_decay_names, 5, ' , ')) print('The following parameters will be optimized NORMALLY:') print(_ellipse(base_parameters_names, 5, ' , ')) optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=opt['gpt_lr'], warmup=opt['warmup_proportion'], max_grad_norm=opt['gradient_clip'], t_total=opt.get('optimizer_step', -1)) self.optimizer = optimizer