def __init__(self, qa_model_path, ca2q_model_path, c2q_model_path, c2a_model_path): self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") self.model = DualNet(qa_model_path, ca2q_model_path, c2q_model_path, c2a_model_path) train_dir = os.path.join("./save", "dual") self.save_dir = os.path.join(train_dir, "train_%d" % int(time.strftime("%m%d%H%M%S"))) # read data-set and prepare iterator self.train_loader = self.get_data_loader("./squad/train-v1.1.json") self.dev_loader = self.get_data_loader("./squad/new_dev-v1.1.json") num_train_optimization_steps = len(self.train_loader) * config.num_epochs # optimizer param_optimizer = list(self.model.qa_model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if "pooler" not in n[0]] no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] self.qa_opt = BertAdam(optimizer_grouped_parameters, lr=config.qa_lr, warmup=config.warmup_proportion, t_total=num_train_optimization_steps) params = list(self.model.ca2q_model.encoder.parameters()) \ + list(self.model.ca2q_model.decoder.parameters()) # self.qg_lr = config.lr self.qg_opt = optim.Adam(params, config.qa_lr) # assign model to device and wrap it with DataParallel torch.cuda.set_device(0) self.model.cuda() self.model = nn.DataParallel(self.model)
def getOptim(model): FULL_FINETUNING = True if FULL_FINETUNING: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] else: param_optimizer = list(mode.classifier.named_parameters()) optimizer_grouped_parameters = [{ "params": [p for n, p in param_optimizer] }] optimizer = BertAdam(optimizer_grouped_parameters, lr=3e-5) return optimizer
def __init__(self): self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") self.model = BertForQuestionAnswering.from_pretrained("bert-base-uncased") train_dir = os.path.join("./save", "qa") self.save_dir = os.path.join(train_dir, "train_%d" % int(time.strftime("%m%d%H%M%S"))) if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) # read data-set and prepare iterator self.train_loader = self.get_data_loader("./squad/train-v1.1.json") self.dev_loader = self.get_data_loader("./squad/new_dev-v1.1.json") num_train_optimization_steps = len(self.train_loader) * config.num_epochs # optimizer param_optimizer = list(self.model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if "pooler" not in n[0]] no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] self.qa_opt = BertAdam(optimizer_grouped_parameters, lr=config.qa_lr, warmup=config.warmup_proportion, t_total=num_train_optimization_steps) # self.qg_lr = config.lr # assign model to device self.model = self.model.to(config.device)
def set_bertadam_optimizer(model, lr, t_total, warmup=0.1, schedule='warmup_linear', weight_decay=0.01): param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if p.requires_grad and not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in param_optimizer if p.requires_grad and any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = BertAdam(grouped_parameters, lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, max_grad_norm=1.0, weight_decay=weight_decay) return optimizer
def __init__(self, opt, batch_num): self.opt = opt self.model = Extraction(opt) self.model.cuda() param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] num_train_optimization_steps = batch_num * (opt['num_epoch'] + 1) self.optimizer = BertAdam(optimizer_grouped_parameters, lr=opt['lr'], warmup=0.1, t_total=num_train_optimization_steps) self.bce = nn.BCELoss(reduction='none') self.ema = layers.EMA(self.model, opt['ema']) self.ema.register()
def _init_nn(self, train_dataset_len): """Initialize the nn model for training.""" self.model = MCBertForPretrainingModel(vis_feat_dim=self.vis_feat_dim, spatial_size=self.spatial_size, hidden_dim=self.hidden_dim, cmb_feat_dim=self.cmb_feat_dim, kernel_size=self.kernel_size) # Prepare optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] self.optimizer = BertAdam( optimizer_grouped_parameters, lr=self.learning_rate, warmup=self.warmup_proportion, t_total=int(train_dataset_len / 40 / self.batch_size * self.num_epochs)) if self.USE_CUDA: self.model = self.model.cuda()
def run_LR_training(self, config, dev_labels, dev_results, labels, lossfunction, results, total_labels): model = SecondaryCls(config).cuda() glorot_param_init(model) optimizer = BertAdam(filter(lambda p: p.requires_grad, model.parameters()), lr=config["hyperparameters"]["learning_rate"], weight_decay=0.02) best_distribution = None best_F1 = 0 for i in range(1000): pred_logits = model(results) loss = lossfunction(pred_logits, labels) loss.backward() optimizer.step() optimizer.zero_grad() dev_pred_logits = model(dev_results) dev_loss = lossfunction(dev_pred_logits, dev_labels) maxpreds, argmaxpreds = torch.max(F.softmax(dev_pred_logits, -1), dim=1) total_preds = list(argmaxpreds.cpu().numpy()) correct_vec = argmaxpreds == dev_labels total_correct = torch.sum(correct_vec).item() loss, acc = dev_loss, total_correct / results.shape[0] F1 = metrics.f1_score(total_labels, total_preds, average="macro") if F1 > best_F1: best_F1 = F1 best_distribution = F.softmax(model.a) # logging.info( # f"Validation loss|acc|F1|BEST: {loss:.6f}|{acc:.6f}|{F1:.6f} || {best_F1} || ") return best_F1, best_distribution
def init_optimizer(self): optimize_steps = iceil(len(self.sampler) / self.batch_size) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_params = [ { 'params': [ i for n, i in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ i for n, i in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, ] self.optimizer = BertAdam(optimizer_params, lr=self.learning_rate, warmup=self.warmup_prop, t_total=optimize_steps) return self
def get_optimizer(model, args): if args.model in ["bert", "concatbert", "mmbt"]: total_steps = (args.train_data_len / args.batch_sz / args.gradient_accumulation_steps * args.max_epochs) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01 }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = BertAdam( optimizer_grouped_parameters, lr=args.lr, warmup=args.warmup, t_total=total_steps, ) else: optimizer = optim.Adam(model.parameters(), lr=args.lr) return optimizer
def train(config, model, train_iter, dev_iter): start_time = time.time() model.train() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.num_epochs) total_batch = 0 # 记录进行到多少batch # dev_best_loss = float('inf') dev_best_acc = -float('inf') last_improve = 0 # 记录上次验证集loss下降的batch数 flag = False # 记录是否很久没有效果提升 model.train() for epoch in range(config.num_epochs): print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs)) for i, batch in enumerate(train_iter): trains = (batch[0], batch[1], batch[2]) labels = torch.squeeze(batch[3], dim=1) outputs = model(trains) model.zero_grad() class_weight = torch.FloatTensor([1, 1, 1, 1, 1, 1, 1, 0.4, 2, 1]).cuda() loss = F.cross_entropy(outputs, labels, weight=class_weight) # loss = F.cross_entropy(outputs, labels) loss.backward() optimizer.step() if total_batch % 100 == 0: # 每多少轮输出在训练集和验证集上的效果 true = labels.data.cpu() predic = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(true, predic) dev_acc, dev_loss, ouputs_all = evaluate(config, model, dev_iter) if dev_acc > dev_best_acc: # dev_best_loss = dev_loss dev_best_acc = dev_acc torch.save(model.state_dict(), config.save_path) improve = '*' last_improve = total_batch else: improve = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}' print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve)) model.train() total_batch += 1 if total_batch - last_improve > config.require_improvement: # 验证集loss超过1000batch没下降,结束训练 print("No optimization for a long time, auto-stopping...") flag = True break if flag: break
def train(opt): # basics definition opt.experiment = os.path.join(root_dir, opt.experiment) if not os.path.exists(opt.experiment): os.makedirs(opt.experiment) opt.save_model = os.path.join(opt.experiment, opt.save_model) opt.log_path = os.path.join(opt.experiment, 'log.train') opt.logger = make_logger(opt.log_path) bert_tokenizer, bert_model = make_bert() # dataIter definition class2idx = build_class_vocab(opt.data_root + 'class.all') opt.class_size = len(class2idx) train_iter = BertIter4STC(opt.data_root + 'train', bert_tokenizer, class2idx, opt.batch_size, opt.cuda, True) valid_iter = BertIter4STC(opt.data_root + 'valid', bert_tokenizer, class2idx, opt.batch_size, opt.cuda, False) # model definition model = make_model(opt, bert_model) # criterion definition criterion = nn.BCELoss(reduction='sum') if opt.cuda: criterion = criterion.cuda() # optimizer definition if opt.fix_bert: for (name, parameter) in model.bert.named_parameters(): parameter.requires_grad = False if opt.optim == 'bert': params = list( filter(lambda x: x[1].requires_grad == True, model.named_parameters())) print('Trainable parameter number: {}'.format(len(params))) print('Trainer: bert') no_decay = ['bias', 'gamma', 'beta'] grouped_params = [{ 'params': [p for n, p in params if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in params if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(grouped_params, opt.lr, warmup=0.1, t_total=len(train_iter) * opt.epochs) else: optimizer = Optim(opt.optim, opt.lr, max_grad_norm=opt.max_norm) optimizer.set_parameters(model.named_parameters()) print('Trainable parameter number: {}'.format(len(optimizer.params))) # training procedure trainer = BertTrainer4STC(model, criterion, optimizer, opt.logger) trainer.train(opt.epochs, train_iter, valid_iter, opt.save_model)
def __fit_net(self, train_dataloader): self.model.cuda() param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=self.lr, warmup=.1) train_loss_set = [] for _ in trange(self.n_epochs, desc="Epoch"): self.model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(self.device) for t in batch) b_input_ids, b_input_mask, b_labels, = batch optimizer.zero_grad() loss = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) train_loss_set.append(loss.item()) loss.backward() optimizer.step() # Update tracking variables tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print("Train loss: {}".format(tr_loss / nb_tr_steps)) # add Validation self.model.eval()
def test_bert_sched_init(self): m = torch.nn.Linear(50, 50) optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule=None) self.assertTrue( isinstance(optim.param_groups[0]["schedule"], ConstantLR)) optim = BertAdam(m.parameters(), lr=0.001, warmup=.1, t_total=1000, schedule="none") self.assertTrue( isinstance(optim.param_groups[0]["schedule"], ConstantLR)) optim = BertAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000) self.assertTrue( isinstance(optim.param_groups[0]["schedule"], WarmupLinearSchedule))
def train_main(args): train_loader, num_train = load_dataset( os.path.join(args.input, "train_retrievedsents_leetal.json"), args.bs, shuffle=True, ) print("loaded train, # samples: %d" % num_train) dev_loader, num_dev = load_dataset( os.path.join(args.input, "dev_retrievedsents_leetal.json"), args.bs) print("loaded dev, # samples: %d" % num_dev) pretrained_model = BertForSequenceClassification.from_pretrained( args.checkpoint) pretrained_state_dict = pretrained_model.state_dict() bert_state_dict = { key: pretrained_state_dict[key] for key in pretrained_state_dict if not key.startswith("classifier") } """ only load the encoder layers from ROVER/base-cased-base """ model = BertForSequenceClassification.from_pretrained( args.checkpoint, state_dict=bert_state_dict, num_labels=3) print("loaded pretrained model") if torch.cuda.is_available(): model = model.cuda() no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] num_training_steps = int(len(train_loader) * args.max_epoch) optimizer = BertAdam(optimizer_grouped_parameters, lr=args.lr, warmup=0.1, t_total=num_training_steps) train(model, train_loader, dev_loader, optimizer, args.max_epoch, args.eval_steps)
def _create_net_and_optim(self, net_cfg, optim_cfg, num_train_optimization_steps): net = BertForSequenceClassification.from_pretrained(net_cfg.bert_pretrain, net_cfg.num_labels) net.to(device=self._device) param_optimizer = filter(lambda p: p.requires_grad, net.parameters()) if num_train_optimization_steps != None: optim = BertAdam(param_optimizer, t_total=num_train_optimization_steps, **optim_cfg.kwargs) else: optim = None return net, optim
def train(config, model, train_iter): """ 模型训练方法 :param config: :param model: :param train_iter: :param dev_iter: :param test_iter: :return: """ # 启动 BatchNormalization 和 dropout model.train() # 拿到所有mode种的参数 param_optimizer = list(model.named_parameters()) # 不需要衰减的参数 no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_deacy': 0.0 }] optimizer = BertAdam(params=optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.num_epochs) total_batch = 0 # 记录进行多少batch model.train() for epoch in range(config.num_epochs): print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs)) # 迭代器返回的结果: (x, seq_len, mask), y ==> (list(list(int)), list(int), list(list(int)), int) for i, (trains, labels) in enumerate(train_iter): outputs = model(trains) model.zero_grad() loss = F.cross_entropy(outputs, labels) loss.backward() optimizer.step() if total_batch % 10 == 0: # 每多少次输出在训练集和校验集上的效果 true = labels.data.cpu() predit = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(true, predit) print(f"##### {loss.item()}, acc {train_acc}") model.train() total_batch = total_batch + 1
def get_bert_optimizer(model, lr=2e-5, **kwargs): """ A convenient function to get the BERT-Adam optimizer. :param model: the model to apply this optimizer on. :param lr: the learning rate; ! for a complicated model like bert, a tiny learning rate like 2e-5 is usually desired. :return: a BERT-Adam optimizer. """ if lr > 2e-4: warnings.warn( f"for a complicated model like bert, a tiny learning rate like 2e-5 is usually desired; got `{lr}`" ) return BertAdam(model.parameters(), lr=lr, **kwargs)
def main(): data = ingest.getTrainData(tokenize=False, lower=False)[:20] validData = ingest.getValidationData(tokenize=False, lower=False)[:20] transformedDataSet = transform(data) transformedValidationDataSet = transform(validData) optimizer = BertAdam(model.parameters(), lr=2e-6, warmup=.1) epochs = 100 for epoch in range(epochs): model.train() optimizer.zero_grad() print("Training started for epoch {}".format(epoch + 1)) correct = 0 total = 0 tr_loss = 0 for dataPoint in tqdm(transformedDataSet): story = dataPoint["story"] segmentMask = dataPoint["segmentMask"] label = torch.LongTensor([0 if dataPoint["label"] else 1]) # Forward pass seq_relationship_score = model(story, token_type_ids=segmentMask) if (dataPoint["label"] and (seq_relationship_score[0][0] > seq_relationship_score[0][1])): correct += 1 elif ( not dataPoint["label"] and (seq_relationship_score[0][0] < seq_relationship_score[0][1])): correct += 1 total += 1 # Backward pass loss_fct = CrossEntropyLoss(ignore_index=-1) loss = loss_fct(seq_relationship_score.view(-1, 2), label.view(-1)) tr_loss += loss.item() loss.backward() optimizer.step() print("Training accuracy for epoch {}: {}".format( epoch + 1, correct / total)) print("Training loss for epoch {}: {}".format(epoch + 1, tr_loss / total)) correct = 0 total = 0
def configure_model(): model = BertForSequenceClassification.from_pretrained(BERT_MODEL_PATH,cache_dir=None,num_labels=1) model.zero_grad() model = model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] lr = 3e-5 epsilon=1 lr_d = {} weight_d = {} for n, p in param_optimizer: if any(nd in n for nd in no_decay): weight_d[n] = 0.0 else: weight_d[n] = 0.01 for n, p in param_optimizer[:5]: lr_d[n] = lr*(epsilon**(11)) for n, p in param_optimizer: if 'bert.encoder.layer.' in n: for i in range(0, 12): if 'bert.encoder.layer.'+str(i)+'.' in n: lr_d[n] = lr*(epsilon**(11-i)) break for n, p in param_optimizer[-4:]: lr_d[n] = lr comb_dict = {} for n, p in param_optimizer: para = (weight_d[n], lr_d[n]) if para in comb_dict: comb_dict[para].append(p) else: comb_dict[para] = [p] optimizer_grouped_parameters = [] for i, j in comb_dict.items(): optimizer_grouped_parameters.append({'params':j, 'weight_decay' : i[0], 'lr' : i[1]}) train = train_dataset num_train_optimization_steps = int(EPOCHS*len(train)/batch_size/accumulation_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.05, t_total=num_train_optimization_steps) model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0) model=model.train() return model, optimizer, train
def test_adam(self): w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True) target = torch.tensor([0.4, 0.2, -0.5]) criterion = torch.nn.MSELoss() # No warmup, constant schedule, no gradient clipping optimizer = BertAdam(params=[w], lr=2e-1, weight_decay=0.0, max_grad_norm=-1) for _ in range(100): loss = criterion(w, target) loss.backward() optimizer.step() w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves. w.grad.zero_() self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
def _get_optimizer_(self, model): no_decay = ['bias', 'gamma', 'beta'] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer \ if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer \ if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] # TODO allow diff lr for G & D return BertAdam(optimizer_grouped_parameters, lr=self.args.lr, warmup=self.args.warmup)
def train(config): train_loader = load_dataset(config.train_path, config) dev_loader = load_dataset(config.dev_path, config) model = Model(config).to(config.device) optimizer = BertAdam(model.parameters(), lr=config.lr, warmup=0.05, t_total=len(train_loader) * config.num_epoches) loss_func = torch.nn.CrossEntropyLoss() print_loss = 0 best_acc = 0 model.train() for epoch in range(config.num_epoches): for step, (batch_texts, batch_span) in enumerate(train_loader): max_len = max([len(i) for i in batch_texts]) x = config.tokenizer.batch_encode_plus(batch_texts, add_special_tokens=True, return_tensors="pt", max_length=max_len, pad_to_max_length=True) x["input_ids"] = x["input_ids"].to(config.device) x["attention_mask"] = x["attention_mask"].to(config.device) x["token_type_ids"] = x["token_type_ids"].to(config.device) batch_span = batch_span.to(config.device) out = model(input_ids=x["input_ids"], attention_mask=x["attention_mask"], token_type_ids=x["token_type_ids"]) optimizer.zero_grad() loss = loss_func(out, batch_span) loss.backward() optimizer.step() if step % 1 == 0: corrects = (torch.max(out, 1)[1].view(batch_span.size()).data == batch_span.data).sum() train_acc = 100.0 * corrects / config.batch_size # print("epoch:", epoch, "step:", step, "loss:", print_loss.item() / 50) sys.stdout.write( '\rBatch[{}] - loss: {:.6f} acc: {:.4f}%({}/{})'.format(step, loss.item(), train_acc, corrects, config.batch_size)) if step % 50 == 0: dev_acc = eval(dev_loader, model, config) if dev_acc > best_acc: best_acc = dev_acc print( 'Saving best model, acc: {:.4f}%\n'.format(best_acc)) save(model, config.model_path, 'best', step)
def buildOptimizer(self, neural, epochs, batch_size, accumulation_steps, lr=2e-5, warmup=0.05): """ build bert optimizer """ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(neural.named_parameters()) optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(epochs * len(self.sentences) / batch_size / accumulation_steps) if self.optimizer == 'BertAdam': return BertAdam(optimizer_grouped_parameters, lr=lr, warmup=warmup, t_total=num_train_optimization_steps) else: return OpenAIAdam(optimizer_grouped_parameters, lr=lr, warmup=warmup, t_total=num_train_optimization_steps)
def setup_bert_optimizer_for_model(model, epochs, lrate, lrate_clf, batch_size, accum_steps, warmup, apex_mixed_precision, train_loader): param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if ('classifier' not in n) and not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if ('classifier' not in n) and any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, { 'params': [p for n, p in param_optimizer if 'classifier' in n], 'weight_decay': 0.01, 'lr': lrate_clf }] num_train_optimization_steps = math.ceil( (epochs + 1) * len(train_loader) / accum_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=lrate, warmup=warmup, t_total=num_train_optimization_steps) if apex_mixed_precision: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) return model, optimizer
def init_optimizer(model, config, *args, **params): optimizer_type = config.get("train", "optimizer") learning_rate = config.getfloat("train", "learning_rate") if optimizer_type == "adam": optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=config.getfloat("train", "weight_decay")) elif optimizer_type == "sgd": optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=config.getfloat("train", "weight_decay")) elif optimizer_type == "bert_adam": optimizer = BertAdam(model.parameters(), lr=learning_rate, weight_decay=config.getfloat("train", "weight_decay")) elif optimizer_type == "lamb": optimizer = Lamb(model.parameters(), lr=learning_rate, weight_decay=config.getfloat("train", "weight_decay")) else: raise NotImplementedError return optimizer
def bert_train(inputs, token_type_ids, masked_lm_labels): #Pre_training & Fine tuning Bert for text Generation model = BertForMaskedLM.from_pretrained('bert-base-uncased') # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=5e-5, warmup=0.1, t_total=300) model.train() n_steps = 10 n_batches = len(inputs) for epoch in range(0, n_steps): #(0,2) eveloss = 0 for i in range(n_batches): # (1): loss = model(inputs[i], token_type_ids=token_type_ids[i], masked_lm_labels=masked_lm_labels[i]) eveloss += loss.mean().item() loss.backward() optimizer.step() optimizer.zero_grad() print("step " + str(epoch) + " : " + str(eveloss)) return model
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() # print(torch.cuda.get_device_name(0)) datadir = '../cola_public/raw/' filename = '../cndata/cntext.json' classes = [ 'C31-Enviornment', 'C32-Agriculture', 'C34-Economy', 'C38-Politics', 'C39-Sports' ] # loadcn(filename, classes) MAX_LEN = 128 batch_size = 32 lr = 2e-5 epoch = 4 train_dataloader, dev_dataloader = loadcn(filename, classes, MAX_LEN, batch_size) model = BertForSequenceClassification.from_pretrained( "../bert-base-chinese", num_labels=len(classes)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ['bias', 'gamma', 'beta'] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }, { 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }] optimizer = BertAdam(optimizer_grouped_parameters, lr, warmup=0.1) train(model, device, epoch, train_dataloader, dev_dataloader, optimizer, n_gpu) torch.save(model.state_dict(), "../save/bert_cn%d" % epoch)
def build_optimizer(model, num_train_steps, learning_rate, warmup_proportion, weight_decay): # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_steps) return optimizer
def __init__(self, model: SketchPredictor, num_train_step: int, freeze_bert_for_niter: int, config: Dict): self.model = model bert_params = list([ (p_name, p) for (p_name, p) in model.encoder_model.bert_model.named_parameters() if p.requires_grad ]) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] bert_grouped_parameters = [{ 'params': [p for n, p in bert_params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in bert_params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] self.other_params = [ p for n, p in model.named_parameters() if 'bert_model' not in n and p.requires_grad ] self.bert_optimizer = BertAdam(bert_grouped_parameters, lr=config['bert_learning_rate'], warmup=0.1, t_total=num_train_step) self.optimizer = torch.optim.Adam(self.other_params, lr=0.001) self.freeze_bert_for_niter = freeze_bert_for_niter
def configure_optimizers(self): if self.hparams.model in ["bert", "concatbert", "mmbt"]: total_steps = (self.hparams.train_data_len / self.hparams.batch_sz / self.hparams.gradient_accumulation_steps * self.hparams.max_epochs) param_optimizer = self.exclude_from_wt_decay( list(self.named_parameters()), ["bias", "LayerNorm.bias", "LayerNorm.weight"]) optimizer = BertAdam( param_optimizer, lr=self.hparams.lr, warmup=self.hparams.warmup, t_total=total_steps, ) else: optimizer = optim.Adam(self.parameters(), lr=self.hparams.lr) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, "max", patience=self.hparams.lr_patience, verbose=True, factor=self.hparams.lr_factor, ) scheduler = { 'scheduler': scheduler, 'monitor': 'val_checkpoint_on', 'interval': 'epoch', 'frequency': self.hparams.lr_patience } return [optimizer], [scheduler]