def set_optimizer(args, model, train_steps=None): if args.warm_up: logging.info('using BertAdam') param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=train_steps) return optimizer else: logging.info('using optim Adam') parameters_trainable = list( filter(lambda p: p.requires_grad, model.parameters())) optimizer = optim.Adam(parameters_trainable, lr=args.learning_rate) return optimizer
def set_optimizer(self, args, model, train_steps=None): param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=train_steps) return optimizer
def train(data, save_model_dir, seg=True, debug=False): print("Training with {} model.".format(data.model_type)) # data.show_data_summary() if data.bert_finetune: print('bert_finetune') model = BertNER(data) else: print('bert feature extraction') model = SeqModel(data) print("finish building model.") parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adamax(parameters, lr=data.HP_lr) if data.warm_up: print('using warm_up...') param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int( len(data.train_Ids) / data.HP_batch_size) * data.HP_iteration optimizer = BertAdam(optimizer_grouped_parameters, lr=data.HP_lr, warmup=0.1, t_total=num_train_optimization_steps) best_dev = -1 best_dev_p = -1 best_dev_r = -1 best_test = -1 best_test_p = -1 best_test_r = -1 ## start training for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print(("Epoch: %s/%s" % (idx, data.HP_iteration))) if not data.warm_up: optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_loss = 0 batch_loss = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) ## set model in train model model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_Ids[start:end] words = data.train_texts[start:end] if not instance: continue batch_word, batch_biword, batch_wordlen, batch_label, mask, batch_bert, bert_mask = batchify_with_label( instance, data.HP_gpu) instance_count += 1 loss, tag_seq = model.neg_log_likelihood_loss( batch_word, batch_biword, batch_wordlen, mask, batch_label, batch_bert, bert_mask) right, whole = predict_check(tag_seq, batch_label, mask) right_token += right whole_token += whole sample_loss += loss.data total_loss += loss.data batch_loss += loss if end % 500 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print(( " Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token))) sys.stdout.flush() sample_loss = 0 if end % data.HP_batch_size == 0: batch_loss.backward() optimizer.step() model.zero_grad() batch_loss = 0 if debug: break temp_time = time.time() temp_cost = temp_time - temp_start print((" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token))) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print(( "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % (idx, epoch_cost, train_num / epoch_cost, total_loss))) speed, acc, p, r, f, pred_labels = evaluate(data, model, "dev") dev_finish = time.time() dev_cost = dev_finish - epoch_finish if seg: current_score = f print(( "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (dev_cost, speed, acc, p, r, f))) else: current_score = acc print(("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (dev_cost, speed, acc))) if current_score > best_dev: if seg: print("Exceed previous best f score:", best_dev) else: print("Exceed previous best acc score:", best_dev) model_name = save_model_dir torch.save(model.state_dict(), model_name) # best_dev = current_score best_dev_p = p best_dev_r = r # ## decode test speed, acc, p, r, f, pred_labels = evaluate(data, model, "test") test_finish = time.time() test_cost = test_finish - dev_finish if seg: current_test_score = f print(( "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (test_cost, speed, acc, p, r, f))) else: current_test_score = acc print(("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" % (test_cost, speed, acc))) if current_score > best_dev: best_dev = current_score best_test = current_test_score best_test_p = p best_test_r = r print("Best dev score: p:{}, r:{}, f:{}".format( best_dev_p, best_dev_r, best_dev)) print("Test score: p:{}, r:{}, f:{}".format(best_test_p, best_test_r, best_test)) gc.collect() with open(data.result_file, "a") as f: f.write(save_model_dir + '\n') f.write("Best dev score: p:{}, r:{}, f:{}\n".format( best_dev_p, best_dev_r, best_dev)) f.write("Test score: p:{}, r:{}, f:{}\n\n".format( best_test_p, best_test_r, best_test)) f.close()