def do_train(): # ========================== subtask 1. 预测subject ========================== # 这一部分我用一个 NER 任务来做,但是原任务用的是 start + end 的方式,原理是一样的 # ========================== =================== ============================= bert_name_or_path = "/home/lawson/pretrain/bert-base-chinese" roberta_name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch" model_subject = SubjectModel(bert_name_or_path,768,out_fea=subject_class_num) if (args.init_checkpoint != None): # 加载初始模型 model_subject.load_state_dict(t.load(args.init_checkpoint)) model_subject = model_subject.cuda() # crf = CRF(num_tags = subject_class_num,batch_first=True) # crf = crf.cuda() #print(crf.transitions) # 这里将DistributedBatchSample(paddle) 修改成了 DistributedSample(torch) # 如果使用 DistributedSampler 那么应该就是一个多进程加载数据 # train_batch_sampler = DistributedSampler( # train_dataset, # shuffle=True, # drop_last=True # ) # Loads dataset. train_dataset = TrainSubjectDataset.from_file( args.train_data_path, tokenizer, args.max_seq_length, True ) # crf.transitions train_data_loader = DataLoader( dataset=train_dataset, #batch_sampler=train_batch_sampler, batch_size=args.batch_size, collate_fn=collator, # 重写一个 collator ) # Loads dataset. # 放在外面是为了避免每次 evaluate 的时候都加载一遍 # dev 数据集也是用 TrainSubjectDataset 的原因是:想计算loss dev_dataset = TrainSubjectDataset.from_file( args.dev_data_path, tokenizer, args.max_seq_length, True ) dev_data_loader = DataLoader( dataset=dev_dataset, batch_size=args.batch_size, collate_fn=collator, # 重写一个 collator ) # 这里为什么只对一部分的参数做这个decay 操作? 这个decay 操作有什么作用? # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model_subject.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] # 需要合并所有模型的参数 optimizer = t.optim.Adam( [ {'params':model_subject.parameters(),'lr':2e-5}, #{'params':crf.parameters(),'lr':0.1}, ], ) # Defines learning rate strategy. steps_by_epoch = len(train_data_loader) num_training_steps = steps_by_epoch * args.num_train_epochs lr_scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='min') # 打印本次的配置 logger.info("the paramers in this model are:") for k,v in (vars(args).items()): logger.info(f"{k,v}") # Starts training. global_step = 0 logging_steps = 50 save_steps = 5000 max_f1 = 0 # 最佳f1 for epoch in tqdm(range(args.num_train_epochs)): #print(crf.transitions) logger.info(f"\n=====start training of {epoch} epochs =====") # 设置为训练模式 model_subject.train() # 预测subject step = 0 vis_loss = 0 # 输出到visdom 的loss for batch in tqdm(train_data_loader): step += 1 input_ids,token_type_ids,attention_mask,batch_origin_info, labels,offset_mappings = batch # labels size = [batch_size,max_seq_length] logits_1 = model_subject(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask ) # batch_size = logits_1.size(0) # max_seq_length = logits_1.size(1) # label_num = logits_1.size(2) # logits_1 = logits_1.view(max_seq_length,batch_size,label_num) # reshape 至可以让crf处理 # labels = labels.view(max_seq_length,batch_size) # attention_mask = attention_mask.view(max_seq_length,batch_size) # 添加crf #loss = -crf(logits_1, labels, mask = attention_mask.byte(), reduction = 'mean') #logits size [batch_size,max_seq_len,class_num] logits_1 = logits_1.view(-1,subject_class_num) labels = labels.view(-1) loss = criterion(logits_1, labels) loss.backward() optimizer.step() #lr_scheduler.step() optimizer.zero_grad() loss_item = loss.item() vis_loss += loss_item logger.info(f"epoch:{epoch}/{args.num_train_epochs}, steps:{step}/{steps_by_epoch}, loss:{loss_item}") if loss_item > 0.1 : logger.info(f"{batch_origin_info}") # 打日志 if global_step % logging_steps == 0 and global_step: vis.line([vis_loss], [global_step], win=win, update="append") vis_loss = 0 global_step += 1 if global_step % save_steps == 0 and global_step: save_model_path = os.path.join(args.output_dir,"model_subject_%d_bert.pdparams" % (global_step+53530)) logger.info("saving checkpoing model_subject_%d_bert.pdparams to %s " % (global_step, args.output_dir)) t.save(model_subject.state_dict(),save_model_path) # 使用dev 数据集评测模型效果 pred_file_path = f"/home/lawson/program/DuIE_py/data/predict/dev_data_subject_predict_model_subject_{global_step+53530}_bert.txt" evaluate(model_subject,dev_data_loader,criterion,pred_file_path,crf=None,all_known_subjects=None) recall,precision,f1 = cal_subject_metric(dev_data_file_path = args.dev_data_path, pred_file_path=pred_file_path) if f1 > max_f1 : # 选择最佳f1 值保存模型 logger.info(f"saving checkpoing model_subject_{global_step}.pdparams to {args.output_dir}") cur_model_subject_name = os.path.join(args.output_dir,"model_subject_%d_bert_f1=%f.pdparams" % (global_step+53530,f1)) #cur_model_crf_name = os.path.join(args.output_dir,"crf_%d_bert.pdparams" % (global_step)) t.save(model_subject.state_dict(),cur_model_subject_name) #t.save(crf.state_dict(),cur_model_crf_name) max_f1 = f1 logger.info(f"recall = {recall}, precision = {precision}, f1 = {f1}") logger.info("\n=====training complete=====")
do_train() if args.do_eval: roberta_name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch" bert_name_or_path = "/home/lawson/pretrain/bert-base-chinese" model_subject = SubjectModel(bert_name_or_path,768,out_fea=subject_class_num) if (args.init_checkpoint != None): # 加载初始模型 model_subject.load_state_dict(t.load(args.init_checkpoint)) model_subject = model_subject.cuda() collator = TrainSubjectDataCollator() # Loads dataset. # 放在外面是为了避免每次 evaluate 的时候都加载一遍 # dev 数据集也是用 TrainSubjectDataset 的原因是:想计算loss dev_dataset = TrainSubjectDataset.from_file( args.dev_data_path, tokenizer, args.max_seq_length, True ) dev_data_loader = DataLoader( dataset=dev_dataset, batch_size=args.batch_size, collate_fn=collator, # 重写一个 collator ) # 找出训练数据集中已知的所有subjects all_known_subjects = get_all_subjects(train_data_path=args.dev_data_path) temp1 = (args.dev_data_path).split("/")[-1].split(".")[0] temp2 = (args.init_checkpoint).split("/")[-1] pred_file_path = f"/home/lawson/program/DuIE_py/data/predict/{temp1}_predict_subject_{temp2}_7_3.txt" if os.path.exists(pred_file_path):
def do_train(): if args.init_checkpoint is not None and os.path.exists( args.init_checkpoint): logger.info(f"加载模型:{args.init_checkpoint}") model_object.load_state_dict(t.load(args.init_checkpoint)) viz_object = Visdom() win = "train_object_loss" # Loads dataset. train_dataset = TrainSubjectDataset.from_file(args.train_data_path, tokenizer, args.max_seq_length, True) train_data_loader = DataLoader( dataset=train_dataset, #batch_sampler=train_batch_sampler, batch_size=args.batch_size, collate_fn=collator, # 重写一个 collator shuffle=False) dev_dataset = TrainSubjectDataset.from_file(args.dev_data_path, tokenizer, args.max_seq_length, True) dev_data_loader = DataLoader( dataset=dev_dataset, batch_size=args.eval_batch_size, collate_fn=collator, ) # 需要合并所有模型的参数 optimizer = t.optim.AdamW([ { 'params': model_object.parameters(), 'lr': 1e-5 }, ], ) # Starts training. global_step = 0 logging_steps = 50 logging_loss = 0 save_steps = 5000 max_f1 = 0 max_recall = 0 for epoch in tqdm(range(args.num_train_epochs)): logger.info("\n=====start training of %d epochs=====" % epoch) # 设置为训练模式 model_object.train() # 根据subject 预测object step = 1 for batch in tqdm(train_data_loader): input_ids, token_type_ids, attention_mask, batch_origin_info, labels, batch_offset_mapping = batch # ====== 根据origin_info 得到 subtask 2 的训练数据 ========== # 这里的object_input_ids 的size 不再是args.batch_size ,可能比这个稍大 object_input_ids, object_token_type_ids, object_attention_mask, object_labels, object_origin_info, object_offset_mapping = from_dict2object( batch_subjects=None, batch_origin_dict=batch_origin_info, tokenizer=tokenizer, max_length=args.max_seq_length, pad_to_max_length=True) object_input_ids = t.tensor(object_input_ids).cuda() object_token_type_ids = t.tensor(object_token_type_ids).cuda() object_attention_mask = t.tensor(object_attention_mask).cuda() object_labels = t.tensor(object_labels).cuda() logits_2 = model_object( input_ids=object_input_ids, token_type_ids=object_token_type_ids, attention_mask=object_attention_mask ) # size [batch_size,max_seq_len,object_class_num] logits_2 = logits_2.view(-1, object_class_num) object_labels = object_labels.view(-1) loss = criterion(logits_2, object_labels) loss.backward() optimizer.step() #lr_scheduler.step() optimizer.zero_grad() loss_item = loss.item() logging_loss += loss_item step += 1 global_step += 1 if global_step % logging_steps == 0 and global_step: viz_object.line([logging_loss], [global_step], win=win, update="append") logging_loss = 0 # 是在验证集上做pred pred_file_path = (args.dev_data_path).strip( ".json") + "_roberta_{global_step}_object_predict.txt" if os.path.exists(pred_file_path): os.remove(pred_file_path) recall, precision, f1 = evaluate(model_object, dev_data_loader, pred_file_path) if f1 > max_f1: # 保存最大f1 save_path = f"{args.output_dir}/model_object_{global_step}_roberta_f1_{f1}.pdparams" t.save(model_object.state_dict(), save_path) f1 = max_f1 elif recall > max_recall: # 再看是否recall达到最大 save_path = f"{args.output_dir}/model_object_{global_step}_roberta_recall_{recall}.pdparams" t.save(model_object.state_dict(), save_path) recall = max_recall logger.info("\n=====training complete=====")
def do_train_3(model_relation_path): relation_name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch" model_relation = RelationModel(relation_name_or_path, relation_class_num) model_relation = model_relation.cuda() if model_relation_path != None and os.path.exists(model_relation_path): model_relation.load_state_dict(t.load(model_relation_path)) tokenizer = BertTokenizerFast.from_pretrained( "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch") # Loads dataset. # 这里之所以使用 TrainSubjectDataset 是因为需要加载原始的数据,通过原始的数据才可以得到训练 relation 的数据 logger.info(f"Preprocessing data, loaded from {args.train_data_path}") train_dataset = TrainSubjectDataset.from_file(args.train_data_path, tokenizer, args.max_seq_length, True) collator = TrainSubjectDataCollator() train_data_loader = DataLoader( dataset=train_dataset, #batch_sampler=train_batch_sampler, batch_size=args.batch_size, collate_fn=collator, # 重写一个 collator ) dev_dataset = TrainSubjectDataset.from_file(args.dev_data_path, tokenizer, args.max_seq_length, True) dev_data_loader = DataLoader( dataset=dev_dataset, batch_size=args.batch_size, collate_fn=collator, # 重写一个 collator ) viz = Visdom() win = "train_loss_negative_2" res = [] # 最后的预测结果 subject_invalid_num = 0 # 预测失败的个数 all_known_subjects = get_all_subjects( train_data_path="/home/lawson/program/DuIE_py/data/train_data.json") optimizer = t.optim.AdamW([ { 'params': model_relation.parameters(), 'lr': 2e-5 }, ], ) # Starts training. global_step = 0 logging_steps = 50 logging_loss = 0 max_f1 = 0 evaluate_step = 2000 for epoch in tqdm(range(args.num_train_epochs)): total_neg_cnt = 0 total_pos_cnt = 0 for batch in tqdm(train_data_loader): # origin_info 是原始的json格式的信息 input_ids, token_type_ids, attention_mask, batch_origin_info, batch_labels, offset_mapping = batch relation_input_ids,relation_token_type_ids,relation_attention_mask,relation_labels \ = get_negative_relation_data_2(batch_origin_info,tokenizer,max_length=args.max_seq_length) relation_input_ids = t.tensor(relation_input_ids).cuda() relation_token_type_ids = t.tensor(relation_token_type_ids).cuda() relation_attention_mask = t.tensor(relation_attention_mask).cuda() relation_labels = t.tensor(relation_labels).cuda() # 随机采样 # 01.降低batch的大小;02 训练更快收敛 # 02.采8个正样本,8个负样本 non_zero_index = t.nonzero(relation_labels) non_zero_index = non_zero_index.squeeze() non_zero_index = non_zero_index.tolist() random.shuffle(non_zero_index) # 随机一下,没有返回值,是原地shuffle non_zero_index = t.tensor(non_zero_index) if len(non_zero_index) > 8: non_zero_index = non_zero_index[0:8] non_zero_index = non_zero_index.cuda() pos_relation_input_ids = t.index_select(relation_input_ids, 0, non_zero_index) # 找出正样本 pos_relation_token_type_ids = t.index_select( relation_token_type_ids, 0, non_zero_index) pos_attention_mask = t.index_select(relation_attention_mask, 0, non_zero_index) pos_labels = t.index_select(relation_labels, 0, non_zero_index) zero_index = [ i for i in range(len(relation_labels)) if not relation_labels[i] ] zero_index = t.tensor(zero_index) zero_index = zero_index.squeeze() zero_index = zero_index.tolist() random.shuffle(zero_index) zero_index = t.tensor(zero_index) if len(zero_index) > 8: zero_index = zero_index[0:8] zero_index = zero_index.cuda() # 找出负样本 neg_relation_input_ids = t.index_select(relation_input_ids, 0, zero_index) # 找出正样本 neg_relation_token_type_ids = t.index_select( relation_token_type_ids, 0, zero_index) neg_attention_mask = t.index_select(relation_attention_mask, 0, zero_index) neg_labels = t.index_select(relation_labels, 0, zero_index) # 将正负样本拼接在一起 relation_input_ids = t.cat( (pos_relation_input_ids, neg_relation_input_ids), 0) relation_attention_mask = t.cat( (pos_attention_mask, neg_attention_mask), 0) relation_token_type_ids = t.cat( (pos_relation_token_type_ids, neg_relation_token_type_ids), 0) relation_labels = t.cat((pos_labels, neg_labels), 0) # 这个模型直接得到loss out = model_relation(input_ids=relation_input_ids, token_type_ids=relation_token_type_ids, attention_mask=relation_attention_mask, labels=relation_labels) loss = out.loss loss.backward() optimizer.step() #lr_scheduler.step() optimizer.zero_grad() avg_loss = loss.item() / relation_input_ids.size(0) logger.info(f"平均每个样本的损失时:avg_loss = {avg_loss}") if avg_loss > 2: # 重点关注一下这种损失的数据 logger.info(f"{batch_origin_info}") logging_loss += avg_loss # 打日志 if global_step % logging_steps == 0 and global_step: viz.line([logging_loss], [global_step], win=win, update="append") logging_loss = 0 # total_neg_cnt += batch_neg_cnt # total_pos_cnt += batch_pos_cnt # logger.info(f"batch_neg_cnt:{batch_neg_cnt}\n,\ # batch_pos_cnt ={batch_pos_cnt}\n,\ # total_neg_cnt={total_neg_cnt}\n,\ # total_pos_cnt={total_pos_cnt}") global_step += 1 if global_step % evaluate_step == 0 and global_step: # 是在验证集上做pred pred_file_path = (args.dev_data_path).strip( ".json") + f"_roberta_{global_step}_object_predict.txt" if os.path.exists(pred_file_path): os.remove(pred_file_path) f1 = evaluate(model_relation, dev_data_loader, tokenizer, pred_file_path) if f1 > max_f1: # 保存最大f1 save_path = f"{args.output_dir}/model_relation_{global_step}_roberta_f1_{f1}.pdparams" t.save(model_relation.state_dict(), save_path) f1 = max_f1 # 每个epoch 之后保存模型 # t.save(model_relation.state_dict(),os.path.join(args.output_dir, # "model_relation_%d_roberta_epoch.pdparams" % (global_step))) logger.info("\n=====training complete=====")
def do_train_2(model_subject_path, model_object_path, model_relation_path): # Does predictions. logger.info( "\n====================start predicting / evaluating ====================" ) subject_name_or_path = "/home/lawson/pretrain/bert-base-chinese" model_subject = SubjectModel(subject_name_or_path, 768, out_fea=subject_class_num) model_subject.load_state_dict(t.load(model_subject_path)) model_subject = model_subject.cuda() object_name_or_path = "/home/lawson/pretrain/bert-base-chinese" model_object = ObjectModel(object_name_or_path, 768, object_class_num) model_object = model_object.cuda() model_object.load_state_dict(t.load(model_object_path)) relation_name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch" model_relation = RelationModel(relation_name_or_path, relation_class_num) model_relation = model_relation.cuda() model_relation.load_state_dict(t.load(model_relation_path)) tokenizer = BertTokenizerFast.from_pretrained( "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch") # Loads dataset. # 这里之所以使用 TrainSubjectDataset 是因为需要加载原始的数据,通过原始的数据才可以得到训练 relation 的数据 logger.info(f"Preprocessing data, loaded from {args.train_data_path}") train_dataset = TrainSubjectDataset.from_file(args.train_data_path, tokenizer, args.max_seq_length, True) # 这里将DistributedBatchSample(paddle) 修改成了 DistributedSample(torch) # 如果使用 DistributedSampler 那么应该就是一个多进程加载数据 # train_batch_sampler = DistributedSampler( # train_dataset, # shuffle=True, # drop_last=True # ) collator = TrainSubjectDataCollator() train_data_loader = DataLoader( dataset=train_dataset, #batch_sampler=train_batch_sampler, batch_size=args.batch_size, collate_fn=collator, # 重写一个 collator ) model_subject.eval() viz = Visdom() win = "train_loss_negative_2" res = [] # 最后的预测结果 subject_invalid_num = 0 # 预测失败的个数 all_known_subjects = get_all_subjects( train_data_path="/home/lawson/program/DuIE_py/data/train_data.json") # 将二者模型的梯度关闭 for param in model_subject.parameters(): param.requires_grad = False for param in model_object.parameters(): param.requires_grad = False optimizer = t.optim.AdamW([ { 'params': model_relation.parameters(), 'lr': 2e-5 }, ], ) # Starts training. global_step = 0 logging_steps = 50 save_steps = 5000 step = 1 logging_loss = 0 for epoch in tqdm(range(args.num_train_epochs)): total_neg_cnt = 0 total_pos_cnt = 0 for batch in tqdm(train_data_loader): # origin_info 是原始的json格式的信息 input_ids, token_type_ids, attention_mask, batch_origin_info, batch_labels, offset_mapping = batch # labels size = [batch_size,max_seq_length] logits_1 = model_subject(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) #logits size [batch_size,max_seq_len,class_num] # 得到预测到的 subject # temp = get_rid_of_number_in_str(origin_info[0]['text']) # origin_info[0]['text'] = temp batch_subjects, batch_subject_labels = decode_subject( logits_1, id2subject_map, input_ids, tokenizer, batch_origin_info, offset_mapping, all_known_subjects) # 将subjects 中的元素去重 # 需要判断 batch_subjects 是空的情况,最好能够和普通subjects 一样处理 if (len(batch_subjects[0]) == 0): #print("----- 未预测到subject ----------") subject_invalid_num += 1 continue # 将subject的预测结果写到文件中 object_invalid_num = 0 # ====== 根据origin_info 得到 subtask 2 的训练数据 ========== # 这里的object_input_ids 的 size 不再是args.batch_size ,可能比这个稍大 object_input_ids, object_token_type_ids,object_attention_mask,\ object_labels,object_origin_info,object_offset_mapping = from_dict2object(batch_subjects=batch_subjects, batch_origin_dict=batch_origin_info, tokenizer=tokenizer, max_length=args.max_seq_length, ) object_input_ids = t.tensor(object_input_ids).cuda() object_token_type_ids = t.tensor(object_token_type_ids).cuda() object_attention_mask = t.tensor(object_attention_mask).cuda() logits_2 = model_object(input_ids=object_input_ids, token_type_ids=object_token_type_ids, attention_mask=object_attention_mask) batch_objects, batch_object_labels = decode_object( logits_2, id2object_map, tokenizer, object_input_ids, object_origin_info, object_offset_mapping, logger) relation_input_ids,relation_token_type_ids,relation_attention_mask,relation_labels,batch_neg_cnt,batch_pos_cnt \ = get_negative_relation_data(batch_subjects,batch_objects,batch_origin_info,tokenizer,max_length=128) relation_input_ids = t.tensor(relation_input_ids).cuda() relation_token_type_ids = t.tensor(relation_token_type_ids).cuda() relation_attention_mask = t.tensor(relation_attention_mask).cuda() relation_labels = t.tensor(relation_labels).cuda() if relation_input_ids.size(0) < 1: continue logger.info( f"relation_input_ids.size(0) = {relation_input_ids.size(0)}") if relation_input_ids.size(0) > 32: out = model_relation( input_ids=relation_input_ids[0:32, :], token_type_ids=relation_token_type_ids[0:32, :], attention_mask=relation_attention_mask[0:32, :], labels=relation_labels[0:32]) logger.info(f"{batch_origin_info}") else: # 这个模型直接得到loss out = model_relation(input_ids=relation_input_ids, token_type_ids=relation_token_type_ids, attention_mask=relation_attention_mask, labels=relation_labels) loss = out.loss loss.backward() optimizer.step() #lr_scheduler.step() optimizer.zero_grad() if relation_input_ids.size(0) > 32: avg_loss = loss.item() / 32 else: avg_loss = loss.item() / relation_input_ids.size(0) logger.info(f"平均每个样本的损失时:avg_loss = {avg_loss}") if avg_loss > 2: # 重点关注一下这种损失的数据 logger.info(f"{batch_origin_info}") logging_loss += avg_loss # 打日志 if global_step % logging_steps == 0 and global_step: viz.line([logging_loss], [global_step], win=win, update="append") logging_loss = 0 # 保存模型 if global_step % save_steps == 0 and global_step != 0: logger.info( f"saving checkpoing model_relation_{513882+global_step}.pdparams to {args.output_dir}" ) cur_model_name = os.path.join( args.output_dir, "model_relation_%d_roberta.pdparams" % (513882 + global_step)) t.save(model_relation.state_dict(), cur_model_name) total_neg_cnt += batch_neg_cnt total_pos_cnt += batch_pos_cnt logger.info(f"batch_neg_cnt:{batch_neg_cnt}\n,\ batch_pos_cnt ={batch_pos_cnt}\n,\ total_neg_cnt={total_neg_cnt}\n,\ total_pos_cnt={total_pos_cnt}") step += 1 global_step += 1 # 每个epoch 之后保存模型 t.save( model_relation.state_dict(), os.path.join( args.output_dir, "model_relation_%d_roberta_epoch.pdparams" % (513882 + global_step))) logger.info("\n=====training complete=====")
def do_train(): # ========================== subtask 1. 预测subject ========================== name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch" model_relation = RelationModel(name_or_path, relation_class_num) model_relation = model_relation.cuda() tokenizer = BertTokenizerFast.from_pretrained( "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch") # Loads dataset. # 这里之所以使用 TrainSubjectDataset 是因为需要加载原始的数据,通过原始的数据才可以得到训练 relation 的数据 train_dataset = TrainSubjectDataset.from_file(args.train_data_path, tokenizer, args.max_seq_length, True) collator = TrainSubjectDataCollator() train_data_loader = DataLoader( dataset=train_dataset, #batch_sampler=train_batch_sampler, batch_size=args.batch_size, collate_fn=collator, # 重写一个 collator ) dev_dataset = TrainSubjectDataset.from_file(args.dev_data_path, tokenizer, args.max_seq_length, True) dev_data_loader = DataLoader( dataset=dev_dataset, batch_size=args.batch_size, #batch_sampler=dev_batch_sampler, collate_fn=collator, ) # 需要合并所有模型的参数 optimizer = t.optim.AdamW([{ 'params': model_relation.parameters(), 'lr': 2e-5 }], #weight_decay=args.weight_decay, ) # Defines learning rate strategy. steps_by_epoch = len(train_data_loader) num_training_steps = steps_by_epoch * args.num_train_epochs lr_scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='min') # Starts training. global_step = 0 logging_steps = 100 tic_train = time.time() viz = Visdom() win = "train_loss" for epoch in tqdm(range(args.num_train_epochs)): print("\n=====start training of %d epochs=====" % epoch) tic_epoch = time.time() # 设置为训练模式 model_relation.train() # 根据subject+object 预测 relation logger_loss = 0 step = 1 for batch in tqdm(train_data_loader): batch_input_ids, batch_token_type_ids, batch_attention_mask, batch_origin_info, batch_labels, batch_offset_mapping = batch # ====== 根据origin_info 得到 subtask 3 的训练数据 ========== # 根据 subject + object 预测 relation relation_input_ids, relation_token_type_ids, relation_attention_mask, relation_labels = from_dict2_relation( batch_subjects=None, batch_objects=None, batch_origin_info=batch_origin_info, tokenizer=tokenizer, max_length=args.max_seq_length) relation_input_ids = t.tensor(relation_input_ids).cuda() relation_token_type_ids = t.tensor(relation_token_type_ids).cuda() relation_attention_mask = t.tensor(relation_attention_mask).cuda() relation_labels = t.tensor(relation_labels).cuda() # 这个模型直接得到loss out = model_relation(input_ids=relation_input_ids, token_type_ids=relation_token_type_ids, attention_mask=relation_attention_mask, labels=relation_labels) loss = out.loss loss.backward() optimizer.step() #lr_scheduler.step() optimizer.zero_grad() loss_item = loss.item() logger_loss += loss_item logger.info(f"loss:{loss_item}") # 打日志 if global_step % logging_steps == 0 and global_step: logger.info( f"epoch:{epoch}/{args.num_train_epochs}, steps:{step}/{steps_by_epoch}, loss:{loss_item}, speed: {logging_steps / (time.time() - tic_train)} step/s" ) tic_train = time.time() viz.line([logger_loss], [global_step], win=win, update="append") logger_loss = 0 step += 1 global_step += 1 # 使用 dev 数据集评测模型效果 evaluate(model_relation, dev_data_loader, tokenizer, pred_file_path) # 每个epoch之后保存模型 logger.info( f"saving checkpoing model_relation_{global_step}.pdparams to {args.output_dir}" ) cur_model_name = os.path.join( args.output_dir, "model_relation_%d_roberta.pdparams" % (global_step)) t.save(model_relation.state_dict(), cur_model_name) logger.info("\n=====training complete=====")