Esempio n. 1
0
def do_train():
    # ========================== subtask 1. 预测subject ==========================
    # 这一部分我用一个 NER 任务来做,但是原任务用的是 start + end 的方式,原理是一样的
    # ========================== =================== =============================
    bert_name_or_path = "/home/lawson/pretrain/bert-base-chinese"
    roberta_name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch"
    model_subject = SubjectModel(bert_name_or_path,768,out_fea=subject_class_num) 
    if (args.init_checkpoint != None): # 加载初始模型
        model_subject.load_state_dict(t.load(args.init_checkpoint))
    model_subject = model_subject.cuda()    
    # crf = CRF(num_tags = subject_class_num,batch_first=True)
    # crf = crf.cuda()
    #print(crf.transitions)
    # 这里将DistributedBatchSample(paddle) 修改成了 DistributedSample(torch)    
    # 如果使用 DistributedSampler 那么应该就是一个多进程加载数据
    # train_batch_sampler = DistributedSampler(
    #     train_dataset,
    #     shuffle=True,
    #     drop_last=True 
    #     )
    
    # Loads dataset.
    train_dataset = TrainSubjectDataset.from_file(
        args.train_data_path,
        tokenizer,
        args.max_seq_length,
        True
        )
    # crf.transitions
    train_data_loader = DataLoader(        
        dataset=train_dataset,
        #batch_sampler=train_batch_sampler,
        batch_size=args.batch_size,
        collate_fn=collator, # 重写一个 collator
        )

     # Loads dataset.
    # 放在外面是为了避免每次 evaluate 的时候都加载一遍
    # dev 数据集也是用 TrainSubjectDataset 的原因是:想计算loss
    dev_dataset = TrainSubjectDataset.from_file(        
        args.dev_data_path,
        tokenizer,
        args.max_seq_length,
        True
        )

    dev_data_loader = DataLoader(        
        dataset=dev_dataset,        
        batch_size=args.batch_size,
        collate_fn=collator, # 重写一个 collator
        )

    # 这里为什么只对一部分的参数做这个decay 操作? 这个decay 操作有什么作用?
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model_subject.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    
    # 需要合并所有模型的参数    
    optimizer = t.optim.Adam(
        [
        {'params':model_subject.parameters(),'lr':2e-5},
        #{'params':crf.parameters(),'lr':0.1},
        ],
        )
    
    # Defines learning rate strategy.
    steps_by_epoch = len(train_data_loader)
    num_training_steps = steps_by_epoch * args.num_train_epochs    
    lr_scheduler = ReduceLROnPlateau(optimizer=optimizer,
                                     mode='min')
    
    # 打印本次的配置
    logger.info("the paramers in this model are:")
    for k,v in (vars(args).items()):
        logger.info(f"{k,v}")


    # Starts training.
    global_step = 0
    logging_steps = 50
    save_steps = 5000
    max_f1 = 0 # 最佳f1    
    for epoch in tqdm(range(args.num_train_epochs)):
        #print(crf.transitions)
        logger.info(f"\n=====start training of {epoch} epochs =====")        
        # 设置为训练模式
        model_subject.train() # 预测subject
        step = 0        
        vis_loss = 0 # 输出到visdom 的loss
        for batch in tqdm(train_data_loader):            
            step += 1
            input_ids,token_type_ids,attention_mask,batch_origin_info, labels,offset_mappings = batch
            # labels size = [batch_size,max_seq_length]
            logits_1 = model_subject(input_ids=input_ids,
                                   token_type_ids=token_type_ids,
                                   attention_mask=attention_mask
                                   )
            
            # batch_size = logits_1.size(0)
            # max_seq_length = logits_1.size(1)
            # label_num = logits_1.size(2)
            # logits_1 = logits_1.view(max_seq_length,batch_size,label_num) # reshape 至可以让crf处理
            
            # labels = labels.view(max_seq_length,batch_size)
            # attention_mask = attention_mask.view(max_seq_length,batch_size)            
            # 添加crf
            #loss = -crf(logits_1, labels, mask = attention_mask.byte(), reduction = 'mean')

            #logits size [batch_size,max_seq_len,class_num]
            logits_1 = logits_1.view(-1,subject_class_num)
            labels = labels.view(-1)
            loss = criterion(logits_1, labels)
            
            loss.backward()
            optimizer.step()
            #lr_scheduler.step()
            optimizer.zero_grad()
            loss_item = loss.item()
            vis_loss += loss_item
            
            logger.info(f"epoch:{epoch}/{args.num_train_epochs},  steps:{step}/{steps_by_epoch},   loss:{loss_item}")
            if loss_item > 0.1 :
                logger.info(f"{batch_origin_info}")
            # 打日志
            if global_step % logging_steps == 0 and global_step:                
                vis.line([vis_loss], [global_step], win=win, update="append")
                vis_loss = 0            
            global_step += 1

            if global_step % save_steps == 0 and global_step:
                save_model_path = os.path.join(args.output_dir,"model_subject_%d_bert.pdparams" % (global_step+53530))
                logger.info("saving checkpoing model_subject_%d_bert.pdparams to %s " %
                        (global_step, args.output_dir))
                t.save(model_subject.state_dict(),save_model_path)

        
        # 使用dev 数据集评测模型效果
        pred_file_path = f"/home/lawson/program/DuIE_py/data/predict/dev_data_subject_predict_model_subject_{global_step+53530}_bert.txt"
        evaluate(model_subject,dev_data_loader,criterion,pred_file_path,crf=None,all_known_subjects=None)
        recall,precision,f1 = cal_subject_metric(dev_data_file_path = args.dev_data_path, pred_file_path=pred_file_path)
        if f1 > max_f1 :
            # 选择最佳f1 值保存模型
            logger.info(f"saving checkpoing model_subject_{global_step}.pdparams to {args.output_dir}")
            cur_model_subject_name = os.path.join(args.output_dir,"model_subject_%d_bert_f1=%f.pdparams" % (global_step+53530,f1))
            #cur_model_crf_name = os.path.join(args.output_dir,"crf_%d_bert.pdparams" % (global_step))
            t.save(model_subject.state_dict(),cur_model_subject_name)
            #t.save(crf.state_dict(),cur_model_crf_name)
            max_f1 = f1
    
        logger.info(f"recall = {recall}, precision = {precision}, f1 = {f1}")        
        
    logger.info("\n=====training complete=====")
Esempio n. 2
0
        do_train()
    if args.do_eval:        
        roberta_name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch"
        bert_name_or_path = "/home/lawson/pretrain/bert-base-chinese"
        model_subject = SubjectModel(bert_name_or_path,768,out_fea=subject_class_num)
        if (args.init_checkpoint != None): # 加载初始模型
            model_subject.load_state_dict(t.load(args.init_checkpoint))
        model_subject = model_subject.cuda()
        
        collator = TrainSubjectDataCollator()
        # Loads dataset.
        # 放在外面是为了避免每次 evaluate 的时候都加载一遍
        # dev 数据集也是用 TrainSubjectDataset 的原因是:想计算loss
        dev_dataset = TrainSubjectDataset.from_file(        
            args.dev_data_path,
            tokenizer,
            args.max_seq_length,
            True
            )

        dev_data_loader = DataLoader(
            dataset=dev_dataset,
            batch_size=args.batch_size,
            collate_fn=collator, # 重写一个 collator
            )
        
        # 找出训练数据集中已知的所有subjects 
        all_known_subjects = get_all_subjects(train_data_path=args.dev_data_path)
        temp1 = (args.dev_data_path).split("/")[-1].split(".")[0]
        temp2 = (args.init_checkpoint).split("/")[-1]
        pred_file_path = f"/home/lawson/program/DuIE_py/data/predict/{temp1}_predict_subject_{temp2}_7_3.txt"
        if os.path.exists(pred_file_path):
Esempio n. 3
0
def do_train():
    if args.init_checkpoint is not None and os.path.exists(
            args.init_checkpoint):
        logger.info(f"加载模型:{args.init_checkpoint}")
        model_object.load_state_dict(t.load(args.init_checkpoint))

    viz_object = Visdom()
    win = "train_object_loss"

    # Loads dataset.
    train_dataset = TrainSubjectDataset.from_file(args.train_data_path,
                                                  tokenizer,
                                                  args.max_seq_length, True)

    train_data_loader = DataLoader(
        dataset=train_dataset,
        #batch_sampler=train_batch_sampler,
        batch_size=args.batch_size,
        collate_fn=collator,  # 重写一个 collator
        shuffle=False)

    dev_dataset = TrainSubjectDataset.from_file(args.dev_data_path, tokenizer,
                                                args.max_seq_length, True)

    dev_data_loader = DataLoader(
        dataset=dev_dataset,
        batch_size=args.eval_batch_size,
        collate_fn=collator,
    )

    # 需要合并所有模型的参数
    optimizer = t.optim.AdamW([
        {
            'params': model_object.parameters(),
            'lr': 1e-5
        },
    ], )

    # Starts training.
    global_step = 0
    logging_steps = 50
    logging_loss = 0
    save_steps = 5000
    max_f1 = 0
    max_recall = 0
    for epoch in tqdm(range(args.num_train_epochs)):
        logger.info("\n=====start training of %d epochs=====" % epoch)
        # 设置为训练模式
        model_object.train()  # 根据subject 预测object
        step = 1
        for batch in tqdm(train_data_loader):
            input_ids, token_type_ids, attention_mask, batch_origin_info, labels, batch_offset_mapping = batch

            # ====== 根据origin_info 得到 subtask 2 的训练数据 ==========
            # 这里的object_input_ids 的size 不再是args.batch_size ,可能比这个稍大

            object_input_ids, object_token_type_ids, object_attention_mask, object_labels, object_origin_info, object_offset_mapping = from_dict2object(
                batch_subjects=None,
                batch_origin_dict=batch_origin_info,
                tokenizer=tokenizer,
                max_length=args.max_seq_length,
                pad_to_max_length=True)

            object_input_ids = t.tensor(object_input_ids).cuda()
            object_token_type_ids = t.tensor(object_token_type_ids).cuda()
            object_attention_mask = t.tensor(object_attention_mask).cuda()
            object_labels = t.tensor(object_labels).cuda()
            logits_2 = model_object(
                input_ids=object_input_ids,
                token_type_ids=object_token_type_ids,
                attention_mask=object_attention_mask
            )  # size [batch_size,max_seq_len,object_class_num]
            logits_2 = logits_2.view(-1, object_class_num)
            object_labels = object_labels.view(-1)
            loss = criterion(logits_2, object_labels)

            loss.backward()
            optimizer.step()
            #lr_scheduler.step()
            optimizer.zero_grad()
            loss_item = loss.item()
            logging_loss += loss_item
            step += 1
            global_step += 1
            if global_step % logging_steps == 0 and global_step:
                viz_object.line([logging_loss], [global_step],
                                win=win,
                                update="append")
                logging_loss = 0

        # 是在验证集上做pred
        pred_file_path = (args.dev_data_path).strip(
            ".json") + "_roberta_{global_step}_object_predict.txt"
        if os.path.exists(pred_file_path):
            os.remove(pred_file_path)

        recall, precision, f1 = evaluate(model_object, dev_data_loader,
                                         pred_file_path)
        if f1 > max_f1:  # 保存最大f1
            save_path = f"{args.output_dir}/model_object_{global_step}_roberta_f1_{f1}.pdparams"
            t.save(model_object.state_dict(), save_path)
            f1 = max_f1
        elif recall > max_recall:  # 再看是否recall达到最大
            save_path = f"{args.output_dir}/model_object_{global_step}_roberta_recall_{recall}.pdparams"
            t.save(model_object.state_dict(), save_path)
            recall = max_recall
    logger.info("\n=====training complete=====")
Esempio n. 4
0
def do_train_3(model_relation_path):
    relation_name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch"
    model_relation = RelationModel(relation_name_or_path, relation_class_num)
    model_relation = model_relation.cuda()
    if model_relation_path != None and os.path.exists(model_relation_path):
        model_relation.load_state_dict(t.load(model_relation_path))
    tokenizer = BertTokenizerFast.from_pretrained(
        "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch")

    # Loads dataset.
    # 这里之所以使用 TrainSubjectDataset 是因为需要加载原始的数据,通过原始的数据才可以得到训练 relation 的数据
    logger.info(f"Preprocessing data, loaded from {args.train_data_path}")
    train_dataset = TrainSubjectDataset.from_file(args.train_data_path,
                                                  tokenizer,
                                                  args.max_seq_length, True)
    collator = TrainSubjectDataCollator()
    train_data_loader = DataLoader(
        dataset=train_dataset,
        #batch_sampler=train_batch_sampler,
        batch_size=args.batch_size,
        collate_fn=collator,  # 重写一个 collator
    )

    dev_dataset = TrainSubjectDataset.from_file(args.dev_data_path, tokenizer,
                                                args.max_seq_length, True)
    dev_data_loader = DataLoader(
        dataset=dev_dataset,
        batch_size=args.batch_size,
        collate_fn=collator,  # 重写一个 collator
    )

    viz = Visdom()
    win = "train_loss_negative_2"
    res = []  # 最后的预测结果
    subject_invalid_num = 0  # 预测失败的个数
    all_known_subjects = get_all_subjects(
        train_data_path="/home/lawson/program/DuIE_py/data/train_data.json")

    optimizer = t.optim.AdamW([
        {
            'params': model_relation.parameters(),
            'lr': 2e-5
        },
    ], )

    # Starts training.
    global_step = 0
    logging_steps = 50
    logging_loss = 0
    max_f1 = 0
    evaluate_step = 2000
    for epoch in tqdm(range(args.num_train_epochs)):
        total_neg_cnt = 0
        total_pos_cnt = 0
        for batch in tqdm(train_data_loader):
            # origin_info 是原始的json格式的信息
            input_ids, token_type_ids, attention_mask, batch_origin_info, batch_labels, offset_mapping = batch

            relation_input_ids,relation_token_type_ids,relation_attention_mask,relation_labels \
                = get_negative_relation_data_2(batch_origin_info,tokenizer,max_length=args.max_seq_length)

            relation_input_ids = t.tensor(relation_input_ids).cuda()
            relation_token_type_ids = t.tensor(relation_token_type_ids).cuda()
            relation_attention_mask = t.tensor(relation_attention_mask).cuda()
            relation_labels = t.tensor(relation_labels).cuda()

            # 随机采样
            # 01.降低batch的大小;02 训练更快收敛
            # 02.采8个正样本,8个负样本
            non_zero_index = t.nonzero(relation_labels)
            non_zero_index = non_zero_index.squeeze()
            non_zero_index = non_zero_index.tolist()
            random.shuffle(non_zero_index)  # 随机一下,没有返回值,是原地shuffle
            non_zero_index = t.tensor(non_zero_index)
            if len(non_zero_index) > 8:
                non_zero_index = non_zero_index[0:8]
            non_zero_index = non_zero_index.cuda()
            pos_relation_input_ids = t.index_select(relation_input_ids, 0,
                                                    non_zero_index)  # 找出正样本
            pos_relation_token_type_ids = t.index_select(
                relation_token_type_ids, 0, non_zero_index)
            pos_attention_mask = t.index_select(relation_attention_mask, 0,
                                                non_zero_index)
            pos_labels = t.index_select(relation_labels, 0, non_zero_index)

            zero_index = [
                i for i in range(len(relation_labels))
                if not relation_labels[i]
            ]
            zero_index = t.tensor(zero_index)
            zero_index = zero_index.squeeze()
            zero_index = zero_index.tolist()
            random.shuffle(zero_index)
            zero_index = t.tensor(zero_index)
            if len(zero_index) > 8:
                zero_index = zero_index[0:8]
            zero_index = zero_index.cuda()
            # 找出负样本
            neg_relation_input_ids = t.index_select(relation_input_ids, 0,
                                                    zero_index)  # 找出正样本
            neg_relation_token_type_ids = t.index_select(
                relation_token_type_ids, 0, zero_index)
            neg_attention_mask = t.index_select(relation_attention_mask, 0,
                                                zero_index)
            neg_labels = t.index_select(relation_labels, 0, zero_index)

            # 将正负样本拼接在一起
            relation_input_ids = t.cat(
                (pos_relation_input_ids, neg_relation_input_ids), 0)
            relation_attention_mask = t.cat(
                (pos_attention_mask, neg_attention_mask), 0)
            relation_token_type_ids = t.cat(
                (pos_relation_token_type_ids, neg_relation_token_type_ids), 0)
            relation_labels = t.cat((pos_labels, neg_labels), 0)

            # 这个模型直接得到loss
            out = model_relation(input_ids=relation_input_ids,
                                 token_type_ids=relation_token_type_ids,
                                 attention_mask=relation_attention_mask,
                                 labels=relation_labels)
            loss = out.loss
            loss.backward()
            optimizer.step()
            #lr_scheduler.step()
            optimizer.zero_grad()

            avg_loss = loss.item() / relation_input_ids.size(0)
            logger.info(f"平均每个样本的损失时:avg_loss = {avg_loss}")
            if avg_loss > 2:  # 重点关注一下这种损失的数据
                logger.info(f"{batch_origin_info}")
            logging_loss += avg_loss
            # 打日志
            if global_step % logging_steps == 0 and global_step:
                viz.line([logging_loss], [global_step],
                         win=win,
                         update="append")
                logging_loss = 0

            # total_neg_cnt += batch_neg_cnt
            # total_pos_cnt += batch_pos_cnt
            # logger.info(f"batch_neg_cnt:{batch_neg_cnt}\n,\
            #             batch_pos_cnt ={batch_pos_cnt}\n,\
            #             total_neg_cnt={total_neg_cnt}\n,\
            #             total_pos_cnt={total_pos_cnt}")
            global_step += 1

            if global_step % evaluate_step == 0 and global_step:
                # 是在验证集上做pred
                pred_file_path = (args.dev_data_path).strip(
                    ".json") + f"_roberta_{global_step}_object_predict.txt"
                if os.path.exists(pred_file_path):
                    os.remove(pred_file_path)

                f1 = evaluate(model_relation, dev_data_loader, tokenizer,
                              pred_file_path)
                if f1 > max_f1:  # 保存最大f1
                    save_path = f"{args.output_dir}/model_relation_{global_step}_roberta_f1_{f1}.pdparams"
                    t.save(model_relation.state_dict(), save_path)
                    f1 = max_f1

        # 每个epoch 之后保存模型
        # t.save(model_relation.state_dict(),os.path.join(args.output_dir,
        #                 "model_relation_%d_roberta_epoch.pdparams" % (global_step)))
    logger.info("\n=====training complete=====")
Esempio n. 5
0
def do_train_2(model_subject_path, model_object_path, model_relation_path):
    # Does predictions.
    logger.info(
        "\n====================start predicting / evaluating ===================="
    )
    subject_name_or_path = "/home/lawson/pretrain/bert-base-chinese"
    model_subject = SubjectModel(subject_name_or_path,
                                 768,
                                 out_fea=subject_class_num)
    model_subject.load_state_dict(t.load(model_subject_path))
    model_subject = model_subject.cuda()

    object_name_or_path = "/home/lawson/pretrain/bert-base-chinese"
    model_object = ObjectModel(object_name_or_path, 768, object_class_num)
    model_object = model_object.cuda()
    model_object.load_state_dict(t.load(model_object_path))

    relation_name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch"
    model_relation = RelationModel(relation_name_or_path, relation_class_num)
    model_relation = model_relation.cuda()
    model_relation.load_state_dict(t.load(model_relation_path))
    tokenizer = BertTokenizerFast.from_pretrained(
        "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch")

    # Loads dataset.
    # 这里之所以使用 TrainSubjectDataset 是因为需要加载原始的数据,通过原始的数据才可以得到训练 relation 的数据
    logger.info(f"Preprocessing data, loaded from {args.train_data_path}")
    train_dataset = TrainSubjectDataset.from_file(args.train_data_path,
                                                  tokenizer,
                                                  args.max_seq_length, True)
    # 这里将DistributedBatchSample(paddle) 修改成了 DistributedSample(torch)
    # 如果使用 DistributedSampler 那么应该就是一个多进程加载数据
    # train_batch_sampler = DistributedSampler(
    #     train_dataset,
    #     shuffle=True,
    #     drop_last=True
    #     )
    collator = TrainSubjectDataCollator()
    train_data_loader = DataLoader(
        dataset=train_dataset,
        #batch_sampler=train_batch_sampler,
        batch_size=args.batch_size,
        collate_fn=collator,  # 重写一个 collator
    )

    model_subject.eval()
    viz = Visdom()
    win = "train_loss_negative_2"
    res = []  # 最后的预测结果
    subject_invalid_num = 0  # 预测失败的个数
    all_known_subjects = get_all_subjects(
        train_data_path="/home/lawson/program/DuIE_py/data/train_data.json")
    # 将二者模型的梯度关闭
    for param in model_subject.parameters():
        param.requires_grad = False
    for param in model_object.parameters():
        param.requires_grad = False

    optimizer = t.optim.AdamW([
        {
            'params': model_relation.parameters(),
            'lr': 2e-5
        },
    ], )

    # Starts training.
    global_step = 0
    logging_steps = 50
    save_steps = 5000
    step = 1
    logging_loss = 0
    for epoch in tqdm(range(args.num_train_epochs)):
        total_neg_cnt = 0
        total_pos_cnt = 0
        for batch in tqdm(train_data_loader):
            # origin_info 是原始的json格式的信息
            input_ids, token_type_ids, attention_mask, batch_origin_info, batch_labels, offset_mapping = batch
            # labels size = [batch_size,max_seq_length]
            logits_1 = model_subject(input_ids=input_ids,
                                     token_type_ids=token_type_ids,
                                     attention_mask=attention_mask)
            #logits size [batch_size,max_seq_len,class_num]
            # 得到预测到的 subject
            # temp = get_rid_of_number_in_str(origin_info[0]['text'])
            # origin_info[0]['text'] = temp
            batch_subjects, batch_subject_labels = decode_subject(
                logits_1, id2subject_map, input_ids, tokenizer,
                batch_origin_info, offset_mapping, all_known_subjects)

            # 将subjects 中的元素去重
            # 需要判断 batch_subjects 是空的情况,最好能够和普通subjects 一样处理
            if (len(batch_subjects[0]) == 0):
                #print("----- 未预测到subject ----------")
                subject_invalid_num += 1
                continue

            # 将subject的预测结果写到文件中
            object_invalid_num = 0
            # ====== 根据origin_info 得到 subtask 2 的训练数据 ==========
            # 这里的object_input_ids 的 size 不再是args.batch_size ,可能比这个稍大
            object_input_ids, object_token_type_ids,object_attention_mask,\
            object_labels,object_origin_info,object_offset_mapping = from_dict2object(batch_subjects=batch_subjects,
                                                        batch_origin_dict=batch_origin_info,
                                                        tokenizer=tokenizer,
                                                        max_length=args.max_seq_length,
                                                        )
            object_input_ids = t.tensor(object_input_ids).cuda()
            object_token_type_ids = t.tensor(object_token_type_ids).cuda()
            object_attention_mask = t.tensor(object_attention_mask).cuda()

            logits_2 = model_object(input_ids=object_input_ids,
                                    token_type_ids=object_token_type_ids,
                                    attention_mask=object_attention_mask)
            batch_objects, batch_object_labels = decode_object(
                logits_2, id2object_map, tokenizer, object_input_ids,
                object_origin_info, object_offset_mapping, logger)

            relation_input_ids,relation_token_type_ids,relation_attention_mask,relation_labels,batch_neg_cnt,batch_pos_cnt \
                = get_negative_relation_data(batch_subjects,batch_objects,batch_origin_info,tokenizer,max_length=128)
            relation_input_ids = t.tensor(relation_input_ids).cuda()
            relation_token_type_ids = t.tensor(relation_token_type_ids).cuda()
            relation_attention_mask = t.tensor(relation_attention_mask).cuda()
            relation_labels = t.tensor(relation_labels).cuda()

            if relation_input_ids.size(0) < 1:
                continue
            logger.info(
                f"relation_input_ids.size(0) = {relation_input_ids.size(0)}")
            if relation_input_ids.size(0) > 32:
                out = model_relation(
                    input_ids=relation_input_ids[0:32, :],
                    token_type_ids=relation_token_type_ids[0:32, :],
                    attention_mask=relation_attention_mask[0:32, :],
                    labels=relation_labels[0:32])
                logger.info(f"{batch_origin_info}")
            else:
                # 这个模型直接得到loss
                out = model_relation(input_ids=relation_input_ids,
                                     token_type_ids=relation_token_type_ids,
                                     attention_mask=relation_attention_mask,
                                     labels=relation_labels)
            loss = out.loss
            loss.backward()
            optimizer.step()
            #lr_scheduler.step()
            optimizer.zero_grad()

            if relation_input_ids.size(0) > 32:
                avg_loss = loss.item() / 32
            else:
                avg_loss = loss.item() / relation_input_ids.size(0)
            logger.info(f"平均每个样本的损失时:avg_loss = {avg_loss}")
            if avg_loss > 2:  # 重点关注一下这种损失的数据
                logger.info(f"{batch_origin_info}")
            logging_loss += avg_loss
            # 打日志
            if global_step % logging_steps == 0 and global_step:
                viz.line([logging_loss], [global_step],
                         win=win,
                         update="append")
                logging_loss = 0

            # 保存模型
            if global_step % save_steps == 0 and global_step != 0:
                logger.info(
                    f"saving checkpoing model_relation_{513882+global_step}.pdparams to {args.output_dir}"
                )
                cur_model_name = os.path.join(
                    args.output_dir, "model_relation_%d_roberta.pdparams" %
                    (513882 + global_step))
                t.save(model_relation.state_dict(), cur_model_name)

            total_neg_cnt += batch_neg_cnt
            total_pos_cnt += batch_pos_cnt
            logger.info(f"batch_neg_cnt:{batch_neg_cnt}\n,\
                        batch_pos_cnt ={batch_pos_cnt}\n,\
                        total_neg_cnt={total_neg_cnt}\n,\
                        total_pos_cnt={total_pos_cnt}")
            step += 1
            global_step += 1

        # 每个epoch 之后保存模型
        t.save(
            model_relation.state_dict(),
            os.path.join(
                args.output_dir, "model_relation_%d_roberta_epoch.pdparams" %
                (513882 + global_step)))
    logger.info("\n=====training complete=====")
Esempio n. 6
0
def do_train():
    # ========================== subtask 1. 预测subject ==========================
    name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch"
    model_relation = RelationModel(name_or_path, relation_class_num)
    model_relation = model_relation.cuda()
    tokenizer = BertTokenizerFast.from_pretrained(
        "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch")

    # Loads dataset.
    # 这里之所以使用 TrainSubjectDataset 是因为需要加载原始的数据,通过原始的数据才可以得到训练 relation 的数据
    train_dataset = TrainSubjectDataset.from_file(args.train_data_path,
                                                  tokenizer,
                                                  args.max_seq_length, True)

    collator = TrainSubjectDataCollator()
    train_data_loader = DataLoader(
        dataset=train_dataset,
        #batch_sampler=train_batch_sampler,
        batch_size=args.batch_size,
        collate_fn=collator,  # 重写一个 collator
    )

    dev_dataset = TrainSubjectDataset.from_file(args.dev_data_path, tokenizer,
                                                args.max_seq_length, True)

    dev_data_loader = DataLoader(
        dataset=dev_dataset,
        batch_size=args.batch_size,
        #batch_sampler=dev_batch_sampler,
        collate_fn=collator,
    )

    # 需要合并所有模型的参数
    optimizer = t.optim.AdamW([{
        'params': model_relation.parameters(),
        'lr': 2e-5
    }],
                              #weight_decay=args.weight_decay,
                              )

    # Defines learning rate strategy.
    steps_by_epoch = len(train_data_loader)
    num_training_steps = steps_by_epoch * args.num_train_epochs
    lr_scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='min')

    # Starts training.
    global_step = 0
    logging_steps = 100
    tic_train = time.time()
    viz = Visdom()
    win = "train_loss"
    for epoch in tqdm(range(args.num_train_epochs)):
        print("\n=====start training of %d epochs=====" % epoch)
        tic_epoch = time.time()
        # 设置为训练模式
        model_relation.train()  # 根据subject+object 预测 relation
        logger_loss = 0
        step = 1
        for batch in tqdm(train_data_loader):
            batch_input_ids, batch_token_type_ids, batch_attention_mask, batch_origin_info, batch_labels, batch_offset_mapping = batch

            # ====== 根据origin_info 得到 subtask 3 的训练数据 ==========
            # 根据 subject + object 预测 relation
            relation_input_ids, relation_token_type_ids, relation_attention_mask, relation_labels = from_dict2_relation(
                batch_subjects=None,
                batch_objects=None,
                batch_origin_info=batch_origin_info,
                tokenizer=tokenizer,
                max_length=args.max_seq_length)

            relation_input_ids = t.tensor(relation_input_ids).cuda()
            relation_token_type_ids = t.tensor(relation_token_type_ids).cuda()
            relation_attention_mask = t.tensor(relation_attention_mask).cuda()
            relation_labels = t.tensor(relation_labels).cuda()

            # 这个模型直接得到loss
            out = model_relation(input_ids=relation_input_ids,
                                 token_type_ids=relation_token_type_ids,
                                 attention_mask=relation_attention_mask,
                                 labels=relation_labels)
            loss = out.loss
            loss.backward()
            optimizer.step()
            #lr_scheduler.step()
            optimizer.zero_grad()
            loss_item = loss.item()
            logger_loss += loss_item
            logger.info(f"loss:{loss_item}")

            # 打日志
            if global_step % logging_steps == 0 and global_step:
                logger.info(
                    f"epoch:{epoch}/{args.num_train_epochs},  steps:{step}/{steps_by_epoch},   loss:{loss_item},  speed: {logging_steps / (time.time() - tic_train)} step/s"
                )
                tic_train = time.time()
                viz.line([logger_loss], [global_step],
                         win=win,
                         update="append")
                logger_loss = 0

            step += 1
            global_step += 1
        # 使用 dev 数据集评测模型效果
        evaluate(model_relation, dev_data_loader, tokenizer, pred_file_path)
        # 每个epoch之后保存模型
        logger.info(
            f"saving checkpoing model_relation_{global_step}.pdparams to {args.output_dir}"
        )
        cur_model_name = os.path.join(
            args.output_dir,
            "model_relation_%d_roberta.pdparams" % (global_step))
        t.save(model_relation.state_dict(), cur_model_name)
        logger.info("\n=====training complete=====")