Beispiel #1
0
object_class_num = len(object_map.keys())  # 得出object 的class num

# Reads object_map.
# id2object_1.json 是将标签的粒度变得更细了
id2object_map_path = os.path.join(args.data_path, "id2object_1.json")
if not (os.path.exists(id2object_map_path)
        and os.path.isfile(id2object_map_path)):
    sys.exit("{} dose not exists or is not a file.".format(id2object_map_path))
with open(id2object_map_path, 'r', encoding='utf8') as fp:
    id2object_map = json.load(fp)

roberta_base_name = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch"
roberta_large_name = "/pretrains/pt/clue-roberta-chinese-clue-large"
model_object = ObjectModel(roberta_base_name, 768, object_class_num)
if args.init_checkpoint is not None and os.path.exists(args.init_checkpoint):
    model_object.load_state_dict(t.load(args.init_checkpoint))
model_object = model_object.cuda()
criterion = nn.CrossEntropyLoss()  # 使用交叉熵计算损失
tokenizer = BertTokenizerFast.from_pretrained(
    "/home/lawson/pretrain/bert-base-chinese")
collator = TrainSubjectDataCollator()


def set_random_seed(seed):
    """sets random seed"""
    random.seed(seed)
    np.random.seed(seed)
    #t.seed(seed)  # 为什么torch 也要设置这个seed ?


import time
Beispiel #2
0
def predict_subject_object(model_subject_path,model_object_path):
    # Does predictions.
    print("\n====================start predicting / evaluating ====================")
    #name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch"
    subject_name_or_path = "/home/lawson/pretrain/bert-base-chinese"
    model_subject = SubjectModel(subject_name_or_path,768,out_fea=subject_class_num)
    model_subject.load_state_dict(t.load(model_subject_path))
    model_subject = model_subject.cuda()


    object_name_or_path = "/home/lawson/pretrain/bert-base-chinese"        
    model_object = ObjectModel(object_name_or_path,768,object_class_num)
    model_object = model_object.cuda()
    model_object.load_state_dict(t.load(model_object_path))
    
    tokenizer = BertTokenizerFast.from_pretrained("/home/lawson/pretrain/bert-base-chinese")    
    # Loads dataset.
    dev_dataset = PredictSubjectDataset.from_file(                
        args.dev_data_path,
        tokenizer,
        args.max_seq_length,
        True
        )
    
    collator = PredictSubjectDataCollator()
    dev_data_loader = DataLoader(        
        dataset=dev_dataset,
        batch_size=args.batch_size,
        collate_fn=collator, # 重写一个 collator
        )

    model_subject.eval()
    model_object.eval()
    all_known_subjects = get_all_subjects("/home/lawson/program/DuIE_py/data/train_data.json")
    res = [] # 最后的预测结果
    subject_invalid_num = 0 # 预测失败的个数
    temp = (args.dev_data_path).split("/")[-1].split('.')[0]
    subject_object_predict_file =  f"./{temp}_subject_object_predict.txt"
    if os.path.exists(subject_object_predict_file):
        os.remove(subject_object_predict_file)
    with t.no_grad():        
        for batch in tqdm(dev_data_loader):
            # origin_info 是原始的json格式的信息
            input_ids,token_type_ids,attention_mask, batch_origin_info,offset_mapping = batch
            # labels size = [batch_size,max_seq_length]
            logits_1 = model_subject(input_ids=input_ids,
                                    token_type_ids=token_type_ids,
                                    attention_mask=attention_mask
                                    )
            #logits size [batch_size,max_seq_len,class_num]  
            # 得到预测到的 subject
            # temp = get_rid_of_number_in_str(origin_info[0]['text'])
            # origin_info[0]['text'] = temp
            batch_subjects,batch_subject_labels = decode_subject(logits_1,
            id2subject_map,
            input_ids,
            tokenizer,
            batch_origin_info,
            offset_mapping,
            all_known_subjects
            )            
                        
            
            logger.info("\n====================start predicting object ====================")                    
            # 将subject的预测结果写到文件中                        
            object_invalid_num = 0 
            # ====== 根据origin_info 得到 subtask 2 的训练数据 ==========
            # 这里的object_input_ids 的 size 不再是args.batch_size ,可能比这个稍大
            object_input_ids, object_token_type_ids,object_attention_mask,\
            object_labels,object_origin_info,object_offset_mapping = from_dict2object(batch_subjects=batch_subjects,
                                                        batch_origin_dict=batch_origin_info,
                                                        tokenizer=tokenizer,
                                                        max_length=args.max_seq_length,
                                                        )
            object_input_ids = t.tensor(object_input_ids).cuda()
            object_token_type_ids = t.tensor(object_token_type_ids).cuda()
            object_attention_mask = t.tensor(object_attention_mask).cuda()
            
            logits_2 = model_object(input_ids = object_input_ids,
                                    token_type_ids=object_token_type_ids,
                                    attention_mask=object_attention_mask
                                    )
            batch_objects, batch_object_labels = decode_object(
                logits_2,
                id2object_map,
                tokenizer,
                object_input_ids,
                object_origin_info,
                object_offset_mapping,
                logger
            )

            # 可视化subject + object 的预测结果
            visualize_subject_object(subject_object_predict_file,batch_subjects,batch_objects)            
    
    #评测
    cal_subject_object_metric(subject_object_predict_file,args.dev_data_path)
Beispiel #3
0
def do_predict(model_subject_path,model_object_path,model_relation_path):
    # Does predictions.
    logger.info("\n====================start predicting====================")
    logger.info("\n===============本次运行,参数配置如下:================")
    for k,v in (vars(args).items()):
        logger.info(f"{k,v}")
    bert_name_or_path = "/home/lawson/pretrain/bert-base-chinese"
    roberta_name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch"    
    model_subject = SubjectModel(bert_name_or_path,768,out_fea=subject_class_num)
    #model_subject = SubjectModel(roberta_name_or_path,768,out_fea=subject_class_num)
    model_subject.load_state_dict(t.load(model_subject_path))
    model_subject = model_subject.cuda()    

    model_object = ObjectModel(bert_name_or_path,768,object_class_num)
    model_object = model_object.cuda()
    model_object.load_state_dict(t.load(model_object_path))

    model_relation = RelationModel(roberta_name_or_path,relation_class_num)
    model_relation = model_relation.cuda()
    model_relation.load_state_dict(t.load(model_relation_path))

    tokenizer = BertTokenizerFast.from_pretrained("/home/lawson/pretrain/bert-base-chinese")
    #predict_file_path = os.path.join(args.data_path, 'train_data_2_predict.json') 
    
    #subject_name = model_subject_path.    
    
    dev_data_path = (args.dev_data_path).split("/")[-1].split(".")[0]
    
    a = (args.model_relation_path).split("/")
    a = "_".join(a[-2::])
    a = a.split(".")[0]

    predict_file_path = os.path.join(args.data_path, dev_data_path)+f"_predict_{a}.json"
    batch_file_path = f"/home/lawson/program/DuIE_py/data/{dev_data_path}_subject_object_relation_{a}.txt"
    # Loads dataset.
    dev_dataset = PredictSubjectDataset.from_file(
        #os.path.join(args.data_path, 'train_data_2.json'),
        args.dev_data_path,
        tokenizer,
        args.max_seq_length,
        True
        )
    
    collator = PredictSubjectDataCollator()
    dev_data_loader = DataLoader(        
        dataset=dev_dataset,
        batch_size=args.batch_size,
        collate_fn=collator, # 重写一个 collator
        )

    model_subject.eval()
    model_object.eval()
    model_relation.eval()

    all_known_subjects = get_all_subjects(train_data_path="/home/lawson/program/DuIE_py/data/train_data.json")
    # 将subject的预测结果写到文件中
    all_country = get_all_country(train_data_path="/home/lawson/program/DuIE_py/data/train_data.json")
    if os.path.exists(batch_file_path):
        logger.info("存在文件subject_object_relation.txt,请处理")
        sys.exit(0)
        
    res = [] # 最后的预测结果
    invalid_num = 0 # 预测失败的个数
    with t.no_grad():
        for batch in tqdm(dev_data_loader):
            # origin_info 是原始的json格式的信息
            input_ids,token_type_ids,attention_mask, batch_origin_info,offset_mapping = batch
            # labels size = [batch_size,max_seq_length]
            logits_1 = model_subject(input_ids=input_ids,
                                    token_type_ids=token_type_ids,
                                    attention_mask=attention_mask
                                    )
            #logits size [batch_size,max_seq_len,class_num]  
            # 得到预测到的 subject
            # temp = get_rid_of_number_in_str(origin_info[0]['text'])
            # origin_info[0]['text'] = temp
            batch_subjects,batch_subject_labels = decode_subject(logits_1,
            id2subject_map,
            input_ids,
            tokenizer,
            batch_origin_info,
            offset_mapping,
            all_known_subjects
            )
                        
            # 将subject的预测结果写到文件中                        
            object_invalid_num = 0 
            # ====== 根据origin_info 得到 subtask 2 的训练数据 ==========
            # 这里的object_input_ids 的 size 不再是args.batch_size ,可能比这个稍大
            object_input_ids, object_token_type_ids,object_attention_mask,\
            object_labels,object_origin_info,object_offset_mapping = from_dict2object(batch_subjects=batch_subjects,
                                                        batch_origin_dict=batch_origin_info,
                                                        tokenizer=tokenizer,
                                                        max_length=args.max_seq_length,
                                                        )
            object_input_ids = t.tensor(object_input_ids).cuda()
            object_token_type_ids = t.tensor(object_token_type_ids).cuda()
            object_attention_mask = t.tensor(object_attention_mask).cuda()
            
            logits_2 = model_object(input_ids = object_input_ids,
                                    token_type_ids=object_token_type_ids,
                                    attention_mask=object_attention_mask
                                    )
            batch_objects, batch_object_labels = decode_object(
                logits_2,
                id2object_map,
                tokenizer,
                object_input_ids,
                object_origin_info,
                object_offset_mapping,
                logger
            )

            if(len(batch_objects[0]) == 0):
                invalid_num+=1
                #print("----- 未预测到 object ----------")        
                continue
            # ====== 根据 subject + object 得到 subtask 3 的测试数据 ==========        
            relation_input_ids, relation_token_type_ids,\
            relation_attention_mask, relation_labels = from_dict2_relation(batch_subjects,
                                                                               batch_objects,
                                                                               batch_origin_info,
                                                                               tokenizer,
                                                                               args.max_seq_length
                                                                               )
            
            relation_input_ids = t.tensor(relation_input_ids).cuda()
            relation_token_type_ids = t.tensor(relation_token_type_ids).cuda()
            relation_attention_mask = t.tensor(relation_attention_mask).cuda()        
            if relation_input_ids.size(0) == 0:
                continue
            
            # 这个模型直接得到loss
            out = model_relation(input_ids=relation_input_ids,
                                    token_type_ids=relation_token_type_ids,
                                    attention_mask=relation_attention_mask,
                                    labels = None                                
                                    )
            logits = out.logits # 输出最后的分类分数
            # size [batch_size, relation_class_num]

            batch_relations = decode_relation_class(logits,id2relation_map)

            # batch_relations = add_relation_of_country(batch_subjects,batch_subject_labels,
            # batch_objects,batch_object_labels,batch_relations,batch_origin_info)

            # 得到最后的结果
            cur_res = post_process_2(batch_subjects, # 5                        
                        batch_objects, # 5                        
                        batch_relations,
                        batch_origin_info
            )
            res.extend(cur_res)

            # 分别写出三步的结果
            with open(batch_file_path,'a') as f:
                a = str(batch_subjects)
                b = str(batch_objects)
                c = str(batch_relations)
                f.write(a+"\n")
                f.write(b+"\n")
                f.write(c+"\n")
                f.write("\n")

    # 写出最后的预测结果
    with open(predict_file_path,"w",encoding="utf-8") as f:
        for line in res:        
            json_str = json.dumps(line,ensure_ascii=False)                        
            #print(json_str)
            f.write(json_str)
            f.write('\n')

    logger.info(f"未预测到的个数是:{invalid_num}")
    logger.info("=====predicting complete=====")
Beispiel #4
0
def do_train_2(model_subject_path, model_object_path, model_relation_path):
    # Does predictions.
    logger.info(
        "\n====================start predicting / evaluating ===================="
    )
    subject_name_or_path = "/home/lawson/pretrain/bert-base-chinese"
    model_subject = SubjectModel(subject_name_or_path,
                                 768,
                                 out_fea=subject_class_num)
    model_subject.load_state_dict(t.load(model_subject_path))
    model_subject = model_subject.cuda()

    object_name_or_path = "/home/lawson/pretrain/bert-base-chinese"
    model_object = ObjectModel(object_name_or_path, 768, object_class_num)
    model_object = model_object.cuda()
    model_object.load_state_dict(t.load(model_object_path))

    relation_name_or_path = "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch"
    model_relation = RelationModel(relation_name_or_path, relation_class_num)
    model_relation = model_relation.cuda()
    model_relation.load_state_dict(t.load(model_relation_path))
    tokenizer = BertTokenizerFast.from_pretrained(
        "/pretrains/pt/chinese_RoBERTa-wwm-ext_pytorch")

    # Loads dataset.
    # 这里之所以使用 TrainSubjectDataset 是因为需要加载原始的数据,通过原始的数据才可以得到训练 relation 的数据
    logger.info(f"Preprocessing data, loaded from {args.train_data_path}")
    train_dataset = TrainSubjectDataset.from_file(args.train_data_path,
                                                  tokenizer,
                                                  args.max_seq_length, True)
    # 这里将DistributedBatchSample(paddle) 修改成了 DistributedSample(torch)
    # 如果使用 DistributedSampler 那么应该就是一个多进程加载数据
    # train_batch_sampler = DistributedSampler(
    #     train_dataset,
    #     shuffle=True,
    #     drop_last=True
    #     )
    collator = TrainSubjectDataCollator()
    train_data_loader = DataLoader(
        dataset=train_dataset,
        #batch_sampler=train_batch_sampler,
        batch_size=args.batch_size,
        collate_fn=collator,  # 重写一个 collator
    )

    model_subject.eval()
    viz = Visdom()
    win = "train_loss_negative_2"
    res = []  # 最后的预测结果
    subject_invalid_num = 0  # 预测失败的个数
    all_known_subjects = get_all_subjects(
        train_data_path="/home/lawson/program/DuIE_py/data/train_data.json")
    # 将二者模型的梯度关闭
    for param in model_subject.parameters():
        param.requires_grad = False
    for param in model_object.parameters():
        param.requires_grad = False

    optimizer = t.optim.AdamW([
        {
            'params': model_relation.parameters(),
            'lr': 2e-5
        },
    ], )

    # Starts training.
    global_step = 0
    logging_steps = 50
    save_steps = 5000
    step = 1
    logging_loss = 0
    for epoch in tqdm(range(args.num_train_epochs)):
        total_neg_cnt = 0
        total_pos_cnt = 0
        for batch in tqdm(train_data_loader):
            # origin_info 是原始的json格式的信息
            input_ids, token_type_ids, attention_mask, batch_origin_info, batch_labels, offset_mapping = batch
            # labels size = [batch_size,max_seq_length]
            logits_1 = model_subject(input_ids=input_ids,
                                     token_type_ids=token_type_ids,
                                     attention_mask=attention_mask)
            #logits size [batch_size,max_seq_len,class_num]
            # 得到预测到的 subject
            # temp = get_rid_of_number_in_str(origin_info[0]['text'])
            # origin_info[0]['text'] = temp
            batch_subjects, batch_subject_labels = decode_subject(
                logits_1, id2subject_map, input_ids, tokenizer,
                batch_origin_info, offset_mapping, all_known_subjects)

            # 将subjects 中的元素去重
            # 需要判断 batch_subjects 是空的情况,最好能够和普通subjects 一样处理
            if (len(batch_subjects[0]) == 0):
                #print("----- 未预测到subject ----------")
                subject_invalid_num += 1
                continue

            # 将subject的预测结果写到文件中
            object_invalid_num = 0
            # ====== 根据origin_info 得到 subtask 2 的训练数据 ==========
            # 这里的object_input_ids 的 size 不再是args.batch_size ,可能比这个稍大
            object_input_ids, object_token_type_ids,object_attention_mask,\
            object_labels,object_origin_info,object_offset_mapping = from_dict2object(batch_subjects=batch_subjects,
                                                        batch_origin_dict=batch_origin_info,
                                                        tokenizer=tokenizer,
                                                        max_length=args.max_seq_length,
                                                        )
            object_input_ids = t.tensor(object_input_ids).cuda()
            object_token_type_ids = t.tensor(object_token_type_ids).cuda()
            object_attention_mask = t.tensor(object_attention_mask).cuda()

            logits_2 = model_object(input_ids=object_input_ids,
                                    token_type_ids=object_token_type_ids,
                                    attention_mask=object_attention_mask)
            batch_objects, batch_object_labels = decode_object(
                logits_2, id2object_map, tokenizer, object_input_ids,
                object_origin_info, object_offset_mapping, logger)

            relation_input_ids,relation_token_type_ids,relation_attention_mask,relation_labels,batch_neg_cnt,batch_pos_cnt \
                = get_negative_relation_data(batch_subjects,batch_objects,batch_origin_info,tokenizer,max_length=128)
            relation_input_ids = t.tensor(relation_input_ids).cuda()
            relation_token_type_ids = t.tensor(relation_token_type_ids).cuda()
            relation_attention_mask = t.tensor(relation_attention_mask).cuda()
            relation_labels = t.tensor(relation_labels).cuda()

            if relation_input_ids.size(0) < 1:
                continue
            logger.info(
                f"relation_input_ids.size(0) = {relation_input_ids.size(0)}")
            if relation_input_ids.size(0) > 32:
                out = model_relation(
                    input_ids=relation_input_ids[0:32, :],
                    token_type_ids=relation_token_type_ids[0:32, :],
                    attention_mask=relation_attention_mask[0:32, :],
                    labels=relation_labels[0:32])
                logger.info(f"{batch_origin_info}")
            else:
                # 这个模型直接得到loss
                out = model_relation(input_ids=relation_input_ids,
                                     token_type_ids=relation_token_type_ids,
                                     attention_mask=relation_attention_mask,
                                     labels=relation_labels)
            loss = out.loss
            loss.backward()
            optimizer.step()
            #lr_scheduler.step()
            optimizer.zero_grad()

            if relation_input_ids.size(0) > 32:
                avg_loss = loss.item() / 32
            else:
                avg_loss = loss.item() / relation_input_ids.size(0)
            logger.info(f"平均每个样本的损失时:avg_loss = {avg_loss}")
            if avg_loss > 2:  # 重点关注一下这种损失的数据
                logger.info(f"{batch_origin_info}")
            logging_loss += avg_loss
            # 打日志
            if global_step % logging_steps == 0 and global_step:
                viz.line([logging_loss], [global_step],
                         win=win,
                         update="append")
                logging_loss = 0

            # 保存模型
            if global_step % save_steps == 0 and global_step != 0:
                logger.info(
                    f"saving checkpoing model_relation_{513882+global_step}.pdparams to {args.output_dir}"
                )
                cur_model_name = os.path.join(
                    args.output_dir, "model_relation_%d_roberta.pdparams" %
                    (513882 + global_step))
                t.save(model_relation.state_dict(), cur_model_name)

            total_neg_cnt += batch_neg_cnt
            total_pos_cnt += batch_pos_cnt
            logger.info(f"batch_neg_cnt:{batch_neg_cnt}\n,\
                        batch_pos_cnt ={batch_pos_cnt}\n,\
                        total_neg_cnt={total_neg_cnt}\n,\
                        total_pos_cnt={total_pos_cnt}")
            step += 1
            global_step += 1

        # 每个epoch 之后保存模型
        t.save(
            model_relation.state_dict(),
            os.path.join(
                args.output_dir, "model_relation_%d_roberta_epoch.pdparams" %
                (513882 + global_step)))
    logger.info("\n=====training complete=====")