Ejemplo n.º 1
0
def predict(model, test_dataloader, params, mode):
    """
    预测并将结果输出至文件
    Args:
        mode (str): 'val' or 'test'
    """
    model.eval()
    # init
    pre_result = pd.DataFrame()

    # idx to label
    cate_idx2label = {idx: value for idx, value in enumerate(params.tag_list)}

    # get data
    for batch in tqdm(test_dataloader, unit='Batch', ascii=True):
        # to device
        batch = tuple(t.to(params.device) for t in batch)
        input_ids, input_mask, segment_ids, _, _, ne_cate, split_to_ori, example_ids = batch

        # inference
        with torch.no_grad():
            start_pre, end_pre = model(input_ids, segment_ids, input_mask)

        # predict label
        start_label = start_pre.detach().cpu().numpy().tolist()
        end_label = end_pre.detach().cpu().numpy().tolist()
        # mask
        input_mask = input_mask.to("cpu").detach().numpy().tolist()
        ne_cate = ne_cate.to("cpu").numpy().tolist()
        split_to_ori = split_to_ori.to('cpu').numpy().tolist()  # (bs, max_len)
        example_ids = example_ids.to('cpu').numpy().tolist()  # (bs,)

        # get result
        for start_p, end_p, input_mask_s, ne_cate_s, s_t_o, example_id in zip(
                start_label, end_label, input_mask, ne_cate, split_to_ori,
                example_ids):
            ne_cate_str = cate_idx2label[ne_cate_s]
            # 问题长度
            q_len = len(EN2QUERY[ne_cate_str])
            # 有效长度
            act_len = sum(input_mask_s[q_len + 2:-1])
            # 转换为BIO标注
            pre_bio_labels = pointer2bio(start_p[q_len + 2:q_len + 2 +
                                                 act_len],
                                         end_p[q_len + 2:q_len + 2 + act_len],
                                         en_cate=ne_cate_str)
            # append to df
            pre_result = pre_result.append(
                {
                    'example_id': int(example_id),
                    'tags': pre_bio_labels,
                    'split_to_ori': s_t_o[q_len + 2:q_len + 2 + act_len]
                },
                ignore_index=True)

    pre_result.to_csv(path_or_buf=params.params_path / f'{mode}_tags_pre.csv',
                      encoding='utf-8',
                      index=False)
Ejemplo n.º 2
0
def predict(model, test_dataloader, params, mode):
    """预测并将结果输出至文件
    :param mode: 'val' or 'test'
    """
    model.eval()
    # init
    pre_result = []
    cate_result = []
    mask_lst = []

    # idx to label
    cate_idx2label = {
        idx: value
        for idx, value in enumerate(params.label_list)
    }

    # get data
    for batch in tqdm(test_dataloader, unit='Batch'):
        # to device
        batch = tuple(t.to(params.device) for t in batch)
        input_ids, input_mask, segment_ids, _, _, ne_cate = batch
        # inference
        with torch.no_grad():
            start_logits, end_logits = model(input_ids,
                                             token_type_ids=segment_ids,
                                             attention_mask=input_mask)

        # predict label
        start_label = start_logits.detach().cpu().numpy().tolist()
        end_label = end_logits.detach().cpu().numpy().tolist()
        # mask
        input_mask = input_mask.to("cpu").detach().numpy().tolist()
        ne_cate = ne_cate.to("cpu").numpy().tolist()

        # get result
        for start_p, end_p, ne_cate_s in zip(start_label, end_label, ne_cate):
            ne_cate_str = cate_idx2label[ne_cate_s]
            pre_bio_labels = pointer2bio(start_p, end_p, ne_cate=ne_cate_str)
            pre_result.append(pre_bio_labels)
            cate_result.append(ne_cate_str)

        # save mask
        mask_lst += input_mask

    # write to file
    with open(params.data_dir / f'{mode}_tags_pre.txt', 'w',
              encoding='utf-8') as file_tags:
        for cate, tag, mask in zip(cate_result, pre_result, mask_lst):
            # 问题长度
            q_len = len(IO2QUERY[cate])
            # 有效长度
            act_len = sum(mask[q_len + 2:-1])
            # 真实标签
            file_tags.write('{}\n'.format(' '.join(tag[q_len + 2:q_len + 2 +
                                                       act_len])))
Ejemplo n.º 3
0
def predict(model, test_dataloader, params, mode):
    """预测并将结果输出至文件
    :param mode: 'val' or 'test'
    """
    model.eval()
    # init
    pre_result = []

    # idx to label
    cate_idx2label = {
        idx: value
        for idx, value in enumerate(params.label_list)
    }

    # get data
    for input_ids, input_mask, segment_ids, start_pos, end_pos, ner_cate in test_dataloader:
        # to device
        input_ids = input_ids.to(params.device)
        input_mask = input_mask.to(params.device)
        segment_ids = segment_ids.to(params.device)

        # inference
        with torch.no_grad():
            start_logits, end_logits = model(input_ids, segment_ids,
                                             input_mask)

        # predict label
        start_label = start_logits.detach().cpu().numpy().tolist()
        end_label = end_logits.detach().cpu().numpy().tolist()
        # mask
        input_mask = input_mask.to("cpu").detach().numpy().tolist()
        ner_cate = ner_cate.to("cpu").numpy().tolist()

        # get result
        for start_p, end_p, input_mask_s, ner_cate_s in zip(
                start_label, end_label, input_mask, ner_cate):
            ner_cate_str = cate_idx2label[ner_cate_s]
            # 问题长度
            q_len = len(IO2QUERY[ner_cate_str])
            # 有效长度
            act_len = sum(input_mask_s[q_len + 2:-1])
            # 转换为BIO标注
            pre_bio_labels = pointer2bio(start_p[q_len + 2:q_len + 2 +
                                                 act_len],
                                         end_p[q_len + 2:q_len + 2 + act_len],
                                         ne_cate=ner_cate_str)

            pre_result.append(pre_bio_labels)

    # write to file
    with open(params.data_dir / f'{mode}_tags_pre.txt', 'w',
              encoding='utf-8') as file_tags:
        for tag in pre_result:
            # 真实标签
            file_tags.write('{}\n'.format(' '.join(tag)))
Ejemplo n.º 4
0
def predict(model, test_dataloader, params, mode):
    """预测并将结果输出至文件
    :param mode: 'val' or 'test'
    """
    model.eval()
    # init
    pre_result = []
    mask_lst = []

    # idx to label
    cate_idx2label = {
        idx: int(idx + 1)
        for idx, _ in enumerate(params.label_list)
    }

    # get data
    for batch in test_dataloader:
        # to device
        batch = tuple(t.to(params.device) for t in batch)
        input_ids, input_mask, start_pos, end_pos = batch
        # inference
        with torch.no_grad():
            start_logits, end_logits = model(input_ids,
                                             attention_mask=input_mask)

        # predict label
        start_label = start_logits.detach().cpu().numpy().transpose(
            (0, 2, 1)).tolist()
        end_label = end_logits.detach().cpu().numpy().transpose(
            (0, 2, 1)).tolist()
        # mask
        input_mask = input_mask.to("cpu").detach().numpy().tolist()

        # get result
        for start_p_s, end_p_s, input_mask_s in zip(start_label, end_label,
                                                    input_mask):
            # 有效长度
            act_len = sum(input_mask_s)
            for idx, (start_p, end_p) in enumerate(zip(start_p_s, end_p_s)):
                pre_bio_labels = pointer2bio(start_p[:act_len],
                                             end_p[:act_len],
                                             ne_cate=cate_idx2label[idx])
                pre_result.append(pre_bio_labels)

    # write to file
    with open(params.data_dir / f'{mode}_tags_pre.txt', 'w',
              encoding='utf-8') as file_tags:
        for idx, tag in enumerate(pre_result):
            # 真实标签
            file_tags.write('{}\n'.format(' '.join(tag)))
Ejemplo n.º 5
0
def evaluate(args, model, eval_dataloader, params):
    model.eval()
    # 记录平均损失
    loss_avg = utils.RunningAverage()
    # init
    pre_result = []
    gold_result = []

    # get data
    for batch in tqdm(eval_dataloader, unit='Batch'):
        # to device
        batch = tuple(t.to(params.device) for t in batch)
        input_ids, input_mask, segment_ids, start_pos, end_pos, ne_cate = batch

        with torch.no_grad():
            # get loss
            loss = model(input_ids,
                         token_type_ids=segment_ids,
                         attention_mask=input_mask,
                         start_positions=start_pos,
                         end_positions=end_pos)
            if params.n_gpu > 1 and args.multi_gpu:
                loss = loss.mean()  # mean() to average on multi-gpu.
            # update the average loss
            loss_avg.update(loss.item())

            # inference
            start_logits, end_logits = model(input_ids=input_ids,
                                             token_type_ids=segment_ids,
                                             attention_mask=input_mask)

        # gold label
        start_pos = start_pos.to("cpu").numpy().tolist()
        end_pos = end_pos.to("cpu").numpy().tolist()
        input_mask = input_mask.to('cpu').numpy().tolist()
        ne_cate = ne_cate.to("cpu").numpy().tolist()

        # predict label
        start_label = start_logits.detach().cpu().numpy().tolist()
        end_label = end_logits.detach().cpu().numpy().tolist()

        # idx to label
        cate_idx2label = {
            idx: value
            for idx, value in enumerate(params.label_list)
        }

        # get bio result
        for start_p, end_p, start_g, end_g, input_mask_s, ne_cate_s in zip(
                start_label, end_label, start_pos, end_pos, input_mask,
                ne_cate):
            ne_cate_str = cate_idx2label[ne_cate_s]
            # 问题长度
            q_len = len(IO2QUERY[ne_cate_str])
            # 有效长度
            act_len = sum(input_mask_s[q_len + 2:-1])
            # get BIO labels
            pre_bio_labels = pointer2bio(start_p[q_len + 2:q_len + 2 +
                                                 act_len],
                                         end_p[q_len + 2:q_len + 2 + act_len],
                                         ne_cate=ne_cate_str)
            gold_bio_labels = pointer2bio(start_g[q_len + 2:q_len + 2 +
                                                  act_len],
                                          end_g[q_len + 2:q_len + 2 + act_len],
                                          ne_cate=ne_cate_str)
            pre_result.append(pre_bio_labels)
            gold_result.append(gold_bio_labels)

    # metrics
    f1 = f1_score(y_true=gold_result, y_pred=pre_result)
    acc = accuracy_score(y_true=gold_result, y_pred=pre_result)

    # f1, acc
    metrics = {'loss': loss_avg(), 'f1': f1, 'acc': acc}
    metrics_str = "; ".join("{}: {:05.2f}".format(k, v)
                            for k, v in metrics.items())
    logging.info("- {} metrics: ".format('Val') + metrics_str)
    # f1 classification report
    report = classification_report(y_true=gold_result, y_pred=pre_result)
    logging.info(report)

    return metrics