Ejemplo n.º 1
0
def evaluate_model(config: Config, model: BertCRF, batch_insts_ids, name: str,
                   insts: List[Instance]):
    # evaluation
    metrics = np.asarray([0, 0, 0], dtype=int)
    batch_id = 0
    batch_size = config.batch_size
    for batch in batch_insts_ids:
        one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) *
                                batch_size]

        input_ids, input_seq_lens, annotation_mask, labels = batch
        input_masks = input_ids.gt(0)
        # get the predict result
        batch_max_scores, batch_max_ids = model(
            input_ids,
            input_seq_lens=input_seq_lens,
            annotation_mask=annotation_mask,
            labels=None,
            attention_mask=input_masks)

        metrics += evaluate_batch_insts(batch_insts=one_batch_insts,
                                        batch_pred_ids=batch_max_ids,
                                        batch_gold_ids=batch[-1],
                                        word_seq_lens=batch[1],
                                        idx2label=config.idx2labels)
        batch_id += 1
    # calculate the precision, recall and f1 score
    p, total_predict, total_entity = metrics[0], metrics[1], metrics[2]
    precision = p * 1.0 / total_predict * 100 if total_predict != 0 else 0
    recall = p * 1.0 / total_entity * 100 if total_entity != 0 else 0
    fscore = 2.0 * precision * recall / (
        precision + recall) if precision != 0 or recall != 0 else 0
    logging.info("[%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" %
                 (name, precision, recall, fscore))
    return [precision, recall, fscore]
def evaluate_model(config: Config, model: NNCRF_sl, batch_insts_ids, name: str,
                   insts: List[Instance]):
    ## evaluation
    metrics = np.asarray([0, 0, 0], dtype=int)
    batch_id = 0
    batch_size = config.batch_size
    for batch in batch_insts_ids:

        one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) *
                                batch_size]
        batch_max_scores, batch_max_ids = model.decode(batch)
        metrics += evaluate_batch_insts(batch_insts=one_batch_insts,
                                        batch_pred_ids=batch_max_ids,
                                        batch_gold_ids=batch[-2],
                                        word_seq_lens=batch[1],
                                        idx2label=config.idx2labels)
        batch_id += 1
    p, total_predict, total_entity = metrics[0], metrics[1], metrics[2]
    precision = p * 1.0 / total_predict * 100 if total_predict != 0 else 0
    recall = p * 1.0 / total_entity * 100 if total_entity != 0 else 0
    fscore = 2.0 * precision * recall / (
        precision + recall) if precision != 0 or recall != 0 else 0
    print("[%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" %
          (name, precision, recall, fscore),
          flush=True)
    return [precision, recall, fscore]
Ejemplo n.º 3
0
def evaluate_model(config: Config,
                   model: NNCRF,
                   batch_insts_ids,
                   name: str,
                   insts: List[Instance],
                   print_each_type_metric: bool = False):
    ## evaluation
    p_dict, total_predict_dict, total_entity_dict = Counter(), Counter(
    ), Counter()
    batch_id = 0
    batch_size = config.batch_size
    with torch.no_grad():
        for batch in batch_insts_ids:
            one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) *
                                    batch_size]
            batch_max_scores, batch_max_ids = model.decode(**batch)
            batch_p, batch_predict, batch_total = evaluate_batch_insts(
                one_batch_insts, batch_max_ids, batch["labels"],
                batch["word_seq_lens"], config.idx2labels)
            p_dict += batch_p
            total_predict_dict += batch_predict
            total_entity_dict += batch_total
            batch_id += 1
    if print_each_type_metric:
        for key in total_entity_dict:
            precision_key, recall_key, fscore_key = get_metric(
                p_dict[key], total_entity_dict[key], total_predict_dict[key])
            print(
                f"[{key}] Prec.: {precision_key:.2f}, Rec.: {recall_key:.2f}, F1: {fscore_key:.2f}"
            )

    total_p = sum(list(p_dict.values()))
    total_predict = sum(list(total_predict_dict.values()))
    total_entity = sum(list(total_entity_dict.values()))
    precision, recall, fscore = get_metric(total_p, total_entity,
                                           total_predict)
    print(colored(
        f"[{name} set Total] Prec.: {precision:.2f}, Rec.: {recall:.2f}, F1: {fscore:.2f}",
        'blue'),
          flush=True)

    return [precision, recall, fscore]
Ejemplo n.º 4
0
def evaluate_model(config: Config, model: NNCRF, batch_insts_ids, name: str,
                   insts: List[Instance]):
    ## evaluation
    p_dict, total_predict_dict, total_entity_dict = Counter(), Counter(
    ), Counter()
    batch_id = 0
    batch_size = config.batch_size
    for batch in batch_insts_ids:
        one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) *
                                batch_size]
        batch_max_scores, batch_max_ids = model.decode(batch)
        batch_p, batch_predict, batch_total = evaluate_batch_insts(
            one_batch_insts, batch_max_ids, batch[-1], batch[1],
            config.idx2labels, config.use_crf_layer)
        p_dict += batch_p
        total_predict_dict += batch_predict
        total_entity_dict += batch_total
        batch_id += 1

    for key in total_entity_dict:
        precision_key, recall_key, fscore_key = get_metric(
            p_dict[key], total_entity_dict[key], total_predict_dict[key])
        print("[%s] Prec.: %.2f, Rec.: %.2f, F1: %.2f" %
              (key, precision_key, recall_key, fscore_key))
        if key == config.new_type:
            precision_new_type, recall_new_type, fscore_new_type = get_metric(
                p_dict[key], total_entity_dict[key], total_predict_dict[key])

    total_p = sum(list(p_dict.values()))
    total_predict = sum(list(total_predict_dict.values()))
    total_entity = sum(list(total_entity_dict.values()))
    precision, recall, fscore = get_metric(total_p, total_entity,
                                           total_predict)
    print(colored(
        "[%s set Total] Prec.: %.2f, Rec.: %.2f, F1: %.2f" %
        (name, precision, recall, fscore), 'blue'),
          flush=True)
    if config.choose_by_new_type:
        return [precision_new_type, recall_new_type, fscore_new_type]
    else:
        return [precision, recall, fscore]
def evaluate_model(config: Config, model: NNCRF, batch_insts_ids, name: str,
                   insts: List[Instance]):
    ## evaluation
    metrics_exact = np.asarray([0, 0, 0], dtype=int)
    metrics_overlap = np.asarray([0, 0, 0], dtype=int)

    dict_exact = {}
    dict_overlap = {}

    batch_id = 0
    batch_size = config.batch_size
    for batch in batch_insts_ids:
        one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) *
                                batch_size]
        batch_max_scores, batch_max_ids = model.decode(batch)
        results = evaluate_batch_insts(one_batch_insts, batch_max_ids,
                                       batch[-1], batch[1], config.idx2labels)

        metrics_exact += results[0]
        metrics_overlap += results[1]

        for key in results[2]:
            if key not in dict_exact:
                dict_exact[key] = [0, 0, 0]
            dict_exact[key][0] += results[2][key][0]
            dict_exact[key][1] += results[2][key][1]
            dict_exact[key][2] += results[2][key][2]

        for key in results[3]:
            if key not in dict_overlap:
                dict_overlap[key] = [0, 0, 0]
            dict_overlap[key][0] += results[3][key][0]
            dict_overlap[key][1] += results[3][key][1]
            dict_overlap[key][2] += results[3][key][2]

        batch_id += 1

    p_exact, total_predict, total_entity = metrics_exact[0], metrics_exact[
        1], metrics_exact[2]
    precision_exact = p_exact * 1.0 / total_predict * 100 if total_predict != 0 else 0
    recall_exact = p_exact * 1.0 / total_entity * 100 if total_entity != 0 else 0
    fscore_exact = 2.0 * precision_exact * recall_exact / (
        precision_exact +
        recall_exact) if precision_exact != 0 or recall_exact != 0 else 0
    print("[%s set - Exact] Precision: %.2f, Recall: %.2f, F1: %.2f" %
          (name, precision_exact, recall_exact, fscore_exact),
          flush=True)
    #print_report(dict_exact)

    p_overlap, total_predict, total_entity = metrics_overlap[
        0], metrics_overlap[1], metrics_overlap[2]
    precision_overlap = p_overlap * 1.0 / total_predict * 100 if total_predict != 0 else 0
    recall_overlap = p_overlap * 1.0 / total_entity * 100 if total_entity != 0 else 0
    fscore_overlap = 2.0 * precision_overlap * recall_overlap / (
        precision_overlap +
        recall_overlap) if precision_overlap != 0 or recall_overlap != 0 else 0
    print("[%s set - Overlap] Precision: %.2f, Recall: %.2f, F1: %.2f" %
          (name, precision_overlap, recall_overlap, fscore_overlap),
          flush=True)
    #print_report(dict_overlap)

    return [precision_exact, recall_exact,
            fscore_exact], [precision_overlap, recall_overlap,
                            fscore_overlap], dict_exact, dict_overlap
Ejemplo n.º 6
0
def evaluate_model(config: Config, model: NNCRF, batch_insts_ids, name: str,
                   insts: List[Instance]):
    ## evaluation
    tp, fp, tn, fn = 0, 0, 0, 0
    # metrics, metrics_e2e = np.asarray([0, 0, 0], dtype=int), np.asarray([0, 0, 0], dtype=int)
    metrics, metrics_e2e = np.asarray([0, 0, 0], dtype=int), np.zeros(
        (1, 3), dtype=int)
    pair_metrics = np.asarray([0, 0, 0], dtype=int)
    batch_idx = 0
    batch_size = config.batch_size
    # print('insts',len(insts))
    for batch in batch_insts_ids:
        # print('batch_idx * batch_size:(batch_idx + 1) * batch_size', batch_idx* batch_size,(batch_idx + 1) * batch_size )
        one_batch_insts = insts[batch_idx * batch_size:(batch_idx + 1) *
                                batch_size]

        processed_batched_data = simple_batching(config, batch)
        # print(len(one_batch_insts))
        batch_max_scores, batch_max_ids, pair_ids = model.decode(
            processed_batched_data)

        metrics += evaluate_batch_insts(one_batch_insts, batch_max_ids,
                                        processed_batched_data[-6],
                                        processed_batched_data[2],
                                        config.idx2labels)
        # print(processed_batched_data[-1])
        metrics_e2e += evaluate_batch_insts_e2e(
            one_batch_insts, batch_max_ids, processed_batched_data[-6],
            processed_batched_data[2], config.idx2labels,
            processed_batched_data[-8], pair_ids, processed_batched_data[-1])

        word_seq_lens = processed_batched_data[2].tolist()
        for batch_id in range(batch_max_ids.size()[0]):
            # print('batch_max_ids[batch_id]:  ',batch_max_ids[batch_id].size(),batch_max_ids[batch_id])
            length = word_seq_lens[batch_id]
            # prediction = batch_max_ids[batch_id][:length]
            # prediction = torch.flip(prediction,dims = [0])

            gold = processed_batched_data[-6][batch_id][:length]
            # gold = torch.flip(gold, dims=[0])

            # s_id = (prediction == 2).nonzero()
            # b_id = (prediction == 3).nonzero()
            # e_id = (prediction == 4).nonzero()
            # i_id = (prediction == 5).nonzero()
            # pred_id = torch.cat([s_id, b_id, e_id, i_id]).squeeze(1)
            # pred_id,_ = pred_id.sort(0, descending=False)
            # pred_id = pred_id[pred_id < processed_batched_data[-1][batch_id]]

            s_id = (gold == 2).nonzero()
            b_id = (gold == 3).nonzero()
            e_id = (gold == 4).nonzero()
            i_id = (gold == 5).nonzero()
            gold_id = torch.cat([s_id, b_id, e_id, i_id]).squeeze(1)
            gold_id, _ = gold_id.sort(0, descending=False)
            gold_id = gold_id[gold_id < processed_batched_data[-1][batch_id]]

            # argu_id = torch.LongTensor(list(set(gold_id.tolist()).intersection(set(pred_id.tolist()))))
            argu_id = torch.LongTensor(list(set(gold_id.tolist())))
            # print('gold_id', gold_id, 'pred_id', pred_id, 'argu_id', argu_id)

            # print(pair_ids[batch_id].size(), batch[-3][batch_id].size())
            one_batch_insts[batch_id].gold2 = processed_batched_data[-3][
                batch_id].tolist()
            one_batch_insts[batch_id].pred2 = pair_ids[batch_id].squeeze(
                2).tolist()

            # print(one_batch_insts[batch_id].gold2)
            # print(torch.sum(one_batch_insts[batch_id].pred2, dim=1))

            # pred2 = one_batch_insts[batch_id].pred2[argu_id]
            pred2 = pair_ids[batch_id].squeeze(2)
            # gold2 = one_batch_insts[batch_id].gold2[argu_id]
            gold2 = processed_batched_data[-3][batch_id]

            # print('argu_id:  ',argu_id.size(),argu_id)
            # print('one_batch_insts[batch_id].pred2:  ',one_batch_insts[batch_id].pred2.size(),one_batch_insts[batch_id].pred2)

            gold_pairs = gold2.flatten()
            pred_pairs = pred2.flatten()

            # print(gold_pairs,pred_pairs)
            sum_table = gold_pairs + pred_pairs
            # print(sum_table.size(),sum_table[:100])
            sum_table_sliced = sum_table[sum_table >= 0]
            # print(sum_table_sliced.size(),sum_table_sliced)
            tp_tmp = len(sum_table_sliced[sum_table_sliced == 2])
            tn_tmp = len(sum_table_sliced[sum_table_sliced == 0])
            tp += tp_tmp
            tn += tn_tmp
            ones = len(gold_pairs[gold_pairs == 1])
            zeros = len(gold_pairs[gold_pairs == 0])
            fp += (zeros - tn_tmp)
            fn += (ones - tp_tmp)
            # print(tp,tp_tmp,tn,tn_tmp,ones,zeros,fp,fn)

        batch_idx += 1
    print('tp, fp, fn, tn: ', tp, fp, fn, tn)
    precision_2 = 1.0 * tp / (tp + fp) * 100 if tp + fp != 0 else 0
    recall_2 = 1.0 * tp / (tp + fn) * 100 if tp + fn != 0 else 0
    f1_2 = 2.0 * precision_2 * recall_2 / (
        precision_2 + recall_2) if precision_2 + recall_2 != 0 else 0
    acc = 1.0 * (tp + tn) / (fp + fn + tp +
                             tn) * 100 if fp + fn + tp + tn != 0 else 0
    p, total_predict, total_entity = metrics[0], metrics[1], metrics[2]
    precision = p * 1.0 / total_predict * 100 if total_predict != 0 else 0
    recall = p * 1.0 / total_entity * 100 if total_entity != 0 else 0
    fscore = 2.0 * precision * recall / (
        precision + recall) if precision != 0 or recall != 0 else 0

    p_e2e, total_predict_e2e, total_entity_e2e = metrics_e2e[:,
                                                             0], metrics_e2e[:,
                                                                             1], metrics_e2e[:,
                                                                                             2]
    # precision_e2e = p_e2e * 1.0 / total_predict_e2e * 100 if total_predict_e2e != 0 else 0
    # recall_e2e = p_e2e * 1.0 / total_entity_e2e * 100 if total_entity_e2e != 0 else 0
    # fscore_e2e = 2.0 * precision_e2e * recall_e2e / (precision_e2e + recall_e2e) if precision_e2e != 0 or recall_e2e != 0 else 0
    total_predict_e2e[total_predict_e2e == 0] = sys.maxsize
    total_entity_e2e[total_entity_e2e == 0] = sys.maxsize

    precision_e2e = p_e2e * 1.0 / total_predict_e2e * 100
    recall_e2e = p_e2e * 1.0 / total_entity_e2e * 100

    sum_e2e = precision_e2e + recall_e2e
    sum_e2e[sum_e2e == 0] = sys.maxsize
    fscore_e2e = 2.0 * precision_e2e * recall_e2e / sum_e2e

    print("Task1: ", p, total_predict, total_entity)
    # print("Overall: ", p_e2e, total_predict_e2e, total_entity_e2e)

    print("Task1: [%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" %
          (name, precision, recall, fscore),
          flush=True)
    print(
        "Task2: [%s set] Precision: %.2f, Recall: %.2f, F1: %.2f, acc: %.2f" %
        (name, precision_2, recall_2, f1_2, acc),
        flush=True)
    percs = [0.9]
    #percs = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]
    for i in range(len(percs)):
        print("Overall ",
              percs[i],
              ": [%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" %
              (name, precision_e2e[i], recall_e2e[i], fscore_e2e[i]),
              flush=True)
    return [
        precision, recall, fscore, precision_2, recall_2, f1_2, acc,
        precision_e2e, recall_e2e, fscore_e2e
    ]
Ejemplo n.º 7
0
def evaluate_model(config: Config, model: NNCRF, batch_insts_ids, name: str,
                   insts: List[Instance]):
    ## evaluation
    i = 0
    metrics = np.asarray([
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0
    ],
                         dtype=int)
    batch_id = 0
    batch_size = config.batch_size
    for batch in batch_insts_ids:
        i += 1
        flag = 0
        one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) *
                                batch_size]
        batch_max_scores, batch_max_ids = model.decode(batch)
        if i == len(batch_insts_ids) - 1:
            flag = 1
        metrics += evaluate_batch_insts(one_batch_insts, batch_max_ids,
                                        batch[-1], batch[1], config.idx2labels,
                                        config.use_crf_layer, config.test_kind,
                                        flag)
        batch_id += 1
    p, p_special, total_predict, total_entity, special_predict, special_entity = metrics[
        0], metrics[1], metrics[2], metrics[3], metrics[4], metrics[5]

    wrong_prediction = {}
    wrong_prediction["BLater"] = metrics[6]
    wrong_prediction["BEarlier"] = metrics[7]
    wrong_prediction["ILater"] = metrics[8]
    wrong_prediction["IEarlier"] = metrics[9]
    wrong_prediction["O2misc"] = metrics[10]
    wrong_prediction["misc2O"] = metrics[11]
    wrong_prediction[1] = metrics[12]
    wrong_prediction[2] = metrics[13]
    wrong_prediction[3] = metrics[14]
    wrong_prediction[4] = metrics[15]
    wrong_prediction[5] = metrics[16]
    wrong_prediction[6] = metrics[17]
    wrong_prediction[7] = metrics[18]
    wrong_prediction["length1"] = metrics[19]
    wrong_prediction["length2"] = metrics[20]
    wrong_prediction["length3"] = metrics[21]
    wrong_prediction["length4"] = metrics[22]
    wrong_prediction["length5"] = metrics[23]
    wrong_prediction["length6"] = metrics[24]
    wrong_prediction["length7"] = metrics[25]

    precision = p * 1.0 / total_predict * 100 if total_predict != 0 else 0
    recall = p * 1.0 / total_entity * 100 if total_entity != 0 else 0
    fscore = 2.0 * precision * recall / (
        precision + recall) if precision != 0 or recall != 0 else 0

    precision_special = p_special * 1.0 / special_predict * 100 if special_predict != 0 else 0
    recall_special = p_special * 1.0 / special_entity * 100 if special_entity != 0 else 0
    fscore_special = 2.0 * precision_special * recall_special / (precision_special + recall_special) \
                    if precision_special != 0 or recall_special != 0 else 0
    print("---[%s set] Precision: %.2f, Recall: %.2f, F1: %.2f" %
          (name, precision, recall, fscore),
          flush=True)
    print("---[%s of %s set] Precision: %.2f, Recall: %.2f, F1: %.2f" %
          (config.test_kind, name, precision_special, recall_special,
           fscore_special),
          flush=True)

    print(p_special, special_entity, special_predict)
    for inn in wrong_prediction.keys():
        if str(inn).startswith("length"):
            print(wrong_prediction[inn], end=" ")
    print()
    print(wrong_prediction)

    if name == "test":
        print()
    return [precision, recall, fscore]