예제 #1
0
def handle_normal_dataset(dataset, ignore_subword_match=False):
    """
    if ignore_subword_match is true, find entities with whitespace around, e.g. "entity" -> " entity "
    """
    # 加载preprocessor
    if config["encoder"] == "BERT":
        tokenizer = BertTokenizerFast.from_pretrained(config["bert_path"],
                                                      add_special_tokens=False,
                                                      do_lower_case=False)
        tokenize = tokenizer.tokenize
        get_tok2char_span_map = lambda text: tokenizer.encode_plus(
            text, return_offsets_mapping=True, add_special_tokens=False)[
                "offset_mapping"]
    elif config["encoder"] == "BiLSTM":
        tokenize = lambda text: text.split(" ")

        def get_tok2char_span_map(text):
            tokens = tokenize(text)
            tok2char_span = []
            char_num = 0
            for tok in tokens:
                tok2char_span.append((char_num, char_num + len(tok)))
                char_num += len(tok) + 1  # +1: whitespace
            return tok2char_span

    preprocessor = Preprocessor(
        tokenize_func=tokenize,
        get_tok2char_span_map_func=get_tok2char_span_map)
    # add char span
    dataset, miss_sample_list = preprocessor.add_char_span(
        dataset, ignore_subword_match=False)

    if len(miss_sample_list) > 0:
        print("=========存在不匹配实体,请检查===========")
        print(miss_sample_list)
        print("========================================")

    # add token span
    dataset = preprocessor.add_tok_span(dataset)

    return dataset
error_statistics = {}
for file_name, data in file_name2data.items():
    print("file name: ", file_name)
    assert len(data) > 0
    if "relation_list" in data[0]:  # train or valid data
        # rm redundant whitespaces
        # separate by whitespaces
        data = preprocessor.clean_data_wo_span(
            data, separate=config["separate_char_by_white"])
        error_statistics[file_name] = {}
        #         if file_name != "train_data":
        #             set_trace()
        # add char span
        if config["add_char_span"]:
            # 实体在语料中的首尾位置
            data, miss_sample_list, miss_sample = preprocessor.add_char_span(
                data, config["ignore_subword"])

            error_statistics[file_name]["miss_samples"] = len(miss_sample_list)
            data_path = os.path.join(data_out_dir,
                                     "{}_miss.json".format(file_name))
            json.dump(miss_sample,
                      open(data_path, "w", encoding="utf-8"),
                      ensure_ascii=False,
                      indent=2)

#         # clean
#         data, bad_samples_w_char_span_error = preprocessor.clean_data_w_span(data)
#         error_statistics[file_name]["char_span_error"] = len(bad_samples_w_char_span_error)

# collect relation types and entity types
        for sample in tqdm(