def handle_normal_dataset(dataset, ignore_subword_match=False): """ if ignore_subword_match is true, find entities with whitespace around, e.g. "entity" -> " entity " """ # 加载preprocessor if config["encoder"] == "BERT": tokenizer = BertTokenizerFast.from_pretrained(config["bert_path"], add_special_tokens=False, do_lower_case=False) tokenize = tokenizer.tokenize get_tok2char_span_map = lambda text: tokenizer.encode_plus( text, return_offsets_mapping=True, add_special_tokens=False)[ "offset_mapping"] elif config["encoder"] == "BiLSTM": tokenize = lambda text: text.split(" ") def get_tok2char_span_map(text): tokens = tokenize(text) tok2char_span = [] char_num = 0 for tok in tokens: tok2char_span.append((char_num, char_num + len(tok))) char_num += len(tok) + 1 # +1: whitespace return tok2char_span preprocessor = Preprocessor( tokenize_func=tokenize, get_tok2char_span_map_func=get_tok2char_span_map) # add char span dataset, miss_sample_list = preprocessor.add_char_span( dataset, ignore_subword_match=False) if len(miss_sample_list) > 0: print("=========存在不匹配实体,请检查===========") print(miss_sample_list) print("========================================") # add token span dataset = preprocessor.add_tok_span(dataset) return dataset
error_statistics = {} for file_name, data in file_name2data.items(): print("file name: ", file_name) assert len(data) > 0 if "relation_list" in data[0]: # train or valid data # rm redundant whitespaces # separate by whitespaces data = preprocessor.clean_data_wo_span( data, separate=config["separate_char_by_white"]) error_statistics[file_name] = {} # if file_name != "train_data": # set_trace() # add char span if config["add_char_span"]: # 实体在语料中的首尾位置 data, miss_sample_list, miss_sample = preprocessor.add_char_span( data, config["ignore_subword"]) error_statistics[file_name]["miss_samples"] = len(miss_sample_list) data_path = os.path.join(data_out_dir, "{}_miss.json".format(file_name)) json.dump(miss_sample, open(data_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2) # # clean # data, bad_samples_w_char_span_error = preprocessor.clean_data_w_span(data) # error_statistics[file_name]["char_span_error"] = len(bad_samples_w_char_span_error) # collect relation types and entity types for sample in tqdm(