def extract_sentence(content): regex = label[0] + "[\s\S]*?" + label[1] label_regex = [[label[0], regex]] result = dataloader.labeled_text(content, label_regex) new_result = {} for sen in result.keys(): new_sen = re.sub("<.*?>", "", sen) new_sen = new_sen.strip() if new_sen.__len__() == 0: continue new_result[new_sen] = result[sen] return new_result
def extract_setence(content, two_class=True, paragraph=False): label_file = Dir.resourceDir + "标签-sheet1.csv" filter = nothing if paragraph: label_file = Dir.resourceDir + "标签-paragraph.csv" filter = remove seperate = "\t" label_regex = dataLoader.get_label_regex(dataLoader.read_label(label_file)) labeled_content = dataLoader.labeled_text(content, label_regex=label_regex, filter=filter) result = [] if two_class: tmp = [] for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + "1" + '\n') tmp.append(sentence + seperate + "1" + '\n') else: result.append(sentence + seperate + "0" + '\n') tmp.append(sentence + seperate + "0" + '\n') else: for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + labeled_content[sen] + '\n') else: result.append(sentence + seperate + "null" + "\n") return result # import Dir # # 典型案例111篇 # # 基础案例299篇-已标注 # dir_classic = Dir.resourceDir+"已标注文书-txt/paragraph_labeled/" # content = transfer(dir_classic,two_class=False) # savepath = Dir.projectDir+"/src1_result/new_extract_data/data_labeled_two" # print(content.__len__()) # save(content,savepath) # save_dir = Dir.projectDir+"/src1_result/label_data/all" # extract_label_data(dir_classic,save_dir) # check(savepath) # check_transfer(content)
def extract_label_data(dir, save_dir): data = dataLoader.get_all_data(dir)[2] result = {} for name, content in data.items(): labeled_content = dataLoader.labeled_text(content) for sentence, label in labeled_content.items(): if isinstance(label, str): if label not in result.keys(): result[label] = [] result[label].append(sentence + "\n") else: if "null" not in result.keys(): result["null"] = [] result["null"].append(sentence + "\n") for label in result.keys(): name = label if "/" in label: name = label.replace("/", "") savepath = save_dir + "/" + name + ".txt" with open(savepath, mode="w", encoding="utf-8") as file: file.writelines(result[label])
def transfer(dir, two_class=True, label_file=Dir.resourceDir + "标签-paragraph.csv", filter=nothing): data = dataLoader.get_all_data(dir)[2] result = [] seperate = "\t" label_regex = dataLoader.get_label_regex(dataLoader.read_label(label_file)) for name, content in data.items(): labeled_content = dataLoader.labeled_text(content, label_regex=label_regex, filter=filter) if two_class: tmp = [] for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + "1" + '\n') tmp.append(sentence + seperate + "1" + '\n') else: result.append(sentence + seperate + "0" + '\n') tmp.append(sentence + seperate + "0" + '\n') # check_res = check_transfer_details(tmp) # if check_res.__len__()>0: # print(name) # print(check_res) else: for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + labeled_content[sen] + '\n') else: result.append(sentence + seperate + "null" + "\n") return result