def extract_setence(content, two_class=True, paragraph=False): label_file = Dir.resourceDir + "标签-sheet1.csv" filter = nothing if paragraph: label_file = Dir.resourceDir + "标签-paragraph.csv" filter = remove seperate = "\t" label_regex = dataLoader.get_label_regex(dataLoader.read_label(label_file)) labeled_content = dataLoader.labeled_text(content, label_regex=label_regex, filter=filter) result = [] if two_class: tmp = [] for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + "1" + '\n') tmp.append(sentence + seperate + "1" + '\n') else: result.append(sentence + seperate + "0" + '\n') tmp.append(sentence + seperate + "0" + '\n') else: for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + labeled_content[sen] + '\n') else: result.append(sentence + seperate + "null" + "\n") return result # import Dir # # 典型案例111篇 # # 基础案例299篇-已标注 # dir_classic = Dir.resourceDir+"已标注文书-txt/paragraph_labeled/" # content = transfer(dir_classic,two_class=False) # savepath = Dir.projectDir+"/src1_result/new_extract_data/data_labeled_two" # print(content.__len__()) # save(content,savepath) # save_dir = Dir.projectDir+"/src1_result/label_data/all" # extract_label_data(dir_classic,save_dir) # check(savepath) # check_transfer(content)
def order_sentences(text, sentences): result = [] final_result = [] text = dataLoader.pre_process(text) tmp = {} for sentence, label in sentences: tmp[sentence] = label result.append([sentence, text.index(sentence)]) result = sorted(result, key=lambda d: d[1]) for sen, index in result: final_result.append([sen, tmp[sen]]) return final_result
def extract_label_data(dir, save_dir): data = dataLoader.get_all_data(dir)[2] result = {} for name, content in data.items(): labeled_content = dataLoader.labeled_text(content) for sentence, label in labeled_content.items(): if isinstance(label, str): if label not in result.keys(): result[label] = [] result[label].append(sentence + "\n") else: if "null" not in result.keys(): result["null"] = [] result["null"].append(sentence + "\n") for label in result.keys(): name = label if "/" in label: name = label.replace("/", "") savepath = save_dir + "/" + name + ".txt" with open(savepath, mode="w", encoding="utf-8") as file: file.writelines(result[label])
def extract_sentence(content): regex = label[0] + "[\s\S]*?" + label[1] label_regex = [[label[0], regex]] result = dataloader.labeled_text(content, label_regex) new_result = {} for sen in result.keys(): new_sen = re.sub("<.*?>", "", sen) new_sen = new_sen.strip() if new_sen.__len__() == 0: continue new_result[new_sen] = result[sen] return new_result
def transfer(dir, two_class=True, label_file=Dir.resourceDir + "标签-paragraph.csv", filter=nothing): data = dataLoader.get_all_data(dir)[2] result = [] seperate = "\t" label_regex = dataLoader.get_label_regex(dataLoader.read_label(label_file)) for name, content in data.items(): labeled_content = dataLoader.labeled_text(content, label_regex=label_regex, filter=filter) if two_class: tmp = [] for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + "1" + '\n') tmp.append(sentence + seperate + "1" + '\n') else: result.append(sentence + seperate + "0" + '\n') tmp.append(sentence + seperate + "0" + '\n') # check_res = check_transfer_details(tmp) # if check_res.__len__()>0: # print(name) # print(check_res) else: for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + labeled_content[sen] + '\n') else: result.append(sentence + seperate + "null" + "\n") return result
def first_filter(): file_dir = Dir.resourceDir + "/已标注文书-txt/paragraph_labeled/" # save_dir = Dir.resourceDir+"/已标注文书-txt/paragraph_labeled/" # print(file_dir) data = dataloader.get_all_data(file_dir)[2] datas = [] # print(data.items().__len__()) count = 0 for name, content in data.items(): datas.append(content) # print(name,end="\t") result = locate_paragraph(content) # print(result[0]) if result[0] == "null": continue count += 1
def demo(): dir_classic = Dir.resourceDir + "已标注文书-txt/paragraph_labeled/" classic, all_labeled, origindata = dataloader.get_all_data(dir_classic) result = [] for name, content in origindata.items(): # print(name) both, precision, recall = locate(content) result.append([both, precision, recall]) for res in result: print(res) both = sum([n for n, var, var1 in result]) preci = sum([var for n, var, var1 in result]) recall = sum(var for n, vav1, var in result) precision = both / preci recall_ = both / recall f_score = 2 * precision * recall_ / (precision + recall_) print(precision, recall_, f_score)
def check_if_contain(filepath): data = dataLoader.get_all_data(filepath)[2] counter = [0, 0, 0, 0] result = [[], [], [], []] for name, content in data.items(): if "..." in content: counter[0] += 1 result[0].append(content) if "......" in content: counter[-1] += 1 result[-1].append(content) if "。。。" in content: counter[1] += 1 if "。。。。。。" in content: content[2] += 1 print("...", "......", "。。。", "。。。。。。") print(counter) for cont in result[-1]: print(cont)
def single(name): dir_classic = Dir.resourceDir + "已标注文书-txt/paragraph_labeled/" classic, all_labeled, origindata = dataloader.get_all_data(dir_classic) # name = "2-广东美的生活电器制造有限公司与梅霞侵害商标权纠纷一审民事判决书" content = origindata[name] simple(content)
def preprocess(content): content = dataloader.pre_process(content) content = content.replace("\n", "##") content = re.sub("#+", "#", content) return content