def first_filter(): file_dir = Dir.resourceDir + "/已标注文书-txt/paragraph_labeled/" # save_dir = Dir.resourceDir+"/已标注文书-txt/paragraph_labeled/" # print(file_dir) data = dataloader.get_all_data(file_dir)[2] datas = [] # print(data.items().__len__()) count = 0 for name, content in data.items(): datas.append(content) # print(name,end="\t") result = locate_paragraph(content) # print(result[0]) if result[0] == "null": continue count += 1
def demo(): dir_classic = Dir.resourceDir + "已标注文书-txt/paragraph_labeled/" classic, all_labeled, origindata = dataloader.get_all_data(dir_classic) result = [] for name, content in origindata.items(): # print(name) both, precision, recall = locate(content) result.append([both, precision, recall]) for res in result: print(res) both = sum([n for n, var, var1 in result]) preci = sum([var for n, var, var1 in result]) recall = sum(var for n, vav1, var in result) precision = both / preci recall_ = both / recall f_score = 2 * precision * recall_ / (precision + recall_) print(precision, recall_, f_score)
def check_if_contain(filepath): data = dataLoader.get_all_data(filepath)[2] counter = [0, 0, 0, 0] result = [[], [], [], []] for name, content in data.items(): if "..." in content: counter[0] += 1 result[0].append(content) if "......" in content: counter[-1] += 1 result[-1].append(content) if "。。。" in content: counter[1] += 1 if "。。。。。。" in content: content[2] += 1 print("...", "......", "。。。", "。。。。。。") print(counter) for cont in result[-1]: print(cont)
def extract_label_data(dir, save_dir): data = dataLoader.get_all_data(dir)[2] result = {} for name, content in data.items(): labeled_content = dataLoader.labeled_text(content) for sentence, label in labeled_content.items(): if isinstance(label, str): if label not in result.keys(): result[label] = [] result[label].append(sentence + "\n") else: if "null" not in result.keys(): result["null"] = [] result["null"].append(sentence + "\n") for label in result.keys(): name = label if "/" in label: name = label.replace("/", "") savepath = save_dir + "/" + name + ".txt" with open(savepath, mode="w", encoding="utf-8") as file: file.writelines(result[label])
def transfer(dir, two_class=True, label_file=Dir.resourceDir + "标签-paragraph.csv", filter=nothing): data = dataLoader.get_all_data(dir)[2] result = [] seperate = "\t" label_regex = dataLoader.get_label_regex(dataLoader.read_label(label_file)) for name, content in data.items(): labeled_content = dataLoader.labeled_text(content, label_regex=label_regex, filter=filter) if two_class: tmp = [] for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + "1" + '\n') tmp.append(sentence + seperate + "1" + '\n') else: result.append(sentence + seperate + "0" + '\n') tmp.append(sentence + seperate + "0" + '\n') # check_res = check_transfer_details(tmp) # if check_res.__len__()>0: # print(name) # print(check_res) else: for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + labeled_content[sen] + '\n') else: result.append(sentence + seperate + "null" + "\n") return result
def single(name): dir_classic = Dir.resourceDir + "已标注文书-txt/paragraph_labeled/" classic, all_labeled, origindata = dataloader.get_all_data(dir_classic) # name = "2-广东美的生活电器制造有限公司与梅霞侵害商标权纠纷一审民事判决书" content = origindata[name] simple(content)