def extract_setence(content, two_class=True, paragraph=False): label_file = Dir.resourceDir + "标签-sheet1.csv" filter = nothing if paragraph: label_file = Dir.resourceDir + "标签-paragraph.csv" filter = remove seperate = "\t" label_regex = dataLoader.get_label_regex(dataLoader.read_label(label_file)) labeled_content = dataLoader.labeled_text(content, label_regex=label_regex, filter=filter) result = [] if two_class: tmp = [] for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + "1" + '\n') tmp.append(sentence + seperate + "1" + '\n') else: result.append(sentence + seperate + "0" + '\n') tmp.append(sentence + seperate + "0" + '\n') else: for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + labeled_content[sen] + '\n') else: result.append(sentence + seperate + "null" + "\n") return result # import Dir # # 典型案例111篇 # # 基础案例299篇-已标注 # dir_classic = Dir.resourceDir+"已标注文书-txt/paragraph_labeled/" # content = transfer(dir_classic,two_class=False) # savepath = Dir.projectDir+"/src1_result/new_extract_data/data_labeled_two" # print(content.__len__()) # save(content,savepath) # save_dir = Dir.projectDir+"/src1_result/label_data/all" # extract_label_data(dir_classic,save_dir) # check(savepath) # check_transfer(content)
def transfer(dir, two_class=True, label_file=Dir.resourceDir + "标签-paragraph.csv", filter=nothing): data = dataLoader.get_all_data(dir)[2] result = [] seperate = "\t" label_regex = dataLoader.get_label_regex(dataLoader.read_label(label_file)) for name, content in data.items(): labeled_content = dataLoader.labeled_text(content, label_regex=label_regex, filter=filter) if two_class: tmp = [] for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + "1" + '\n') tmp.append(sentence + seperate + "1" + '\n') else: result.append(sentence + seperate + "0" + '\n') tmp.append(sentence + seperate + "0" + '\n') # check_res = check_transfer_details(tmp) # if check_res.__len__()>0: # print(name) # print(check_res) else: for sen in labeled_content.keys(): sentence = sen.strip() if sentence == "": continue if labeled_content[sen].__len__() > 0: result.append(sentence + seperate + labeled_content[sen] + '\n') else: result.append(sentence + seperate + "null" + "\n") return result