Example #1
0
def extract_setence(content, two_class=True, paragraph=False):
    label_file = Dir.resourceDir + "标签-sheet1.csv"
    filter = nothing
    if paragraph:
        label_file = Dir.resourceDir + "标签-paragraph.csv"
        filter = remove

    seperate = "\t"
    label_regex = dataLoader.get_label_regex(dataLoader.read_label(label_file))
    labeled_content = dataLoader.labeled_text(content,
                                              label_regex=label_regex,
                                              filter=filter)
    result = []
    if two_class:
        tmp = []
        for sen in labeled_content.keys():
            sentence = sen.strip()
            if sentence == "":
                continue
            if labeled_content[sen].__len__() > 0:
                result.append(sentence + seperate + "1" + '\n')
                tmp.append(sentence + seperate + "1" + '\n')
            else:
                result.append(sentence + seperate + "0" + '\n')
                tmp.append(sentence + seperate + "0" + '\n')
    else:
        for sen in labeled_content.keys():
            sentence = sen.strip()
            if sentence == "":
                continue
            if labeled_content[sen].__len__() > 0:
                result.append(sentence + seperate + labeled_content[sen] +
                              '\n')
            else:
                result.append(sentence + seperate + "null" + "\n")
    return result


# import Dir
# # 典型案例111篇
# # 基础案例299篇-已标注
# dir_classic = Dir.resourceDir+"已标注文书-txt/paragraph_labeled/"
# content = transfer(dir_classic,two_class=False)
# savepath =  Dir.projectDir+"/src1_result/new_extract_data/data_labeled_two"
# print(content.__len__())
# save(content,savepath)

# save_dir =  Dir.projectDir+"/src1_result/label_data/all"
# extract_label_data(dir_classic,save_dir)

# check(savepath)
# check_transfer(content)
Example #2
0
def transfer(dir,
             two_class=True,
             label_file=Dir.resourceDir + "标签-paragraph.csv",
             filter=nothing):
    data = dataLoader.get_all_data(dir)[2]
    result = []
    seperate = "\t"
    label_regex = dataLoader.get_label_regex(dataLoader.read_label(label_file))
    for name, content in data.items():
        labeled_content = dataLoader.labeled_text(content,
                                                  label_regex=label_regex,
                                                  filter=filter)
        if two_class:
            tmp = []
            for sen in labeled_content.keys():
                sentence = sen.strip()
                if sentence == "":
                    continue
                if labeled_content[sen].__len__() > 0:
                    result.append(sentence + seperate + "1" + '\n')

                    tmp.append(sentence + seperate + "1" + '\n')
                else:
                    result.append(sentence + seperate + "0" + '\n')
                    tmp.append(sentence + seperate + "0" + '\n')
            # check_res = check_transfer_details(tmp)
            # if check_res.__len__()>0:
            #     print(name)
            #     print(check_res)
        else:
            for sen in labeled_content.keys():
                sentence = sen.strip()
                if sentence == "":
                    continue
                if labeled_content[sen].__len__() > 0:
                    result.append(sentence + seperate + labeled_content[sen] +
                                  '\n')
                else:
                    result.append(sentence + seperate + "null" + "\n")
    return result