def cut_off_jsonlines(path, dest_file):
    file_list = read_jsonline(path)
    new_file_list = []
    for file in file_list:
        new_file = cut_off_sentence(file)
        new_file_list.append(new_file)
    # print(len(new_file_list))
    write_jsonline(dest_file, new_file_list)
def del_empty_cluster(path, dest_path):
    file_list = read_jsonline(path)
    for file in file_list:
        if len(file["clusters"]) == 0:
            print(file["clusters"])
    new_file_list = [file for file in file_list if len(file["clusters"]) != 0]
    write_jsonline(dest_path, new_file_list)
    print(len(new_file_list))
    print(len(file_list))
Beispiel #3
0
def merge_two_jsonlines(path, path2, dest_path):
    file1 = read_jsonline(path)
    file2 = read_jsonline(path2)
    file1.extend(file2)
    tag = 0
    for i in file1:
        i["doc_key"] = "nw" + str(tag)
        tag += 1
        print(i["doc_key"])
    print(len(file1))
    write_jsonline(dest_path, file1)
Beispiel #4
0
def split_train_eval_dataset(path, train_path, eval_path):
    file_list = read_jsonline(path)
    print(len(file_list))
    random.shuffle(file_list)
    train_nums = math.floor(len(file_list) * 0.85)
    train_data = file_list[:train_nums]
    eval_data = file_list[train_nums:]
    print(len(train_data))
    print(len(eval_data))
    write_jsonline(train_path, train_data)
    write_jsonline(eval_path, eval_data)
Beispiel #5
0
def batch_get_head(path, dest_path):
    file_list = read_jsonline(path)
    for file in file_list:
        new_clusters = get_entity_head(file)
        file["clusters"] = new_clusters
        print(
            sum(file["sentences"],
                [])[file["clusters"][0][0][0]:file["clusters"][0][0][1] + 1])
        print(
            sum(file["sentences"],
                [])[file["clusters"][0][1][0]:file["clusters"][0][1][1] + 1])
    # print(len(file_list))
    write_jsonline(dest_path, file_list)
def all_file(path, dest_path, vocab_file, length):
    file_l = read_jsonline(path)
    tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False)
    new_file = []
    for i in file_l:
        dic = get_sub_token(i, tokenizer, length)
        dic = get_cls_sep(dic)
        dic = get_speaker(dic)
        dic = get_sentence_map(dic)
        dic = finally_get_cluster(dic)
        new_file.append(dic)
        print("_______________________")
    write_jsonline(dest_path, new_file)
def jsonlines_count(path, dest_path):
    jsonlines_list = read_jsonline(path)
    new_jsonline_list = []
    a = 0
    for dic in jsonlines_list:
        new_dic = count_sentence_entity(dic)
        if new_dic == 1:
            a += 1
        if new_dic != {} and new_dic != 1:
            new_jsonline_list.append(new_dic)
    write_jsonline(dest_path, new_jsonline_list)
    phrases_count = len(new_jsonline_list)
    sentences_count = len(jsonlines_list) - phrases_count
    print(sentences_count / len(jsonlines_list))
    print(len(jsonlines_list))
    print('new', len(new_jsonline_list))
    return sentences_count, phrases_count
Beispiel #8
0
def del_all_overlap(path, dest_path):
    file_l = read_jsonline(path)
    print(len(file_l))
    num = 0
    sorted_l = []
    for raw_index, raw_file in enumerate(file_l):
        raw_sentences = sum(raw_file["sentences"], [])
        raw_cluster = raw_file["clusters"]
        r_entity_s = raw_cluster[0][0][0]
        r_entity_e = raw_cluster[0][0][1]
        r_pn = raw_file["pn"]
        new_l = []
        for new_index, new_file in enumerate(file_l):
            new_sentences = sum(new_file["sentences"], [])
            new_cluster = new_file["clusters"]
            n_entity_s = new_cluster[0][0][0]
            n_entity_e = new_cluster[0][0][1]
            n_pn = new_file["pn"]
            if new_index != raw_index and r_pn == n_pn and new_sentences[
                    n_entity_s:n_entity_e] == raw_sentences[
                        r_entity_s:r_entity_e]:
                new_l.append(raw_index)
                new_l.append(new_index)
        new_l = sorted(set(new_l))
        if len(new_l) > 0:
            sorted_l.append(new_l)

    # print(sorted_l)
    final_l = []
    for i in sorted_l:
        if i not in final_l:
            final_l.append(i)
    final_l2 = [i[1:] for i in final_l]
    final_l3 = sum(final_l2, [])
    print(len(final_l3))
    overlap_l = []
    for raw_index, raw_file in enumerate(file_l):
        if raw_index not in final_l3:
            overlap_l.append(raw_file)
    write_jsonline(dest_path, overlap_l)
    print(len(overlap_l))
def create_jsonline(dest_file, all_dic_list):
    write_jsonline(dest_file, all_dic_list)
Beispiel #10
0
            if num_count == min(clusters):
                tag = id
                break

    if tag > 1:
        sentences = sentences[tag:]
        speakers = speakers[tag:]
        new_clusters = [(i - sentence_index_tag) for i in clusters]
        new_clusters = [[new_clusters[0], new_clusters[1]],
                        [new_clusters[2], new_clusters[3]]]
        dic["clusters"] = [new_clusters]
        # print(dic["clusters"])
        dic["sentences"] = sentences
        dic["speakers"] = speakers
    print(len(dic["sentences"]))
    print('___')
    return dic


if __name__ == '__main__':
    path = "/home/patsnap/PycharmProjects/webanno_preprocess/data/jsonline_data/bert_test/bert_256_merge_x4_z5_x1_z3.jsonlines"
    dest_path = "/home/patsnap/PycharmProjects/webanno_preprocess/data/jsonline_data/bert_test/cut_bert_256_merge_x4_z5_x1_z3.jsonlines"
    file_list = read_jsonline(path)
    new = []
    for dic in file_list:
        # print(len(dic["sentences"]), '??')
        dic = cut_off_sentence(dic)
        # print(len(dic["sentences"]))
        new.append(dic)
    write_jsonline(dest_path, new)