def cut_off_jsonlines(path, dest_file): file_list = read_jsonline(path) new_file_list = [] for file in file_list: new_file = cut_off_sentence(file) new_file_list.append(new_file) # print(len(new_file_list)) write_jsonline(dest_file, new_file_list)
def del_empty_cluster(path, dest_path): file_list = read_jsonline(path) for file in file_list: if len(file["clusters"]) == 0: print(file["clusters"]) new_file_list = [file for file in file_list if len(file["clusters"]) != 0] write_jsonline(dest_path, new_file_list) print(len(new_file_list)) print(len(file_list))
def merge_two_jsonlines(path, path2, dest_path): file1 = read_jsonline(path) file2 = read_jsonline(path2) file1.extend(file2) tag = 0 for i in file1: i["doc_key"] = "nw" + str(tag) tag += 1 print(i["doc_key"]) print(len(file1)) write_jsonline(dest_path, file1)
def split_train_eval_dataset(path, train_path, eval_path): file_list = read_jsonline(path) print(len(file_list)) random.shuffle(file_list) train_nums = math.floor(len(file_list) * 0.85) train_data = file_list[:train_nums] eval_data = file_list[train_nums:] print(len(train_data)) print(len(eval_data)) write_jsonline(train_path, train_data) write_jsonline(eval_path, eval_data)
def batch_get_head(path, dest_path): file_list = read_jsonline(path) for file in file_list: new_clusters = get_entity_head(file) file["clusters"] = new_clusters print( sum(file["sentences"], [])[file["clusters"][0][0][0]:file["clusters"][0][0][1] + 1]) print( sum(file["sentences"], [])[file["clusters"][0][1][0]:file["clusters"][0][1][1] + 1]) # print(len(file_list)) write_jsonline(dest_path, file_list)
def all_file(path, dest_path, vocab_file, length): file_l = read_jsonline(path) tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=False) new_file = [] for i in file_l: dic = get_sub_token(i, tokenizer, length) dic = get_cls_sep(dic) dic = get_speaker(dic) dic = get_sentence_map(dic) dic = finally_get_cluster(dic) new_file.append(dic) print("_______________________") write_jsonline(dest_path, new_file)
def jsonlines_count(path, dest_path): jsonlines_list = read_jsonline(path) new_jsonline_list = [] a = 0 for dic in jsonlines_list: new_dic = count_sentence_entity(dic) if new_dic == 1: a += 1 if new_dic != {} and new_dic != 1: new_jsonline_list.append(new_dic) write_jsonline(dest_path, new_jsonline_list) phrases_count = len(new_jsonline_list) sentences_count = len(jsonlines_list) - phrases_count print(sentences_count / len(jsonlines_list)) print(len(jsonlines_list)) print('new', len(new_jsonline_list)) return sentences_count, phrases_count
def del_all_overlap(path, dest_path): file_l = read_jsonline(path) print(len(file_l)) num = 0 sorted_l = [] for raw_index, raw_file in enumerate(file_l): raw_sentences = sum(raw_file["sentences"], []) raw_cluster = raw_file["clusters"] r_entity_s = raw_cluster[0][0][0] r_entity_e = raw_cluster[0][0][1] r_pn = raw_file["pn"] new_l = [] for new_index, new_file in enumerate(file_l): new_sentences = sum(new_file["sentences"], []) new_cluster = new_file["clusters"] n_entity_s = new_cluster[0][0][0] n_entity_e = new_cluster[0][0][1] n_pn = new_file["pn"] if new_index != raw_index and r_pn == n_pn and new_sentences[ n_entity_s:n_entity_e] == raw_sentences[ r_entity_s:r_entity_e]: new_l.append(raw_index) new_l.append(new_index) new_l = sorted(set(new_l)) if len(new_l) > 0: sorted_l.append(new_l) # print(sorted_l) final_l = [] for i in sorted_l: if i not in final_l: final_l.append(i) final_l2 = [i[1:] for i in final_l] final_l3 = sum(final_l2, []) print(len(final_l3)) overlap_l = [] for raw_index, raw_file in enumerate(file_l): if raw_index not in final_l3: overlap_l.append(raw_file) write_jsonline(dest_path, overlap_l) print(len(overlap_l))
def create_jsonline(dest_file, all_dic_list): write_jsonline(dest_file, all_dic_list)
if num_count == min(clusters): tag = id break if tag > 1: sentences = sentences[tag:] speakers = speakers[tag:] new_clusters = [(i - sentence_index_tag) for i in clusters] new_clusters = [[new_clusters[0], new_clusters[1]], [new_clusters[2], new_clusters[3]]] dic["clusters"] = [new_clusters] # print(dic["clusters"]) dic["sentences"] = sentences dic["speakers"] = speakers print(len(dic["sentences"])) print('___') return dic if __name__ == '__main__': path = "/home/patsnap/PycharmProjects/webanno_preprocess/data/jsonline_data/bert_test/bert_256_merge_x4_z5_x1_z3.jsonlines" dest_path = "/home/patsnap/PycharmProjects/webanno_preprocess/data/jsonline_data/bert_test/cut_bert_256_merge_x4_z5_x1_z3.jsonlines" file_list = read_jsonline(path) new = [] for dic in file_list: # print(len(dic["sentences"]), '??') dic = cut_off_sentence(dic) # print(len(dic["sentences"])) new.append(dic) write_jsonline(dest_path, new)