def __init__(self, raw_path, punc_conf_path, seg_conf_path, seed=12, max_num=200000): """ :param raw_path: :param punc_conf_path: :param seg_conf_path: :param seed: :param max_num: maximum paragraph number in one file """ random.seed(seed) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) self.logger = logging.getLogger(__name__) self.dir = raw_path self.logger.info(" root = %s", self.dir) self.book_list = load_file_name(self.dir)[2] self.logger.info(" training books = %s", ' '.join(self.book_list)) self.sentence_end_symbol = list("。!??!") self.max_num = max_num self.max_sentence_in_paragraph = 8 self.max_char_in_sentence = 128 self.min_char_in_paragraph = 64 self.max_char_in_paragraph = 128 self.max_char_in_batch = 512 self.puncs = set([x for x in load_txt_data(punc_conf_path) ]) # TODO: 用set数据结构 self.puncs_pattern = ['\\' + x for x in self.puncs] self.segment_labels = [ x.split('\t')[0] for x in load_txt_data(seg_conf_path) ] self.segment_symbol = [ x.split('\t')[1] for x in load_txt_data(seg_conf_path) ] self.segment_label_symbol_dict = dict( zip(self.segment_labels, self.segment_symbol)) self.place_holder = 'b' self.raw_data = self.read_file(self.book_list) self.raw_data = self.format_data() self.punc_data = self.tag_punc_data()
def split_doc2(data_path, out_path): import re data = load_txt_data(data_path) doc_index = 0 for i in tqdm(range(len(data))): try: line = data[i].split(',') if len(line[0]) < 100: continue abstract = re.sub("[\" ]", "", line[1]) abstract = ' '.join(abstract) tmp = re.sub("[\" ]", "", line[0]) tmp = tmp.split('。') document = [] for x in tmp: document.append(' '.join(x)) except IndexError: continue # print(document) for j in range(len(document)): document[j] = document[j] + '\n' new_doc = document + ['@highlight\n'] + [abstract] save_txt_file(new_doc, out_path + str(doc_index) + '.story') doc_index += 1
def tokenize_format_lines(args): input_path = args.raw_path output_path = args.json_path + args.data_name + '.json' input_data = load_txt_data(input_path) pun = oneOf(list("。,,;:()()!?\\—、丨/")) json_data_set = [] for i in tqdm(range(len(input_data))): json_dict = {} raw = input_data[i].split(',', 4) if args.vy_predict: abstract = [list('NONE')] sentence = pun.split(re.sub("\"", '', raw[4])) else: abstract = [list(raw[0])] sentence = pun.split(raw[1]) split_sentence = [] for split_content in sentence: split_content = list(split_content) if split_content: split_sentence.append(split_content) json_dict['src'] = split_sentence json_dict['tgt'] = abstract if args.vy_predict: json_dict['doc_id'] = raw[0] json_data_set.append(json_dict) # with open(output_path, 'w', encoding='utf-8') as save: # save.write(json.dumps(json_data_set, ensure_ascii=False)) return json_data_set
def get_labels(self): # return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "[CLS]", "[SEP]"] # TODO: define label # puncs = [',', '。', '?', ',', '.', '\?', "、", '》', '《', ';', ':', ';', ':', '‘', '“', "\"", "'", # '【', '】', '{', '}', '…', '\|', '~', '·', '!', '\!', '/', '「', '」', '(', ')', '〔', '〕', # '\(', '\)', '\[', '\]', '#other#'] from utils.dataio import load_txt_data puncs = [x for x in load_txt_data('./utils/config/punctuation.dat')] segs = [ x.split('\t')[1] for x in load_txt_data('./utils/config/segmentation.dat') ] # puncs = puncs + [x + '#end#' for x in puncs] + ["[CLS]", "[SEP]"] puncs += ["[CLS]", "[SEP]"] segs += ["[CLS]", "[SEP]"] return puncs
def filter_data(path): data = load_txt_data(path) res = [] for item in tqdm(data, desc='Filter'): raw = item.split(',') doc = raw[1] abst = raw[0] if len(doc) >= 100: res.append('{},{}'.format(abst, doc)) save_txt_file(res, path)
def revers_index(path): data = load_txt_data(path) res = [] for item in data: raw = item.split(',') doc = raw[0] try: abst = raw[1] except IndexError: continue res.append('{},{}'.format(abst, doc)) save_txt_file(res, path)
def split_doc(data_path, out_path): data = load_txt_data(data_path) doc_index = 0 for i in tqdm(range(len(data)), desc='split_doc'): line = data[i].split(',') abstract = " ".join(line[0]) from pyparsing import oneOf punc = oneOf(list("。,;;!?")) document = [' '.join(x) for x in punc.split(line[1])] # print(document) for j in range(len(document)): document[j] = document[j] + '\n' new_doc = document + ['@highlight\n'] + [abstract] _doc_index = str(doc_index) while len(_doc_index) <= 8: _doc_index = '0' + _doc_index save_txt_file(new_doc, out_path + _doc_index + '.story') doc_index += 1
def read_file(self, file_names): """ :param file_names: :return: load data: 按文本自然段读取 [ ["para1"], ["para2"], ] """ res = [] count = 0 for file_name in tqdm(file_names, desc='Load Data'): raw = load_txt_data(self.data_dir + file_name, origin=True) count += len(raw) for item in tqdm(raw, desc='adding data from {}'.format(file_name)): res.append(item) self.logger.info(" Num paragraph = %d", count) return res
def read_file(self, file_names): """ 读取文档并筛选优质数据 :param file_names: :return: load data: [ ["sentence1", "sentence2", "sentence3"], ["sentence1", "sentence2", "sentence3"], ] """ res = [] for file_name in file_names: raw = load_txt_data(self.dir + file_name, origin=True)[:self.max_num] # raw = load_txt_data(self.dir + file_name, origin=True)[-50:] for i in trange(len(raw), desc='Load {}'.format(file_name)): paragraph: str = raw[i].strip() paragraph_char_num = self.count_info(paragraph) if self.min_char_in_paragraph < paragraph_char_num < self.max_char_in_paragraph: # 筛选字数范围内段落 paragraph: str = self.remove_none_puncs_paragraph( paragraph) if paragraph: paragraph: str = re.sub('\\u3000', ':', paragraph) paragraph: str = self.format_puncs_space(paragraph) paragraph: list = self.split_paragraph(paragraph) paragraph: list = [ x for x in paragraph if len(x) <= self.max_char_in_sentence ] res.append(paragraph) import random random.seed(1024) random.shuffle(res) self.logger.info(" Num paragraph = %d", len(res)) return res
def merge_files(path_list, merge_path): data = [] for path in path_list: data += load_txt_data(path) save_txt_file(data, merge_path)