def build_files(raw_data_path, tokenized_data_path, full_tokenizer, num_pieces): if ppd.is_default_file_type(): # 是否采用默认json类型,默认编码为utf-8 if ppd.DEFAULT_FILE_TYPE in raw_data_path: with open(raw_data_path, 'r', encoding='utf8') as f: print('reading lines') lines = json.load(f) lines = [line.replace('\n', ' [SEP] ') for line in lines] # 用[SEP]表示换行, 段落之间使用SEP表示段落结束 else: raise Exception("请使用json文件类型,或者自定义文件类型,请看pre_process_data.py文件load方法") else: # 自定义数据源的,调用pre_process_data.py中的load方法 lines = ppd.load() all_len = len(lines) single = ''.join(lines) len_single = len(single) if not os.path.exists(tokenized_data_path): os.mkdir(tokenized_data_path) for i in tqdm(range(num_pieces)): single_ids = full_tokenizer.convert_tokens_to_ids( full_tokenizer.tokenize(single[len_single // num_pieces * i: len_single // num_pieces * (i + 1)])) with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'w') as f: for id in single_ids[:-1]: f.write(str(id) + ' ') f.write(str(single_ids[-1])) f.write('\n') print('finish')
def build_files(data_path, tokenized_data_path, num_pieces, full_tokenizer, min_length): if ppd.is_default_file_type(): # 是否采用默认json类型,默认编码为utf-8 if ppd.DEFAULT_FILE_TYPE in data_path: with open(data_path, 'r', encoding='utf8') as f: print('reading lines') lines = json.load(f) lines = [line.replace('\n', ' [SEP] ') for line in lines] # 用[SEP]表示换行, 段落之间使用SEP表示段落结束 else: raise Exception("请使用json文件类型,或者自定义文件类型,请看pre_process_data.py文件load方法") else: # 自定义数据源的,调用pre_process_data.py中的load方法 lines = ppd.load() all_len = len(lines) if not os.path.exists(tokenized_data_path): os.mkdir(tokenized_data_path) for i in tqdm(range(num_pieces)): sublines = lines[all_len // num_pieces * i: all_len // num_pieces * (i + 1)] if i == num_pieces - 1: sublines.extend(lines[all_len // num_pieces * (i + 1):]) # 把尾部例子添加到最后一个piece sublines = [full_tokenizer.tokenize(line) for line in sublines if len(line) > min_length] # 只考虑长度超过min_length的句子 sublines = [full_tokenizer.convert_tokens_to_ids(line) for line in sublines] full_line = [] for subline in sublines: full_line.append(full_tokenizer.convert_tokens_to_ids('[MASK]')) # 文章开头添加MASK表示文章开始 full_line.extend(subline) full_line.append(full_tokenizer.convert_tokens_to_ids('[CLS]')) # 文章之间添加CLS表示文章结束 with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'w') as f: for id in full_line: f.write(str(id) + ' ') print('finish')