Beispiel #1
0
def build_files(raw_data_path, tokenized_data_path, full_tokenizer, num_pieces):
    if ppd.is_default_file_type():  # 是否采用默认json类型,默认编码为utf-8
        if ppd.DEFAULT_FILE_TYPE in raw_data_path:
            with open(raw_data_path, 'r', encoding='utf8') as f:
                print('reading lines')
                lines = json.load(f)
                lines = [line.replace('\n', ' [SEP] ') for line in lines]  # 用[SEP]表示换行, 段落之间使用SEP表示段落结束
        else:
            raise Exception("请使用json文件类型,或者自定义文件类型,请看pre_process_data.py文件load方法")
    else:  # 自定义数据源的,调用pre_process_data.py中的load方法
        lines = ppd.load()
        all_len = len(lines)
    single = ''.join(lines)
    len_single = len(single)
    if not os.path.exists(tokenized_data_path):
        os.mkdir(tokenized_data_path)
    for i in tqdm(range(num_pieces)):
        single_ids = full_tokenizer.convert_tokens_to_ids(
            full_tokenizer.tokenize(single[len_single // num_pieces * i: len_single // num_pieces * (i + 1)]))
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'w') as f:
            for id in single_ids[:-1]:
                f.write(str(id) + ' ')
            f.write(str(single_ids[-1]))
            f.write('\n')

    print('finish')
Beispiel #2
0
def build_files(data_path, tokenized_data_path, num_pieces, full_tokenizer, min_length):
    if ppd.is_default_file_type():  # 是否采用默认json类型,默认编码为utf-8
        if ppd.DEFAULT_FILE_TYPE in data_path:
            with open(data_path, 'r', encoding='utf8') as f:
                print('reading lines')
                lines = json.load(f)
                lines = [line.replace('\n', ' [SEP] ') for line in lines]  # 用[SEP]表示换行, 段落之间使用SEP表示段落结束
        else:
            raise Exception("请使用json文件类型,或者自定义文件类型,请看pre_process_data.py文件load方法")
    else:  # 自定义数据源的,调用pre_process_data.py中的load方法
        lines = ppd.load()
    all_len = len(lines)
    if not os.path.exists(tokenized_data_path):
        os.mkdir(tokenized_data_path)
    for i in tqdm(range(num_pieces)):
        sublines = lines[all_len // num_pieces * i: all_len // num_pieces * (i + 1)]
        if i == num_pieces - 1:
            sublines.extend(lines[all_len // num_pieces * (i + 1):])  # 把尾部例子添加到最后一个piece
        sublines = [full_tokenizer.tokenize(line) for line in sublines if
                    len(line) > min_length]  # 只考虑长度超过min_length的句子
        sublines = [full_tokenizer.convert_tokens_to_ids(line) for line in sublines]
        full_line = []
        for subline in sublines:
            full_line.append(full_tokenizer.convert_tokens_to_ids('[MASK]'))  # 文章开头添加MASK表示文章开始
            full_line.extend(subline)
            full_line.append(full_tokenizer.convert_tokens_to_ids('[CLS]'))  # 文章之间添加CLS表示文章结束
        with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'w') as f:
            for id in full_line:
                f.write(str(id) + ' ')
    print('finish')