def preprocess_raw_data(file, tokenize, token2id_dct, **kwargs): """ # 处理自有数据函数模板 # file文件数据格式: 句子1\t句子2 # [filter] 过滤 # [segment] 分词 # [build vocab] 构造词典 # [split] train-dev-test """ seg_file = file.rsplit('.', 1)[0] + '_seg.txt' if not os.path.exists(seg_file): items = utils.file2items(file) # 过滤 # filter here print('过滤后数据量', len(items)) # 分词 for i, item in enumerate(items): item[0] = ' '.join(tokenize(item[0])) item[1] = ' '.join(tokenize(item[1])) utils.list2file(seg_file, items) print('保存分词后数据成功', '数据量', len(items), seg_file) else: # 读取分词好的数据 items = utils.file2items(seg_file) # 划分 不分测试集 train_items, dev_items = utils.split_file(items, ratio='19:1', shuffle=True, seed=1234) # 构造词典(option) need_to_rebuild = [] for token2id_name in token2id_dct: if not token2id_dct[token2id_name]: print(f'字典{token2id_name} 载入不成功, 将生成并保存') need_to_rebuild.append(token2id_name) if need_to_rebuild: print(f'生成缺失词表文件...{need_to_rebuild}') for items in [train_items, dev_items]: # 字典只统计train和dev for item in items: if 'word2id' in need_to_rebuild: token2id_dct['word2id'].to_count(item[0].split(' ')) token2id_dct['word2id'].to_count(item[1].split(' ')) if 'word2id' in need_to_rebuild: token2id_dct['word2id'].rebuild_by_counter( restrict=['<pad>', '<unk>', '<eos>'], min_freq=5, max_vocab_size=30000) token2id_dct['word2id'].save(f'{curr_dir}/../data/s2s_word2id.dct') else: print('使用已有词表文件...') return train_items, dev_items, None
def preprocess_common_dataset_Toutiao(file, tokenize, token2id_dct, **kwargs): train_file = f'{curr_dir}/../data/train.toutiao.cls.txt' dev_file = f'{curr_dir}/../data/valid.toutiao.cls.txt' test_file = f'{curr_dir}/../data/test.toutiao.cls.txt' items_lst = [] for file in [train_file, dev_file, test_file]: seg_file = file.rsplit('.', 1)[0] + '_seg.txt' # 原始文本分词并保存为_seg.txt后缀文件 if not os.path.exists(seg_file): items = utils.file2items(file, deli='\t') # 过滤 # filter here print('过滤后数据量', len(items)) # 分词 for i, item in enumerate(items): item[0] = ' '.join(tokenize(item[0])) utils.list2file(seg_file, items) print('保存分词后数据成功', '数据量', len(items), seg_file) items_lst.append(items) else: # 读取分词好的数据 items_lst.append(utils.file2items(seg_file)) train_items, dev_items, test_items = items_lst # 构造词典(option) need_to_rebuild = [] for token2id_name in token2id_dct: if not token2id_dct[token2id_name]: print(f'字典{token2id_name} 载入不成功, 将生成并保存') need_to_rebuild.append(token2id_name) if need_to_rebuild: print(f'生成缺失词表文件...{need_to_rebuild}') for items in [train_items, dev_items]: # 字典只统计train和dev for item in items: if 'word2id' in need_to_rebuild: token2id_dct['word2id'].to_count(item[0].split(' ')) if 'label2id' in need_to_rebuild: token2id_dct['label2id'].to_count([item[1]]) if 'word2id' in need_to_rebuild: token2id_dct['word2id'].rebuild_by_counter(restrict=['<pad>', '<unk>'], min_freq=1, max_vocab_size=20000) token2id_dct['word2id'].save(f'{curr_dir}/../data/toutiao_cls_word2id.dct') if 'label2id' in need_to_rebuild: token2id_dct['label2id'].rebuild_by_counter(restrict=['<unk>']) token2id_dct['label2id'].save(f'{curr_dir}/../data/toutiao_cls_label2id.dct') else: print('使用已有词表文件...') return train_items, dev_items, test_items
def Doubanchange2items(file): # 转为[multi_src, tgt]格式 按字分 exm_lst = [] sess_lst = utils.file2items(file) for sess in sess_lst: sess = [' '.join(s) for s in sess] # 按字分 for i in range(1, len(sess)): multi_src = '$$$'.join(sess[:i]) tgt = sess[i] exm_lst.append([multi_src, tgt]) return exm_lst
def Doubanchange2items(file): # 转为[multi_src, tgt]格式 # 分词 seg_file = file.rsplit('.', 1)[0] + '_seg.txt' if not os.path.exists(seg_file): items = utils.file2items(file) # 分词 for i, item in enumerate(items): for j in range(len(item)): items[i][j] = ' '.join(tokenize(items[i][j])) utils.list2file(seg_file, items) print('保存分词后数据成功', '数据量', len(items), seg_file) else: items = utils.file2items(seg_file) exm_lst = [] sess_lst = items for sess in sess_lst: for i in range(1, len(sess)): multi_src = '$$$'.join(sess[:i]) tgt = sess[i] exm_lst.append([multi_src, tgt]) return exm_lst
def change2line(file): exm_lst = [] items = utils.file2items(file, deli=' ') curr_sent = [] curr_bmeo = [] for item in items: if len(item) == 1: # 分隔标志 [''] if curr_sent and curr_bmeo: exm_lst.append([' '.join(curr_sent), ' '.join(curr_bmeo)]) curr_sent, curr_bmeo = [], [] continue curr_sent.append(item[0]) curr_bmeo.append(item[1]) if curr_sent and curr_bmeo: exm_lst.append([' '.join(curr_sent), ' '.join(curr_bmeo)]) return exm_lst
def preprocess_raw_data(file, tokenize, token2id_dct, **kwargs): """ # 处理自有数据函数模板 # file文件数据格式: 句子(以空格分好)\t标签(以空格分好) # [filter] 过滤 # [segment] 分词 ner一般仅分字,用空格隔开,不需分词步骤 # [build vocab] 构造词典 # [split] train-dev-test """ items = utils.file2items(file) # 过滤 # filter here print('过滤后数据量', len(items)) # 划分 train_items, dev_items, test_items = utils.split_file(items, ratio='18:1:1', shuffle=True, seed=1234) # 构造词典(option) need_to_rebuild = [] for token2id_name in token2id_dct: if not token2id_dct[token2id_name]: print(f'字典{token2id_name} 载入不成功, 将生成并保存') need_to_rebuild.append(token2id_name) if need_to_rebuild: print(f'生成缺失词表文件...{need_to_rebuild}') for items in [train_items, dev_items]: # 字典只统计train和dev for item in items: if 'char2id' in need_to_rebuild: token2id_dct['char2id'].to_count(item[0].split(' ')) if 'bmeo2id' in need_to_rebuild: token2id_dct['bmeo2id'].to_count(item[1].split(' ')) if 'char2id' in need_to_rebuild: token2id_dct['char2id'].rebuild_by_counter(restrict=['<pad>', '<unk>'], min_freq=1, max_vocab_size=5000) token2id_dct['char2id'].save(f'{curr_dir}/../data/s2l_char2id.dct') if 'bmeo2id' in need_to_rebuild: token2id_dct['bmeo2id'].rebuild_by_counter(restrict=['<pad>', '<unk>']) token2id_dct['bmeo2id'].save(f'{curr_dir}/../data/s2l_bmeo2id.dct') else: print('使用已有词表文件...') return train_items, dev_items, test_items
def preprocess_raw_data(file, tokenize, token2id_dct, **kwargs): """ # 处理自有数据函数模板 # file文件数据格式: 多轮对话句子1\t多轮对话句子2\t...\t多轮对话句子n # [filter] 过滤 # [segment] 分词 # [build vocab] 构造词典 # [split] train-dev-test """ seg_file = file.rsplit('.', 1)[0] + '_seg.txt' if not os.path.exists(seg_file): sess_lst = utils.file2items(file) # 过滤 # filter here print('过滤后数据量', len(sess_lst)) # 分词 for i, sess in enumerate(sess_lst): # sess_lst[i] = [' '.join(s) for s in sess] # 按字分 sess_lst[i] = [' '.join(tokenize(s)) for s in sess] # 按词分 utils.list2file(seg_file, sess_lst) print('保存分词后数据成功', '数据量', len(sess_lst), seg_file) else: # 读取分词好的数据 sess_lst = utils.file2items(seg_file) # 转为多轮格式 multi-turn之间用$$$分隔 items = [] for sess in sess_lst: for i in range(1, len(sess)): multi_src = '$$$'.join(sess[:i]) tgt = sess[i] items.append([multi_src, tgt]) # items: [['w w w$$$w w', 'w w w'],...] # 划分 不分测试集 train_items, dev_items = utils.split_file(items, ratio='19:1', shuffle=True, seed=1234) # 构造词典(option) 字词联合 need_to_rebuild = [] for token2id_name in token2id_dct: if not token2id_dct[token2id_name]: print(f'字典{token2id_name} 载入不成功, 将生成并保存') need_to_rebuild.append(token2id_name) if need_to_rebuild: print(f'生成缺失词表文件...{need_to_rebuild}') for items in [train_items, dev_items]: # 字典只统计train和dev for item in items: if 'word2id' in need_to_rebuild: for sent in item[0].split('$$$'): token2id_dct['word2id'].to_count(sent.split(' ')) token2id_dct['word2id'].to_count(item[1].split(' ')) if 'char2id' in need_to_rebuild: for sent in item[0].split('$$$'): token2id_dct['char2id'].to_count( list(sent.replace(' ', ''))) token2id_dct['char2id'].to_count( list(item[1].replace(' ', ''))) if 'word2id' in need_to_rebuild: token2id_dct['word2id'].rebuild_by_counter( restrict=['<pad>', '<unk>', '<eos>'], min_freq=1, max_vocab_size=30000) token2id_dct['word2id'].save( f'{curr_dir}/../data/mmch_word2id.dct') if 'char2id' in need_to_rebuild: token2id_dct['char2id'].rebuild_by_counter( restrict=['<pad>', '<unk>', '<eos>'], min_freq=1, max_vocab_size=4000) token2id_dct['char2id'].save( f'{curr_dir}/../data/mmch_char2id.dct') else: print('使用已有词表文件...') # 负采样 train_items = train_helper.gen_pos_neg_sample(train_items, sample_idx=1, num_neg_exm=4) dev_items = train_helper.gen_pos_neg_sample(dev_items, sample_idx=1, num_neg_exm=4) return train_items, dev_items, None