Esempio n. 1
0
def preprocess_raw_data(file, tokenize, token2id_dct, **kwargs):
    """
    # 处理自有数据函数模板
    # file文件数据格式: 句子1\t句子2
    # [filter] 过滤
    # [segment] 分词
    # [build vocab] 构造词典
    # [split] train-dev-test
    """
    seg_file = file.rsplit('.', 1)[0] + '_seg.txt'
    if not os.path.exists(seg_file):
        items = utils.file2items(file)
        # 过滤
        # filter here

        print('过滤后数据量', len(items))

        # 分词
        for i, item in enumerate(items):
            item[0] = ' '.join(tokenize(item[0]))
            item[1] = ' '.join(tokenize(item[1]))
        utils.list2file(seg_file, items)
        print('保存分词后数据成功', '数据量', len(items), seg_file)
    else:
        # 读取分词好的数据
        items = utils.file2items(seg_file)

    # 划分 不分测试集
    train_items, dev_items = utils.split_file(items,
                                              ratio='19:1',
                                              shuffle=True,
                                              seed=1234)

    # 构造词典(option)
    need_to_rebuild = []
    for token2id_name in token2id_dct:
        if not token2id_dct[token2id_name]:
            print(f'字典{token2id_name} 载入不成功, 将生成并保存')
            need_to_rebuild.append(token2id_name)

    if need_to_rebuild:
        print(f'生成缺失词表文件...{need_to_rebuild}')
        for items in [train_items, dev_items]:  # 字典只统计train和dev
            for item in items:
                if 'word2id' in need_to_rebuild:
                    token2id_dct['word2id'].to_count(item[0].split(' '))
                    token2id_dct['word2id'].to_count(item[1].split(' '))
        if 'word2id' in need_to_rebuild:
            token2id_dct['word2id'].rebuild_by_counter(
                restrict=['<pad>', '<unk>', '<eos>'],
                min_freq=5,
                max_vocab_size=30000)
            token2id_dct['word2id'].save(f'{curr_dir}/../data/s2s_word2id.dct')
    else:
        print('使用已有词表文件...')

    return train_items, dev_items, None
Esempio n. 2
0
def preprocess_common_dataset_Toutiao(file, tokenize, token2id_dct, **kwargs):
    train_file = f'{curr_dir}/../data/train.toutiao.cls.txt'
    dev_file = f'{curr_dir}/../data/valid.toutiao.cls.txt'
    test_file = f'{curr_dir}/../data/test.toutiao.cls.txt'
    items_lst = []
    for file in [train_file, dev_file, test_file]:
        seg_file = file.rsplit('.', 1)[0] + '_seg.txt'  # 原始文本分词并保存为_seg.txt后缀文件
        if not os.path.exists(seg_file):
            items = utils.file2items(file, deli='\t')
            # 过滤
            # filter here

            print('过滤后数据量', len(items))

            # 分词
            for i, item in enumerate(items):
                item[0] = ' '.join(tokenize(item[0]))
            utils.list2file(seg_file, items)
            print('保存分词后数据成功', '数据量', len(items), seg_file)
            items_lst.append(items)
        else:
            # 读取分词好的数据
            items_lst.append(utils.file2items(seg_file))

    train_items, dev_items, test_items = items_lst

    # 构造词典(option)
    need_to_rebuild = []
    for token2id_name in token2id_dct:
        if not token2id_dct[token2id_name]:
            print(f'字典{token2id_name} 载入不成功, 将生成并保存')
            need_to_rebuild.append(token2id_name)

    if need_to_rebuild:
        print(f'生成缺失词表文件...{need_to_rebuild}')
        for items in [train_items, dev_items]:  # 字典只统计train和dev
            for item in items:
                if 'word2id' in need_to_rebuild:
                    token2id_dct['word2id'].to_count(item[0].split(' '))
                if 'label2id' in need_to_rebuild:
                    token2id_dct['label2id'].to_count([item[1]])
        if 'word2id' in need_to_rebuild:
            token2id_dct['word2id'].rebuild_by_counter(restrict=['<pad>', '<unk>'], min_freq=1, max_vocab_size=20000)
            token2id_dct['word2id'].save(f'{curr_dir}/../data/toutiao_cls_word2id.dct')
        if 'label2id' in need_to_rebuild:
            token2id_dct['label2id'].rebuild_by_counter(restrict=['<unk>'])
            token2id_dct['label2id'].save(f'{curr_dir}/../data/toutiao_cls_label2id.dct')
    else:
        print('使用已有词表文件...')

    return train_items, dev_items, test_items
Esempio n. 3
0
 def Doubanchange2items(file):
     # 转为[multi_src, tgt]格式 按字分
     exm_lst = []
     sess_lst = utils.file2items(file)
     for sess in sess_lst:
         sess = [' '.join(s) for s in sess]  # 按字分
         for i in range(1, len(sess)):
             multi_src = '$$$'.join(sess[:i])
             tgt = sess[i]
             exm_lst.append([multi_src, tgt])
     return exm_lst
Esempio n. 4
0
    def Doubanchange2items(file):
        # 转为[multi_src, tgt]格式
        # 分词
        seg_file = file.rsplit('.', 1)[0] + '_seg.txt'
        if not os.path.exists(seg_file):
            items = utils.file2items(file)
            # 分词
            for i, item in enumerate(items):
                for j in range(len(item)):
                    items[i][j] = ' '.join(tokenize(items[i][j]))
            utils.list2file(seg_file, items)
            print('保存分词后数据成功', '数据量', len(items), seg_file)
        else:
            items = utils.file2items(seg_file)

        exm_lst = []
        sess_lst = items
        for sess in sess_lst:
            for i in range(1, len(sess)):
                multi_src = '$$$'.join(sess[:i])
                tgt = sess[i]
                exm_lst.append([multi_src, tgt])
        return exm_lst
Esempio n. 5
0
    def change2line(file):
        exm_lst = []
        items = utils.file2items(file, deli=' ')
        curr_sent = []
        curr_bmeo = []

        for item in items:
            if len(item) == 1:  # 分隔标志 ['']
                if curr_sent and curr_bmeo:
                    exm_lst.append([' '.join(curr_sent), ' '.join(curr_bmeo)])
                    curr_sent, curr_bmeo = [], []
                continue
            curr_sent.append(item[0])
            curr_bmeo.append(item[1])
        if curr_sent and curr_bmeo:
            exm_lst.append([' '.join(curr_sent), ' '.join(curr_bmeo)])
        return exm_lst
Esempio n. 6
0
def preprocess_raw_data(file, tokenize, token2id_dct, **kwargs):
    """
    # 处理自有数据函数模板
    # file文件数据格式: 句子(以空格分好)\t标签(以空格分好)
    # [filter] 过滤
    # [segment] 分词 ner一般仅分字,用空格隔开,不需分词步骤
    # [build vocab] 构造词典
    # [split] train-dev-test
    """
    items = utils.file2items(file)
    # 过滤
    # filter here

    print('过滤后数据量', len(items))

    # 划分
    train_items, dev_items, test_items = utils.split_file(items, ratio='18:1:1', shuffle=True, seed=1234)

    # 构造词典(option)
    need_to_rebuild = []
    for token2id_name in token2id_dct:
        if not token2id_dct[token2id_name]:
            print(f'字典{token2id_name} 载入不成功, 将生成并保存')
            need_to_rebuild.append(token2id_name)

    if need_to_rebuild:
        print(f'生成缺失词表文件...{need_to_rebuild}')
        for items in [train_items, dev_items]:  # 字典只统计train和dev
            for item in items:
                if 'char2id' in need_to_rebuild:
                    token2id_dct['char2id'].to_count(item[0].split(' '))
                if 'bmeo2id' in need_to_rebuild:
                    token2id_dct['bmeo2id'].to_count(item[1].split(' '))
        if 'char2id' in need_to_rebuild:
            token2id_dct['char2id'].rebuild_by_counter(restrict=['<pad>', '<unk>'], min_freq=1, max_vocab_size=5000)
            token2id_dct['char2id'].save(f'{curr_dir}/../data/s2l_char2id.dct')
        if 'bmeo2id' in need_to_rebuild:
            token2id_dct['bmeo2id'].rebuild_by_counter(restrict=['<pad>', '<unk>'])
            token2id_dct['bmeo2id'].save(f'{curr_dir}/../data/s2l_bmeo2id.dct')
    else:
        print('使用已有词表文件...')

    return train_items, dev_items, test_items
Esempio n. 7
0
def preprocess_raw_data(file, tokenize, token2id_dct, **kwargs):
    """
    # 处理自有数据函数模板
    # file文件数据格式: 多轮对话句子1\t多轮对话句子2\t...\t多轮对话句子n
    # [filter] 过滤
    # [segment] 分词
    # [build vocab] 构造词典
    # [split] train-dev-test
    """
    seg_file = file.rsplit('.', 1)[0] + '_seg.txt'
    if not os.path.exists(seg_file):
        sess_lst = utils.file2items(file)
        # 过滤
        # filter here

        print('过滤后数据量', len(sess_lst))

        # 分词
        for i, sess in enumerate(sess_lst):
            # sess_lst[i] = [' '.join(s) for s in sess]  # 按字分
            sess_lst[i] = [' '.join(tokenize(s)) for s in sess]  # 按词分
        utils.list2file(seg_file, sess_lst)
        print('保存分词后数据成功', '数据量', len(sess_lst), seg_file)
    else:
        # 读取分词好的数据
        sess_lst = utils.file2items(seg_file)

    # 转为多轮格式 multi-turn之间用$$$分隔
    items = []
    for sess in sess_lst:
        for i in range(1, len(sess)):
            multi_src = '$$$'.join(sess[:i])
            tgt = sess[i]
            items.append([multi_src, tgt])
    # items: [['w w w$$$w w', 'w w w'],...]

    # 划分 不分测试集
    train_items, dev_items = utils.split_file(items,
                                              ratio='19:1',
                                              shuffle=True,
                                              seed=1234)

    # 构造词典(option) 字词联合
    need_to_rebuild = []
    for token2id_name in token2id_dct:
        if not token2id_dct[token2id_name]:
            print(f'字典{token2id_name} 载入不成功, 将生成并保存')
            need_to_rebuild.append(token2id_name)

    if need_to_rebuild:
        print(f'生成缺失词表文件...{need_to_rebuild}')
        for items in [train_items, dev_items]:  # 字典只统计train和dev
            for item in items:
                if 'word2id' in need_to_rebuild:
                    for sent in item[0].split('$$$'):
                        token2id_dct['word2id'].to_count(sent.split(' '))
                    token2id_dct['word2id'].to_count(item[1].split(' '))
                if 'char2id' in need_to_rebuild:
                    for sent in item[0].split('$$$'):
                        token2id_dct['char2id'].to_count(
                            list(sent.replace(' ', '')))
                    token2id_dct['char2id'].to_count(
                        list(item[1].replace(' ', '')))
        if 'word2id' in need_to_rebuild:
            token2id_dct['word2id'].rebuild_by_counter(
                restrict=['<pad>', '<unk>', '<eos>'],
                min_freq=1,
                max_vocab_size=30000)
            token2id_dct['word2id'].save(
                f'{curr_dir}/../data/mmch_word2id.dct')
        if 'char2id' in need_to_rebuild:
            token2id_dct['char2id'].rebuild_by_counter(
                restrict=['<pad>', '<unk>', '<eos>'],
                min_freq=1,
                max_vocab_size=4000)
            token2id_dct['char2id'].save(
                f'{curr_dir}/../data/mmch_char2id.dct')
    else:
        print('使用已有词表文件...')

    # 负采样
    train_items = train_helper.gen_pos_neg_sample(train_items,
                                                  sample_idx=1,
                                                  num_neg_exm=4)
    dev_items = train_helper.gen_pos_neg_sample(dev_items,
                                                sample_idx=1,
                                                num_neg_exm=4)
    return train_items, dev_items, None