Ejemplo n.º 1
0
    def __init__(self, start_ratio=0.0, end_ratio=0.8, sample_rate=1.0):
        # load data from files
        zh_data, en_data = wmt_news.zh_en()

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        data = list(zip(zh_data, en_data))
        random.shuffle(data)

        # split data according to the ratio (for train set, val set and test set)
        data = self.__split_data(data, start_ratio, end_ratio)

        self.__src_data, self.__tar_data = list(zip(*data))
Ejemplo n.º 2
0
    def __init__(self, start_ratio=0.0, end_ratio=0.9, sample_rate=1.0):
        # load data from wmt_news
        zh_data, en_data = wmt_news.zh_en()

        # reproduce the process that nmt would go through in order to get its train set; shuffle the data
        random.seed(self.RANDOM_STATE)
        data = list(zip(zh_data, en_data))
        random.shuffle(data)

        # get the train set
        data = self.__split_data(data, 0.0, self.NMT_TRAIN_RATIO)

        # split dataset
        data = self.__split_data(data, start_ratio, end_ratio)

        if start_ratio == 0. or sample_rate < 1.:
            # sample data
            data = self.sample_data(data, sample_rate)

        self.__src_data, self.__tar_data = list(zip(*data))
Ejemplo n.º 3
0
    def __init__(self, start_ratio=0.0, end_ratio=0.8, sample_rate=1.0):
        # load data from files
        data = news_commentary.zh_en()
        data = self.__split_data(data, start_ratio, end_ratio)

        zh_data, en_data = wmt_news.zh_en()
        wmt_data = list(zip(zh_data, en_data))
        wmt_data = self.__split_data(wmt_data, start_ratio, end_ratio)

        data = reduce(lambda x, y: x + y, data)
        data += wmt_data

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # sample data if the data size is too big; low resource setting
        data = self.sample_data(data, sample_rate)

        self.__src_data, self.__tar_data = list(zip(*data))
 def __load_from_wmt_news(self):
     zh_data, en_data = wmt_news.zh_en()
     wmt_news_data = list(zip(zh_data, en_data))
     return self.__split_data(wmt_news_data, 0.0,
                              self.NMT_TRAIN_RATIO_WMT_NEWS)
Ejemplo n.º 5
0
    {
        'name': 'join_list_token_2_string',
        'func': utils.join_list_token_2_string,
        'input_keys': ['input_1'],
        'output_keys': 'input_1',
        'show_dict': {
            'lan': 'input_1'
        },
    },
]

if __name__ == '__main__':
    from nmt.preprocess.corpus import wmt_news
    from nmt.preprocess.inputs.zh_en import seg_zh_by_jieba_pipeline, remove_space_pipeline

    zh_data, en_data = wmt_news.zh_en()
    params = {
        'src_vocab_size': 2**13,
        'tar_vocab_size': 2**13,
        'max_src_seq_len': 50,
        'max_tar_seq_len': 60,
    }

    print('\n------------------- Encoding -------------------------')
    zh_data, en_data, zh_tokenizer, en_tokenizer = utils.pipeline(
        preprocess_pipeline=seg_zh_by_jieba_pipeline +
        train_tokenizer_pipeline + encode_pipeline,
        lan_data_1=zh_data,
        lan_data_2=en_data,
        params=params)
Ejemplo n.º 6
0
    ]


if __name__ == '__main__':
    from nmt.preprocess.corpus import wmt_news
    from lib.preprocess import utils
    from nmt.preprocess.inputs import noise_pl, tfds_share_pl, zh_en
    from pretrain.preprocess.inputs import pl
    from pretrain.preprocess.inputs.decode import decode_pl
    from pretrain.load.token_translation import Loader
    from pretrain.preprocess.inputs.sampling import sample_pl

    # token_loader = Loader(0.0, 1.0)
    # token_zh_data, token_en_data = token_loader.data()

    origin_zh_data, origin_en_data = wmt_news.zh_en()
    params = {
        'vocab_size': 40000,
        'max_src_seq_len': 60,
        'max_tar_seq_len': 60,
        'max_src_ground_seq_len': 12,
        'max_tar_ground_seq_len': 12,
    }

    # tokenizer_pl = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise + tfds_share_pl.train_tokenizer
    # tokenizer = utils.pipeline(tokenizer_pl,
    #                            token_zh_data + list(origin_zh_data[:1000]), token_en_data + list(origin_en_data[:1000]), params)

    pipeline = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise + tfds_share_pl.train_tokenizer
    # pipeline = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise
    pipeline += pl.sent_2_tokens + sample_pl(2.0) + combine_pl(