def __init__(self, start_ratio=0.0, end_ratio=0.8, sample_rate=1.0): # load data from files zh_data, en_data = wmt_news.zh_en() # shuffle the data random.seed(self.RANDOM_STATE) data = list(zip(zh_data, en_data)) random.shuffle(data) # split data according to the ratio (for train set, val set and test set) data = self.__split_data(data, start_ratio, end_ratio) self.__src_data, self.__tar_data = list(zip(*data))
def __init__(self, start_ratio=0.0, end_ratio=0.9, sample_rate=1.0): # load data from wmt_news zh_data, en_data = wmt_news.zh_en() # reproduce the process that nmt would go through in order to get its train set; shuffle the data random.seed(self.RANDOM_STATE) data = list(zip(zh_data, en_data)) random.shuffle(data) # get the train set data = self.__split_data(data, 0.0, self.NMT_TRAIN_RATIO) # split dataset data = self.__split_data(data, start_ratio, end_ratio) if start_ratio == 0. or sample_rate < 1.: # sample data data = self.sample_data(data, sample_rate) self.__src_data, self.__tar_data = list(zip(*data))
def __init__(self, start_ratio=0.0, end_ratio=0.8, sample_rate=1.0): # load data from files data = news_commentary.zh_en() data = self.__split_data(data, start_ratio, end_ratio) zh_data, en_data = wmt_news.zh_en() wmt_data = list(zip(zh_data, en_data)) wmt_data = self.__split_data(wmt_data, start_ratio, end_ratio) data = reduce(lambda x, y: x + y, data) data += wmt_data # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # sample data if the data size is too big; low resource setting data = self.sample_data(data, sample_rate) self.__src_data, self.__tar_data = list(zip(*data))
def __load_from_wmt_news(self): zh_data, en_data = wmt_news.zh_en() wmt_news_data = list(zip(zh_data, en_data)) return self.__split_data(wmt_news_data, 0.0, self.NMT_TRAIN_RATIO_WMT_NEWS)
{ 'name': 'join_list_token_2_string', 'func': utils.join_list_token_2_string, 'input_keys': ['input_1'], 'output_keys': 'input_1', 'show_dict': { 'lan': 'input_1' }, }, ] if __name__ == '__main__': from nmt.preprocess.corpus import wmt_news from nmt.preprocess.inputs.zh_en import seg_zh_by_jieba_pipeline, remove_space_pipeline zh_data, en_data = wmt_news.zh_en() params = { 'src_vocab_size': 2**13, 'tar_vocab_size': 2**13, 'max_src_seq_len': 50, 'max_tar_seq_len': 60, } print('\n------------------- Encoding -------------------------') zh_data, en_data, zh_tokenizer, en_tokenizer = utils.pipeline( preprocess_pipeline=seg_zh_by_jieba_pipeline + train_tokenizer_pipeline + encode_pipeline, lan_data_1=zh_data, lan_data_2=en_data, params=params)
] if __name__ == '__main__': from nmt.preprocess.corpus import wmt_news from lib.preprocess import utils from nmt.preprocess.inputs import noise_pl, tfds_share_pl, zh_en from pretrain.preprocess.inputs import pl from pretrain.preprocess.inputs.decode import decode_pl from pretrain.load.token_translation import Loader from pretrain.preprocess.inputs.sampling import sample_pl # token_loader = Loader(0.0, 1.0) # token_zh_data, token_en_data = token_loader.data() origin_zh_data, origin_en_data = wmt_news.zh_en() params = { 'vocab_size': 40000, 'max_src_seq_len': 60, 'max_tar_seq_len': 60, 'max_src_ground_seq_len': 12, 'max_tar_ground_seq_len': 12, } # tokenizer_pl = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise + tfds_share_pl.train_tokenizer # tokenizer = utils.pipeline(tokenizer_pl, # token_zh_data + list(origin_zh_data[:1000]), token_en_data + list(origin_en_data[:1000]), params) pipeline = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise + tfds_share_pl.train_tokenizer # pipeline = zh_en.seg_zh_by_jieba_pipeline + noise_pl.remove_noise pipeline += pl.sent_2_tokens + sample_pl(2.0) + combine_pl(