def __init__(self, _is_train, _dataset='cdlm'): # initialize variables self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed', _dataset) # initialize wmt news loader start_ratio = 0.0 if _is_train else zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_wmt_loader = zh_en_wmt_news.Loader(start_ratio, end_ratio) # initialize news commentary loader start_ratio = 0.0 if _is_train else zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_news_commentary_loader = zh_en_news_commentary.Loader(start_ratio, end_ratio) # load the data zh_data, en_data = zh_en_wmt_loader.data() zh_data_2, en_data_2 = zh_en_news_commentary_loader.data() # combine data zh_data += zh_data_2 en_data += en_data_2 data = list(zip(zh_data, en_data)) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) self.gen_data(data, self.BATCH_SIZE_PER_FILE)
def __init__(self, _is_train, _sample_rate=1.0, data_params={}, tokenizer_pl=[], encoder_pl=[], _tokenizer_dir='cdlm', _dataset='cdlm'): # initialize variables self.__data_params = data_params self.__tokenizer_pl = tokenizer_pl self.__encoder_pl = encoder_pl self.__sample_rate = _sample_rate self.__tokenizer_path = os.path.join( create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl') self.__processed_dir_path = create_dir(data_dir, 'preprocessed', _dataset) # initialize wmt news loader start_ratio = 0.0 if _is_train else zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_wmt_loader = zh_en_wmt_news.Loader(start_ratio, end_ratio) # initialize news commentary loader start_ratio = 0.0 if _is_train else zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_news_commentary_loader = zh_en_news_commentary.Loader( start_ratio, end_ratio, 0.2) # load the data zh_data, en_data = zh_en_wmt_loader.data() zh_data_2, en_data_2 = zh_en_news_commentary_loader.data() # combine data zh_data += zh_data_2 en_data += en_data_2 data = list(zip(zh_data, en_data)) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get tokenizer if os.path.isfile(self.__tokenizer_path): self.__tokenizer = load_pkl(self.__tokenizer_path) else: self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data)) self.get_tokenizer() self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
def __init__(self, _is_train, _dataset='cdlm'): # initialize variables self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed', _dataset) # initialize wmt news loader start_ratio = 0.0 if _is_train else zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_wmt_news.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_wmt_loader = zh_en_wmt_news.Loader(start_ratio, end_ratio) # initialize news commentary loader start_ratio = 0.0 if _is_train else zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO end_ratio = zh_en_news_commentary.Loader.PRETRAIN_TRAIN_RATIO if _is_train else 1.0 zh_en_news_commentary_loader = zh_en_news_commentary.Loader(start_ratio, end_ratio) # load the data zh_data, en_data = zh_en_wmt_loader.data() zh_data_2, en_data_2 = zh_en_news_commentary_loader.data() # um corpus data is only for training if _is_train: zh_data_3, en_data_3 = um_corpus.zh_en(get_test=False) # combine data zh_data += tuple(zh_data_3) en_data += tuple(en_data_3) # combine data zh_data += zh_data_2 en_data += en_data_2 # word segmentation for zh_data zh_data = utils.pipeline(seg_zh_by_jieba_pipeline, zh_data) data = list(zip(zh_data, en_data)) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) self.gen_data(data, self.BATCH_SIZE_PER_FILE)