def __init__(self, use_cache=True): # read data from cache ; # if no cache, then load the data and preprocess it, then store it to cache cache_name = f'{self.TRAIN_NAME}_nmt_preprocessed_data_{md5(self.M.data_params)}.pkl' data = read_cache(cache_name) if use_cache else None if not isinstance(data, type(None)): self.__train_src, \ self.__train_tar, \ self.__train_src_encode, \ self.__train_tar_encode, \ self.__val_src, \ self.__val_tar, \ self.__val_src_encode, \ self.__val_tar_encode, \ self.__test_src, \ self.__test_tar, \ self.__test_src_encode, \ self.__test_tar_encode, \ self.__src_tokenizer, \ self.__tar_tokenizer, \ self.__src_vocab_size, \ self.__tar_vocab_size = data else: self.__load_data() self.__preprocess() cache(cache_name, [ self.__train_src, self.__train_tar, self.__train_src_encode, self.__train_tar_encode, self.__val_src, self.__val_tar, self.__val_src_encode, self.__val_tar_encode, self.__test_src, self.__test_tar, self.__test_src_encode, self.__test_tar_encode, self.__src_tokenizer, self.__tar_tokenizer, self.__src_vocab_size, self.__tar_vocab_size, ]) print('src_vocab_size: {}\ntar_vocab_size: {}'.format( self.__src_vocab_size, self.__tar_vocab_size)) print('train_size: {}\ntest_size: {}'.format(len(self.__train_src), len(self.__test_src))) print('train_x.shape: {}\ntrain_y.shape: {}'.format( self.__train_src_encode.shape, self.__train_tar_encode.shape)) print('val_x.shape: {}\nval_y.shape: {}'.format( self.__val_src_encode.shape, self.__val_tar_encode.shape)) print('test_x.shape: {}\ntest_y.shape: {}'.format( self.__test_src_encode.shape, self.__test_tar_encode.shape))
def __init__(self, use_cache=True): # read data from cache ; # if no cache, then load the data and preprocess it, then store it to cache cache_name = f'pre{self.TRAIN_NAME}_preprocessed_data_{md5(self.M.data_params)}.pkl' data = read_cache(cache_name) if use_cache else None if not isinstance(data, type(None)): self.train_x, \ self.train_y, \ self.train_lan_x, \ self.train_lan_y, \ self.train_pos_y, \ self.test_x, \ self.test_y, \ self.test_lan_x, \ self.test_lan_y, \ self.test_pos_y, \ self.tokenizer, \ self.vocab_size = data else: self.load_data() self.preprocess_tokenizer() self.preprocess() cache(cache_name, [ self.train_x, self.train_y, self.train_lan_x, self.train_lan_y, self.train_pos_y, self.test_x, self.test_y, self.test_lan_x, self.test_lan_y, self.test_pos_y, self.tokenizer, self.vocab_size, ]) print(f'vocab_size: {self.vocab_size}\n') print( f'train_x.shape: {self.train_x.shape}\ntrain_y.shape: {self.train_y.shape}' ) print( f'train_lan_x.shape: {self.train_lan_x.shape}\ntrain_lan_y.shape: {self.train_lan_y.shape}' ) print(f'train_pos_y.shape: {self.train_pos_y.shape}') print( f'test_x.shape: {self.test_x.shape}\ntest_y.shape: {self.test_y.shape}' ) print( f'test_lan_x.shape: {self.test_lan_x.shape}\ntest_lan_y.shape: {self.test_lan_y.shape}' ) print(f'test_pos_y.shape: {self.test_pos_y.shape}')
def __preprocess(self): """ preprocess the data to list of list token idx """ print('\nProcessing data ... ') if self.tokenizer_dir: self.__src_tokenizer = load_pkl( get_file_path(data_dir, 'tokenizer', self.tokenizer_dir, 'tokenizer.pkl')) self.__tar_tokenizer = self.__src_tokenizer elif self.M.checkpoint_params['load_model']: load_model_params = self.M.checkpoint_params['load_model'] tokenizer_path = create_dir_in_root('runtime', 'tokenizer', load_model_params[0], load_model_params[1], 'tokenizer.pkl') self.__src_tokenizer = self.__tar_tokenizer = read_cache( tokenizer_path) else: self.__src_tokenizer = utils.pipeline( self.M.tokenizer_pl, self.__tokenizer_data_src, self.__tokenizer_data_tar, self.M.data_params, ) self.__tar_tokenizer = self.__src_tokenizer del self.__tokenizer_data_src del self.__tokenizer_data_tar params = { **self.M.data_params, 'tokenizer': self.__src_tokenizer, 'src_tokenizer': self.__src_tokenizer, 'tar_tokenizer': self.__tar_tokenizer, } self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline( self.M.encode_pipeline, self.__train_src, self.__train_tar, params) self.__val_src_encode, self.__val_tar_encode, _, _ = utils.pipeline( self.M.encode_pipeline, self.__val_src, self.__val_tar, params) self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline( self.M.encode_pipeline, self.__test_src, self.__test_tar, params) # get vocabulary size self.__src_vocab_size = self.__src_tokenizer.vocab_size self.__tar_vocab_size = self.__tar_tokenizer.vocab_size print('\nFinish preprocessing ')
def __preprocess(self): """ preprocess the data to list of list token idx """ print('\nProcessing data ... ') load_model_params = Model.checkpoint_params['load_model'] if load_model_params: tokenizer_path = create_dir_in_root('runtime', 'tokenizer', load_model_params[0], load_model_params[1], 'tokenizer.pkl') self.__src_tokenizer = self.__tar_tokenizer = read_cache( tokenizer_path) self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline( Model.encode_pipeline, self.__train_src, self.__train_tar, { **Model.data_params, 'tokenizer': self.__src_tokenizer, 'src_tokenizer': self.__src_tokenizer, 'tar_tokenizer': self.__tar_tokenizer, }) else: self.__train_src_encode, self.__train_tar_encode, self.__src_tokenizer, self.__tar_tokenizer = utils.pipeline( Model.preprocess_pipeline, self.__train_src, self.__train_tar, Model.data_params, ) params = { **Model.data_params, 'tokenizer': self.__src_tokenizer, 'src_tokenizer': self.__src_tokenizer, 'tar_tokenizer': self.__tar_tokenizer, } self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline( Model.encode_pipeline, self.__test_src, self.__test_tar, params) # get vocabulary size self.__src_vocab_size = self.__src_tokenizer.vocab_size self.__tar_vocab_size = self.__tar_tokenizer.vocab_size print('\nFinish preprocessing ')
def preprocess_tokenizer(self): print('\nProcessing tokenizer ... ') # get tokenizer load_model_params = self.M.checkpoint_params['load_model'] if not load_model_params: self.tokenizer = utils.pipeline( self.M.tokenizer_pl, self.train_tokenizer_src, self.train_tokenizer_tar, self.M.data_params, ) del self.train_tokenizer_src del self.train_tokenizer_tar # load tokenizer from cache else: tokenizer_path = get_relative_file_path('runtime', 'tokenizer', load_model_params[0], load_model_params[1], 'tokenizer.pkl') self.tokenizer = read_cache(tokenizer_path)
def get(url, file_name): data_path = os.path.join(data_dir, file_name) cache_name = os.path.splitext(data_path)[0] + '.pkl' if os.path.exists(cache_name): return read_cache(cache_name) # download and unzip data utils.download(url, data_path) with gzip.open(data_path, 'rb') as f: _data = f.read().decode('utf-8') _data = utils.full_2_half(utils.unicode_to_ascii(_data)) _data = _data.replace('\r', '').strip().split('\n\t\n') _data = list( map( lambda x: list( map( lambda line: [line.split('\t')[1].strip(), line.split('\t')[0].strip()], x.split('\n'))), _data)) cache(cache_name, _data) return _data
import os from pretrain.preprocess.config import dictionary_dir, merged_en_ro_dict_path, merged_ro_en_dict_path from nmt.load import ro_en from pretrain.preprocess.dictionary import map_dict_ro_en as map_dict from lib.utils import load_json, write_json, cache, read_cache from lib.preprocess.utils import stem, zh_word_seg_by_jieba from pretrain.preprocess.dictionary.preprocess_string import filter_duplicate cache_dict_path = os.path.join(dictionary_dir, 'cache_dict_ro_en.pkl') if os.path.exists(cache_dict_path): ro_word_dict, en_word_dict = read_cache(cache_dict_path) else: ro_word_dict = {} en_word_dict = {} def __add_to_dict(lan_data, is_ro): length = len(lan_data) for i, sentence in enumerate(lan_data): if i % 20 == 0: progress = float(i + 1) / length * 100. print('\rprogress: %.2f%% ' % progress, end='') words = sentence.strip('.').strip('?').strip('!').strip(';').strip().split(' ') list_of_token = list(map(lambda x: x.strip(), words)) list_of_2_gram = map_dict.n_grams(list_of_token, 2) list_of_3_gram = map_dict.n_grams(list_of_token, 3) list_of_4_gram = map_dict.n_grams(list_of_token, 4)