Beispiel #1
0
    def __init__(self, use_cache=True):
        # read data from cache ;
        #    if no cache, then load the data and preprocess it, then store it to cache
        cache_name = f'{self.TRAIN_NAME}_nmt_preprocessed_data_{md5(self.M.data_params)}.pkl'
        data = read_cache(cache_name) if use_cache else None
        if not isinstance(data, type(None)):
            self.__train_src, \
            self.__train_tar, \
            self.__train_src_encode, \
            self.__train_tar_encode, \
            self.__val_src, \
            self.__val_tar, \
            self.__val_src_encode, \
            self.__val_tar_encode, \
            self.__test_src, \
            self.__test_tar, \
            self.__test_src_encode, \
            self.__test_tar_encode, \
            self.__src_tokenizer, \
            self.__tar_tokenizer, \
            self.__src_vocab_size, \
            self.__tar_vocab_size = data

        else:
            self.__load_data()
            self.__preprocess()

            cache(cache_name, [
                self.__train_src,
                self.__train_tar,
                self.__train_src_encode,
                self.__train_tar_encode,
                self.__val_src,
                self.__val_tar,
                self.__val_src_encode,
                self.__val_tar_encode,
                self.__test_src,
                self.__test_tar,
                self.__test_src_encode,
                self.__test_tar_encode,
                self.__src_tokenizer,
                self.__tar_tokenizer,
                self.__src_vocab_size,
                self.__tar_vocab_size,
            ])

        print('src_vocab_size: {}\ntar_vocab_size: {}'.format(
            self.__src_vocab_size, self.__tar_vocab_size))
        print('train_size: {}\ntest_size: {}'.format(len(self.__train_src),
                                                     len(self.__test_src)))
        print('train_x.shape: {}\ntrain_y.shape: {}'.format(
            self.__train_src_encode.shape, self.__train_tar_encode.shape))
        print('val_x.shape: {}\nval_y.shape: {}'.format(
            self.__val_src_encode.shape, self.__val_tar_encode.shape))
        print('test_x.shape: {}\ntest_y.shape: {}'.format(
            self.__test_src_encode.shape, self.__test_tar_encode.shape))
Beispiel #2
0
    def __init__(self, use_cache=True):
        # read data from cache ;
        #    if no cache, then load the data and preprocess it, then store it to cache
        cache_name = f'pre{self.TRAIN_NAME}_preprocessed_data_{md5(self.M.data_params)}.pkl'
        data = read_cache(cache_name) if use_cache else None
        if not isinstance(data, type(None)):
            self.train_x, \
            self.train_y, \
            self.train_lan_x, \
            self.train_lan_y, \
            self.train_pos_y, \
            self.test_x, \
            self.test_y, \
            self.test_lan_x, \
            self.test_lan_y, \
            self.test_pos_y, \
            self.tokenizer, \
            self.vocab_size = data

        else:
            self.load_data()
            self.preprocess_tokenizer()
            self.preprocess()

            cache(cache_name, [
                self.train_x,
                self.train_y,
                self.train_lan_x,
                self.train_lan_y,
                self.train_pos_y,
                self.test_x,
                self.test_y,
                self.test_lan_x,
                self.test_lan_y,
                self.test_pos_y,
                self.tokenizer,
                self.vocab_size,
            ])

        print(f'vocab_size: {self.vocab_size}\n')
        print(
            f'train_x.shape: {self.train_x.shape}\ntrain_y.shape: {self.train_y.shape}'
        )
        print(
            f'train_lan_x.shape: {self.train_lan_x.shape}\ntrain_lan_y.shape: {self.train_lan_y.shape}'
        )
        print(f'train_pos_y.shape: {self.train_pos_y.shape}')
        print(
            f'test_x.shape: {self.test_x.shape}\ntest_y.shape: {self.test_y.shape}'
        )
        print(
            f'test_lan_x.shape: {self.test_lan_x.shape}\ntest_lan_y.shape: {self.test_lan_y.shape}'
        )
        print(f'test_pos_y.shape: {self.test_pos_y.shape}')
Beispiel #3
0
    def __preprocess(self):
        """ preprocess the data to list of list token idx """
        print('\nProcessing data ... ')

        if self.tokenizer_dir:
            self.__src_tokenizer = load_pkl(
                get_file_path(data_dir, 'tokenizer', self.tokenizer_dir,
                              'tokenizer.pkl'))
            self.__tar_tokenizer = self.__src_tokenizer

        elif self.M.checkpoint_params['load_model']:
            load_model_params = self.M.checkpoint_params['load_model']

            tokenizer_path = create_dir_in_root('runtime', 'tokenizer',
                                                load_model_params[0],
                                                load_model_params[1],
                                                'tokenizer.pkl')
            self.__src_tokenizer = self.__tar_tokenizer = read_cache(
                tokenizer_path)

        else:
            self.__src_tokenizer = utils.pipeline(
                self.M.tokenizer_pl,
                self.__tokenizer_data_src,
                self.__tokenizer_data_tar,
                self.M.data_params,
            )
            self.__tar_tokenizer = self.__src_tokenizer
            del self.__tokenizer_data_src
            del self.__tokenizer_data_tar

        params = {
            **self.M.data_params,
            'tokenizer': self.__src_tokenizer,
            'src_tokenizer': self.__src_tokenizer,
            'tar_tokenizer': self.__tar_tokenizer,
        }

        self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline(
            self.M.encode_pipeline, self.__train_src, self.__train_tar, params)

        self.__val_src_encode, self.__val_tar_encode, _, _ = utils.pipeline(
            self.M.encode_pipeline, self.__val_src, self.__val_tar, params)

        self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline(
            self.M.encode_pipeline, self.__test_src, self.__test_tar, params)

        # get vocabulary size
        self.__src_vocab_size = self.__src_tokenizer.vocab_size
        self.__tar_vocab_size = self.__tar_tokenizer.vocab_size

        print('\nFinish preprocessing ')
Beispiel #4
0
    def __preprocess(self):
        """ preprocess the data to list of list token idx """
        print('\nProcessing data ... ')

        load_model_params = Model.checkpoint_params['load_model']
        if load_model_params:
            tokenizer_path = create_dir_in_root('runtime', 'tokenizer',
                                                load_model_params[0],
                                                load_model_params[1],
                                                'tokenizer.pkl')
            self.__src_tokenizer = self.__tar_tokenizer = read_cache(
                tokenizer_path)

            self.__train_src_encode, self.__train_tar_encode, _, _ = utils.pipeline(
                Model.encode_pipeline, self.__train_src, self.__train_tar, {
                    **Model.data_params,
                    'tokenizer': self.__src_tokenizer,
                    'src_tokenizer': self.__src_tokenizer,
                    'tar_tokenizer': self.__tar_tokenizer,
                })

        else:
            self.__train_src_encode, self.__train_tar_encode, self.__src_tokenizer, self.__tar_tokenizer = utils.pipeline(
                Model.preprocess_pipeline,
                self.__train_src,
                self.__train_tar,
                Model.data_params,
            )

        params = {
            **Model.data_params,
            'tokenizer': self.__src_tokenizer,
            'src_tokenizer': self.__src_tokenizer,
            'tar_tokenizer': self.__tar_tokenizer,
        }

        self.__test_src_encode, self.__test_tar_encode, _, _ = utils.pipeline(
            Model.encode_pipeline, self.__test_src, self.__test_tar, params)

        # get vocabulary size
        self.__src_vocab_size = self.__src_tokenizer.vocab_size
        self.__tar_vocab_size = self.__tar_tokenizer.vocab_size

        print('\nFinish preprocessing ')
Beispiel #5
0
    def preprocess_tokenizer(self):
        print('\nProcessing tokenizer ... ')

        # get tokenizer
        load_model_params = self.M.checkpoint_params['load_model']
        if not load_model_params:
            self.tokenizer = utils.pipeline(
                self.M.tokenizer_pl,
                self.train_tokenizer_src,
                self.train_tokenizer_tar,
                self.M.data_params,
            )
            del self.train_tokenizer_src
            del self.train_tokenizer_tar

        # load tokenizer from cache
        else:
            tokenizer_path = get_relative_file_path('runtime', 'tokenizer',
                                                    load_model_params[0],
                                                    load_model_params[1],
                                                    'tokenizer.pkl')
            self.tokenizer = read_cache(tokenizer_path)
Beispiel #6
0
def get(url, file_name):
    data_path = os.path.join(data_dir, file_name)
    cache_name = os.path.splitext(data_path)[0] + '.pkl'
    if os.path.exists(cache_name):
        return read_cache(cache_name)

    # download and unzip data
    utils.download(url, data_path)
    with gzip.open(data_path, 'rb') as f:
        _data = f.read().decode('utf-8')

    _data = utils.full_2_half(utils.unicode_to_ascii(_data))
    _data = _data.replace('\r', '').strip().split('\n\t\n')
    _data = list(
        map(
            lambda x: list(
                map(
                    lambda line:
                    [line.split('\t')[1].strip(),
                     line.split('\t')[0].strip()], x.split('\n'))), _data))

    cache(cache_name, _data)
    return _data
Beispiel #7
0
import os
from pretrain.preprocess.config import dictionary_dir, merged_en_ro_dict_path, merged_ro_en_dict_path
from nmt.load import ro_en
from pretrain.preprocess.dictionary import map_dict_ro_en as map_dict
from lib.utils import load_json, write_json, cache, read_cache
from lib.preprocess.utils import stem, zh_word_seg_by_jieba
from pretrain.preprocess.dictionary.preprocess_string import filter_duplicate

cache_dict_path = os.path.join(dictionary_dir, 'cache_dict_ro_en.pkl')
if os.path.exists(cache_dict_path):
    ro_word_dict, en_word_dict = read_cache(cache_dict_path)

else:
    ro_word_dict = {}
    en_word_dict = {}


def __add_to_dict(lan_data, is_ro):
    length = len(lan_data)
    for i, sentence in enumerate(lan_data):
        if i % 20 == 0:
            progress = float(i + 1) / length * 100.
            print('\rprogress: %.2f%% ' % progress, end='')

        words = sentence.strip('.').strip('?').strip('!').strip(';').strip().split(' ')

        list_of_token = list(map(lambda x: x.strip(), words))
        list_of_2_gram = map_dict.n_grams(list_of_token, 2)
        list_of_3_gram = map_dict.n_grams(list_of_token, 3)
        list_of_4_gram = map_dict.n_grams(list_of_token, 4)