def __init__(self,
                 data_params={},
                 tokenizer_pl=[],
                 _tokenizer_dir='only_news_commentary'):
        # initialize variables
        self.__data_params = data_params
        self.__tokenizer_pl = tokenizer_pl
        self.tokenizer_dir = f'{_tokenizer_dir}_{self.__data_params["vocab_size"]}'
        self.tokenizer_path = os.path.join(
            create_dir(data_dir, 'tokenizer', self.tokenizer_dir),
            'tokenizer.pkl')

        if os.path.isfile(self.tokenizer_path):
            return

            # load data from files
        data = news_commentary.zh_en()

        data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO)
        data = reduce(lambda x, y: x + y, data)

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get tokenizer
        self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data))
        self.get_tokenizer()
    def __init__(self, start_ratio=0.0, end_ratio=0.8, sample_rate=1.0):
        # load data from files
        data = news_commentary.zh_en()

        data = self.__split_data(data, start_ratio, end_ratio)
        data = reduce(lambda x, y: x + y, data)

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # sample data if the data size is too big; low resource setting
        data = self.sample_data(data, sample_rate)

        self.__src_data, self.__tar_data = list(zip(*data))
Exemple #3
0
    def __init__(self,
                 start_ratio=0.0,
                 end_ratio=0.98,
                 _sample_rate=1.0,
                 data_params={},
                 tokenizer_pl=[],
                 encoder_pl=[],
                 _tokenizer_dir='cdlm',
                 _dataset='cdlm'):
        # initialize variables
        self.__data_params = data_params
        self.__tokenizer_pl = tokenizer_pl
        self.__encoder_pl = encoder_pl
        self.__sample_rate = _sample_rate

        self.__tokenizer_path = os.path.join(
            create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl')
        self.__processed_dir_path = create_dir(data_dir, 'preprocessed',
                                               _dataset)

        # load data from files
        data = news_commentary.zh_en()

        data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO)

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get tokenizer
        if os.path.isfile(self.__tokenizer_path):
            self.__tokenizer = load_pkl(self.__tokenizer_path)
        else:
            tmp_data = reduce(lambda x, y: x + y, data)
            self.__tokenizer_src, self.__tokenizer_tar = list(zip(*tmp_data))
            self.get_tokenizer()

        # get the data set (train or validation or test)
        data = self.__split_data(data, start_ratio, end_ratio)

        data = reduce(lambda x, y: x + y, data)

        self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
Exemple #4
0
    def __init__(self, start_ratio=0.0, end_ratio=0.98, _dataset='cdlm'):
        # initialize variables
        self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed', _dataset)

        # load data from files
        data = news_commentary.zh_en()

        data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO)

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        # get the data set (train or validation or test)
        data = self.__split_data(data, start_ratio, end_ratio)

        data = reduce(lambda x, y: x + y, data)

        # shuffle the data
        random.seed(self.RANDOM_STATE)
        random.shuffle(data)

        self.gen_data(data, self.BATCH_SIZE_PER_FILE)
 def __load_from_news_commentary(self):
     data = news_commentary.zh_en()
     data = self.__split_data(data, 0.,
                              self.NMT_TRAIN_RATIO_NEWS_COMMENTARY)
     return reduce(lambda x, y: x + y, data)