def _load_paired_data(self, source_file, target_file):
        if self.overlength_strategy == 'drop':
            loaded_source_text = load_data(source_file, self.tokenize_strategy,
                                           'none', self.max_source_length,
                                           self.source_language)
            loaded_target_text = load_data(target_file, self.tokenize_strategy,
                                           'none', self.max_target_length,
                                           self.target_language)
            assert len(loaded_source_text) == len(loaded_target_text)
            source_text = []
            target_text = []
            for src, tgt in zip(loaded_source_text, loaded_target_text):
                if (len(src) <= self.max_source_length
                        and len(tgt) <= self.max_target_length):
                    source_text.append(src)
                    target_text.append(tgt)
        else:
            source_text = load_data(source_file, self.tokenize_strategy,
                                    self.overlength_strategy,
                                    self.max_source_length,
                                    self.source_language)
            target_text = load_data(target_file, self.tokenize_strategy,
                                    self.overlength_strategy,
                                    self.max_target_length,
                                    self.target_language)

        return source_text, target_text
Example #2
0
 def _load_source_data(self):
     for i, prefix in enumerate(['train', 'valid', 'test']):
         filename = os.path.join(self.dataset_path, f'{prefix}.src')
         text_data = load_data(filename, self.tokenize_strategy,
                               self.source_max_length, self.source_language,
                               self.source_multi_sentence,
                               self.source_max_num)
         assert len(text_data) == len(self.target_text[i])
         self.source_text.append(text_data)
 def _load_single_data(self, dataset_path):
     """Load full corpus.
     This is designed for single sentence format, unconditional task.
     Args:
         dataset_path (str): path of dataset dir.
     """
     dataset_file = os.path.join(dataset_path, 'corpus.txt')
     self.text_data = load_data(dataset_file, self.tokenize_strategy,
                                self.overlength_strategy,
                                self.max_seq_length, self.language)
     self.text_data = split_data([self.text_data], self.split_ratio)[0]
Example #4
0
 def _load_target_data(self):
     """Load dataset from target file (train, valid, test).
     This is designed for single sentence format.
     """
     for prefix in ['train', 'valid', 'test']:
         filename = os.path.join(self.dataset_path, f'{prefix}.tgt')
         text_data = load_data(
             filename, self.tokenize_strategy, self.target_max_length, self.target_language,
             self.target_multi_sentence, self.target_max_num
         )
         self.target_text.append(text_data)
 def _load_split_data(self, dataset_path):
     """Load dataset from split (train, dev, test).
     This is designed for single sentence format, unconditional task.
     Args:
         dataset_path (str): path of dataset dir.
     """
     for prefix in ['train', 'dev', 'test']:
         filename = os.path.join(dataset_path, '{}.txt'.format(prefix))
         text_data = load_data(filename, self.tokenize_strategy,
                               self.overlength_strategy,
                               self.max_seq_length, self.language)
         self.text_data.append(text_data)
    def _load_source_data(self):
        for i, prefix in enumerate(['train', 'valid', 'test']):
            filename = os.path.join(self.dataset_path, f'{prefix}.src')

            text_data = load_data(filename, self.tokenize_strategy,
                                  self.source_max_length, self.source_language,
                                  True, self.source_max_num)
            assert len(text_data) == len(self.target_text[i])
            key_data = []
            for doc in text_data:
                key = []
                for kv in doc:
                    k, kv[0] = kv[0].split('<kv>')
                    key.append(k)
                key_data.append(key)
            self.source_value_text.append(text_data)
            self.source_key_text.append(key_data)