def _get_eval_iterator(self, task): split_words = self.args.input_type == 'word' src_data = TextLookupDataset(task.src_dataset, self.src_dict, split_words, bos=False, eos=False, lower=self.args.lower) tgt_data = None if task.tgt_dataset is not None: tgt_data = TextLookupDataset(task.tgt_dataset, self.tgt_dict, split_words, lower=self.args.lower) dataset = ParallelDataset(src_data, tgt_data) return dataset.get_iterator(batch_size=self.args.batch_size, num_workers=self.args.data_loader_threads, cuda=self.args.cuda)
def _get_eval_iterator(self, task): split_words = self.args.input_type == 'word' noisy_data = NoisyTextDataset( TextLookupDataset(task.src_dataset, self.dictionary, split_words, bos=False, eos=False, lower=self.args.lower), self.args.word_shuffle, self.args.noise_word_dropout, self.args.word_blank, self.args.bpe_symbol) clean_data = TextLookupDataset(task.tgt_dataset, self.dictionary, split_words, bos=True, eos=True, lower=self.args.lower) dataset = ParallelDataset(noisy_data, clean_data) return dataset.get_iterator(batch_size=self.args.batch_size, num_workers=self.args.data_loader_threads, cuda=self.args.cuda)
def load_data(self, model_args=None): logger.info('Loading training data') split_words = self.args.input_type == 'word' train_src_name = os.path.basename(self.args.train_src) train_tgt_name = os.path.basename(self.args.train_tgt) if self.args.load_into_memory: src_data = TextLineDataset.load_into_memory(self.args.train_src) tgt_data = TextLineDataset.load_into_memory(self.args.train_tgt) else: offsets_src = os.path.join(self.args.data_dir, train_src_name + '.idx.npy') offsets_tgt = os.path.join(self.args.data_dir, train_tgt_name + '.idx.npy') src_data = TextLineDataset.load_indexed(self.args.train_src, offsets_src) tgt_data = TextLineDataset.load_indexed(self.args.train_tgt, offsets_tgt) src_data = TextLookupDataset(src_data, self.src_dict, words=split_words, bos=False, eos=False, trunc_len=self.args.src_seq_length_trunc, lower=self.args.lower) tgt_data = TextLookupDataset(tgt_data, self.tgt_dict, words=split_words, bos=True, eos=True, trunc_len=self.args.tgt_seq_length_trunc, lower=self.args.lower) dataset = ParallelDataset(src_data, tgt_data) logger.info('Number of training sentences: {:,d}'.format(len(dataset))) src_len_filename = os.path.join(self.args.data_dir, train_src_name + '.len.npy') tgt_len_filename = os.path.join(self.args.data_dir, train_tgt_name + '.len.npy') src_lengths = np.load(src_len_filename) tgt_lengths = np.load(tgt_len_filename) def filter_fn(i): return src_lengths[i] <= self.args.src_seq_length and tgt_lengths[i] <= self.args.tgt_seq_length logger.info('Generating batches') batches = data_utils.generate_length_based_batches_from_lengths( np.maximum(src_lengths, tgt_lengths), self.args.batch_size_words, self.args.batch_size_sents, self.args.batch_size_multiplier, self.args.pad_count, key_fn=lambda i: (tgt_lengths[i], src_lengths[i]), filter_fn=filter_fn) logger.info('Number of training batches: {:,d}'.format(len(batches))) filtered = len(src_lengths) - sum(len(batch) for batch in batches) logger.info('Filtered {:,d}/{:,d} training examples for length'.format(filtered, len(src_lengths))) sampler = PreGeneratedBatchSampler(batches, self.args.curriculum == 0) model = self.build_model(model_args) params = list(filter(lambda p: p.requires_grad, model.parameters())) lr_scheduler, optimizer = self._build_optimizer(params) return TrainData(model, dataset, sampler, lr_scheduler, optimizer, self._get_training_metrics())
def _get_train_dataset(self): logger.info('Loading training data') split_words = self.args.input_type == 'word' src_data, src_lengths = TextLookupDataset.load(self.args.train_src, self.src_dict, self.args.data_dir, self.args.load_into_memory, split_words, bos=False, eos=False, trunc_len=self.args.src_seq_length_trunc, lower=self.args.lower) if self.args.translation_noise: src_data = NoisyTextDataset(src_data, self.args.word_shuffle, self.args.noise_word_dropout, self.args.word_blank, self.args.bpe_symbol) tgt_data, tgt_lengths = TextLookupDataset.load(self.args.train_tgt, self.tgt_dict, self.args.data_dir, self.args.load_into_memory, split_words, bos=True, eos=True, trunc_len=self.args.tgt_seq_length_trunc, lower=self.args.lower) src_data.lengths = src_lengths tgt_data.lengths = tgt_lengths dataset = ParallelDataset(src_data, tgt_data) logger.info('Number of training sentences: {:,d}'.format(len(dataset))) return dataset
def _get_eval_dataset(self, task: TranslationTask): split_words = self.args.input_type == 'word' src_dataset = TextLookupDataset(task.src_dataset, self.src_dict, words=split_words, lower=task.lower, bos=False, eos=False, trunc_len=self.args.src_seq_length_trunc) if self.args.eval_noise: src_dataset = NoisyTextDataset(src_dataset, self.args.word_shuffle, self.args.noise_word_dropout, self.args.word_blank, self.args.bpe_symbol) if task.tgt_dataset is not None: tgt_dataset = TextLookupDataset(task.tgt_dataset, self.tgt_dict, words=split_words, lower=task.lower, bos=True, eos=True, trunc_len=self.args.tgt_seq_length_trunc) else: tgt_dataset = None dataset = ParallelDataset(src_dataset, tgt_dataset) return dataset
def load_data(self, model_args=None): logger.info('Loading training data') split_words = self.args.input_type == 'word' train_clean_name = os.path.basename(self.args.train_clean) if self.args.load_into_memory: clean_data = TextLineDataset.load_into_memory( self.args.train_clean) else: offsets = os.path.join(self.args.data_dir, train_clean_name + '.idx.npy') clean_data = TextLineDataset.load_indexed(self.args.train_clean, offsets) if self.args.train_noisy is not None: train_noisy_name = os.path.basename(self.args.train_noisy) if self.args.load_into_memory: noisy_data = TextLineDataset.load_into_memory( self.args.train_noisy) else: offsets = os.path.join(self.args.data_dir, train_noisy_name + '.idx.npy') noisy_data = TextLineDataset.load_indexed( self.args.train_noisy, offsets) else: noisy_data = clean_data noisy_data = NoisyTextDataset( TextLookupDataset(noisy_data, self.dictionary, words=split_words, bos=False, eos=False, trunc_len=self.args.seq_length_trunc, lower=self.args.lower), self.args.word_shuffle, self.args.word_dropout, self.args.word_blank, self.args.bpe_symbol) clean_data = TextLookupDataset(clean_data, self.dictionary, words=split_words, bos=True, eos=True, trunc_len=self.args.seq_length_trunc, lower=self.args.lower) dataset = ParallelDataset(noisy_data, clean_data) logger.info('Number of training sentences: {:,d}'.format(len(dataset))) clean_len_filename = os.path.join(self.args.data_dir, train_clean_name + '.len.npy') lengths = np.load(clean_len_filename) def filter_fn(i): return lengths[i] <= self.args.seq_length logger.info('Generating batches') batches = generate_length_based_batches_from_lengths( lengths, self.args.batch_size_words, self.args.batch_size_sents, self.args.batch_size_multiplier, self.args.pad_count, filter_fn=filter_fn) logger.info('Number of training batches: {:,d}'.format(len(batches))) filtered = len(lengths) - sum(len(batch) for batch in batches) logger.info('Filtered {:,d}/{:,d} training examples for length'.format( filtered, len(lengths))) sampler = PreGeneratedBatchSampler(batches, self.args.curriculum == 0) model = self.build_model(model_args) params = list(filter(lambda p: p.requires_grad, model.parameters())) lr_scheduler, optimizer = self._build_optimizer(params) return TrainData(model, dataset, sampler, lr_scheduler, optimizer, self._get_training_metrics())