Example #1
0
 def _get_eval_iterator(self, task):
     split_words = self.args.input_type == 'word'
     src_data = TextLookupDataset(task.src_dataset, self.src_dict, split_words, bos=False, eos=False,
                                  lower=self.args.lower)
     tgt_data = None
     if task.tgt_dataset is not None:
         tgt_data = TextLookupDataset(task.tgt_dataset, self.tgt_dict, split_words,
                                      lower=self.args.lower)
     dataset = ParallelDataset(src_data, tgt_data)
     return dataset.get_iterator(batch_size=self.args.batch_size,
                                 num_workers=self.args.data_loader_threads,
                                 cuda=self.args.cuda)
    def _get_eval_iterator(self, task):
        split_words = self.args.input_type == 'word'
        noisy_data = NoisyTextDataset(
            TextLookupDataset(task.src_dataset,
                              self.dictionary,
                              split_words,
                              bos=False,
                              eos=False,
                              lower=self.args.lower), self.args.word_shuffle,
            self.args.noise_word_dropout, self.args.word_blank,
            self.args.bpe_symbol)

        clean_data = TextLookupDataset(task.tgt_dataset,
                                       self.dictionary,
                                       split_words,
                                       bos=True,
                                       eos=True,
                                       lower=self.args.lower)

        dataset = ParallelDataset(noisy_data, clean_data)
        return dataset.get_iterator(batch_size=self.args.batch_size,
                                    num_workers=self.args.data_loader_threads,
                                    cuda=self.args.cuda)
Example #3
0
    def load_data(self, model_args=None):
        logger.info('Loading training data')
        split_words = self.args.input_type == 'word'

        train_src_name = os.path.basename(self.args.train_src)
        train_tgt_name = os.path.basename(self.args.train_tgt)

        if self.args.load_into_memory:
            src_data = TextLineDataset.load_into_memory(self.args.train_src)
            tgt_data = TextLineDataset.load_into_memory(self.args.train_tgt)
        else:
            offsets_src = os.path.join(self.args.data_dir, train_src_name + '.idx.npy')
            offsets_tgt = os.path.join(self.args.data_dir, train_tgt_name + '.idx.npy')
            src_data = TextLineDataset.load_indexed(self.args.train_src, offsets_src)
            tgt_data = TextLineDataset.load_indexed(self.args.train_tgt, offsets_tgt)
        src_data = TextLookupDataset(src_data, self.src_dict, words=split_words, bos=False, eos=False,
                                     trunc_len=self.args.src_seq_length_trunc, lower=self.args.lower)
        tgt_data = TextLookupDataset(tgt_data, self.tgt_dict, words=split_words, bos=True, eos=True,
                                     trunc_len=self.args.tgt_seq_length_trunc, lower=self.args.lower)
        dataset = ParallelDataset(src_data, tgt_data)
        logger.info('Number of training sentences: {:,d}'.format(len(dataset)))

        src_len_filename = os.path.join(self.args.data_dir, train_src_name + '.len.npy')
        tgt_len_filename = os.path.join(self.args.data_dir, train_tgt_name + '.len.npy')
        src_lengths = np.load(src_len_filename)
        tgt_lengths = np.load(tgt_len_filename)

        def filter_fn(i):
            return src_lengths[i] <= self.args.src_seq_length and tgt_lengths[i] <= self.args.tgt_seq_length

        logger.info('Generating batches')
        batches = data_utils.generate_length_based_batches_from_lengths(
            np.maximum(src_lengths, tgt_lengths), self.args.batch_size_words,
            self.args.batch_size_sents,
            self.args.batch_size_multiplier,
            self.args.pad_count,
            key_fn=lambda i: (tgt_lengths[i], src_lengths[i]),
            filter_fn=filter_fn)
        logger.info('Number of training batches: {:,d}'.format(len(batches)))

        filtered = len(src_lengths) - sum(len(batch) for batch in batches)
        logger.info('Filtered {:,d}/{:,d} training examples for length'.format(filtered, len(src_lengths)))

        sampler = PreGeneratedBatchSampler(batches, self.args.curriculum == 0)

        model = self.build_model(model_args)
        params = list(filter(lambda p: p.requires_grad, model.parameters()))
        lr_scheduler, optimizer = self._build_optimizer(params)
        return TrainData(model, dataset, sampler, lr_scheduler, optimizer, self._get_training_metrics())
Example #4
0
    def _get_train_dataset(self):
        logger.info('Loading training data')
        split_words = self.args.input_type == 'word'

        src_data, src_lengths = TextLookupDataset.load(self.args.train_src, self.src_dict, self.args.data_dir,
                                                       self.args.load_into_memory, split_words,
                                                       bos=False, eos=False, trunc_len=self.args.src_seq_length_trunc,
                                                       lower=self.args.lower)

        if self.args.translation_noise:
            src_data = NoisyTextDataset(src_data, self.args.word_shuffle, self.args.noise_word_dropout,
                                        self.args.word_blank, self.args.bpe_symbol)

        tgt_data, tgt_lengths = TextLookupDataset.load(self.args.train_tgt, self.tgt_dict, self.args.data_dir,
                                                       self.args.load_into_memory, split_words,
                                                       bos=True, eos=True, trunc_len=self.args.tgt_seq_length_trunc,
                                                       lower=self.args.lower)
        src_data.lengths = src_lengths
        tgt_data.lengths = tgt_lengths
        dataset = ParallelDataset(src_data, tgt_data)
        logger.info('Number of training sentences: {:,d}'.format(len(dataset)))
        return dataset
Example #5
0
    def _get_eval_dataset(self, task: TranslationTask):
        split_words = self.args.input_type == 'word'
        src_dataset = TextLookupDataset(task.src_dataset,
                                        self.src_dict,
                                        words=split_words,
                                        lower=task.lower,
                                        bos=False, eos=False,
                                        trunc_len=self.args.src_seq_length_trunc)

        if self.args.eval_noise:
            src_dataset = NoisyTextDataset(src_dataset, self.args.word_shuffle, self.args.noise_word_dropout,
                                           self.args.word_blank, self.args.bpe_symbol)

        if task.tgt_dataset is not None:
            tgt_dataset = TextLookupDataset(task.tgt_dataset,
                                            self.tgt_dict,
                                            words=split_words,
                                            lower=task.lower,
                                            bos=True, eos=True,
                                            trunc_len=self.args.tgt_seq_length_trunc)
        else:
            tgt_dataset = None
        dataset = ParallelDataset(src_dataset, tgt_dataset)
        return dataset
    def load_data(self, model_args=None):
        logger.info('Loading training data')
        split_words = self.args.input_type == 'word'

        train_clean_name = os.path.basename(self.args.train_clean)

        if self.args.load_into_memory:
            clean_data = TextLineDataset.load_into_memory(
                self.args.train_clean)
        else:
            offsets = os.path.join(self.args.data_dir,
                                   train_clean_name + '.idx.npy')
            clean_data = TextLineDataset.load_indexed(self.args.train_clean,
                                                      offsets)

        if self.args.train_noisy is not None:
            train_noisy_name = os.path.basename(self.args.train_noisy)
            if self.args.load_into_memory:
                noisy_data = TextLineDataset.load_into_memory(
                    self.args.train_noisy)
            else:
                offsets = os.path.join(self.args.data_dir,
                                       train_noisy_name + '.idx.npy')
                noisy_data = TextLineDataset.load_indexed(
                    self.args.train_noisy, offsets)
        else:
            noisy_data = clean_data

        noisy_data = NoisyTextDataset(
            TextLookupDataset(noisy_data,
                              self.dictionary,
                              words=split_words,
                              bos=False,
                              eos=False,
                              trunc_len=self.args.seq_length_trunc,
                              lower=self.args.lower), self.args.word_shuffle,
            self.args.word_dropout, self.args.word_blank, self.args.bpe_symbol)

        clean_data = TextLookupDataset(clean_data,
                                       self.dictionary,
                                       words=split_words,
                                       bos=True,
                                       eos=True,
                                       trunc_len=self.args.seq_length_trunc,
                                       lower=self.args.lower)
        dataset = ParallelDataset(noisy_data, clean_data)
        logger.info('Number of training sentences: {:,d}'.format(len(dataset)))

        clean_len_filename = os.path.join(self.args.data_dir,
                                          train_clean_name + '.len.npy')
        lengths = np.load(clean_len_filename)

        def filter_fn(i):
            return lengths[i] <= self.args.seq_length

        logger.info('Generating batches')
        batches = generate_length_based_batches_from_lengths(
            lengths,
            self.args.batch_size_words,
            self.args.batch_size_sents,
            self.args.batch_size_multiplier,
            self.args.pad_count,
            filter_fn=filter_fn)
        logger.info('Number of training batches: {:,d}'.format(len(batches)))

        filtered = len(lengths) - sum(len(batch) for batch in batches)
        logger.info('Filtered {:,d}/{:,d} training examples for length'.format(
            filtered, len(lengths)))

        sampler = PreGeneratedBatchSampler(batches, self.args.curriculum == 0)

        model = self.build_model(model_args)
        params = list(filter(lambda p: p.requires_grad, model.parameters()))
        lr_scheduler, optimizer = self._build_optimizer(params)
        return TrainData(model, dataset, sampler, lr_scheduler, optimizer,
                         self._get_training_metrics())