def build_criterion(self, args):
        from fairseq import criterions

        if self.data_cfg.prepend_tgt_lang_tag and args.ignore_prefix_size != 1:
            raise ValueError('Please set "--ignore-prefix-size 1" since '
                             "target language ID token is prepended as BOS.")
        return criterions.build_criterion(args, self)
Exemple #2
0
 def _gpu_train_step(self, test_args):
     samples, src_dict, tgt_dict = test_utils.prepare_inputs(test_args)
     model = models.build_model(test_args, src_dict, tgt_dict)
     criterion = criterions.build_criterion(test_args, src_dict, tgt_dict)
     trainer = Trainer(test_args, model, criterion)
     logging_dict = trainer.train_step(next(samples))
     return trainer, logging_dict
    def build_criterion(self, args: Namespace):
        """
        Build the :class:`~fairseq.criterions.FairseqCriterion` instance for
        this task.
        Args:
            args (argparse.Namespace): parsed command-line arguments
        Returns:
            a :class:`~fairseq.criterions.FairseqCriterion` instance
        """
        from fairseq import criterions

        return criterions.build_criterion(args, self)
Exemple #4
0
    def build_criterion(self, args):
        """
        Build the :class:`~fairseq.criterions.FairseqCriterion` instance for
        this task.

        Args:
            args (argparse.Namespace): parsed command-line arguments

        Returns:
            a :class:`~fairseq.criterions.FairseqCriterion` instance
        """
        from fairseq import criterions
        return criterions.build_criterion(args, self)
Exemple #5
0
    def build_criterion(self, cfg: DictConfig):
        """
        Build the :class:`~fairseq.criterions.FairseqCriterion` instance for
        this task.

        Args:
            cfg (omegaconf.DictConfig): configration object

        Returns:
            a :class:`~fairseq.criterions.FairseqCriterion` instance
        """
        from fairseq import criterions

        return criterions.build_criterion(cfg, self)
Exemple #6
0
    def build_criterion(self, args):
        from fairseq import criterions

        if len(self.multitask_tasks) > 0:
            if self.args.target_is_code and args._name != "speech_to_unit":
                raise ValueError(
                    "set --criterion speech_to_unit for speech-to-unit loss with multitask"
                )
            elif not self.args.target_is_code and args._name != "speech_to_spectrogram":
                raise ValueError(
                    "set --criterion speech_to_spectrogram for speech-to-spectrogram loss with multitask"
                )

        return criterions.build_criterion(args, self)
Exemple #7
0
    def build_criterion(self, args):
        """
        Build the :class:`~fairseq.criterions.FairseqCriterion` instance for
        this task.

        Args:
            args (argparse.Namespace): parsed command-line arguments

        Returns:
            a :class:`~fairseq.criterions.FairseqCriterion` instance
        """
        from fairseq import criterions
        criterion = criterions.build_criterion(args, self)
        assert isinstance(
            criterion, criterions.fairseq_criterion.FairseqSequenceCriterion)
        return criterion
Exemple #8
0
 def build_criterion(self, args, criterion_type=None):
     from fairseq import criterions
     return criterions.build_criterion(args, self, criterion_type)
 def build_criterion(self, args):
     return criterions.build_criterion(args, self)
Exemple #10
0
    def __init__(self, opt, shared=None):
        # In general use a basic TorchAgent wherever possible
        super().__init__(opt, shared)
        if not shared:
            # this is not a shared instance of this class, so do full initialization

            # check early if we're going to be loading the model from a checkpoint
            model_file_exists = (self.opt.get('model_file')
                                 and os.path.isfile(self.opt['model_file']))

            # fairseq expects options to be in argparse format, instead of a dict
            # We also need to do some argument postprocessing and whatnot
            # We'll skip pretrained embeddings if we're going to override them with
            # a model checkpoint anyway
            self.args, self.opt = _fairseq_opt_wrapper(opt, model_file_exists)

            # seed the RNG
            torch.manual_seed(self.args.seed)

            # Just some identifying info
            self.id = "fairseq:{}".format(self.args.arch)

            # We need a placeholder task for fairseq
            self.task = _ParlaiTask(self.dict)

            # actually construct the model and generator
            self.model = self.build_model()

            # Construct the generator and scorer
            self.generator = SequenceGenerator(
                [self.model],
                tgt_dict=self.dict,
                beam_size=self.args.beam,
                stop_early=(not self.args.no_early_stop),
                normalize_scores=(not self.args.unnormalized),
                len_penalty=self.args.lenpen,
                unk_penalty=self.args.unkpen,
                sampling=self.args.sampling,
                sampling_topk=self.args.sampling_topk,
                sampling_temperature=self.args.sampling_temperature,
            )
            self.scorer = SequenceScorer([self.model], self.dict)

            # set up the grader and the trainer
            self.criterion = criterions.build_criterion(self.args, self.task)

            if getattr(self.args, 'fp16', None):
                self.trainer = fp16_trainer.FP16Trainer(
                    self.args, self.task, self.model, self.criterion)
            else:
                # TODO: we might choose to add a --no-fp16 opt in the future to
                # explicitly disable fp16 instead
                if torch.cuda.get_device_capability(0)[0] >= 7:
                    print("Heads up: using --fp16 could be a lot faster!")
                self.trainer = trainer.Trainer(self.args, self.task,
                                               self.model, self.criterion)

            # if the model already existed, let's preload it and the trainer
            if model_file_exists:
                print('Loading existing model params from ' +
                      self.opt['model_file'])
                self.load(self.opt.get('model_file'))

            # move things to the GPU if possible
            if self.use_cuda:
                self.model = self.model.cuda()
                self.generator = self.generator.cuda()
        else:
            self.model = shared['model']
            self.trainer = shared['trainer']
            self.generator = shared['generator']
            self.dict = shared['dict']
            self.args = shared['args']

        # Start things off clean
        self.reset()
Exemple #11
0
 def build_criterion(self, args):
     from fairseq import criterions
     return criterions.build_criterion(args, self)
Exemple #12
0
def setup_training(args):
    """Parse args, load dataset, and load model trainer."""
    if not torch.cuda.is_available():
        raise NotImplementedError("Training on CPU is not supported")
    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = [args.train_subset, args.valid_subset]

    validate_and_set_default_args(args)

    train_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang, data_file=args.train_source_binary_path),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang, data_file=args.train_target_binary_path),
        weights_file=args.train_weights_path if hasattr(
            args, "train_weights_path") else None,
    )

    eval_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang, data_file=args.eval_source_binary_path),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang, data_file=args.eval_target_binary_path),
        weights_file=None,
    )

    if args.log_verbose:
        print("Starting to load binarized data files.", flush=True)
    use_char_source = args.arch == "char_source"
    dataset = pytorch_translate_data.load_binarized_dataset(
        train_corpus=train_corpus,
        eval_corpus=eval_corpus,
        train_split=args.train_subset,
        eval_split=args.valid_subset,
        args=args,
        use_char_source=use_char_source,
    )
    if args.log_verbose:
        print("Finished loading dataset", flush=True)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")

    for split in splits:
        print(f"| {split} {len(dataset.splits[split])} examples")

    # Build model and criterion
    model = models.build_model(args, dataset.src_dict, dataset.dst_dict)
    print("building criterion")
    criterion = criterions.build_criterion(args, dataset.src_dict,
                                           dataset.dst_dict)
    print(f"| model {args.arch}, criterion {criterion.__class__.__name__}")
    print(f"| num. model params: \
        {sum(p.numel() for p in model.parameters())}")

    # Load pretrained model weights if applicable
    if args.pretrained_weights_file:
        utils.load_model_state(args.pretrained_weights_file,
                               model,
                               cuda_device=torch.cuda.current_device())

    # Build trainer
    trainer = Trainer(args, model, criterion)
    print(f"| training on {args.distributed_world_size} GPUs")
    print(
        f"| max tokens per GPU = {args.max_tokens} and \
        max sentences per GPU = {args.max_sentences}",
        flush=True,
    )

    os.makedirs(args.save_dir, exist_ok=True)
    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
    if not os.path.isfile(checkpoint_path) and args.multi_model_restore_files:
        print(
            f"| Restoring individual models from {args.multi_model_restore_files}"
        )
        extra_state = multi_model.import_individual_models(
            args.multi_model_restore_files, trainer)
    else:
        extra_state = load_existing_checkpoint(checkpoint_path, trainer)
    return extra_state, trainer, dataset
Exemple #13
0
def setup_training(args):
    """Parse args, load dataset, and load model trainer."""
    if not torch.cuda.is_available():
        raise NotImplementedError("Training on CPU is not supported")
    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = [args.train_subset, args.valid_subset]

    validate_and_set_default_args(args)

    train_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang, data_file=args.train_source_binary_path),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang, data_file=args.train_target_binary_path),
        weights_file=args.train_weights_path if hasattr(
            args, "train_weights_path") else None,
    )

    eval_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang, data_file=args.eval_source_binary_path),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang, data_file=args.eval_target_binary_path),
        weights_file=None,
    )

    if args.log_verbose:
        print("Starting to load binarized data files.", flush=True)
    use_char_source = args.arch == "char_source"
    dataset = pytorch_translate_data.load_binarized_dataset(
        train_corpus=train_corpus,
        eval_corpus=eval_corpus,
        train_split=args.train_subset,
        eval_split=args.valid_subset,
        args=args,
        use_char_source=use_char_source,
    )
    if args.log_verbose:
        print("Finished loading dataset", flush=True)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")

    for split in splits:
        print(f"| {split} {len(dataset.splits[split])} examples")

    # Build model and criterion
    model = models.build_model(args, dataset.src_dict, dataset.dst_dict)
    print("building criterion")
    criterion = criterions.build_criterion(args, dataset.src_dict,
                                           dataset.dst_dict)
    print(f"| model {args.arch}, criterion {criterion.__class__.__name__}")
    print(f"| num. model params: \
        {sum(p.numel() for p in model.parameters())}")

    # Build trainer
    trainer = Trainer(args, model, criterion)
    print(f"| training on {args.distributed_world_size} GPUs")
    print(
        f"| max tokens per GPU = {args.max_tokens} and \
        max sentences per GPU = {args.max_sentences}",
        flush=True,
    )

    os.makedirs(args.save_dir, exist_ok=True)

    # If --restore-file is already present under --save-dir, use that one
    # instead of the --restore-file that may be present under
    # --restore-checkpoint-dir. The idea is that --restore-checkpoint-dir
    # allows the user to specify restoring from a different run's
    # checkpoint (possibly with different training params), while not
    # polluting the previous run's checkpoint directory with new checkpoints.
    # However, if training gets interrupted and the user restarts training,
    # we want to resume from the checkpoints under --save-dir, instead of
    # restarting again from the old run's checkpoint under
    # --restore-checkpoint-dir.
    #
    # Note that if args.restore_file is an absolute path, os.path.join() will
    # ignore previous directory args and just use the absolute path as is.
    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
    if os.path.exists(checkpoint_path):
        print(f"Using --save-dir={args.save_dir}, "
              f"--restore-file={args.restore_file}.")
    elif args.restore_checkpoint_dir:
        checkpoint_path = os.path.join(args.restore_checkpoint_dir,
                                       args.restore_file)
        print(f"Using --restore-checkpoint-dir={args.restore_checkpoint_dir}, "
              f"--restore-file={args.restore_file}.")

    if not os.path.isfile(checkpoint_path) and args.multi_model_restore_files:
        print(
            f"| Restoring individual models from {args.multi_model_restore_files}"
        )
        extra_state = multi_model.import_individual_models(
            args.multi_model_restore_files, trainer)
    else:
        extra_state = load_existing_checkpoint(
            checkpoint_path=checkpoint_path,
            trainer=trainer,
            restore_state=args.restore_checkpoint_state,
        )
    return extra_state, trainer, dataset
Exemple #14
0
def main(args):
    print(args)

    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = ['train', 'valid']
    if data.has_binary_files(args.data, splits):
        dataset = data.load_dataset(args.data, splits, args.source_lang,
                                    args.target_lang)
    else:
        dataset = data.load_raw_text_dataset(args.data, splits,
                                             args.source_lang,
                                             args.target_lang)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst
    print('| [{}] dictionary: {} types'.format(dataset.src,
                                               len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst,
                                               len(dataset.dst_dict)))
    for split in splits:
        print('| {} {} {} examples'.format(args.data, split,
                                           len(dataset.splits[split])))

    # Build model and criterion
    model = models.build_model(args, dataset.src_dict, dataset.dst_dict)
    if 0.0 < args.rank_scale < 1.0:
        patch_transformer(args, model)
        if args.wd2fd:
            no_decay, skiplist = [
                'fc1', 'fc2', 'embed_tokens', 'embed_positions', 'out_embed'
            ], []
        else:
            no_decay, skiplist = [], [
                'fc1', 'fc2', 'embed_tokens', 'embed_positions', 'out_embed'
            ]
    else:
        no_decay, skiplist = [], []
    spectral_init(args, model)

    criterion = criterions.build_criterion(args, dataset.src_dict,
                                           dataset.dst_dict)
    print('| model {}, criterion {}'.format(args.arch,
                                            criterion.__class__.__name__))
    print('| num. model params: {}'.format(
        sum(p.data.numel() for p in model.parameters())))

    # Build trainer
    no_decay, skiplist = [], []
    if args.wd2fd_quekey:
        no_decay.extend(['_query.weight', '_key.weight'])
    else:
        skiplist.append('quekey')
    if args.wd2fd_outval:
        no_decay.extend(['_value.weight', 'output_perform.weight'])
    else:
        skiplist.append('outval')

    trainer = Trainer(args,
                      model,
                      criterion,
                      skiplist=skiplist,
                      no_decay=no_decay)
    print('| training on {} GPUs'.format(args.distributed_world_size))
    print('| max tokens per GPU = {} and max sentences per GPU = {}'.format(
        args.max_tokens,
        args.max_sentences,
    ))

    # Load the latest checkpoint if one is available
    os.makedirs(args.save_dir, exist_ok=True)
    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
    extra_state = trainer.load_checkpoint(checkpoint_path)
    if extra_state is not None:
        epoch = extra_state['epoch']
        batch_offset = extra_state['batch_offset']
        print('| loaded checkpoint {} (epoch {})'.format(
            checkpoint_path, epoch))
        if batch_offset == 0:
            trainer.lr_step(epoch)
            epoch += 1
    else:
        epoch, batch_offset = 1, 0

    if args.distributed_rank <= 0:
        writer = SummaryWriter(args.save_dir)
        with open(os.path.join(args.save_dir, 'args.json'), 'w') as f:
            json.dump(vars(args), f, indent=4)
    else:
        writer = SummaryWriter(
            os.path.join(args.save_dir, str(args.distributed_rank)))

    # Train until the learning rate gets too small
    max_epoch = args.max_epoch or math.inf
    max_update = args.max_update or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    while lr > args.min_lr and epoch <= max_epoch:

        if args.distributed_rank <= 0:
            writer.add_scalar('hyper/lr', lr, epoch)
            for form in ['QueKey', 'OutVal']:
                frobnorm, nucnorm, bound, nonorth = [], [], [], []
                for module in model.modules():
                    if hasattr(module, form.lower()):
                        U, VT = getattr(module, form.lower()).get_UVT()
                        for u, vt in zip(U, VT):
                            frobnorm.append(frobenius_norm(u, vt))
                            nucnorm.append(
                                torch.norm(torch.matmul(u, vt), 'nuc'))
                            bound.append(
                                (u.pow(2).sum() + vt.pow(2).sum()) / 2.)
                            nonorth.append(sum(non_orthogonality(u, vt)) / 2.)
                writer.add_scalar('FrobNorm/' + form,
                                  sum(frobnorm) / len(frobnorm), epoch)
                writer.add_scalar('NucNorm/' + form,
                                  sum(nucnorm) / len(nucnorm), epoch)
                writer.add_scalar('NucNorm/' + form + '-Bound',
                                  sum(bound) / len(bound), epoch)
                writer.add_scalar('NonOrth/' + form,
                                  sum(nonorth) / len(nonorth), epoch)
            frobnorm, nucnorm, bound, nonorth = [], [], [], []
            for name, module in model.named_modules():
                if not any(
                        block in name for block in
                    ['embed', '_query', '_key', '_value', 'output_perform']):
                    if hasattr(module,
                               'frobgrad') and not hasattr(module, 'get_UVT'):
                        U, VT = module.U.data, module.VT.data
                        frobnorm.append(frobenius_norm(U, VT))
                        nucnorm.append(torch.norm(torch.matmul(U, VT), 'nuc'))
                        nonorth.append(sum(non_orthogonality(U, VT)) / 2.)
                        bound.append((U.pow(2).sum() + VT.pow(2).sum()) / 2.)
                    elif hasattr(module, 'weight'):
                        frobnorm.append(torch.norm(module.weight.data))
                        nucnorm.append(torch.norm(module.weight.data, 'nuc'))
            writer.add_scalar('FrobNorm/Linear',
                              sum(frobnorm) / len(frobnorm), epoch)
            writer.add_scalar('NucNorm/Linear',
                              sum(nucnorm) / len(nucnorm), epoch)
            if nonorth:
                writer.add_scalar('NucNorm/Linear-Bound',
                                  sum(bound) / len(bound), epoch)
                writer.add_scalar('NonOrth/Linear',
                                  sum(nonorth) / len(nonorth), epoch)

        # train for one epoch
        train(args, trainer, dataset, epoch, batch_offset)

        # evaluate on validate set
        if epoch % args.validate_interval == 0:
            for k, subset in enumerate(args.valid_subset.split(',')):
                val_loss = validate(args, trainer, dataset, subset, epoch)
                if k == 0:
                    # only use first validation loss to update the learning schedule
                    lr = trainer.lr_step(epoch, val_loss)

                    # save checkpoint
                    if not args.no_save:
                        save_checkpoint(trainer, args, epoch, 0, val_loss)
            for k in ['loss', 'nll_loss']:
                writer.add_scalar('valid/' + k,
                                  trainer.meters['valid_' + k].avg, epoch)
                writer.add_scalar('train/' + k,
                                  trainer.meters['train_' + k].avg, epoch)
        else:
            lr = trainer.lr_step(epoch)

        epoch += 1
        batch_offset = 0

        if trainer.get_num_updates() >= max_update:
            break
    train_meter.stop()

    print('| done training in {:.1f} seconds'.format(train_meter.sum))
    writer.flush()
    newpar = sum(p.numel() for p in model.parameters())
    if 0.0 < args.rank_scale < 1.0:
        args.rank_scale = 1.0
        origpar = sum(p.numel() for p in models.build_model(
            args, dataset.src_dict, dataset.dst_dict).parameters())
    else:
        origpar = newpar
    if args.distributed_rank <= 0:
        with open(os.path.join(args.save_dir, 'results.json'), 'w') as f:
            json.dump(
                {
                    'final validation loss':
                    trainer.meters['valid_nll_loss'].avg,
                    'original parameter count': origpar,
                    'compressed parameter count': newpar,
                    'compression ratio': newpar / origpar
                },
                f,
                indent=4)
 def build_criterion(self):
     """Set up the grader."""
     # TorchAgent will call this without ready=True before self.args is ready
     return criterions.build_criterion(self.args, self.task)
Exemple #16
0
def setup_training(args):
    """Parse args, load dataset, and load model trainer."""
    if not torch.cuda.is_available():
        raise NotImplementedError("Training on CPU is not supported")
    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = [args.train_subset, args.valid_subset]

    validate_and_set_default_args(args)

    train_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang,
            data_file=args.train_source_binary_prefix),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang,
            data_file=args.train_target_binary_prefix),
    )
    eval_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang,
            data_file=args.eval_source_binary_prefix),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang,
            data_file=args.eval_target_binary_prefix),
    )

    if args.log_verbose:
        print("Starting to load binarized data files.", flush=True)
    dataset = pytorch_translate_data.load_binarized_dataset(
        train_corpus=train_corpus,
        eval_corpus=eval_corpus,
        train_split=args.train_subset,
        eval_split=args.valid_subset,
        args=args,
    )
    if args.log_verbose:
        print("Finished loading dataset", flush=True)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print(f"| [{dataset.src}] dictionary: {len(dataset.src_dict)} types")
    print(f"| [{dataset.dst}] dictionary: {len(dataset.dst_dict)} types")

    for split in splits:
        print(f"| {split} {len(dataset.splits[split])} examples")

    # Build model and criterion
    model = models.build_model(args, dataset.src_dict, dataset.dst_dict)
    criterion = criterions.build_criterion(args, dataset.src_dict,
                                           dataset.dst_dict)
    print(f"| model {args.arch}, criterion {criterion.__class__.__name__}")
    print(f"| num. model params: \
        {sum(p.data.numel() for p in model.parameters())}")

    # Build trainer
    trainer = Trainer(args, model, criterion)
    print(f"| training on {args.distributed_world_size} GPUs")
    print(
        f"| max tokens per GPU = {args.max_tokens} and \
        max sentences per GPU = {args.max_sentences}",
        flush=True,
    )

    extra_state = load_existing_checkpoint(args.save_dir, args.restore_file,
                                           trainer)

    return extra_state, trainer, dataset
def main(args):
    print(args)

    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = ['train', 'valid']
    if data.has_binary_files(args.data, splits):
        dataset = data.load_dataset(
            args.data, splits, args.source_lang, args.target_lang)
    else:
        dataset = data.load_raw_text_dataset(
            args.data, splits, args.source_lang, args.target_lang)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst
    print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict)))
    for split in splits:
        print('| {} {} {} examples'.format(args.data, split, len(dataset.splits[split])))

    # Build model and criterion
    model = models.build_model(args, dataset.src_dict, dataset.dst_dict)
    criterion = criterions.build_criterion(args, dataset.src_dict, dataset.dst_dict)
    print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__))
    print('| num. model params: {}'.format(sum(p.data.numel() for p in model.parameters())))

    # Build trainer
    trainer = Trainer(args, model, criterion)
    print('| training on {} GPUs'.format(args.distributed_world_size))
    print('| max tokens per GPU = {} and max sentences per GPU = {}'.format(
        args.max_tokens,
        args.max_sentences,
    ))

    # Load the latest checkpoint if one is available
    os.makedirs(args.save_dir, exist_ok=True)
    checkpoint_path = os.path.join(args.save_dir, args.restore_file)
    extra_state = trainer.load_checkpoint(checkpoint_path)
    if extra_state is not None:
        epoch = extra_state['epoch']
        batch_offset = extra_state['batch_offset']
        print('| loaded checkpoint {} (epoch {})'.format(checkpoint_path, epoch))
        if batch_offset == 0:
            trainer.lr_step(epoch)
            epoch += 1
    else:
        epoch, batch_offset = 1, 0

    # Train until the learning rate gets too small
    max_epoch = args.max_epoch or math.inf
    lr = trainer.get_lr()
    train_meter = StopwatchMeter()
    train_meter.start()
    while lr > args.min_lr and epoch <= max_epoch:
        # train for one epoch
        train(args, trainer, dataset, epoch, batch_offset)

        # evaluate on validate set
        for k, subset in enumerate(args.valid_subset.split(',')):
            val_loss = validate(args, trainer, dataset, subset, epoch)
            if k == 0:
                # only use first validation loss to update the learning schedule
                lr = trainer.lr_step(epoch, val_loss)

                # save checkpoint
                if not args.no_save:
                    save_checkpoint(trainer, args, epoch, 0, val_loss)

        epoch += 1
        batch_offset = 0
    train_meter.stop()

    print('| done training in {:.1f} seconds'.format(train_meter.sum))
Exemple #18
0
def setup_training(args):
    """Parse args, load dataset, and load model trainer."""
    if not torch.cuda.is_available():
        raise NotImplementedError('Training on CPU is not supported')
    torch.cuda.set_device(args.device_id)
    torch.manual_seed(args.seed)

    # Load dataset
    splits = [args.train_subset, args.valid_subset]

    if args.source_lang is None:
        args.source_lang = 'src'
    if args.target_lang is None:
        args.target_lang = 'tgt'

    assert_corpora_files_specified(args)
    train_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang,
            data_file=args.train_source_text_file,
        ),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang,
            data_file=args.train_target_text_file,
        ),
    )
    eval_corpus = pytorch_translate_data.ParallelCorpusConfig(
        source=pytorch_translate_data.CorpusConfig(
            dialect=args.source_lang,
            data_file=args.eval_source_text_file,
        ),
        target=pytorch_translate_data.CorpusConfig(
            dialect=args.target_lang,
            data_file=args.eval_target_text_file,
        ),
    )

    if args.log_verbose:
        print('Starting to load raw text files.', flush=True)
    dataset = pytorch_translate_data.load_raw_text_dataset(
        train_corpus=train_corpus,
        eval_corpus=eval_corpus,
        train_split=args.train_subset,
        eval_split=args.valid_subset,
        args=args,
    )
    if args.log_verbose:
        print('Finished loading dataset', flush=True)
    if args.source_lang is None or args.target_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.source_lang, args.target_lang = dataset.src, dataset.dst

    print('| [{}] dictionary: {} types'.format(
        dataset.src,
        len(dataset.src_dict),
    ))
    print('| [{}] dictionary: {} types'.format(
        dataset.dst,
        len(dataset.dst_dict),
    ))
    for split in splits:
        print('| {} {} examples'.format(
            split,
            len(dataset.splits[split]),
        ))

    # Build model and criterion
    model = models.build_model(args, dataset.src_dict, dataset.dst_dict)
    criterion = criterions.build_criterion(args, dataset.src_dict,
                                           dataset.dst_dict)
    print('| model {}, criterion {}'.format(
        args.arch,
        criterion.__class__.__name__,
    ))
    print('| num. model params: {}'.format(
        sum(p.data.numel() for p in model.parameters())))

    # Build trainer
    trainer = Trainer(args, model, criterion)
    print('| training on {} GPUs'.format(args.distributed_world_size))
    print(
        '| max tokens per GPU = {} and max sentences per GPU = {}'.format(
            args.max_tokens,
            args.max_sentences,
        ),
        flush=True,
    )

    extra_state = load_existing_checkpoint(
        args.save_dir,
        args.restore_file,
        trainer,
    )

    return extra_state, trainer, dataset