Beispiel #1
0
    def _build_optimizer(self):
        if self.args.optimizer != 'adam_cbn':
            params = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.parameters(),
                          self.criterion.parameters()),
                ))
        else:
            # selection
            from fairseq.modules.norms.constraint_bn_v2 import Constraint_Lagrangian
            constraint_param = []
            for m in self.model.modules():
                if isinstance(m, Constraint_Lagrangian):
                    constraint_param.extend(list(map(id, m.parameters())))
            params_lag = list(
                filter(lambda p: id(p) in constraint_param,
                       chain(self.model.parameters())))
            params = list(
                filter(
                    lambda p: id(p) not in constraint_param and p.
                    requires_grad,
                    chain(self.model.parameters(),
                          self.criterion.parameters())))

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            # check cbn
            if self.args.optimizer != 'adam_cbn':
                self._optimizer = optim.build_optimizer(self.args, params)
            else:
                self._optimizer = optim.build_optimizer(
                    self.args, params, params_lag)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Beispiel #2
0
 def build_optimizer(cls, args, params):
     """
     Args:
         args (argparse.Namespace): fairseq args
         params (iterable): iterable of parameters to optimize
     """
     flatten = not getattr(args, 'fp16_no_flatten_grads', False)
     fp32_params = cls.build_fp32_params(params, flatten=flatten)
     if flatten:
         fp32_optimizer = optim.build_optimizer(args, [fp32_params])
     else:
         fp32_optimizer = optim.build_optimizer(args, fp32_params)
     return cls(args, params, fp32_optimizer, fp32_params)
    def __init__(self, args, model):

        if not torch.cuda.is_available():
            raise NotImplementedError('Training on CPU is not supported')

        self.args = args

        self.model = model.cuda()
        self.criterion = CRITERION_REGISTRY[args.criterion](args).cuda()
        self.optimizer = optim.build_optimizer(self.args, self.model.parameters())
        self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)

        if args.amp:
            model, optimizer = amp.initialize(
                    self.model,
                    self.optimizer._optimizer, 
                    opt_level=self.args.amp_level if self.args.amp_level else 'O2',
                    max_loss_scale=2**15,
                    cast_model_outputs=torch.float16
                    )

        if self.args.distributed_world_size > 1:
            self.model = DDP(model)

        self._buffered_stats = defaultdict(lambda: [])
        self._flat_grads = None
        self._num_updates = 0
        self._num_val_iterations = 0
        self._optim_history = None
        self.throughput_meter = TimeMeter()
Beispiel #4
0
    def __init__(self, args, model, criterion):

        if not torch.cuda.is_available():
            raise NotImplementedError('Training on CPU is not supported')

        self.args = args

        # copy model and criterion to current device
        self.model = model.cuda()
        self.criterion = criterion.cuda()

        # initialize optimizer and LR scheduler
        self.optimizer = optim.build_optimizer(self.args, self.model.parameters())
        self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)

        # initialize meters
        self.meters = OrderedDict()
        self.meters['train_loss'] = AverageMeter()
        self.meters['train_nll_loss'] = AverageMeter()
        self.meters['valid_loss'] = AverageMeter()
        self.meters['valid_nll_loss'] = AverageMeter()
        self.meters['wps'] = TimeMeter()       # words per second
        self.meters['ups'] = TimeMeter()       # updates per second
        self.meters['wpb'] = AverageMeter()    # words per batch
        self.meters['bsz'] = AverageMeter()    # sentences per batch
        self.meters['gnorm'] = AverageMeter()  # gradient norm
        self.meters['clip'] = AverageMeter()   # % of updates clipped
        self.meters['oom'] = AverageMeter()    # out of memory

        self._max_bsz_seen = 0
        self._num_updates = 0
Beispiel #5
0
    def _build_optimizer(self):

        params = list(
            filter(lambda p: p.requires_grad, self.model.parameters()))
        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        print('| num. model params: {} (num. optimized: {})'.format(
            sum(p.numel() for p in self.model.parameters()),
            sum(p.numel() for p in self._optimizer.params if p.requires_grad),
        ))
        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
Beispiel #6
0
    def load_checkpoint(self, filename):
        """Load all training state from a checkpoint file."""
        extra_state, self._optim_history, last_optim_state = utils.load_model_state(
            filename, self.model, cuda_device=torch.cuda.current_device())

        if last_optim_state is not None:
            # rebuild optimizer after loading model, since params may have changed
            self.optimizer = optim.build_optimizer(self.args,
                                                   self.model.parameters())
            self.lr_scheduler = lr_scheduler.build_lr_scheduler(
                self.args, self.optimizer)

            # only reload optimizer and lr_scheduler if they match
            last_optim = self._optim_history[-1]
            if last_optim[
                    'criterion_name'] == self.criterion.__class__.__name__:
                self.lr_scheduler.load_state_dict(
                    last_optim['lr_scheduler_state'])
                if last_optim[
                        'optimizer_name'] == self.optimizer.__class__.__name__:
                    self.optimizer.load_state_dict(last_optim_state)

            self._num_updates = last_optim['num_updates']

        return extra_state
Beispiel #7
0
    def build_optimizer(self):
        params = list(
            filter(
                lambda p: p.requires_grad,
                self.model.parameters(),
            ))

        if self.args.fp16:
            self.args.fp16_scale_window = 2**14 / self.args.world_size / self.args.gradient_accumulation_steps
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self.optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self.optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            self.optimizer = optim.build_optimizer(self.args, params)

        self.lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self.lr_scheduler.step_update(0)
Beispiel #8
0
    def _build_optimizer(self):
        params = list(
            filter(
                lambda p: p.requires_grad,
                chain(self.model.parameters(), self.criterion.parameters()),
            )
        )

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print('| WARNING: your device does NOT support faster training with --fp16, '
                      'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print('| NOTICE: your device may support faster training with --fp16')
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Beispiel #9
0
 def build_optimizer(cls, cfg: DictConfig, params, **kwargs):
     """
     Args:
         args (argparse.Namespace): fairseq args
         params (iterable): iterable of parameters to optimize
     """
     fp16_optimizer = optim.build_optimizer(cfg.optimizer, params)
     return cls(cfg, params, fp16_optimizer, **kwargs)
Beispiel #10
0
 def build_optimizer(cls, cfg: DictConfig, params, **kwargs):
     """
     Args:
         cfg (omegaconf.DictConfig): fairseq args
         params (iterable): iterable of parameters to optimize
     """
     fp32_optimizer = optim.build_optimizer(cfg.optimizer, params)
     return cls(cfg, params, fp32_optimizer, **kwargs)
Beispiel #11
0
 def build_optimizer(cls, args, params):
     """
     Args:
         args (argparse.Namespace): fairseq args
         params (iterable): iterable of parameters to optimize
     """
     fp16_optimizer = optim.build_optimizer(args, params)
     return cls(args, params, fp16_optimizer)
Beispiel #12
0
 def build_optimizer(cls, args, params):
     """
     Args:
         args (argparse.Namespace): fairseq args
         params (iterable): iterable of parameters to optimize
     """
     flatten = not getattr(args, 'fp16_no_flatten_grads', False)
     fp32_params = cls.build_fp32_params(params, flatten=flatten)
     if flatten:
         fp32_optimizer = optim.build_optimizer(args, [fp32_params])
     else:
         fp32_optimizer = optim.build_optimizer(args, fp32_params)
     if flatten and not fp32_optimizer.supports_flat_params:
         raise RuntimeError(
             'chosen optimizer does not support flat params, '
             'please set --fp16-no-flatten-grads')
     return cls(args, params, fp32_optimizer, fp32_params)
Beispiel #13
0
def load_existing_checkpoint(checkpoint_path, trainer, restore_state=True):
    extra_state = None
    loaded = False
    if restore_state:
        extra_state = trainer.load_checkpoint(checkpoint_path)
        if extra_state is None:
            loaded = False
            print(
                f"Failed to load checkpoint and state from {checkpoint_path}.")
        else:
            loaded = True
            print(
                f"| loaded checkpoint {checkpoint_path} (epoch {extra_state['epoch']})\n"
                f"| extra_state {extra_state}")
            # batch_offset being None denotes this was a checkpoint saved at
            # the end of an epoch (after the last batch).
            if extra_state["batch_offset"] is None:
                trainer.lr_step(extra_state["epoch"])
                extra_state["epoch"] += 1
                extra_state["batch_offset"] = 0

            # check availability for checkpoint backward compatiblity
            if "start_time" not in extra_state:
                extra_state["start_time"] = time.time()

            if "last_bleu_eval" not in extra_state:
                extra_state["last_bleu_eval"] = 0

    else:
        # TODO(weiho): use trainer.load_checkpoint(load_optim=False) after
        # that's been synced to open-source fairseq.
        dummy_state, _, _ = utils.load_model_state(
            checkpoint_path,
            trainer.model,
            cuda_device=torch.cuda.current_device())
        trainer.optimizer = optim.build_optimizer(trainer.args,
                                                  trainer.model.parameters())
        trainer.lr_scheduler = optim.lr_scheduler.build_lr_scheduler(
            trainer.args, trainer.optimizer)
        trainer._optim_history = []

        if dummy_state is None:
            loaded = False
            print(f"Failed to load checkpoint weights from {checkpoint_path}.")
        else:
            loaded = True
            print(f"Loaded checkpoint weights from {checkpoint_path}.")

    if extra_state is None:
        extra_state = {
            "epoch": 1,
            "batch_offset": 0,
            "val_loss": None,
            "start_time": time.time(),
            "last_bleu_eval": 0,
        }

    return loaded, extra_state
 def build_optimizer(cls, args, params):
     """
     Args:
         args (argparse.Namespace): fairseq args
         params (iterable): iterable of parameters to optimize
     """
     fp32_params = cls.build_fp32_params(params)
     fp32_optimizer = optim.build_optimizer(args, [fp32_params])
     return cls(args, params, fp32_optimizer, fp32_params)
Beispiel #15
0
    def _build_optimizer(self):
        params_nmt = self.model.get_nmt_parameters()

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                raise NotImplementedError
            else:
                self._optimizer_nmt = optim.FP16Optimizer.build_optimizer(
                    self.args, params_nmt)
                self._optimizer_adv = optim.FP16Optimizer.build_optimizer(
                    self.args, self.model.get_adv_parameters())

        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            self._optimizer_nmt = optim.build_optimizer(self.args, params_nmt)

            self._optimizer_adv = optim.build_optimizer(
                self.args, self.model.get_adv_parameters())

        self._optimizer = {}
        self._optimizer['nmt'] = self._optimizer_nmt
        self._optimizer['adv'] = self._optimizer_adv

        print(
            '| num. model params: {} (num. optimized: {} ( nmt: {}, adv classifier: {} ) )'
            .format(
                sum(p.numel() for p in self.model.parameters()),
                sum(p.numel() for optim in self._optimizer.values()
                    for p in optim.params if p.requires_grad),
                sum(p.numel() for p in self._optimizer['nmt'].params
                    if p.requires_grad),
                sum(p.numel() for p in self._optimizer['adv'].params
                    if p.requires_grad)))
        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer['nmt'])
Beispiel #16
0
 def build_optimizer(cls, cfg: DictConfig, params, **kwargs):
     """
     Args:
         cfg (omegaconf.DictConfig): fairseq args
         params (iterable): iterable of parameters to optimize
     """
     flatten = not getattr(cfg.common, "fp16_no_flatten_grads", False)
     if getattr(cfg.common, "bf16", False):
         flatten = False  # mixed precision is faster on TPUs without flat grads
     fp32_params = cls.build_fp32_params(cfg.optimizer, params, flatten=flatten)
     if flatten:
         fp32_optimizer = optim.build_optimizer(cfg.optimizer, [fp32_params])
     else:
         fp32_optimizer = optim.build_optimizer(cfg.optimizer, fp32_params)
     if flatten and not fp32_optimizer.supports_flat_params:
         raise RuntimeError(
             f"chosen optimizer {fp32_optimizer.__class__.__name__} does not support flat params, please set --fp16-no-flatten-grads"
         )
     return cls(cfg, params, fp32_optimizer, fp32_params, **kwargs)
Beispiel #17
0
 def build_optimizer(cls, args, params):
     """
     Args:
         args (argparse.Namespace): fairseq args
         params (iterable): iterable of parameters to optimize
     """
     flatten = not getattr(args, "fp16_no_flatten_grads", False)
     if getattr(args, "bf16", False):
         flatten = False  # mixed precision is faster on TPUs without flat grads
     fp32_params = cls.build_fp32_params(args, params, flatten=flatten)
     if flatten:
         fp32_optimizer = optim.build_optimizer(args, [fp32_params])
     else:
         fp32_optimizer = optim.build_optimizer(args, fp32_params)
     if flatten and not fp32_optimizer.supports_flat_params:
         raise RuntimeError(
             "chosen optimizer does not support flat params, "
             "please set --fp16-no-flatten-grads")
     return cls(args, params, fp32_optimizer, fp32_params)
Beispiel #18
0
    def _build_optimizer(self):
        # params = list(
        #     filter(
        #         lambda p: p.requires_grad,
        #         chain(self.model.parameters(), self.criterion.parameters()),
        #     )
        # )
        params_dict = {}
        _default_manifold = Euclidean()
        for name, p in chain(self.model.named_parameters(),
                             self.criterion.named_parameters()):
            if not p.requires_grad:
                continue
            if isinstance(p, (ManifoldParameter, ManifoldTensor)):
                _manifold = p.manifold
            else:
                _manifold = _default_manifold
            _manifold_name = _manifold.__class__.__name__
            if not _manifold_name in params_dict:
                ref_grad = _manifold.egrad2rgrad(p.new_zeros(1), p.new_ones(1))
                coef = 1 if ref_grad == 1 else 1
                #print(f"lr={self.args.lr}, ref={ref_grad.item()}")
                params_dict[_manifold_name] = dict(
                    params=[],
                    lr_rectifier=ref_grad.reciprocal().item() * coef)
            params_dict[_manifold_name]['params'].append(p)
        params = params_dict.values()

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster")
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info(
                    "NOTE: your device may support faster training with --fp16"
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Beispiel #19
0
    def _build_optimizer(self):
        params = list(
            filter(lambda p: p.requires_grad, self.model.parameters()))
        # for n, p in list(self.model.named_parameters()):
        #     if p.requires_grad and n.startswith('encoder.bert'):
        #         print(n)
        #     else:
        #         print('=====%s',n)
        # params = [(n, p) for n, p in list(self.model.parameters()) if n.startswith('bert.model')]

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            if self.args.sep_optim:
                # bert_params = [(n, p) for n, p in list(self.model.named_parameters()) if n.startswith('encoder.bert')]
                # dec_params = [(n, p) for n, p in list(self.model.named_parameters()) if not n.startswith('encoder.bert')]
                bert_params = [
                    p for n, p in list(self.model.named_parameters())
                    if n.startswith('encoder.bert')
                ]
                dec_params = [
                    p for n, p in list(self.model.named_parameters())
                    if not n.startswith('encoder.bert')
                ]
                self._optimizer = optim.build_optimizer_bert(
                    self.args, bert_params)
                self._dec_optimizer = optim.build_optimizer_dec(
                    self.args, dec_params)
            else:
                self._optimizer = optim.build_optimizer(self.args, params)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        if self.args.sep_optim:
            self._lr_scheduler = lr_scheduler.build_lr_scheduler(
                self.args, self.optimizer)
            self._dec_lr_scheduler = lr_scheduler.build_lr_scheduler(
                self.args, self._dec_optimizer, decoder=True)
        else:
            self._lr_scheduler = lr_scheduler.build_lr_scheduler(
                self.args, self.optimizer)
Beispiel #20
0
    def _build_optimizer(self):
        params = list(
            filter(
                lambda p: p.requires_grad,
                chain(self.model.parameters(), self.criterion.parameters()),
            )
        )

        if self.cfg.common.fp16 or self.cfg.common.bf16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster"
                )
            if (
                self.cfg.common.memory_efficient_fp16
                or self.cfg.common.memory_efficient_bf16
            ):
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.cfg, params
                )
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(self.cfg, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info("NOTE: your device may support faster training with --fp16")
            self._optimizer = optim.build_optimizer(self.cfg.optimizer, params)

        if self.cfg.optimization.use_bmuf:
            self._optimizer = optim.FairseqBMUF(
                self.cfg.bmuf,
                self._optimizer,
            )

        if self.cfg.distributed_training.zero_sharding == "os":
            if (
                self.cfg.common.fp16
                and not self.cfg.common.memory_efficient_fp16
                and not self.cfg.common.memory_efficient_bf16
            ) and not self.cfg.common.fp16_no_flatten_grads:
                raise ValueError(
                    "ZeRO is incomptabile with fp16 and flattened grads. "
                    "Please use --fp16-no-flatten-grads"
                )
            else:
                optim.shard_(self._optimizer, self.data_parallel_process_group)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.cfg.lr_scheduler,
            self.optimizer,
        )
        self._lr_scheduler.step_update(0)
Beispiel #21
0
    def _build_optimizer(self):
        if self.ulmfit:
            params = []
            multiplier_map = []
            for n, p in self.model.named_parameters():
                if p.requires_grad:
                    params.append(p)

                    param_name_split = n.split('.')

                    if param_name_split[2] == 'lm_head':  # last layer
                        multiplier = 1.
                    elif param_name_split[4].isdigit():  # encoder layer
                        layer = int(param_name_split[4])

                        multiplier = self.decay_rate_lrc**-(self.num_layers -
                                                            layer)
                    else:  # first layer
                        multiplier = self.decay_rate_lrc**-(self.num_layers +
                                                            1)

                    multiplier_map.append(multiplier)
        else:
            params = list(
                filter(lambda p: p.requires_grad, self.model.parameters()))

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, params,
                                                self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Beispiel #22
0
    def _build_optimizer(self):
        if self.args.fp16:
            if torch.cuda.get_device_capability(0)[0] < 7:
                print('| WARNING: your device does NOT support faster training with --fp16, '
                      'please switch to FP32 which is likely to be faster')
            params = list(filter(lambda p: p.requires_grad, self.model.parameters()))
            self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params)
        else:
            if torch.cuda.get_device_capability(0)[0] >= 7:
                print('| NOTICE: your device may support faster training with --fp16')
            self._optimizer = optim.build_optimizer(self.args, self.model.parameters())

        self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self._optimizer)
Beispiel #23
0
    def _build_optimizer(self):
        if self.freeze_bart:
            for name, param in self.model.named_parameters():
                if name.startswith('encoder') and name not in [
                        "encoder.structure_att.exparam",
                        "encoder.structure_att.tp_linear.weight",
                        "encoder.structure_att.tp_linear.bias",
                        "encoder.structure_att.tc_linear.weight",
                        "encoder.structure_att.tc_linear.bias",
                        "encoder.structure_att.fi_linear.weight",
                        "encoder.structure_att.bilinear._weight_matrix",
                        "encoder.structure_att.bilinear._bias",
                        "encoder.structure_att.fzlinear.weight",
                        "encoder.structure_att.fzlinear.bias",
                        "encoder.str_to_enc_linear.weight",
                        "encoder.str_to_enc_linear.bias"
                ]:
                    param.requires_grad = False
            print("Freezing parameters")
        params = list(
            filter(
                lambda p: p.requires_grad,
                chain(self.model.parameters(), self.criterion.parameters()),
            ))

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster")
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info(
                    "NOTE: your device may support faster training with --fp16"
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Beispiel #24
0
    def build_optimizer(args, params):
        # create FP32 copy of parameters and grads
        total_param_size = sum(p.data.numel() for p in params)
        fp32_params = params[0].new(0).float().new(total_param_size)
        offset = 0
        for p in params:
            numel = p.data.numel()
            fp32_params[offset:offset + numel].copy_(p.data.view(-1))
            offset += numel
        fp32_params = torch.nn.Parameter(fp32_params)
        fp32_params.grad = fp32_params.data.new(total_param_size)

        fp32_optimizer = optim.build_optimizer(args, [fp32_params])
        return FP16Optimizer(args, params, fp32_optimizer, fp32_params)
Beispiel #25
0
    def build_model(self, args):
        self.main_model = super().build_model(args)

        if not self.baseline:
            from copy import deepcopy
            model_args = deepcopy(args)
            model_args.classifier_att_context_size = getattr(
                args, 'classifier_att_context_size', 512)
            model_args.classifier_num_layers = getattr(
                args, 'classifier_num_layers', 2)
            model_args.classifier_input_size = getattr(
                args, 'classifier_input_size', 768)
            model_args.classifier_hidden_size = getattr(
                args, 'classifier_hidden_size', 512)
            model_args.fp16 = True
            model_args.arch = 'lang_test'
            self.adversary_model = super().build_model(model_args)
            # self.adversary_model.half()
            # self.adversary_model.decoder.half()

            use_cuda = torch.cuda.is_available() and not self.args.cpu
            if use_cuda:
                self.adversary_model.cuda()
            from copy import deepcopy
            optim_args = deepcopy(self.args)
            optim_args.lr = [0.0001]
            optim_args.learning_rate = [0.0001]
            self._adversarial_optimizer = optim.build_optimizer(
                optim_args, self.adversary_model.decoder.parameters())

            optim_args = deepcopy(self.args)
            optim_args.lr = [0.00001]
            optim_args.learning_rate = [0.00001]
            self._wasserstein_optimizer = optim.build_optimizer(
                optim_args, self.main_model.encoder.parameters())

        return self.main_model
 def _build_optimizer(self, model):
     if self.args.fp16:
         if torch.cuda.get_device_capability(0)[0] < 7:
             print(
                 "| WARNING: your device does NOT support faster training "
                 "with --fp16, please switch to FP32 which is likely to be"
                 " faster"
             )
         params = list(filter(lambda p: p.requires_grad, model.parameters()))
         self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params)
     else:
         if torch.cuda.get_device_capability(0)[0] >= 7:
             print("| NOTICE: your device may support faster training with --fp16")
         self._optimizer = optim.build_optimizer(self.args, model.parameters())
     return self._optimizer
Beispiel #27
0
    def _build_optimizer(self):
        from itertools import chain
        if hasattr(self.args, 'encoder_layers'):
            params = get_decayed_param_groups(
                chain(self.model.named_parameters(),
                      self.criterion.named_parameters()),
                num_layers=self.args.encoder_layers,
                weight_decay=self.args.weight_decay,
                weight_decay_exclude=self.args.weight_decay_exclude,
                freeze_encoder=self.args.freeze_encoder,
                freeze_embedding=self.args.freeze_embedding,
                lr=float(self.args.lr[0]),
                lr_decay=float(self.args.lr_decay),
            )
        else:
            params = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.parameters(),
                          self.criterion.parameters()),
                ))

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster")
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info(
                    "NOTE: your device may support faster training with --fp16"
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Beispiel #28
0
    def train_step(self,
                   sample,
                   model,
                   criterion,
                   optimizer,
                   ignore_grad=False):
        """
        Do forward and backward, and return the loss as computed by *criterion*
        for the given *model* and *sample*.

        Args:
            sample (dict): the mini-batch. The format is defined by the
                :class:`~fairseq.data.FairseqDataset`.
            model (~fairseq.models.BaseFairseqModel): the model
            criterion (~fairseq.criterions.FairseqCriterion): the criterion
            optimizer (~fairseq.optim.FairseqOptimizer): the optimizer
            ignore_grad (bool): multiply loss by 0 if this is set to True

        Returns:
            tuple:
                - the loss
                - the sample size, which is used as the denominator for the
                  gradient
                - logging outputs to display while training
        """
        p = self.get_mask_rate()
        sample = self.process_sample(sample, p=p)

        if self.discriminator_optimizer is None:
            params = list(
                filter(lambda p: p.requires_grad,
                       criterion.discriminator.parameters()))
            self.discriminator_optimizer = optim.build_optimizer(
                self.args, params)

        if self._step_counter % self.update_discr_every == 0:
            discriminator_logging_output = self.train_discriminator(
                model, criterion.discriminator, ignore_grad)
        else:
            discriminator_logging_output = {"loss": -1.}

        loss, sample_size, generator_logging_output = self.generator_train_step(
            sample, model, criterion, optimizer)

        logging_output = self.merge_logging_outputs(
            generator_logging_output, discriminator_logging_output)
        self._step_counter += 1
        return loss, sample_size, logging_output
Beispiel #29
0
    def _build_optimizer(self):
        # create FP32 copy of parameters and grads
        params = [p for p in self.model.parameters() if p.requires_grad]
        total_param_size = sum(p.data.numel() for p in params)
        self.fp32_params = params[0].new(0).float().new(total_param_size)
        offset = 0
        for p in params:
            numel = p.data.numel()
            self.fp32_params[offset:offset+numel].copy_(p.data.view(-1))
            offset += numel
        self.fp32_params = torch.nn.Parameter(self.fp32_params)
        self.fp32_params.grad = self.fp32_params.data.new(total_param_size)

        # create optimizer using the copied FP32 params
        self._optimizer = optim.build_optimizer(self.args, [self.fp32_params])
        self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
    def _build_optimizer(self):

        param_groups = self.task.get_task_params(self.model, self.criterion)

        if (not hasattr(self.args, "lr_list")) or (self.args.lr_list is None):
            lr_list = [self.args.lr[0] for _ in param_groups]
        else:
            lr_list = [
                float(lr.strip()) for lr in self.args.lr_list.split(",")
            ]

        for params, curr_lr in zip(param_groups, lr_list):
            if self.args.fp16:
                if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                    print(
                        '| WARNING: your device does NOT support faster training with --fp16, '
                        'please switch to FP32 which is likely to be faster')
                if self.args.memory_efficient_fp16:
                    optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                        self.args, params)
                else:
                    optimizer = optim.FP16Optimizer.build_optimizer(
                        self.args, params)
            else:
                if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                    print(
                        '| NOTICE: your device may support faster training with --fp16'
                    )
                optimizer = optim.build_optimizer(self.args, params)

            if self.args.use_bmuf:
                optimizer = optim.FairseqBMUF(self.args, self._optimizer)

            self._optimizers.append(optimizer)

            # We should initialize the learning rate scheduler immediately after
            # building the optimizer, so that the initial learning rate is set.
            self.args.lr = [curr_lr]
            lrs = lr_scheduler.build_lr_scheduler(self.args, optimizer)
            lrs.step_update(0)
            self._lr_schedulers.append(lrs)

        self.args.lr = None

        self.set_current_optimizer()
Beispiel #31
0
 def _build_optimizer(self):
     self._optimizer = optim.build_optimizer(self.args, self.model.parameters())
     self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self._optimizer)