コード例 #1
0
    def _build_optimizer(self):
        params = list(
            filter(lambda p: p.requires_grad, self.model.parameters()))
        # for n, p in list(self.model.named_parameters()):
        #     if p.requires_grad and n.startswith('encoder.bert'):
        #         print(n)
        #     else:
        #         print('=====%s',n)
        # params = [(n, p) for n, p in list(self.model.parameters()) if n.startswith('bert.model')]

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            if self.args.sep_optim:
                # bert_params = [(n, p) for n, p in list(self.model.named_parameters()) if n.startswith('encoder.bert')]
                # dec_params = [(n, p) for n, p in list(self.model.named_parameters()) if not n.startswith('encoder.bert')]
                bert_params = [
                    p for n, p in list(self.model.named_parameters())
                    if n.startswith('encoder.bert')
                ]
                dec_params = [
                    p for n, p in list(self.model.named_parameters())
                    if not n.startswith('encoder.bert')
                ]
                self._optimizer = optim.build_optimizer_bert(
                    self.args, bert_params)
                self._dec_optimizer = optim.build_optimizer_dec(
                    self.args, dec_params)
            else:
                self._optimizer = optim.build_optimizer(self.args, params)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        if self.args.sep_optim:
            self._lr_scheduler = lr_scheduler.build_lr_scheduler(
                self.args, self.optimizer)
            self._dec_lr_scheduler = lr_scheduler.build_lr_scheduler(
                self.args, self._dec_optimizer, decoder=True)
        else:
            self._lr_scheduler = lr_scheduler.build_lr_scheduler(
                self.args, self.optimizer)
コード例 #2
0
    def load_checkpoint(self, filename, load_optim=True):
        """Load all training state from a checkpoint file."""
        extra_state, optim_history, last_optim_state = \
            utils.load_model_state(filename, self.get_model())

        if last_optim_state is not None:
            # rebuild optimizer after loading model, since params may have changed
            #self.optimizer = optim.build_optimizer(self.args, self.model.parameters())
            self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)

            if load_optim:
                self._optim_history = optim_history
                # only reload optimizer and lr_scheduler if they match
                last_optim = self._optim_history[-1]
                if last_optim['criterion_name'] == self.criterion.__class__.__name__:
                    self.lr_scheduler.load_state_dict(last_optim['lr_scheduler_state'])
                    if last_optim['optimizer_name'] == self.optimizer.__class__.__name__:
                        self.optimizer.load_state_dict(last_optim_state)

                self._num_updates = last_optim['num_updates']

        if self.args.amp and extra_state is not None and 'amp_state_dict' in extra_state:
            self.optimizer.optimizer._lazy_init_maybe_master_weights()
            self.optimizer.optimizer._amp_stash.lazy_init_called = True
            self.optimizer.optimizer.load_state_dict(last_optim_state)
            for param, saved_param in zip(amp.master_params(self.optimizer.optimizer), extra_state['amp_master_params']):
                param.data.copy_(saved_param.data)
 
            amp.load_state_dict(extra_state['amp_state_dict'])

        return extra_state
コード例 #3
0
ファイル: trainer.py プロジェクト: zsquaredz/XSum
    def __init__(self, args, model, criterion):

        if not torch.cuda.is_available():
            raise NotImplementedError('Training on CPU is not supported')

        self.args = args

        # copy model and criterion to current device
        self.model = model.cuda()
        self.criterion = criterion.cuda()

        # initialize optimizer and LR scheduler
        self.optimizer = optim.build_optimizer(self.args, self.model.parameters())
        self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)

        # initialize meters
        self.meters = OrderedDict()
        self.meters['train_loss'] = AverageMeter()
        self.meters['train_nll_loss'] = AverageMeter()
        self.meters['valid_loss'] = AverageMeter()
        self.meters['valid_nll_loss'] = AverageMeter()
        self.meters['wps'] = TimeMeter()       # words per second
        self.meters['ups'] = TimeMeter()       # updates per second
        self.meters['wpb'] = AverageMeter()    # words per batch
        self.meters['bsz'] = AverageMeter()    # sentences per batch
        self.meters['gnorm'] = AverageMeter()  # gradient norm
        self.meters['clip'] = AverageMeter()   # % of updates clipped
        self.meters['oom'] = AverageMeter()    # out of memory

        self._max_bsz_seen = 0
        self._num_updates = 0
コード例 #4
0
    def __init__(self, args, model):

        if not torch.cuda.is_available():
            raise NotImplementedError('Training on CPU is not supported')

        self.args = args

        self.model = model.cuda()
        self.criterion = CRITERION_REGISTRY[args.criterion](args).cuda()
        self.optimizer = optim.build_optimizer(self.args, self.model.parameters())
        self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)

        if args.amp:
            model, optimizer = amp.initialize(
                    self.model,
                    self.optimizer._optimizer, 
                    opt_level=self.args.amp_level if self.args.amp_level else 'O2',
                    max_loss_scale=2**15,
                    cast_model_outputs=torch.float16
                    )

        if self.args.distributed_world_size > 1:
            self.model = DDP(model)

        self._buffered_stats = defaultdict(lambda: [])
        self._flat_grads = None
        self._num_updates = 0
        self._num_val_iterations = 0
        self._optim_history = None
        self.throughput_meter = TimeMeter()
コード例 #5
0
    def load_checkpoint(self, filename):
        """Load all training state from a checkpoint file."""
        extra_state, self._optim_history, last_optim_state = utils.load_model_state(
            filename, self.model, cuda_device=torch.cuda.current_device())

        if last_optim_state is not None:
            # rebuild optimizer after loading model, since params may have changed
            self.optimizer = optim.build_optimizer(self.args,
                                                   self.model.parameters())
            self.lr_scheduler = lr_scheduler.build_lr_scheduler(
                self.args, self.optimizer)

            # only reload optimizer and lr_scheduler if they match
            last_optim = self._optim_history[-1]
            if last_optim[
                    'criterion_name'] == self.criterion.__class__.__name__:
                self.lr_scheduler.load_state_dict(
                    last_optim['lr_scheduler_state'])
                if last_optim[
                        'optimizer_name'] == self.optimizer.__class__.__name__:
                    self.optimizer.load_state_dict(last_optim_state)

            self._num_updates = last_optim['num_updates']

        return extra_state
コード例 #6
0
    def _build_optimizer(self):
        params = list(
            filter(
                lambda p: p.requires_grad,
                chain(self.model.parameters(), self.criterion.parameters()),
            )
        )

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print('| WARNING: your device does NOT support faster training with --fp16, '
                      'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print('| NOTICE: your device may support faster training with --fp16')
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
コード例 #7
0
ファイル: composite.py プロジェクト: hfxunlp/fairseq-py
    def __init__(self, cfg: CompositeOptimizerConfig, params):
        super().__init__(cfg)

        assert (
            len(params) > 1
        ), "Composite optimizer only works when there are multiple parameter groups (try fp16_no_flatten_grads: true)"

        groupped_params = defaultdict(list)
        for p in params:
            group = getattr(p, "param_group", "default")
            groupped_params[group].append(p)

        assert groupped_params.keys() == cfg.groups.keys(), (
            f"Parameter groups {groupped_params.keys()} and optimizer groups {cfg.groups.keys()} are not the same! "
            "Try setting 'param_group' on your parameters in the model.")

        for group, group_params in groupped_params.items():
            group_cfg = cfg.groups[group]
            with open_dict(group_cfg):
                group_cfg.optimizer.lr = group_cfg.lr
                group_cfg.lr_scheduler.lr = group_cfg.lr
            self.optimizers[group] = _build_optimizer(group_cfg.optimizer,
                                                      group_params)
            if group_cfg.lr_scheduler is not None:
                self.lr_schedulers[group] = build_lr_scheduler(
                    group_cfg.lr_scheduler, self.optimizers[group])

        if len(self.lr_schedulers) > 0:
            assert len(self.lr_schedulers) == len(self.optimizers), (
                f"Please provide an lr scheduler for each optimizer to use pass_through scheduler. "
                f"Optimizers: {self.optimizers}; Lr scheds: {self.lr_schedulers}"
            )
            self.lr_scheduler = CompositeLRScheduler(self.lr_schedulers)

        self._optimizer = CompositeOptimizer(self.optimizers)
コード例 #8
0
    def build_optimizer(self):
        params = list(
            filter(
                lambda p: p.requires_grad,
                self.model.parameters(),
            ))

        if self.args.fp16:
            self.args.fp16_scale_window = 2**14 / self.args.world_size / self.args.gradient_accumulation_steps
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self.optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self.optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            self.optimizer = optim.build_optimizer(self.args, params)

        self.lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self.lr_scheduler.step_update(0)
コード例 #9
0
ファイル: trainer_dtn.py プロジェクト: wangyong1122/dtn
    def _build_optimizer(self):

        params = list(
            filter(lambda p: p.requires_grad, self.model.parameters()))
        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        print('| num. model params: {} (num. optimized: {})'.format(
            sum(p.numel() for p in self.model.parameters()),
            sum(p.numel() for p in self._optimizer.params if p.requires_grad),
        ))
        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
コード例 #10
0
ファイル: trainer.py プロジェクト: lianqing01/transformer-cbn
    def _build_optimizer(self):
        if self.args.optimizer != 'adam_cbn':
            params = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.parameters(),
                          self.criterion.parameters()),
                ))
        else:
            # selection
            from fairseq.modules.norms.constraint_bn_v2 import Constraint_Lagrangian
            constraint_param = []
            for m in self.model.modules():
                if isinstance(m, Constraint_Lagrangian):
                    constraint_param.extend(list(map(id, m.parameters())))
            params_lag = list(
                filter(lambda p: id(p) in constraint_param,
                       chain(self.model.parameters())))
            params = list(
                filter(
                    lambda p: id(p) not in constraint_param and p.
                    requires_grad,
                    chain(self.model.parameters(),
                          self.criterion.parameters())))

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            # check cbn
            if self.args.optimizer != 'adam_cbn':
                self._optimizer = optim.build_optimizer(self.args, params)
            else:
                self._optimizer = optim.build_optimizer(
                    self.args, params, params_lag)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
コード例 #11
0
    def _build_optimizer(self):
        params = list(
            filter(
                lambda p: p.requires_grad,
                chain(self.model.parameters(), self.criterion.parameters()),
            )
        )

        if self.cfg.common.fp16 or self.cfg.common.bf16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster"
                )
            if (
                self.cfg.common.memory_efficient_fp16
                or self.cfg.common.memory_efficient_bf16
            ):
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.cfg, params
                )
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(self.cfg, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info("NOTE: your device may support faster training with --fp16")
            self._optimizer = optim.build_optimizer(self.cfg.optimizer, params)

        if self.cfg.optimization.use_bmuf:
            self._optimizer = optim.FairseqBMUF(
                self.cfg.bmuf,
                self._optimizer,
            )

        if self.cfg.distributed_training.zero_sharding == "os":
            if (
                self.cfg.common.fp16
                and not self.cfg.common.memory_efficient_fp16
                and not self.cfg.common.memory_efficient_bf16
            ) and not self.cfg.common.fp16_no_flatten_grads:
                raise ValueError(
                    "ZeRO is incomptabile with fp16 and flattened grads. "
                    "Please use --fp16-no-flatten-grads"
                )
            else:
                optim.shard_(self._optimizer, self.data_parallel_process_group)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.cfg.lr_scheduler,
            self.optimizer,
        )
        self._lr_scheduler.step_update(0)
コード例 #12
0
    def _build_optimizer(self):
        # params = list(
        #     filter(
        #         lambda p: p.requires_grad,
        #         chain(self.model.parameters(), self.criterion.parameters()),
        #     )
        # )
        params_dict = {}
        _default_manifold = Euclidean()
        for name, p in chain(self.model.named_parameters(),
                             self.criterion.named_parameters()):
            if not p.requires_grad:
                continue
            if isinstance(p, (ManifoldParameter, ManifoldTensor)):
                _manifold = p.manifold
            else:
                _manifold = _default_manifold
            _manifold_name = _manifold.__class__.__name__
            if not _manifold_name in params_dict:
                ref_grad = _manifold.egrad2rgrad(p.new_zeros(1), p.new_ones(1))
                coef = 1 if ref_grad == 1 else 1
                #print(f"lr={self.args.lr}, ref={ref_grad.item()}")
                params_dict[_manifold_name] = dict(
                    params=[],
                    lr_rectifier=ref_grad.reciprocal().item() * coef)
            params_dict[_manifold_name]['params'].append(p)
        params = params_dict.values()

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster")
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info(
                    "NOTE: your device may support faster training with --fp16"
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
コード例 #13
0
    def _build_optimizer(self):
        if self.ulmfit:
            params = []
            multiplier_map = []
            for n, p in self.model.named_parameters():
                if p.requires_grad:
                    params.append(p)

                    param_name_split = n.split('.')

                    if param_name_split[2] == 'lm_head':  # last layer
                        multiplier = 1.
                    elif param_name_split[4].isdigit():  # encoder layer
                        layer = int(param_name_split[4])

                        multiplier = self.decay_rate_lrc**-(self.num_layers -
                                                            layer)
                    else:  # first layer
                        multiplier = self.decay_rate_lrc**-(self.num_layers +
                                                            1)

                    multiplier_map.append(multiplier)
        else:
            params = list(
                filter(lambda p: p.requires_grad, self.model.parameters()))

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, params,
                                                self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
コード例 #14
0
ファイル: trainer.py プロジェクト: roeeaharoni/fairseq
    def _build_optimizer(self):
        if self.args.fp16:
            if torch.cuda.get_device_capability(0)[0] < 7:
                print('| WARNING: your device does NOT support faster training with --fp16, '
                      'please switch to FP32 which is likely to be faster')
            params = list(filter(lambda p: p.requires_grad, self.model.parameters()))
            self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params)
        else:
            if torch.cuda.get_device_capability(0)[0] >= 7:
                print('| NOTICE: your device may support faster training with --fp16')
            self._optimizer = optim.build_optimizer(self.args, self.model.parameters())

        self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self._optimizer)
コード例 #15
0
    def _build_optimizer(self):
        if self.freeze_bart:
            for name, param in self.model.named_parameters():
                if name.startswith('encoder') and name not in [
                        "encoder.structure_att.exparam",
                        "encoder.structure_att.tp_linear.weight",
                        "encoder.structure_att.tp_linear.bias",
                        "encoder.structure_att.tc_linear.weight",
                        "encoder.structure_att.tc_linear.bias",
                        "encoder.structure_att.fi_linear.weight",
                        "encoder.structure_att.bilinear._weight_matrix",
                        "encoder.structure_att.bilinear._bias",
                        "encoder.structure_att.fzlinear.weight",
                        "encoder.structure_att.fzlinear.bias",
                        "encoder.str_to_enc_linear.weight",
                        "encoder.str_to_enc_linear.bias"
                ]:
                    param.requires_grad = False
            print("Freezing parameters")
        params = list(
            filter(
                lambda p: p.requires_grad,
                chain(self.model.parameters(), self.criterion.parameters()),
            ))

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster")
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info(
                    "NOTE: your device may support faster training with --fp16"
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
コード例 #16
0
ファイル: trainer.py プロジェクト: ecchochan/fairseq
    def _build_optimizer(self):
        from itertools import chain
        if hasattr(self.args, 'encoder_layers'):
            params = get_decayed_param_groups(
                chain(self.model.named_parameters(),
                      self.criterion.named_parameters()),
                num_layers=self.args.encoder_layers,
                weight_decay=self.args.weight_decay,
                weight_decay_exclude=self.args.weight_decay_exclude,
                freeze_encoder=self.args.freeze_encoder,
                freeze_embedding=self.args.freeze_embedding,
                lr=float(self.args.lr[0]),
                lr_decay=float(self.args.lr_decay),
            )
        else:
            params = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.parameters(),
                          self.criterion.parameters()),
                ))

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster")
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info(
                    "NOTE: your device may support faster training with --fp16"
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
コード例 #17
0
ファイル: fp16_trainer.py プロジェクト: yxlin1/fairseq
    def _build_optimizer(self):
        # create FP32 copy of parameters and grads
        params = [p for p in self.model.parameters() if p.requires_grad]
        total_param_size = sum(p.data.numel() for p in params)
        self.fp32_params = params[0].new(0).float().new(total_param_size)
        offset = 0
        for p in params:
            numel = p.data.numel()
            self.fp32_params[offset:offset+numel].copy_(p.data.view(-1))
            offset += numel
        self.fp32_params = torch.nn.Parameter(self.fp32_params)
        self.fp32_params.grad = self.fp32_params.data.new(total_param_size)

        # create optimizer using the copied FP32 params
        self._optimizer = optim.build_optimizer(self.args, [self.fp32_params])
        self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
コード例 #18
0
    def _build_optimizer(self):

        param_groups = self.task.get_task_params(self.model, self.criterion)

        if (not hasattr(self.args, "lr_list")) or (self.args.lr_list is None):
            lr_list = [self.args.lr[0] for _ in param_groups]
        else:
            lr_list = [
                float(lr.strip()) for lr in self.args.lr_list.split(",")
            ]

        for params, curr_lr in zip(param_groups, lr_list):
            if self.args.fp16:
                if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                    print(
                        '| WARNING: your device does NOT support faster training with --fp16, '
                        'please switch to FP32 which is likely to be faster')
                if self.args.memory_efficient_fp16:
                    optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                        self.args, params)
                else:
                    optimizer = optim.FP16Optimizer.build_optimizer(
                        self.args, params)
            else:
                if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                    print(
                        '| NOTICE: your device may support faster training with --fp16'
                    )
                optimizer = optim.build_optimizer(self.args, params)

            if self.args.use_bmuf:
                optimizer = optim.FairseqBMUF(self.args, self._optimizer)

            self._optimizers.append(optimizer)

            # We should initialize the learning rate scheduler immediately after
            # building the optimizer, so that the initial learning rate is set.
            self.args.lr = [curr_lr]
            lrs = lr_scheduler.build_lr_scheduler(self.args, optimizer)
            lrs.step_update(0)
            self._lr_schedulers.append(lrs)

        self.args.lr = None

        self.set_current_optimizer()
コード例 #19
0
    def _build_optimizer(self):
        params_nmt = self.model.get_nmt_parameters()

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                raise NotImplementedError
            else:
                self._optimizer_nmt = optim.FP16Optimizer.build_optimizer(
                    self.args, params_nmt)
                self._optimizer_adv = optim.FP16Optimizer.build_optimizer(
                    self.args, self.model.get_adv_parameters())

        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            self._optimizer_nmt = optim.build_optimizer(self.args, params_nmt)

            self._optimizer_adv = optim.build_optimizer(
                self.args, self.model.get_adv_parameters())

        self._optimizer = {}
        self._optimizer['nmt'] = self._optimizer_nmt
        self._optimizer['adv'] = self._optimizer_adv

        print(
            '| num. model params: {} (num. optimized: {} ( nmt: {}, adv classifier: {} ) )'
            .format(
                sum(p.numel() for p in self.model.parameters()),
                sum(p.numel() for optim in self._optimizer.values()
                    for p in optim.params if p.requires_grad),
                sum(p.numel() for p in self._optimizer['nmt'].params
                    if p.requires_grad),
                sum(p.numel() for p in self._optimizer['adv'].params
                    if p.requires_grad)))
        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer['nmt'])
コード例 #20
0
    def __init__(self, args, model):

        if not torch.cuda.is_available():
            raise NotImplementedError('Training on CPU is not supported')

        self.args = args

        self.model = model.cuda()
        self.criterion = CRITERION_REGISTRY[args.criterion](args).cuda()
        self.optimizer = optim.build_optimizer(self.args,
                                               self.model.parameters())
        self.lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self.scaler = amp.GradScaler(enabled=self.args.amp, init_scale=2**15)

        if self.args.distributed_world_size > 1:
            self.model = DDP(model)

        self._buffered_stats = defaultdict(lambda: [])
        self._num_updates = 0
        self._optim_history = None
        self.throughput_meter = TimeMeter()
        self.avg_loss_meter = AverageMeter()
コード例 #21
0
ファイル: trainer.py プロジェクト: ffzhai-2019/fairseq
    def _build_optimizer(self):
        params = list(
            filter(
                lambda p: p.requires_grad,
                ## chain函数:接受一个可迭代对象列表作为输入,并返回一个迭代器, 顺次迭代所有对象的内容
                chain(self.model.parameters(), self.criterion.parameters()),
            )
        )  ## 获取需要优化的参数列表, 此参数列表是通过nn.Module.parameters()获取,通过递归遍历Module的所有submodule和其所有的torch.nn.Parameter对象获取。这些submodule和parameter都是在Module的属性赋值时根据类型进行分类存储得到的

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    "| WARNING: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster")
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    "| NOTICE: your device may support faster training with --fp16"
                )
            #通过optim/__init__.py构造的build_optimizer和args.optimizer标定的choice,直接调用对应optimizer的构造函数
            #比如如果args.optimier指定为adam,则此处build_optimizer就是FairseqAdam类的构造函数
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
コード例 #22
0
ファイル: trainer.py プロジェクト: ecwu/bert-fairseq
    def _build_optimizer(self):
        # params = list(filter(lambda p: p.requires_grad, self.model.parameters()))
        model = self.model
        no_decay = [
            'bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight'
        ]
        params = [
            {
                'params': [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay) and (
                        'bert' in n or 'embedding_token' in n)
                ],
                'weight_decay':
                0.01,
                'lr_scale':
                self.args.encoder_lr_scale
            },
            {
                'params': [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay) and (
                        'bert' not in n and 'embedding_token' not in n)
                ],
                'weight_decay':
                0.01,
                'lr_scale':
                self.args.decoder_lr_scale
            },
            {
                'params': [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay) and (
                        'bert' in n or 'embedding_token' in n)
                ],
                'weight_decay':
                0.0,
                'lr_scale':
                self.args.encoder_lr_scale
            },
            {
                'params': [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay) and (
                        'bert' not in n and 'embedding_token' not in n)
                ],
                'weight_decay':
                0.0,
                'lr_scale':
                self.args.decoder_lr_scale
            },
        ]
        # params = [{"params":[p for n, p in model.named_parameters()], "lr_scale":1}]
        # params = [p for n, p in model.named_parameters() if p.require_grad]
        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
コード例 #23
0
ファイル: bt_translation.py プロジェクト: zjpbinary/multiDDS
    def build_model(self, args):
        from fairseq import models
        copied_lang_pairs = [p for p in self.lang_pairs]
        for lang_pair in copied_lang_pairs:
            src, tgt = lang_pair.split('-')
            key = '{}-{}'.format(tgt, src)
            self.lang_pairs.append(key)
        model = models.build_model(args, self)
        self.lang_pairs = copied_lang_pairs
        if not isinstance(model, FairseqMultiModel):
            raise ValueError(
                'SemisupervisedTranslationTask requires a FairseqMultiModel architecture'
            )
        if self.args.bt_dds:
            # set up data actor finetune optimizer
            bt_params = []
            for lang_pair in self.lang_pairs:
                bt_lang_pair = _get_dds_bt_key(lang_pair)
                for p in model.models[bt_lang_pair].parameters():
                    if p.requires_grad: bt_params.append(p)
            if self.args.bt_optimizer == "SGD":
                self.data_optimizer = torch.optim.SGD(
                    bt_params,
                    lr=self.args.data_actor_lr[0],
                    momentum=self.args.bt_optimizer_momentum,
                    nesterov=self.args.bt_optimizer_nesterov)
                #t_optim = self.args.optimizer
                #self.args.optimizer = "data_sgd"
                #self.data_optimizer = build_optimizer(self.args, bt_params)
                #self.args.optimizer = t_optim
            elif self.args.bt_optimizer == "ASGD":
                self.data_optimizer = torch.optim.ASGD(
                    bt_params, lr=self.args.data_actor_lr[0])
            if self.args.data_lr_scheduler is not None:
                print("Building lr scheduler {} for BT model...".format(
                    self.args.data_lr_scheduler))
                t_scheduler = self.args.lr_scheduler
                self.args.lr_scheduler = self.args.data_lr_scheduler
                self.data_lr_scheduler = lr_scheduler.build_lr_scheduler(
                    self.args, self.data_optimizer)
                self.data_lr_scheduler.step_update(0)
                self.args.lr_scheduler = t_scheduler
            else:
                self.data_lr_scheduler = None
            self.step = 0
        # create SequenceGenerator for each model that has backtranslation dependency on it
        self.sequence_generators = {}
        #if (self.lambda_otf_bt > 0.0 or self.lambda_otf_bt_steps is not None) and self.training:
        if self.training:
            for lang_pair in self.lang_pairs:
                src, tgt = lang_pair.split('-')
                key = '{}-{}'.format(tgt, src)
                self.sequence_generators[key] = SequenceGenerator(
                    tgt_dict=self.dicts[src],
                    beam_size=args.bt_beam_size,
                    max_len_a=args.bt_max_len_a,
                    max_len_b=args.bt_max_len_b,
                    sampling=self.args.sampling,
                    sampling_topk=self.args.sampling_topk,
                    temperature=self.args.temperature,
                )
                decoder_lang_tok_idx = self.get_decoder_langtok(src)

                def backtranslate_fn(
                    sample,
                    model=model.models[key],
                    bos_token=decoder_lang_tok_idx,
                    sequence_generator=self.sequence_generators[key],
                ):
                    return sequence_generator.generate(
                        [model],
                        sample,
                        bos_token=bos_token,
                    )

                self.backtranslators[lang_pair] = backtranslate_fn

        return model
コード例 #24
0
ファイル: trainer.py プロジェクト: xssstory/STAS
    def _build_optimizer(self):
        params = list(
            filter(
                lambda p: p.requires_grad,
                chain(self.model.parameters(), self.criterion.parameters()),
            )
        )

        if getattr(self.args, 'multiple_lr', False):
            assert self.args.lr_scheduler == 'multi_lr_inverse_sqrt', 'only multi_lr_inverse_sqrt supports multiple_lr now'
            assert len(self.args.lr) == 3, 'Three learning rates for roberta, sents_encoder and decoder should be provided'
            named_params = [(n, p) for n, p in self.model.named_parameters() if p.requires_grad]
            encoder_params = [(n, p) for n, p in self.model.encoder.named_parameters() if p.requires_grad]
            decoder_params = [(n, p) for n, p in self.model.decoder.named_parameters() if p.requires_grad]
            if hasattr(self.model, 'decoder_perm'):
                decoder_params += [(n, p) for n, p in self.model.decoder_perm.named_parameters() if p.requires_grad]
                
            # params = [
            #     {'params': [p for n, p in named_params if 'roberta' in n and (n.startswith('encoder') or n.startswith('module.encoder'))]},
            #     {'params': [p for n, p in named_params if not 'roberta' in n and (n.startswith('encoder') or n.startswith('module.encoder'))]},
            #     {'params': [p for n, p in named_params if n.startswith('decoder') or n.startswith('module.decoder')]}
            # ]
            params = [
                {'params': [p for n, p in encoder_params if 'roberta' in n]},
                {'params': [p for n, p in encoder_params if not 'roberta' in n]},
                {'params': [p for n, p in decoder_params]}  
            ]
            assert len(named_params) == sum([len(p['params']) for p in params]), named_params


        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    "| WARNING: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster"
                )
            if self.args.memory_efficient_fp16:
                if getattr(self.args, 'multiple_lr', False):
                    self._optimizer = optim.ConcatOptimizer(self.args, [ optim.MemoryEfficientFP16Optimizer.build_optimizer(self.args, param['params']) for param in params])
                else:
                    self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                        self.args, params
                    )
            else:
                if getattr(self.args, 'multiple_lr', False):
                    self._optimizer = optim.ConcatOptimizer(self.args, [optim.FP16Optimizer.build_optimizer(self.args, param['params']) for param in params])
                else:
                    self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print("| NOTICE: your device may support faster training with --fp16")
            if getattr(self.args, 'multiple_lr', False):
                self._optimizer = optim.ConcatOptimizer(self.args, [optim.build_optimizer(self.args, param['params']) for param in params])
            else:
                self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
コード例 #25
0
ファイル: trainer.py プロジェクト: kevinlin311tw/fairseq
 def _build_optimizer(self):
     self._optimizer = optim.build_optimizer(self.args,
                                             self.model.parameters())
     self.lr_scheduler = lr_scheduler.build_lr_scheduler(
         self.args, self._optimizer)
コード例 #26
0
    def _build_optimizer(self):
        #if not self.args.balance:
        '''
        count = 1
        for name, param in chain(self.model.encoder.named_parameters(), self.model.decoder.named_parameters(), self.criterion.named_parameters()):
            if param.requires_grad:
                print(count, name)
                count += 1

        print("------")
        for name, param in chain(self.model.section_positions.named_parameters(), self.model.section_layernorm_embedding.named_parameters(), self.model.section.named_parameters(),
                self.model.w_context_vector.named_parameters(), self.model.w_proj.named_parameters()):
            if param.requires_grad:
                print(count, name)
                count += 1
        '''

        #print(len(self.model.named_parameters()))
        #print(list(
        #   filter(
        #        lambda p: p.requires_grad,
        #       chain(self.model.encoder.parameters(), self.model.decoder.parameters()),
        #    )
        #))

        #params = list(
        #    filter(
        #        lambda p: p.requires_grad,
        #        chain(self.model.parameters(), self.criterion.parameters()),
        #    )
        #)

        #print("Total: ")
        #print(len(params))
        '''
        base_params = list(map(id, chain(self.model.encoder.parameters(), self.model.decoder.parameters())))
        logits_params = filter(lambda p: id(p) not in base_params and p.requires_grad, self.model.parameters())

        base_params_id = list(map(id, self.model.section_positions.parameters())) + list(map(id,net.bn1.parameters()))+\
	list(map(id,net.layer1.parameters())) + list(map(id,net.layer2.parameters())) \
		+ list(map(id,net.layer3.parameters())) + list(map(id,net.layer4.parameters()))

        new_params = filter(lambda p: id(p) not in base_params_id , net.parameters())
        base_params = filter(lambda p: id(p) in base_params_id, net.parameters())

        
        
        '''
        new_params_id = list(map(id, self.model.section_positions.parameters())) + list(map(id,self.model.section_layernorm_embedding.parameters()))+\
 list(map(id,self.model.section.parameters())) + list(map(id, self.model.w_proj.parameters())) \
  + list(map(id,self.model.w_context_vector.parameters())) + list(map(id,self.model.w_proj_layer_norm.parameters()))

        base_params = list(
            filter(lambda p: id(p) not in new_params_id and p.requires_grad,
                   self.model.parameters()))

        print("group1: ")
        print(len(base_params))

        new_params = list(
            filter(lambda p: id(p) in new_params_id and p.requires_grad,
                   self.model.parameters()))
        print("group2: ")
        print(len(new_params))

        params = [
            {
                "params": base_params
            },
            {
                "params": new_params
            },
        ]
        # "weight_decay": 0.01

        params2 = None
        '''
        if self.args.balance:
            params = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.encoder.parameters(), self.model.decoder.parameters(), self.criterion.parameters()),
                )
            )

            params2 = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.w_proj.parameters(), self.model.w_context_vector.parameters(), self.model.section_positions.parameters(), 
                    self.model.section_layernorm_embedding.parameters()),
                )
            )
        '''

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster")
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info(
                    "NOTE: your device may support faster training with --fp16"
                )
            self._optimizer = optim.build_optimizer(self.args, params)

            self._optimizer2 = None
            if self.args.balance and params2 is not None:
                self._optimizer2 = optim.build_optimizer(self.args, params2)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer, self._optimizer2)
        self._lr_scheduler.step_update(0)
コード例 #27
0
ファイル: trainer.py プロジェクト: vyraun/Frequency-Agnostic
 def _build_optimizer(self):
     self._optimizer = optim.build_optimizer(self.args, self.model)
     self.lr_scheduler = lr_scheduler.build_lr_scheduler(
         self.args, self._optimizer)
コード例 #28
0
ファイル: trainer.py プロジェクト: fyabc/fairseq
 def _build_optimizer(self):
     self._optimizer = optim.build_optimizer(self.args, self.model.parameters())
     self.lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self._optimizer)
コード例 #29
0
def main():
    global args, best_prec1
    args = parser.parse_args()
    args.data = args.data_dir

    os.environ["CUDA_VISIBLE_DEVICES"] = f"{args.local_rank}"

    # Setup task, e.g., translation, language modeling, etc.
    task = tasks.setup_task(args)

    # Load dataset splits
    load_dataset_splits(task, ['train', 'valid'])

    # Build criterion
    criterion = task.build_criterion(args)

    # create stages of the model
    module = importlib.import_module(args.module)
    args.arch = module.arch()
    model = module.model(criterion)

    max_positions = (args.max_source_positions, args.max_target_positions)
    dummy_batch = task.dataset('train').get_dummy_batch(
        args.max_tokens, max_positions)
    inputs = dummy_batch['net_input']
    input0 = inputs['src_tokens']
    input1 = inputs['prev_output_tokens']
    target = dummy_batch['target']

    training_tensor_shapes = {
        "input0": list(input0.size()),
        "input1": list(input1.size()),
        "target": list(target.size()),
        "ntokens": [1]
    }
    dtypes = {
        "input0": input0.dtype,
        "input1": input1.dtype,
        "target": target.dtype,
        "ntokens": torch.float32
    }
    inputs_module_destinations = {"input0": 0, "input1": 0}
    target_tensor_names = {"target", "ntokens"}
    for module_id, (stage, inputs, outputs) in enumerate(
            model[:-1]):  # Skip last layer (loss).
        input_tensors = []
        for module_input in inputs:
            if module_input in inputs_module_destinations:
                inputs_module_destinations[module_input] = module_id

            input_tensor = torch.ones(tuple(
                training_tensor_shapes[module_input]),
                                      dtype=dtypes[module_input]).cuda()
            input_tensors.append(input_tensor)
        stage.cuda()
        # PyTorch should not maintain metadata for a backward pass on
        # synthetic inputs. Without the following line, the runtime is
        # as much as 1.5x slower in a full DP configuration.
        with torch.no_grad():
            output_tensors = stage(*tuple(input_tensors))
        if not type(output_tensors) is tuple:
            output_tensors = [output_tensors]
        for output, output_tensor in zip(outputs, list(output_tensors)):
            training_tensor_shapes[output] = list(output_tensor.size())
            dtypes[output] = output_tensor.dtype

    eval_tensor_shapes = {}
    for key in training_tensor_shapes:
        eval_tensor_shapes[key] = tuple(training_tensor_shapes[key])
        training_tensor_shapes[key] = tuple(training_tensor_shapes[key])

    configuration_maps = {
        'module_to_stage_map': None,
        'stage_to_rank_map': None,
        'stage_to_depth_map': None
    }
    if args.config_path is not None:
        json_config_file = json.load(open(args.config_path, 'r'))
        configuration_maps['module_to_stage_map'] = json_config_file.get(
            "module_to_stage_map", None)
        configuration_maps['stage_to_rank_map'] = json_config_file.get(
            "stage_to_rank_map", None)
        configuration_maps['stage_to_rank_map'] = {
            int(k): v
            for (k, v) in configuration_maps['stage_to_rank_map'].items()
        }
        configuration_maps['stage_to_depth_map'] = json_config_file.get(
            "stage_to_depth_map", None)

    r = runtime.StageRuntime(
        model=model,
        distributed_backend=args.distributed_backend,
        fp16=args.fp16,
        loss_scale=args.loss_scale,
        training_tensor_shapes=training_tensor_shapes,
        eval_tensor_shapes=eval_tensor_shapes,
        training_tensor_dtypes=dtypes,
        inputs_module_destinations=inputs_module_destinations,
        target_tensor_names=target_tensor_names,
        configuration_maps=configuration_maps,
        master_addr=args.master_addr,
        rank=args.rank,
        local_rank=args.local_rank,
        num_ranks_in_server=args.num_ranks_in_server,
        verbose_freq=args.verbose_frequency,
        model_type=runtime.TRANSLATION,
        enable_recompute=args.recompute)

    # stage needed to determine if current stage is the first stage
    # num_stages needed to determine if current stage is the last stage
    # num_ranks needed to determine number of warmup_minibatches in case of pipelining
    args.stage = r.stage
    args.num_stages = r.num_stages
    args.num_ranks = r.num_ranks
    if not is_first_stage():
        args.synthetic_data = True

    # define optimizer
    if args.no_input_pipelining:
        num_versions = 1
    else:
        # number of versions is the total number of machines following the current
        # stage, shared amongst all replicas in this stage
        num_versions = r.num_warmup_minibatches + 1

    # if specified, resume from checkpoint
    if args.resume:
        checkpoint_file_path = os.path.join(
            args.checkpoint_dir,
            f"checkpoint.{r.stage}.pth.tar.epoch.{args.start_epoch}")
        assert os.path.isfile(checkpoint_file_path)
        print("=> loading checkpoint '{}'".format(checkpoint_file_path))
        checkpoint = torch.load(checkpoint_file_path)
        args.start_epoch = checkpoint['epoch']
        best_prec1 = checkpoint['best_prec1']
        r.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}' (epoch {})".format(
            checkpoint_file_path, checkpoint['epoch']))

    # TODO: make this configurable by args
    use_adam_optimizer = True
    if use_adam_optimizer:
        optimizer = adam.Adam(r.master_parameters,
                              lr=args.lr,
                              betas=(0.9, 0.98),
                              weight_decay=args.weight_decay)
    else:
        optimizer = sgd.SGD(r.master_parameters,
                            lr=args.lr,
                            momentum=args.momentum,
                            weight_decay=args.weight_decay)
    scheduler = lr_scheduler.build_lr_scheduler(args, optimizer)

    if args.resume:
        optimizer.load_state_dict(checkpoint['optimizer'])

    cudnn.benchmark = True

    # epoch_itr = data.EpochBatchIterator(
    #     dataset=task.dataset(args.train_subset),
    #     max_tokens=args.max_tokens,
    #     max_sentences=args.max_sentences_valid,
    #     max_positions=max_positions,
    #     ignore_invalid_inputs=True,
    #     required_batch_size_multiple=8,
    #     seed=1,
    #     num_shards=1,
    #     shard_id=0,
    # )

    def epoch_itr():
        return task.dataset('train').get_dummy_batch(args.max_tokens,
                                                     max_positions)

    distributed_sampler = False
    if configuration_maps['stage_to_rank_map'] is not None:
        num_ranks_in_first_stage = len(
            configuration_maps['stage_to_rank_map'][0])
        if num_ranks_in_first_stage > 1:
            distributed_sampler = True

    for epoch in range(args.start_epoch, args.epochs):
        if distributed_sampler:
            train_loader.sampler.set_epoch(epoch)

        # train or run forward pass only for one epoch
        if args.forward_only:
            validate(val_loader, r, epoch)
        else:
            train(epoch_itr, r, optimizer, epoch, scheduler)

            # evaluate on validation set
            # prec1 = validate(val_loader, r, epoch)
            prec1 = 0
            if r.stage != r.num_stages: prec1 = 0

            # remember best prec@1 and save checkpoint
            best_prec1 = max(prec1, best_prec1)

            should_save_checkpoint = args.checkpoint_dir_not_nfs or r.rank_in_stage == 0
            if args.checkpoint_dir and should_save_checkpoint:
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'arch': args.arch,
                        'state_dict': r.state_dict(),
                        'best_prec1': best_prec1,
                        'optimizer': optimizer.state_dict()
                    }, args.checkpoint_dir, r.stage, epoch)
コード例 #30
0
ファイル: trainer.py プロジェクト: degerli/fairseq
 def lr_scheduler(self):
     if self._lr_scheduler is None:
         self._lr_scheduler = lr_scheduler.build_lr_scheduler(
             self.args, self.optimizer)
     return self._lr_scheduler
コード例 #31
0
ファイル: j_trainer.py プロジェクト: qinger521/LA-MT-COURSE
    def _build_optimizer(self):
        # params = list(filter(lambda p: p.requires_grad, self.model.parameters()))
        params = []
        id_params = []
        # 获得每一层encoder_layer
        #params.append({"params":torch.nn.ModuleList([self.model.encoder.]).parameters()})
        en_embed_tokens = torch.nn.ModuleList(
            [self.model.encoder.embed_tokens]).parameters()
        en_embed_positions = torch.nn.ModuleList(
            [self.model.encoder.embed_positions]).parameters()
        en_layer_norm = torch.nn.ModuleList([self.model.encoder.layer_norm
                                             ]).parameters()

        de_embed_tokens = torch.nn.ModuleList(
            [self.model.decoder.embed_tokens]).parameters()
        de_embed_positions = torch.nn.ModuleList(
            [self.model.decoder.embed_positions]).parameters()
        de_layer_norm = torch.nn.ModuleList([self.model.decoder.layer_norm
                                             ]).parameters()

        other_join = []
        for i in list(en_embed_tokens):
            other_join.append(i)
        for i in list(en_embed_positions):
            other_join.append(i)
        for i in list(en_layer_norm):
            other_join.append(i)
        for i in list(de_embed_tokens):
            other_join.append(i)
        for i in list(de_embed_positions):
            other_join.append(i)
        for i in list(de_layer_norm):
            other_join.append(i)
        params.append({"params": other_join})

        for i in range(self.encoder_layer_num):
            encoder = torch.nn.ModuleList([self.model.encoder.layers[i]
                                           ]).parameters()
            params.append({"params": list(encoder)})

        # 获得每一层decoder_layer

        for i in range(self.decoder_layer_num):
            decoder = torch.nn.ModuleList([self.model.decoder.layers[i]
                                           ]).parameters()
            params.append({"params": list(decoder)})
        '''
        for i in range(self.encoder_layer_num):
            encoder = filter(lambda p: id(p) in torch.nn.ModuleList([self.model.encoder.layers[i]]),
                             self.model.parameters())
            id_params.append(encoder)
            params.append({"params": encoder})

        # 获得每一层decoder_layer
        for i in range(self.decoder_layer_num):
            decoder = filter(lambda p: id(p) in torch.nn.ModuleList([self.model.decoder.layers[i]]),
                             self.model.parameters())
            id_params.append(decoder)
            params.append({"params": decoder})
        # 获得其他层的参数
        base_params = filter(lambda p: id(p) not in id_params, self.model.parameters())
        params.append({"params": base_params})
        '''

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)