Exemple #1
0
    def _build_optimizer(self):
        params = list(
            filter(
                lambda p: p.requires_grad,
                chain(self.model.parameters(), self.criterion.parameters()),
            )
        )

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print('| WARNING: your device does NOT support faster training with --fp16, '
                      'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print('| NOTICE: your device may support faster training with --fp16')
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
    def _build_optimizer(self):
        if self.args.optimizer != 'adam_cbn':
            params = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.parameters(),
                          self.criterion.parameters()),
                ))
        else:
            # selection
            from fairseq.modules.norms.constraint_bn_v2 import Constraint_Lagrangian
            constraint_param = []
            for m in self.model.modules():
                if isinstance(m, Constraint_Lagrangian):
                    constraint_param.extend(list(map(id, m.parameters())))
            params_lag = list(
                filter(lambda p: id(p) in constraint_param,
                       chain(self.model.parameters())))
            params = list(
                filter(
                    lambda p: id(p) not in constraint_param and p.
                    requires_grad,
                    chain(self.model.parameters(),
                          self.criterion.parameters())))

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            # check cbn
            if self.args.optimizer != 'adam_cbn':
                self._optimizer = optim.build_optimizer(self.args, params)
            else:
                self._optimizer = optim.build_optimizer(
                    self.args, params, params_lag)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Exemple #3
0
    def _build_optimizer(self):
        params = list(
            filter(
                lambda p: p.requires_grad,
                chain(self.model.parameters(), self.criterion.parameters()),
            )
        )

        if self.cfg.common.fp16 or self.cfg.common.bf16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster"
                )
            if (
                self.cfg.common.memory_efficient_fp16
                or self.cfg.common.memory_efficient_bf16
            ):
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.cfg, params
                )
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(self.cfg, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info("NOTE: your device may support faster training with --fp16")
            self._optimizer = optim.build_optimizer(self.cfg.optimizer, params)

        if self.cfg.optimization.use_bmuf:
            self._optimizer = optim.FairseqBMUF(
                self.cfg.bmuf,
                self._optimizer,
            )

        if self.cfg.distributed_training.zero_sharding == "os":
            if (
                self.cfg.common.fp16
                and not self.cfg.common.memory_efficient_fp16
                and not self.cfg.common.memory_efficient_bf16
            ) and not self.cfg.common.fp16_no_flatten_grads:
                raise ValueError(
                    "ZeRO is incomptabile with fp16 and flattened grads. "
                    "Please use --fp16-no-flatten-grads"
                )
            else:
                optim.shard_(self._optimizer, self.data_parallel_process_group)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.cfg.lr_scheduler,
            self.optimizer,
        )
        self._lr_scheduler.step_update(0)
Exemple #4
0
    def _build_optimizer(self):
        # params = list(
        #     filter(
        #         lambda p: p.requires_grad,
        #         chain(self.model.parameters(), self.criterion.parameters()),
        #     )
        # )
        params_dict = {}
        _default_manifold = Euclidean()
        for name, p in chain(self.model.named_parameters(),
                             self.criterion.named_parameters()):
            if not p.requires_grad:
                continue
            if isinstance(p, (ManifoldParameter, ManifoldTensor)):
                _manifold = p.manifold
            else:
                _manifold = _default_manifold
            _manifold_name = _manifold.__class__.__name__
            if not _manifold_name in params_dict:
                ref_grad = _manifold.egrad2rgrad(p.new_zeros(1), p.new_ones(1))
                coef = 1 if ref_grad == 1 else 1
                #print(f"lr={self.args.lr}, ref={ref_grad.item()}")
                params_dict[_manifold_name] = dict(
                    params=[],
                    lr_rectifier=ref_grad.reciprocal().item() * coef)
            params_dict[_manifold_name]['params'].append(p)
        params = params_dict.values()

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster")
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info(
                    "NOTE: your device may support faster training with --fp16"
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Exemple #5
0
    def _build_optimizer(self):
        if self.ulmfit:
            params = []
            multiplier_map = []
            for n, p in self.model.named_parameters():
                if p.requires_grad:
                    params.append(p)

                    param_name_split = n.split('.')

                    if param_name_split[2] == 'lm_head':  # last layer
                        multiplier = 1.
                    elif param_name_split[4].isdigit():  # encoder layer
                        layer = int(param_name_split[4])

                        multiplier = self.decay_rate_lrc**-(self.num_layers -
                                                            layer)
                    else:  # first layer
                        multiplier = self.decay_rate_lrc**-(self.num_layers +
                                                            1)

                    multiplier_map.append(multiplier)
        else:
            params = list(
                filter(lambda p: p.requires_grad, self.model.parameters()))

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    '| WARNING: your device does NOT support faster training with --fp16, '
                    'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    '| NOTICE: your device may support faster training with --fp16'
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, params,
                                                self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Exemple #6
0
    def _build_optimizer(self):
        if self.freeze_bart:
            for name, param in self.model.named_parameters():
                if name.startswith('encoder') and name not in [
                        "encoder.structure_att.exparam",
                        "encoder.structure_att.tp_linear.weight",
                        "encoder.structure_att.tp_linear.bias",
                        "encoder.structure_att.tc_linear.weight",
                        "encoder.structure_att.tc_linear.bias",
                        "encoder.structure_att.fi_linear.weight",
                        "encoder.structure_att.bilinear._weight_matrix",
                        "encoder.structure_att.bilinear._bias",
                        "encoder.structure_att.fzlinear.weight",
                        "encoder.structure_att.fzlinear.bias",
                        "encoder.str_to_enc_linear.weight",
                        "encoder.str_to_enc_linear.bias"
                ]:
                    param.requires_grad = False
            print("Freezing parameters")
        params = list(
            filter(
                lambda p: p.requires_grad,
                chain(self.model.parameters(), self.criterion.parameters()),
            ))

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster")
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info(
                    "NOTE: your device may support faster training with --fp16"
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Exemple #7
0
    def _build_optimizer(self):
        from itertools import chain
        if hasattr(self.args, 'encoder_layers'):
            params = get_decayed_param_groups(
                chain(self.model.named_parameters(),
                      self.criterion.named_parameters()),
                num_layers=self.args.encoder_layers,
                weight_decay=self.args.weight_decay,
                weight_decay_exclude=self.args.weight_decay_exclude,
                freeze_encoder=self.args.freeze_encoder,
                freeze_embedding=self.args.freeze_embedding,
                lr=float(self.args.lr[0]),
                lr_decay=float(self.args.lr_decay),
            )
        else:
            params = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.parameters(),
                          self.criterion.parameters()),
                ))

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster")
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info(
                    "NOTE: your device may support faster training with --fp16"
                )
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Exemple #8
0
def setup_model_loss_criterion(args, rank, is_cuda):
    """
    setup model, criterion and optimizer based on input args
    """
    args.distributed_rank = rank
    distributed_utils.distributed_init(args)
    torch.manual_seed(1)
    model = Model(args.input_size, args.nb_classes)
    loss_fn = nn.CrossEntropyLoss()
    if is_cuda:
        model = model.cuda()
        loss_fn = loss_fn.cuda()

    optimizer = optim.sgd.SGD(args, model.parameters())
    optimizer = optim.FairseqBMUF(args, optimizer)

    return model, loss_fn, optimizer
    def _build_optimizer(self):

        param_groups = self.task.get_task_params(self.model, self.criterion)

        if (not hasattr(self.args, "lr_list")) or (self.args.lr_list is None):
            lr_list = [self.args.lr[0] for _ in param_groups]
        else:
            lr_list = [
                float(lr.strip()) for lr in self.args.lr_list.split(",")
            ]

        for params, curr_lr in zip(param_groups, lr_list):
            if self.args.fp16:
                if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                    print(
                        '| WARNING: your device does NOT support faster training with --fp16, '
                        'please switch to FP32 which is likely to be faster')
                if self.args.memory_efficient_fp16:
                    optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                        self.args, params)
                else:
                    optimizer = optim.FP16Optimizer.build_optimizer(
                        self.args, params)
            else:
                if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                    print(
                        '| NOTICE: your device may support faster training with --fp16'
                    )
                optimizer = optim.build_optimizer(self.args, params)

            if self.args.use_bmuf:
                optimizer = optim.FairseqBMUF(self.args, self._optimizer)

            self._optimizers.append(optimizer)

            # We should initialize the learning rate scheduler immediately after
            # building the optimizer, so that the initial learning rate is set.
            self.args.lr = [curr_lr]
            lrs = lr_scheduler.build_lr_scheduler(self.args, optimizer)
            lrs.step_update(0)
            self._lr_schedulers.append(lrs)

        self.args.lr = None

        self.set_current_optimizer()
Exemple #10
0
    def _build_optimizer(self):
        params = list(
            filter(
                lambda p: p.requires_grad,
                ## chain函数:接受一个可迭代对象列表作为输入,并返回一个迭代器, 顺次迭代所有对象的内容
                chain(self.model.parameters(), self.criterion.parameters()),
            )
        )  ## 获取需要优化的参数列表, 此参数列表是通过nn.Module.parameters()获取,通过递归遍历Module的所有submodule和其所有的torch.nn.Parameter对象获取。这些submodule和parameter都是在Module的属性赋值时根据类型进行分类存储得到的

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    "| WARNING: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster")
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print(
                    "| NOTICE: your device may support faster training with --fp16"
                )
            #通过optim/__init__.py构造的build_optimizer和args.optimizer标定的choice,直接调用对应optimizer的构造函数
            #比如如果args.optimier指定为adam,则此处build_optimizer就是FairseqAdam类的构造函数
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
    def _build_optimizer(self):
        #if not self.args.balance:
        '''
        count = 1
        for name, param in chain(self.model.encoder.named_parameters(), self.model.decoder.named_parameters(), self.criterion.named_parameters()):
            if param.requires_grad:
                print(count, name)
                count += 1

        print("------")
        for name, param in chain(self.model.section_positions.named_parameters(), self.model.section_layernorm_embedding.named_parameters(), self.model.section.named_parameters(),
                self.model.w_context_vector.named_parameters(), self.model.w_proj.named_parameters()):
            if param.requires_grad:
                print(count, name)
                count += 1
        '''

        #print(len(self.model.named_parameters()))
        #print(list(
        #   filter(
        #        lambda p: p.requires_grad,
        #       chain(self.model.encoder.parameters(), self.model.decoder.parameters()),
        #    )
        #))

        #params = list(
        #    filter(
        #        lambda p: p.requires_grad,
        #        chain(self.model.parameters(), self.criterion.parameters()),
        #    )
        #)

        #print("Total: ")
        #print(len(params))
        '''
        base_params = list(map(id, chain(self.model.encoder.parameters(), self.model.decoder.parameters())))
        logits_params = filter(lambda p: id(p) not in base_params and p.requires_grad, self.model.parameters())

        base_params_id = list(map(id, self.model.section_positions.parameters())) + list(map(id,net.bn1.parameters()))+\
	list(map(id,net.layer1.parameters())) + list(map(id,net.layer2.parameters())) \
		+ list(map(id,net.layer3.parameters())) + list(map(id,net.layer4.parameters()))

        new_params = filter(lambda p: id(p) not in base_params_id , net.parameters())
        base_params = filter(lambda p: id(p) in base_params_id, net.parameters())

        
        
        '''
        new_params_id = list(map(id, self.model.section_positions.parameters())) + list(map(id,self.model.section_layernorm_embedding.parameters()))+\
 list(map(id,self.model.section.parameters())) + list(map(id, self.model.w_proj.parameters())) \
  + list(map(id,self.model.w_context_vector.parameters())) + list(map(id,self.model.w_proj_layer_norm.parameters()))

        base_params = list(
            filter(lambda p: id(p) not in new_params_id and p.requires_grad,
                   self.model.parameters()))

        print("group1: ")
        print(len(base_params))

        new_params = list(
            filter(lambda p: id(p) in new_params_id and p.requires_grad,
                   self.model.parameters()))
        print("group2: ")
        print(len(new_params))

        params = [
            {
                "params": base_params
            },
            {
                "params": new_params
            },
        ]
        # "weight_decay": 0.01

        params2 = None
        '''
        if self.args.balance:
            params = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.encoder.parameters(), self.model.decoder.parameters(), self.criterion.parameters()),
                )
            )

            params2 = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.w_proj.parameters(), self.model.w_context_vector.parameters(), self.model.section_positions.parameters(), 
                    self.model.section_layernorm_embedding.parameters()),
                )
            )
        '''

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster")
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(
                    self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info(
                    "NOTE: your device may support faster training with --fp16"
                )
            self._optimizer = optim.build_optimizer(self.args, params)

            self._optimizer2 = None
            if self.args.balance and params2 is not None:
                self._optimizer2 = optim.build_optimizer(self.args, params2)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(
            self.args, self.optimizer, self._optimizer2)
        self._lr_scheduler.step_update(0)
Exemple #12
0
    def _build_optimizer(self):
        params = list(
            filter(
                lambda p: p.requires_grad,
                chain(self.model.parameters(), self.criterion.parameters()),
            )
        )

        if getattr(self.args, 'multiple_lr', False):
            assert self.args.lr_scheduler == 'multi_lr_inverse_sqrt', 'only multi_lr_inverse_sqrt supports multiple_lr now'
            assert len(self.args.lr) == 3, 'Three learning rates for roberta, sents_encoder and decoder should be provided'
            named_params = [(n, p) for n, p in self.model.named_parameters() if p.requires_grad]
            encoder_params = [(n, p) for n, p in self.model.encoder.named_parameters() if p.requires_grad]
            decoder_params = [(n, p) for n, p in self.model.decoder.named_parameters() if p.requires_grad]
            if hasattr(self.model, 'decoder_perm'):
                decoder_params += [(n, p) for n, p in self.model.decoder_perm.named_parameters() if p.requires_grad]
                
            # params = [
            #     {'params': [p for n, p in named_params if 'roberta' in n and (n.startswith('encoder') or n.startswith('module.encoder'))]},
            #     {'params': [p for n, p in named_params if not 'roberta' in n and (n.startswith('encoder') or n.startswith('module.encoder'))]},
            #     {'params': [p for n, p in named_params if n.startswith('decoder') or n.startswith('module.decoder')]}
            # ]
            params = [
                {'params': [p for n, p in encoder_params if 'roberta' in n]},
                {'params': [p for n, p in encoder_params if not 'roberta' in n]},
                {'params': [p for n, p in decoder_params]}  
            ]
            assert len(named_params) == sum([len(p['params']) for p in params]), named_params


        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print(
                    "| WARNING: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster"
                )
            if self.args.memory_efficient_fp16:
                if getattr(self.args, 'multiple_lr', False):
                    self._optimizer = optim.ConcatOptimizer(self.args, [ optim.MemoryEfficientFP16Optimizer.build_optimizer(self.args, param['params']) for param in params])
                else:
                    self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                        self.args, params
                    )
            else:
                if getattr(self.args, 'multiple_lr', False):
                    self._optimizer = optim.ConcatOptimizer(self.args, [optim.FP16Optimizer.build_optimizer(self.args, param['params']) for param in params])
                else:
                    self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print("| NOTICE: your device may support faster training with --fp16")
            if getattr(self.args, 'multiple_lr', False):
                self._optimizer = optim.ConcatOptimizer(self.args, [optim.build_optimizer(self.args, param['params']) for param in params])
            else:
                self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Exemple #13
0
    def _build_optimizer(self):
        # TODO: Rename 'optimizers' to param_groups
        use_param_groups = hasattr(self.args, 'optimizers') and len(self.args.optimizers) > 0
        freeze_prefix = getattr(self.args, 'freeze_prefix', None)

        if use_param_groups:
            params = list(
                filter(
                    lambda np: np[1].requires_grad,
                    chain(self.model.named_parameters(), self.criterion.named_parameters()),
                )
            )
            params = self._get_param_groups(params)
        elif freeze_prefix:
            params = list(
                filter(
                    lambda np: np[1].requires_grad and not (np[0].startswith(freeze_prefix) or np[0].startswith("module." + freeze_prefix)),
                    chain(self.model.named_parameters(), self.criterion.named_parameters()),
                )
            )
            frozen_params = list(
                filter(
                    lambda np: np[1].requires_grad and (np[0].startswith(freeze_prefix) or np[0].startswith("module." + freeze_prefix)),
                    chain(self.model.named_parameters(), self.criterion.named_parameters()),
                )
            )
            print('The following parameters are NOT FROZEN: %s' % (
                ','.join([param[0] for param in params]),
            ))
            print('The following parameters are FROZEN by prefix "%s": %s' % (
                freeze_prefix,
                ','.join([param[0] for param in frozen_params]),
            ))
            assert len(params) > 0
            assert len(frozen_params) > 0

            params = [param[1] for param in params]

        else:
            params = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.parameters(), self.criterion.parameters()),
                )
            )

        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                logger.info(
                    "NOTE: your device does NOT support faster training with --fp16, "
                    "please switch to FP32 which is likely to be faster"
                )
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
                    self.args, params
                )
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                logger.info("NOTE: your device may support faster training with --fp16")
            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        if use_param_groups:
            # HACK: Current implementation of LR schedulers works fine
            # if we replace LR with numpy arrays of LRs.
            # However, FairseqOptimizer (the base class for the optimizer here)
            # always return just a single LR (from the first param group).
            # We change this behaviour for the optimizer object (so we don't need to change the base class)
            self.optimizer.set_lr = set_lr_group.__get__(self.optimizer, FairseqOptimizer)
            self.optimizer.get_lr = get_lr_group.__get__(self.optimizer, FairseqOptimizer)
            assert len(self.args.lr) == 1
            args = copy.deepcopy(self.args)
            args.lr = [np.array([param_group.get('lr', self.args.lr[0]) for param_group in self.args.optimizers])]
            self._lr_scheduler = lr_scheduler.build_lr_scheduler(args, self.optimizer)
        else:
            self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
        self._lr_scheduler.step_update(0)
Exemple #14
0
    def _build_optimizer(self):
        self.index = 0
        self.index2 = 0
        # params = list(
        #     filter(
        #         lambda p: p.requires_grad,
        #         chain(self.model.parameters(), self.criterion.parameters()),
        #     )
        # )
        # for n, p in chain(self.model.named_parameters(), self.criterion.named_parameters()):
        #     print(n)
        # exit()

        if self.args.task=="audio_translation":
            params = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.parameters(), self.criterion.parameters()),
                ))
            # def filter_fn(n, p):
            #     self.index += 1
            #     if not p.requires_grad:
            #         print(n)
            #     return p.requires_grad
            #     # print(n)
            #     # cond = p.requires_grad
            #     # # if self.args.fix_transformer:
            #     # # cond &= ('audio_encoder.conv_layers' not in n and 'audio_encoder.transformer_layers' not in n and  'text_encoder' not in n)
            #     # cond &= ('audio_encoder.conv_layers' not in n and 'audio_encoder.transformer_layers' not in n )
            #
            #     # if not cond:
            #     #     self.index2 += 1
            #     #     p.requires_grad=False
            #     # else:
            #     #     print(n)
            #     # return  cond
            # params = [p for n, p in chain(self.model.named_parameters(), self.criterion.named_parameters()) if filter_fn(n, p)]
            # exit()
        else:
            params = list(
                filter(
                    lambda p: p.requires_grad,
                    chain(self.model.parameters(), self.criterion.parameters()),
                )
            )
        # print(self.index)
        # print(self.index2)
        # print(len(params))
        # exit()
        # # print(len(params))
        # index=0
        # for n, p in chain(self.model.named_parameters(), self.criterion.named_parameters()):
        #     index+=1
        #     # print('{}: {}'.format(p.data[0], n))
        #     print(str(index)+" "+n)
        #     print(p.requires_grad)
        # exit()
        if self.args.fp16:
            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
                print('| WARNING: your device does NOT support faster training with --fp16, '
                      'please switch to FP32 which is likely to be faster')
            if self.args.memory_efficient_fp16:
                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(self.args, params)
            else:
                self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params)
        else:
            if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7:
                print('| NOTICE: your device may support faster training with --fp16')

            self._optimizer = optim.build_optimizer(self.args, params)

        if self.args.use_bmuf:
            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)

        # We should initialize the learning rate scheduler immediately after
        # building the optimizer, so that the initial learning rate is set.
        self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
        self._lr_scheduler.step_update(0)