def build_optim(args, model, checkpoint): """ Build optimizer """ saved_optimizer_state_dict = None if args.train_from != "": optim = checkpoint["optim"] saved_optimizer_state_dict = optim.optimizer.state_dict() else: optim = Optimizer( args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method=args.decay_method, warmup_steps=args.warmup_steps, ) optim.set_parameters(list(model.named_parameters())) if args.train_from != "": optim.optimizer.load_state_dict(saved_optimizer_state_dict) if args.visible_gpus != "-1": for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == "adam") and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
def build_optim(args, model, checkpoint): """ Build optimizer """ saved_optimizer_state_dict = None if args.train_from != '' and checkpoint is not None: optim = checkpoint['optim'] saved_optimizer_state_dict = optim.optimizer.state_dict() else: optim = Optimizer(args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method=args.decay_method, warmup_steps=args.warmup_steps, weight_decay=args.l2_lambda) #self.start_decay_steps take effect when decay_method is not noam optim.set_parameters(list(model.named_parameters())) if args.train_from != '' and checkpoint is not None: optim.optimizer.load_state_dict(saved_optimizer_state_dict) if args.device == "cuda": for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
def build_optim_dec(args, model, checkpoint): """ Build optimizer """ if checkpoint is not None: optim = checkpoint['optims'][1] saved_optimizer_state_dict = optim.optimizer.state_dict() optim.optimizer.load_state_dict(saved_optimizer_state_dict) if args.visible_gpus != '-1': for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") else: optim = Optimizer(args.optim, args.lr_dec, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method='noam', warmup_steps=args.warmup_steps_dec) params = [(n, p) for n, p in list(model.named_parameters()) if not n.startswith('bert.model')] optim.set_parameters(params) return optim
def build_optim_dec_inner(args, model, checkpoint, maml_type=None): """Builds inner optimizer for decoder. We don't need to load trained optimizer in inner loop. Args: model (models.model_builder.ABsSummarizer/MTLAbsSummarizer) checkpoint (dict) Returns: A optimizer in type models.optimizers.Optimizer. """ assert maml_type == 'maml' # only support MAML currently # NOTE: no warm up optim = Optimizer(args.inner_optim, args.lr_dec_inner, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2) # NOTE: these params is pseudo, which will be replaced in forwarding params = [(n, p) for n, p in list(model.named_parameters()) if not n.startswith('bert.model')] optim.set_parameters(params) return optim
def build_optim(args, model, checkpoint): """ Build optimizer """ if checkpoint is not None and args.new_optim == False and args.few_shot == False: optim = checkpoint['optim'][0] saved_optimizer_state_dict = optim.optimizer.state_dict() optim.optimizer.load_state_dict(saved_optimizer_state_dict) if args.visible_gpus != '-1': for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") else: optim = Optimizer( args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method='noam', warmup_steps=args.warmup_steps) optim.set_parameters(list(model.named_parameters())) return optim
def build_optim(args, model, checkpoint): saved_optimizer_state_dict = None if args.train_from != '': optim = checkpoint['optim'] saved_optimizer_state_dict = optim.optimizer.state_dict() else: optim = Optimizer( args.optim, args.learning_rate, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method=args.decay_method, warmup_steps=args.warmup_steps) optim.set_parameters(list(model.named_parameters())) if args.train_from != '': optim.optimizer.load_state_dict(saved_optimizer_state_dict) optim.learning_rate = args.learning_rate for param_group in optim.optimizer.param_groups: param_group['lr'] = args.learning_rate if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
def build_optim(args, model, checkpoint): """ Build optimizer """ saved_optimizer_state_dict = None if args.train_from != '': optim = checkpoint['optim'] saved_optimizer_state_dict = optim.optimizer.state_dict() else: optim = Optimizer(args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method=args.decay_method, warmup_steps=args.warmup_steps, model_size=args.hidden_size) # Stage 1: # Essentially optim.set_parameters (re-)creates and optimizer using # model.paramters() as parameters that will be stored in the # optim.optimizer.param_groups field of the torch optimizer class. # Importantly, this method does not yet load the optimizer state, as # essentially it builds a new optimizer with empty optimizer state and # parameters from the model. optim.set_parameters(list(model.named_parameters())) if args.train_from != '': # Stage 2: In this stage, which is only performed when loading an # optimizer from a checkpoint, we load the saved_optimizer_state_dict # into the re-created optimizer, to set the optim.optimizer.state # field, which was previously empty. For this, we use the optimizer # state saved in the "saved_optimizer_state_dict" variable for # this purpose. # See also: https://github.com/pytorch/pytorch/issues/2830 optim.optimizer.load_state_dict(saved_optimizer_state_dict) # Convert back the state values to cuda type if applicable if args.visible_gpu != '-1': for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() # We want to make sure that indeed we have a non-empty optimizer state # when we loaded an existing model. This should be at least the case # for Adam, which saves "exp_avg" and "exp_avg_sq" state # (Exponential moving average of gradient and squared gradient values) if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
def build_optim_dec(args, model, checkpoint): """Builds optimizer for decoder. Args: model (models.model_builder.ABsSummarizer/MTLAbsSummarizer) checkpoint (dict) Returns: A optimizer in type models.optimizers.Optimizer. """ # Load optimizer if checkpoint is not None and args.init_optim == False: optim = checkpoint['optims'][1] # [0] -> encoder, [1] -> decoder optim.optimizer.load_state_dict(optim.optimizer.state_dict()) if args.visible_gpus != '-1': for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") else: # Disable warm up if (args.outer_no_warm_up): optim = Optimizer(args.optim, args.lr_dec, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2) else: optim = Optimizer(args.optim, args.lr_dec, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method='noam', warmup_steps=args.warmup_steps_dec) # Feed parameters to be optimized params = [(n, p) for n, p in list(model.named_parameters()) if not n.startswith('bert.model')] optim.set_parameters(params) return optim
def build_optim(args, model, checkpoint=None): """ Build optimizer """ if checkpoint is not None and not args.transfer_learning: logger.info('Loading model optimizer...') optim = checkpoint['optim'] else: optim = Optimizer( args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method='noam', warmup_steps=args.warmup_steps) optim.set_parameters(list(model.named_parameters())) # optim = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) return optim
def build_optim(args, model, checkpoint): """ Build optimizer """ saved_optimizer_state_dict = None if args.train_from or args.recover_from != '': optim = checkpoint['optim'] saved_optimizer_state_dict = optim.optimizer.state_dict() else: optim = Optimizer(args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method=args.decay_method, warmup_steps=args.warmup_steps) if isinstance(model, list): tmp = [] for _model in model: tmp.extend(list(_model.named_parameters())) optim.set_parameters(tmp) else: optim.set_parameters(list(model.named_parameters())) if args.train_from or args.recover_from != '': optim.optimizer.load_state_dict(saved_optimizer_state_dict) if args.visible_gpus != '-1': for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim