def build_optim_bert(args, model, checkpoint): """ Build optimizer """ if checkpoint is not None: optim = checkpoint['optims'][0] saved_optimizer_state_dict = optim.optimizer.state_dict() optim.optimizer.load_state_dict(saved_optimizer_state_dict) if args.visible_gpus != '-1': for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") else: optim = Optimizer(args.optim, args.lr_bert, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method='noam', warmup_steps=args.warmup_steps_bert) params = [(n, p) for n, p in list(model.named_parameters()) if n.startswith('encoder.model')] optim.set_parameters(params) return optim
def build_optim(model, opt, checkpoint): """ Build optimizer """ saved_optimizer_state_dict = None if opt.train_from: optim = checkpoint['optim'] # We need to save a copy of optim.optimizer.state_dict() for setting # the, optimizer state later on in Stage 2 in this method, since # the method optim.set_parameters(model.parameters()) will overwrite # optim.optimizer, and with ith the values stored in # optim.optimizer.state_dict() saved_optimizer_state_dict = optim.optimizer.state_dict() else: optim = Optimizer(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_steps=opt.start_decay_steps, decay_steps=opt.decay_steps, beta1=opt.adam_beta1, beta2=opt.adam_beta2, adagrad_accum=opt.adagrad_accumulator_init, decay_method=opt.decay_method, warmup_steps=opt.warmup_steps) # Stage 1: # Essentially optim.set_parameters (re-)creates and optimizer using # model.paramters() as parameters that will be stored in the # optim.optimizer.param_groups field of the torch optimizer class. # Importantly, this method does not yet load the optimizer state, as # essentially it builds a new optimizer with empty optimizer state and # parameters from the model. optim.set_parameters(model.named_parameters()) if opt.train_from: # Stage 2: In this stage, which is only performed when loading an # optimizer from a checkpoint, we load the saved_optimizer_state_dict # into the re-created optimizer, to set the optim.optimizer.state # field, which was previously empty. For this, we use the optimizer # state saved in the "saved_optimizer_state_dict" variable for # this purpose. # See also: https://github.com/pytorch/pytorch/issues/2830 optim.optimizer.load_state_dict(saved_optimizer_state_dict) # Convert back the state values to cuda type if applicable if use_gpu(opt): for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() # We want to make sure that indeed we have a non-empty optimizer state # when we loaded an existing model. This should be at least the case # for Adam, which saves "exp_avg" and "exp_avg_sq" state # (Exponential moving average of gradient and squared gradient values) if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
def build_optim(model, opt, checkpoint): """ Build optimizer """ saved_optimizer_state_dict = None if opt.train_from: optim = checkpoint['optim'] saved_optimizer_state_dict = optim.optimizer.state_dict() else: optim = Optimizer( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_steps=opt.start_decay_steps, decay_steps=opt.decay_steps, beta1=opt.adam_beta1, beta2=opt.adam_beta2, adagrad_accum=opt.adagrad_accumulator_init, decay_method=opt.decay_method, warmup_steps=opt.warmup_steps) optim.set_parameters(model.named_parameters()) if opt.train_from: optim.optimizer.load_state_dict(saved_optimizer_state_dict) if use_gpu(opt): for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
def build_optim(args, model, checkpoint): """ Build optimizer """ optim = Optimizer(args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method=args.decay_method, warmup_steps=args.warmup_steps, model_size=args.enc_hidden_size) optim.set_parameters(list(model.named_parameters())) if args.train_from != '': optim.optimizer.load_state_dict(checkpoint['optim']) if args.visible_gpu != '-1': for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
def build_optim(model, opt, checkpoint): """ Build optimizer """ saved_optimizer_state_dict = None if opt.train_from: optim = checkpoint["optim"] # We need to save a copy of optim.optimizer.state_dict() for setting # the, optimizer state later on in Stage 2 in this method, since # the method optim.set_parameters(model.parameters()) will overwrite # optim.optimizer, and with ith the values stored in # optim.optimizer.state_dict() # saved_optimizer_state_dict = optim.optimizer.state_dict() saved_optimizer_state_dict = optim else: optim = Optimizer( opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_steps=opt.start_decay_steps, decay_steps=opt.decay_steps, beta1=opt.adam_beta1, beta2=opt.adam_beta2, adagrad_accum=opt.adagrad_accumulator_init, decay_method=opt.decay_method, warmup_steps=opt.warmup_steps, ) optim.set_parameters(model.named_parameters()) if opt.train_from: optim.optimizer.load_state_dict(saved_optimizer_state_dict) if use_gpu(opt): for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == "adam") and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty" ) return optim
def create_optimizer(model_or_iterable, options=None): if options is None: options = copy.deepcopy(onmt.standard_options.stdOptions) if not isinstance(options, dict): options = mhf.convertToDictionary(options) options = handle_options(options) options = mhf.convertToNamedTuple(options) optim = onmt.Optim(options.optim, options.learning_rate, options.max_grad_norm, lr_decay=options.learning_rate_decay, start_decay_at=options.start_decay_at, opt=options) try: optim.set_parameters(model_or_iterable.parameters()) except AttributeError: optim.set_parameters(model_or_iterable) return optim
def build_optim(model, args): """ Build optimizer """ optim = Optimizer(args.optim, args.learning_rate, args.max_grad_norm, lr_decay=args.learning_rate_decay, start_decay_steps=args.start_decay_steps, decay_steps=args.decay_steps, beta1=args.adam_beta1, beta2=args.adam_beta2, adagrad_accum=args.adagrad_accumulator_init, decay_method=args.decay_method, warmup_steps=args.warmup_steps, model_size=args.rnn_size) optim.set_parameters(model) return optim
def build_optim(opt, model): """ Build optimizer """ optim = Optimizer(opt.optim, opt.learning_rate, opt.max_grad_norm, lr_decay=opt.learning_rate_decay, start_decay_steps=opt.start_decay_steps, decay_steps=opt.decay_steps, beta1=opt.adam_beta1, beta2=opt.adam_beta2, adagrad_accum=opt.adagrad_accumulator_init, decay_method=opt.decay_method, warmup_steps=opt.warmup_steps, model_size=opt.encoder_size) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optim.set_parameters(parameters) return optim
def build_optim(args, model, checkpoint, generation=False): """ Build optimizer """ if checkpoint is not None: optim = checkpoint['optims'][0] saved_optimizer_state_dict = optim.optimizer.state_dict() optim.optimizer.load_state_dict(saved_optimizer_state_dict) if args.visible_gpus != '-1': for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") else: if generation: optim = Optimizer(args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method='noam', warmup_steps=args.warmup_steps) else: optim = Optimizer(args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, start_decay_steps=1, decay_steps=10, lr_decay=0.9999) optim.set_parameters(list(model.named_parameters())) return optim