def build_optim_dec(args, model, checkpoint): """ Build optimizer """ if checkpoint is not None: optim = checkpoint['optims'][1] saved_optimizer_state_dict = optim.optimizer.state_dict() optim.optimizer.load_state_dict(saved_optimizer_state_dict) if args.visible_gpus != '-1': for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") else: optim = Optimizer(args.optim, args.lr_dec, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method='noam', warmup_steps=args.warmup_steps_dec) params = [(n, p) for n, p in list(model.named_parameters()) if not n.startswith('bert.model')] optim.set_parameters(params) return optim
def build_optim(args, model, checkpoint): """ Build optimizer """ saved_optimizer_state_dict = None if args.train_from != "": optim = checkpoint["optim"] saved_optimizer_state_dict = optim.optimizer.state_dict() else: optim = Optimizer( args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method=args.decay_method, warmup_steps=args.warmup_steps, ) optim.set_parameters(list(model.named_parameters())) if args.train_from != "": optim.optimizer.load_state_dict(saved_optimizer_state_dict) if args.visible_gpus != "-1": for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == "adam") and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
def build_optim(args, model, checkpoint): """ Build optimizer """ saved_optimizer_state_dict = None if args.train_from != '' and checkpoint is not None: optim = checkpoint['optim'] saved_optimizer_state_dict = optim.optimizer.state_dict() else: optim = Optimizer(args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method=args.decay_method, warmup_steps=args.warmup_steps, weight_decay=args.l2_lambda) #self.start_decay_steps take effect when decay_method is not noam optim.set_parameters(list(model.named_parameters())) if args.train_from != '' and checkpoint is not None: optim.optimizer.load_state_dict(saved_optimizer_state_dict) if args.device == "cuda": for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
def build_optim_dec_inner(args, model, checkpoint, maml_type=None): """Builds inner optimizer for decoder. We don't need to load trained optimizer in inner loop. Args: model (models.model_builder.ABsSummarizer/MTLAbsSummarizer) checkpoint (dict) Returns: A optimizer in type models.optimizers.Optimizer. """ assert maml_type == 'maml' # only support MAML currently # NOTE: no warm up optim = Optimizer(args.inner_optim, args.lr_dec_inner, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2) # NOTE: these params is pseudo, which will be replaced in forwarding params = [(n, p) for n, p in list(model.named_parameters()) if not n.startswith('bert.model')] optim.set_parameters(params) return optim
def build_optim(args, model, checkpoint): """ Build optimizer """ if checkpoint is not None and args.new_optim == False and args.few_shot == False: optim = checkpoint['optim'][0] saved_optimizer_state_dict = optim.optimizer.state_dict() optim.optimizer.load_state_dict(saved_optimizer_state_dict) if args.visible_gpus != '-1': for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") else: optim = Optimizer( args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method='noam', warmup_steps=args.warmup_steps) optim.set_parameters(list(model.named_parameters())) return optim
def build_optim(args, model, checkpoint): saved_optimizer_state_dict = None if args.train_from != '': optim = checkpoint['optim'] saved_optimizer_state_dict = optim.optimizer.state_dict() else: optim = Optimizer( args.optim, args.learning_rate, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method=args.decay_method, warmup_steps=args.warmup_steps) optim.set_parameters(list(model.named_parameters())) if args.train_from != '': optim.optimizer.load_state_dict(saved_optimizer_state_dict) optim.learning_rate = args.learning_rate for param_group in optim.optimizer.param_groups: param_group['lr'] = args.learning_rate if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
def build_optim(args, model, checkpoint): """ Build optimizer """ saved_optimizer_state_dict = None if args.train_from != '': optim = checkpoint['optim'] saved_optimizer_state_dict = optim.optimizer.state_dict() else: optim = Optimizer(args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method=args.decay_method, warmup_steps=args.warmup_steps, model_size=args.hidden_size) # Stage 1: # Essentially optim.set_parameters (re-)creates and optimizer using # model.paramters() as parameters that will be stored in the # optim.optimizer.param_groups field of the torch optimizer class. # Importantly, this method does not yet load the optimizer state, as # essentially it builds a new optimizer with empty optimizer state and # parameters from the model. optim.set_parameters(list(model.named_parameters())) if args.train_from != '': # Stage 2: In this stage, which is only performed when loading an # optimizer from a checkpoint, we load the saved_optimizer_state_dict # into the re-created optimizer, to set the optim.optimizer.state # field, which was previously empty. For this, we use the optimizer # state saved in the "saved_optimizer_state_dict" variable for # this purpose. # See also: https://github.com/pytorch/pytorch/issues/2830 optim.optimizer.load_state_dict(saved_optimizer_state_dict) # Convert back the state values to cuda type if applicable if args.visible_gpu != '-1': for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() # We want to make sure that indeed we have a non-empty optimizer state # when we loaded an existing model. This should be at least the case # for Adam, which saves "exp_avg" and "exp_avg_sq" state # (Exponential moving average of gradient and squared gradient values) if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
def build_optim(args, model, checkpoint=None): """ Build optimizer """ if checkpoint is not None and not args.transfer_learning: logger.info('Loading model optimizer...') optim = checkpoint['optim'] else: optim = Optimizer( args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method='noam', warmup_steps=args.warmup_steps) optim.set_parameters(list(model.named_parameters())) # optim = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) return optim
def build_optim_dec(args, model, checkpoint): """Builds optimizer for decoder. Args: model (models.model_builder.ABsSummarizer/MTLAbsSummarizer) checkpoint (dict) Returns: A optimizer in type models.optimizers.Optimizer. """ # Load optimizer if checkpoint is not None and args.init_optim == False: optim = checkpoint['optims'][1] # [0] -> encoder, [1] -> decoder optim.optimizer.load_state_dict(optim.optimizer.state_dict()) if args.visible_gpus != '-1': for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") else: # Disable warm up if (args.outer_no_warm_up): optim = Optimizer(args.optim, args.lr_dec, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2) else: optim = Optimizer(args.optim, args.lr_dec, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method='noam', warmup_steps=args.warmup_steps_dec) # Feed parameters to be optimized params = [(n, p) for n, p in list(model.named_parameters()) if not n.startswith('bert.model')] optim.set_parameters(params) return optim
def build_optim(args, model, checkpoint): """ Build optimizer """ saved_optimizer_state_dict = None if args.train_from or args.recover_from != '': optim = checkpoint['optim'] saved_optimizer_state_dict = optim.optimizer.state_dict() else: optim = Optimizer(args.optim, args.lr, args.max_grad_norm, beta1=args.beta1, beta2=args.beta2, decay_method=args.decay_method, warmup_steps=args.warmup_steps) if isinstance(model, list): tmp = [] for _model in model: tmp.extend(list(_model.named_parameters())) optim.set_parameters(tmp) else: optim.set_parameters(list(model.named_parameters())) if args.train_from or args.recover_from != '': optim.optimizer.load_state_dict(saved_optimizer_state_dict) if args.visible_gpus != '-1': for state in optim.optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if (optim.method == 'adam') and (len(optim.optimizer.state) < 1): raise RuntimeError( "Error: loaded Adam optimizer from existing model" + " but optimizer state is empty") return optim
def run(mtd="fold_split"): def _eval(data): model.eval() # 不启用 BatchNormalization 和 Dropout # data = dev_data y_pred = [] y_true = [] with torch.no_grad(): for batch_data in dataset_processer.data_iter( data, config['test_batch_size'], shuffle=False): torch.cuda.empty_cache() batch_inputs, batch_labels = dataset_processer.batch2tensor( batch_data) batch_outputs = model(batch_inputs) y_pred.extend( torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) score, dev_f1 = scores.get_score(y_true, y_pred) return score, dev_f1 if mtd == "fold_split": demo_preprocess.split_dataset(raw_path, train_path, dev_path, test_path) elif mtd == "process_data": demo_preprocess.process_data(config, train_path, dev_path) elif mtd == "train": Train_data = file_utils.read_json(config["train_set"]) Dev_data = file_utils.read_json(config["dev_set"]) # 生成模型可处理的格式 train_data = dataset_processer.get_examples(Train_data, label_encoder) dev_data = dataset_processer.get_examples(Dev_data, label_encoder) del Train_data, Dev_data # 一个epoch的batch个数 batch_num = int( np.ceil(len(train_data) / float(config["train_batch_size"]))) print("batch_num:{}".format(batch_num)) # model = BertSoftmaxModel(cfg.bert_path, label_encoder) optimizer = Optimizer(model.all_parameters, steps=batch_num * config["epochs"]) # 优化器 # loss # criterion = nn.CrossEntropyLoss() # obj criterion = loss_factory.focal_loss() best_train_f1, best_dev_f1 = 0, 0 early_stop = -1 EarlyStopEpochs = 10 # 当多个epoch,dev的指标都没有提升,则早停 # train print("start train") for epoch in range(cfg.RESUME_EPOCH + 1, config["epochs"] + 1): optimizer.zero_grad() model.train() # 启用 BatchNormalization 和 Dropout overall_losses = 0 losses = 0 # batch_idx = 1 y_pred = [] y_true = [] step = 0 for batch_data in dataset_processer.data_iter( train_data, config["train_batch_size"], shuffle=True): torch.cuda.empty_cache() batch_inputs, batch_labels = dataset_processer.batch2tensor( batch_data) batch_outputs = model(batch_inputs) print(batch_outputs.shape) # loss = criterion(batch_outputs, batch_labels) loss.backward() loss_value = loss.detach().cpu().item() losses += loss_value overall_losses += loss_value y_pred.extend( torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) # nn.utils.clip_grad_norm_(optimizer.all_params, max_norm=config["clip"]) # 梯度裁剪 for cur_optim, scheduler in zip(optimizer.optims, optimizer.schedulers): cur_optim.step() scheduler.step() optimizer.zero_grad() step += 1 # print(step, time.time()) overall_losses /= batch_num overall_losses = scores.reformat(overall_losses, 4) score, train_f1 = scores.get_score(y_true, y_pred) print("epoch:{},train_score:{}, train_f1:{}, overall_loss:{} ". format(epoch, train_f1, score, overall_losses)) # if set(y_true) == set(y_pred): # print("report") # report = classification_report(y_true, y_pred, digits=4, target_names=label_encoder.target_names) # # logging.info('\n' + report) # print(report) # eval _, dev_f1 = _eval(data=dev_data) if best_dev_f1 < dev_f1: best_dev_f1 = dev_f1 early_stop = 0 best_train_f1 = train_f1 save_path = model_utils.save_checkpoint( model, epoch, save_folder=os.path.join(cfg.proj_path, "data/bert_nn")) print("save_path:{}".format(save_path)) # torch.save(model.state_dict(), save_model) else: early_stop += 1 if early_stop == EarlyStopEpochs: # 达到早停次数,则停止训练 break print( "early_stop:{}, score:{}, dev_f1:{}, best_train_f1:{}, best_dev_f1:{}" .format(early_stop, dev_f1, score, best_train_f1, best_dev_f1))