**{ "optm": optimizer, "lrsch": lrsch, "pyrand": PyRandomState(), "thrand": THRandomState(use_cuda=use_cuda) }) num_checkpoint = cnfg.num_checkpoint cur_checkid = 0 tminerr = inf_default minloss, minerr = eva(vd, nvalid, mymodel, lossf, cuda_device, multi_gpu, use_amp) logger.info("Init lr: %s, Dev Loss/Error: %.3f %.2f" % ( " ".join(tostr(getlr(optimizer))), minloss, minerr, )) if fine_tune_m is None: save_model(mymodel, wkdir + "init.h5", multi_gpu, print_func=logger.info) logger.info("Initial model saved") else: if cnt_states is not None: logger.info("Loading training states") _remain_states = state_holder.load_state_dict(torch.load(cnt_states)) remain_steps, cur_checkid = _remain_states[ "remain_steps"], _remain_states["checkpoint_id"] if "training_list" in _remain_states: _ctl = _remain_states["training_list"]
state_holder = None if statesf is None and cnt_states is None else Holder( **{ "optm": optimizer, "lrsch": lrsch, "pyrand": PyRandomState(), "thrand": THRandomState(use_cuda=use_cuda) }) num_checkpoint = cnfg.num_checkpoint cur_checkid = 0 tminerr = inf_default minloss, minerr = eva(vd, nvalid, mymodel, lossf, cuda_device, multi_gpu, use_amp) logger.info("".join(("Init lr: ", ",".join(tostr(getlr(optimizer))), ", Dev Loss/Error: %.3f %.2f" % (minloss, minerr)))) if fine_tune_m is None: save_model(mymodel, wkdir + "init.h5", multi_gpu, print_func=logger.info) logger.info("Initial model saved") else: if cnt_states is not None: logger.info("Loading training states") _remain_states = state_holder.load_state_dict(torch.load(cnt_states)) remain_steps, cur_checkid = _remain_states[ "remain_steps"], _remain_states["checkpoint_id"] if "training_list" in _remain_states: _ctl = _remain_states["training_list"] else: shuffle(tl)
optimizer = mymodel.build_optimizer(Optimizer, lr=init_lr, betas=adam_betas_default, eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams, multi_gpu_optimizer=multi_gpu_optimizer, contiguous_parameters=contiguous_parameters) else: optimizer = Optimizer(get_model_parameters(mymodel, contiguous_parameters=contiguous_parameters), lr=init_lr, betas=adam_betas_default, eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams) optimizer.zero_grad(set_to_none=optm_step_zero_grad_set_none) lrsch = LRScheduler(optimizer, cnfg.isize, cnfg.warm_step, scale=cnfg.lr_scale) state_holder = None if statesf is None and cnt_states is None else Holder(**{"optm": optimizer, "lrsch": lrsch, "pyrand": PyRandomState(), "thrand": THRandomState(use_cuda=use_cuda)}) num_checkpoint = cnfg.num_checkpoint cur_checkid = 0 tminerr = inf_default minloss, minerr = eva(vd, vl, mymodel, lossf, cuda_device, multi_gpu, use_amp) logger.info("".join(("Init lr: ", ",".join(tostr(getlr(optimizer))), ", Dev Loss/Error: %.3f %.2f" % (minloss, minerr)))) if fine_tune_m is None: save_model(mymodel, wkdir + "init.h5", multi_gpu, print_func=logger.info) logger.info("Initial model saved") else: if cnt_states is not None: logger.info("Loading training states") _remain_states = state_holder.load_state_dict(torch.load(cnt_states)) remain_steps, cur_checkid = _remain_states["remain_steps"], _remain_states["checkpoint_id"] if "training_list" in _remain_states: _ctl = _remain_states["training_list"] else: shuffle(tl) _ctl = tl tminerr, done_tokens, cur_checkid, remain_steps, _ = train(td, _ctl, vd, vl, optimizer, lrsch, mymodel, lossf, cuda_device, logger, done_tokens, multi_gpu, tokens_optm, batch_report, save_every, chkpf, state_holder, statesf, num_checkpoint, cur_checkid, report_eva, remain_steps, False, False, scaler)