eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams, multi_gpu_optimizer=multi_gpu_optimizer, contiguous_parameters=contiguous_parameters) else: optimizer = Optimizer(get_model_parameters( mymodel, contiguous_parameters=contiguous_parameters), lr=init_lr, betas=adam_betas_default, eps=ieps_adam_default, weight_decay=cnfg.weight_decay, amsgrad=use_ams) optimizer.zero_grad(set_to_none=optm_step_zero_grad_set_none) lrsch = LRScheduler(optimizer, cnfg.isize, cnfg.warm_step, scale=cnfg.lr_scale) state_holder = None if statesf is None and cnt_states is None else Holder( **{ "optm": optimizer, "lrsch": lrsch, "pyrand": PyRandomState(), "thrand": THRandomState(use_cuda=use_cuda) }) num_checkpoint = cnfg.num_checkpoint cur_checkid = 0 tminerr = inf_default minloss, minerr = eva(vd, nvalid, mymodel, lossf, cuda_device, multi_gpu,
mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False) lossf = DataParallelCriterion(lossf, device_ids=cuda_devices, output_device=cuda_device.index, replicate_once=True) fine_tune_state = cnfg.fine_tune_state if fine_tune_state is not None: logger.info("Load optimizer state from: " + fine_tune_state) optimizer.load_state_dict(h5load(fine_tune_state)) lrsch = GoogleLR(optimizer, cnfg.isize, cnfg.warm_step, scale=cnfg.lr_scale) num_checkpoint = cnfg.num_checkpoint cur_checkid = 0 tminerr = inf_default minloss, minerr = eva(vd, nvalid, mymodel, lossf, cuda_device, multi_gpu, use_amp) logger.info("".join(("Init lr: ", ",".join(tostr(getlr(optimizer))), ", Dev Loss/Error: %.3f %.2f" % (minloss, minerr)))) if fine_tune_m is None: save_model(mymodel, wkdir + "init.h5", multi_gpu, logger) logger.info("Initial model saved") else:
mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False) lossf = DataParallelCriterion(lossf, device_ids=cuda_devices, output_device=cuda_device.index, replicate_once=True) fine_tune_state = cnfg.fine_tune_state if fine_tune_state is not None: logger.info("Load optimizer state from: " + fine_tune_state) optimizer.load_state_dict(torch.load(fine_tune_state)) lrsch = GoogleLR(optimizer, cnfg.isize, cnfg.warm_step) lrsch.step() num_checkpoint = cnfg.num_checkpoint cur_checkid = 0 tminerr = float("inf") minloss, minerr = eva(vd, nvalid, mymodel, lossf, cuda_device, multi_gpu) logger.info("".join(("Init lr: ", ",".join(tostr(getlr(optimizer))), ", Dev Loss/Error: %.3f %.2f" % (minloss, minerr)))) if fine_tune_m is None: save_model(mymodel, wkdir + "init.t7", multi_gpu) logger.info("Initial model saved") else:
def main(): ''' Main function ''' rid = cnfg.runid # Get run ID from cnfg file where training files will be stored if len(sys.argv) > 1: rid = sys.argv[1] # getting runid from console earlystop = cnfg.earlystop # Get early-stop criteria epochs = cnfg.epochs # tokens_optm = cnfg.tokens_optm # number of tokens done_tokens = tokens_optm batch_report = cnfg.batch_report report_eva = cnfg.report_eva use_cuda = cnfg.use_cuda gpuid = cnfg.gpuid # GPU configuration if use_cuda and torch.cuda.is_available(): use_cuda = True if len(gpuid.split(",")) > 1: cuda_device = torch.device(gpuid[:gpuid.find(",")].strip()) cuda_devices = [ int(_.strip()) for _ in gpuid[gpuid.find(":") + 1:].split(",") ] print('[Info] using multiple gpu', cuda_devices) multi_gpu = True else: cuda_device = torch.device(gpuid) multi_gpu = False print('[Info] using single gpu', cuda_device) cuda_devices = None torch.cuda.set_device(cuda_device.index) else: cuda_device = False print('using single cpu') multi_gpu = False cuda_devices = None use_ams = cnfg.use_ams # ? save_optm_state = cnfg.save_optm_state save_every = cnfg.save_every epoch_save = cnfg.epoch_save remain_steps = cnfg.training_steps wkdir = "".join((cnfg.work_dir, cnfg.data_dir, "/", rid, "/")) # CREATING MODEL DIRECTORY if not path_check(wkdir): makedirs(wkdir) chkpt = None chkptoptf = None chkptstatesf = None if save_every is not None: chkpt = wkdir + "checkpoint.t7" if save_optm_state: chkptoptf = wkdir + "checkpoint.optm.t7" chkptstatesf = wkdir + "checkpoint.states" logger = get_logger(wkdir + "train.log") # Logger object train_data = h5py.File(cnfg.train_data, "r") # training data read from h5 file valid_data = h5py.File(cnfg.dev_data, "r") # validation data read from h5 file print('[Info] Training and Validation data are loaded.') ntrain = int( train_data["ndata"][:][0]) # number of batches for TRAINING DATA nvalid = int( valid_data["ndata"][:][0]) # number of batches for VALIDATION DATA nwordi = int(train_data["nwordi"][:][0]) # VOCAB SIZE FOR SOURCE nwordt = int( train_data["nwordt"][:][0]) # VOCAB SIZE FOR PE [TODO: SIMILAR FOR MT] print('[INFO] number of batches for TRAINING DATA: ', ntrain) print('[INFO] number of batches for VALIDATION DATA: ', nvalid) print('[INFO] Source vocab size: ', nwordi) print('[INFO] Target vocab size: ', nwordt) random_seed = torch.initial_seed() if cnfg.seed is None else cnfg.seed rpyseed(random_seed) if use_cuda: torch.cuda.manual_seed_all(random_seed) print('[Info] Setting up random seed using CUDA.') else: torch.manual_seed(random_seed) logger.info("Design models with seed: %d" % torch.initial_seed()) mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.num_src_layer, cnfg.num_mt_layer, cnfg.num_pe_layer, cnfg.ff_hsize, cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead, cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output, cnfg.bindDecoderEmb, cnfg.forbidden_indexes) # TODO NEED DOCUMENTATION tl = [("i" + str(i), "m" + str(i), "t" + str(i)) for i in range(ntrain)] # TRAINING LIST fine_tune_m = cnfg.fine_tune_m # Fine tune model if fine_tune_m is not None: logger.info("Load pre-trained model from: " + fine_tune_m) mymodel = load_model_cpu(fine_tune_m, mymodel) lossf = LabelSmoothingLoss(nwordt, cnfg.label_smoothing, ignore_index=0, reduction='sum', forbidden_index=cnfg.forbidden_indexes) if use_cuda: mymodel.to(cuda_device) lossf.to(cuda_device) if fine_tune_m is None: for p in mymodel.parameters(): if p.requires_grad and (p.dim() > 1): xavier_uniform_(p) if cnfg.src_emb is not None: _emb = torch.load(cnfg.src_emb, map_location='cpu') if nwordi < _emb.size(0): _emb = _emb.narrow(0, 0, nwordi).contiguous() if use_cuda: _emb = _emb.to(cuda_device) mymodel.enc.wemb.weight.data = _emb if cnfg.freeze_srcemb: mymodel.enc.wemb.weight.requires_grad_(False) else: mymodel.enc.wemb.weight.requires_grad_(True) if cnfg.tgt_emb is not None: _emb = torch.load(cnfg.tgt_emb, map_location='cpu') if nwordt < _emb.size(0): _emb = _emb.narrow(0, 0, nwordt).contiguous() if use_cuda: _emb = _emb.to(cuda_device) mymodel.dec.wemb.weight.data = _emb if cnfg.freeze_tgtemb: mymodel.dec.wemb.weight.requires_grad_(False) else: mymodel.dec.wemb.weight.requires_grad_(True) mymodel.apply(init_fixing) # lr will be over written by GoogleLR before used optimizer = optim.Adam(mymodel.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9, weight_decay=cnfg.weight_decay, amsgrad=use_ams) # TODO: Need to implement '''if multi_gpu: # mymodel = nn.DataParallel(mymodel, device_ids=cuda_devices, output_device=cuda_device.index) mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True, gather_output=False) lossf = DataParallelCriterion(lossf, device_ids=cuda_devices, output_device=cuda_device.index, replicate_once=True)''' # Load fine tune state if declared fine_tune_state = cnfg.fine_tune_state if fine_tune_state is not None: logger.info("Load optimizer state from: " + fine_tune_state) optimizer.load_state_dict(torch.load(fine_tune_state)) lrsch = GoogleLR(optimizer, cnfg.isize, cnfg.warm_step) lrsch.step() num_checkpoint = cnfg.num_checkpoint cur_checkid = 0 # initialized current check point tminerr = float("inf") # minimum error during training minloss, minerr = eva(valid_data, nvalid, mymodel, lossf, cuda_device, multi_gpu) logger.info("".join(("Init lr: ", ",".join(tostr(getlr(optimizer))), ", Dev Loss/Error: %.3f %.2f" % (minloss, minerr)))) # if fine_tune_m is None: save_model(mymodel, wkdir + "init.t7", multi_gpu) logger.info("Initial model saved") # ==================================================Fine tune ======================================== if fine_tune_m is None: save_model(mymodel, wkdir + "init.t7", multi_gpu) logger.info("Initial model saved") else: cnt_states = cnfg.train_statesf if (cnt_states is not None) and path_check(cnt_states): logger.info("Continue last epoch") args = {} tminerr, done_tokens, cur_checkid, remain_steps, _ = train( train_data, load_states(cnt_states), valid_data, nvalid, optimizer, lrsch, mymodel, lossf, cuda_device, logger, done_tokens, multi_gpu, tokens_optm, batch_report, save_every, chkpt, chkptoptf, chkptstatesf, num_checkpoint, cur_checkid, report_eva, remain_steps, False) vloss, vprec = eva(valid_data, nvalid, mymodel, lossf, cuda_device, multi_gpu) logger.info( "Epoch: 0, train loss: %.3f, valid loss/error: %.3f %.2f" % (tminerr, vloss, vprec)) save_model( mymodel, wkdir + "train_0_%.3f_%.3f_%.2f.t7" % (tminerr, vloss, vprec), multi_gpu) if save_optm_state: torch.save( optimizer.state_dict(), wkdir + "train_0_%.3f_%.3f_%.2f.optm.t7" % (tminerr, vloss, vprec)) logger.info("New best model saved") # assume that the continue trained model has already been through sort grad, thus shuffle the training list. shuffle(tl) # ==================================================================================================== # ================================Dynamic sentence Sampling ========================================= if cnfg.dss_ws is not None and cnfg.dss_ws > 0.0 and cnfg.dss_ws < 1.0: dss_ws = int(cnfg.dss_ws * ntrain) _Dws = {} _prev_Dws = {} _crit_inc = {} if cnfg.dss_rm is not None and cnfg.dss_rm > 0.0 and cnfg.dss_rm < 1.0: dss_rm = int(cnfg.dss_rm * ntrain * (1.0 - cnfg.dss_ws)) else: dss_rm = 0 else: dss_ws = 0 dss_rm = 0 _Dws = None # ==================================================================================================== namin = 0 # TRAINING EPOCH STARTS for i in range(1, epochs + 1): terr, done_tokens, cur_checkid, remain_steps, _Dws = train( train_data, tl, valid_data, nvalid, optimizer, lrsch, mymodel, lossf, cuda_device, logger, done_tokens, multi_gpu, tokens_optm, batch_report, save_every, chkpt, chkptoptf, chkptstatesf, num_checkpoint, cur_checkid, report_eva, remain_steps, dss_ws > 0) # VALIDATION vloss, vprec = eva(valid_data, nvalid, mymodel, lossf, cuda_device, multi_gpu) logger.info( "Epoch: %d ||| train loss: %.3f ||| valid loss/error: %.3f/%.2f" % (i, terr, vloss, vprec)) # CONDITION TO SAVE MODELS if (vprec <= minerr) or (vloss <= minloss): save_model( mymodel, wkdir + "eva_%d_%.3f_%.3f_%.2f.t7" % (i, terr, vloss, vprec), multi_gpu) if save_optm_state: torch.save( optimizer.state_dict(), wkdir + "eva_%d_%.3f_%.3f_%.2f.optm.t7" % (i, terr, vloss, vprec)) logger.info("New best model saved" ) # [TODO CALCULATE BLEU FOR VALIDATION SET] namin = 0 if vprec < minerr: minerr = vprec if vloss < minloss: minloss = vloss else: if terr < tminerr: tminerr = terr save_model( mymodel, wkdir + "train_%d_%.3f_%.3f_%.2f.t7" % (i, terr, vloss, vprec), multi_gpu) if save_optm_state: torch.save( optimizer.state_dict(), wkdir + "train_%d_%.3f_%.3f_%.2f.optm.t7" % (i, terr, vloss, vprec)) elif epoch_save: save_model( mymodel, wkdir + "epoch_%d_%.3f_%.3f_%.2f.t7" % (i, terr, vloss, vprec), multi_gpu) namin += 1 # CONDITIONED TO EARLY STOP if namin >= earlystop: if done_tokens > 0: if multi_gpu: mymodel.collect_gradients() optimizer.step() # lrsch.step() done_tokens = 0 # optimizer.zero_grad() logger.info("early stop") break if remain_steps is not None and remain_steps <= 0: logger.info("Last training step reached") break '''if dss_ws > 0: if _prev_Dws: for _key, _value in _Dws.items(): if _key in _prev_Dws: _ploss = _prev_Dws[_key] _crit_inc[_key] = (_ploss - _value) / _ploss tl = dynamic_sample(_crit_inc, dss_ws, dss_rm) _prev_Dws = _Dws''' shuffle(tl) '''oldlr = getlr(optimizer) lrsch.step(terr) newlr = getlr(optimizer) if updated_lr(oldlr, newlr): logger.info("".join(("lr update from: ", ",".join(tostr(oldlr)), ", to: ", ",".join(tostr(newlr))))) hook_lr_update(optimizer, use_ams)''' if done_tokens > 0: if multi_gpu: mymodel.collect_gradients() optimizer.step() # lrsch.step() # done_tokens = 0 # optimizer.zero_grad() save_model(mymodel, wkdir + "last.t7", multi_gpu) if save_optm_state: torch.save(optimizer.state_dict(), wkdir + "last.optm.t7") logger.info("model saved") train_data.close() valid_data.close()