Beispiel #1
0
        eps=ieps_adam_default,
        weight_decay=cnfg.weight_decay,
        amsgrad=use_ams,
        multi_gpu_optimizer=multi_gpu_optimizer,
        contiguous_parameters=contiguous_parameters)
else:
    optimizer = Optimizer(get_model_parameters(
        mymodel, contiguous_parameters=contiguous_parameters),
                          lr=init_lr,
                          betas=adam_betas_default,
                          eps=ieps_adam_default,
                          weight_decay=cnfg.weight_decay,
                          amsgrad=use_ams)
optimizer.zero_grad(set_to_none=optm_step_zero_grad_set_none)

lrsch = LRScheduler(optimizer, cnfg.isize, cnfg.warm_step, scale=cnfg.lr_scale)

state_holder = None if statesf is None and cnt_states is None else Holder(
    **{
        "optm": optimizer,
        "lrsch": lrsch,
        "pyrand": PyRandomState(),
        "thrand": THRandomState(use_cuda=use_cuda)
    })

num_checkpoint = cnfg.num_checkpoint
cur_checkid = 0

tminerr = inf_default

minloss, minerr = eva(vd, nvalid, mymodel, lossf, cuda_device, multi_gpu,
Beispiel #2
0
    mymodel = DataParallelMT(mymodel,
                             device_ids=cuda_devices,
                             output_device=cuda_device.index,
                             host_replicate=True,
                             gather_output=False)
    lossf = DataParallelCriterion(lossf,
                                  device_ids=cuda_devices,
                                  output_device=cuda_device.index,
                                  replicate_once=True)

fine_tune_state = cnfg.fine_tune_state
if fine_tune_state is not None:
    logger.info("Load optimizer state from: " + fine_tune_state)
    optimizer.load_state_dict(h5load(fine_tune_state))

lrsch = GoogleLR(optimizer, cnfg.isize, cnfg.warm_step, scale=cnfg.lr_scale)

num_checkpoint = cnfg.num_checkpoint
cur_checkid = 0

tminerr = inf_default

minloss, minerr = eva(vd, nvalid, mymodel, lossf, cuda_device, multi_gpu,
                      use_amp)
logger.info("".join(("Init lr: ", ",".join(tostr(getlr(optimizer))),
                     ", Dev Loss/Error: %.3f %.2f" % (minloss, minerr))))

if fine_tune_m is None:
    save_model(mymodel, wkdir + "init.h5", multi_gpu, logger)
    logger.info("Initial model saved")
else:
Beispiel #3
0
    mymodel = DataParallelMT(mymodel,
                             device_ids=cuda_devices,
                             output_device=cuda_device.index,
                             host_replicate=True,
                             gather_output=False)
    lossf = DataParallelCriterion(lossf,
                                  device_ids=cuda_devices,
                                  output_device=cuda_device.index,
                                  replicate_once=True)

fine_tune_state = cnfg.fine_tune_state
if fine_tune_state is not None:
    logger.info("Load optimizer state from: " + fine_tune_state)
    optimizer.load_state_dict(torch.load(fine_tune_state))

lrsch = GoogleLR(optimizer, cnfg.isize, cnfg.warm_step)
lrsch.step()

num_checkpoint = cnfg.num_checkpoint
cur_checkid = 0

tminerr = float("inf")

minloss, minerr = eva(vd, nvalid, mymodel, lossf, cuda_device, multi_gpu)
logger.info("".join(("Init lr: ", ",".join(tostr(getlr(optimizer))),
                     ", Dev Loss/Error: %.3f %.2f" % (minloss, minerr))))

if fine_tune_m is None:
    save_model(mymodel, wkdir + "init.t7", multi_gpu)
    logger.info("Initial model saved")
else:
Beispiel #4
0
def main():
    ''' Main function '''
    rid = cnfg.runid  # Get run ID from cnfg file where training files will be stored
    if len(sys.argv) > 1:
        rid = sys.argv[1]  # getting runid from console

    earlystop = cnfg.earlystop  # Get early-stop criteria
    epochs = cnfg.epochs  #

    tokens_optm = cnfg.tokens_optm  # number of tokens

    done_tokens = tokens_optm

    batch_report = cnfg.batch_report
    report_eva = cnfg.report_eva

    use_cuda = cnfg.use_cuda
    gpuid = cnfg.gpuid

    # GPU configuration
    if use_cuda and torch.cuda.is_available():
        use_cuda = True
        if len(gpuid.split(",")) > 1:
            cuda_device = torch.device(gpuid[:gpuid.find(",")].strip())
            cuda_devices = [
                int(_.strip()) for _ in gpuid[gpuid.find(":") + 1:].split(",")
            ]
            print('[Info] using multiple gpu', cuda_devices)
            multi_gpu = True
        else:
            cuda_device = torch.device(gpuid)
            multi_gpu = False
            print('[Info] using single gpu', cuda_device)
            cuda_devices = None
        torch.cuda.set_device(cuda_device.index)
    else:
        cuda_device = False
        print('using single cpu')
        multi_gpu = False
        cuda_devices = None

    use_ams = cnfg.use_ams  # ?

    save_optm_state = cnfg.save_optm_state

    save_every = cnfg.save_every

    epoch_save = cnfg.epoch_save

    remain_steps = cnfg.training_steps

    wkdir = "".join((cnfg.work_dir, cnfg.data_dir, "/", rid,
                     "/"))  # CREATING MODEL DIRECTORY
    if not path_check(wkdir):
        makedirs(wkdir)

    chkpt = None
    chkptoptf = None
    chkptstatesf = None
    if save_every is not None:
        chkpt = wkdir + "checkpoint.t7"
        if save_optm_state:
            chkptoptf = wkdir + "checkpoint.optm.t7"
            chkptstatesf = wkdir + "checkpoint.states"

    logger = get_logger(wkdir + "train.log")  # Logger object

    train_data = h5py.File(cnfg.train_data,
                           "r")  # training data read from h5 file
    valid_data = h5py.File(cnfg.dev_data,
                           "r")  # validation data read from h5 file

    print('[Info] Training and Validation data are loaded.')

    ntrain = int(
        train_data["ndata"][:][0])  # number of batches for TRAINING DATA
    nvalid = int(
        valid_data["ndata"][:][0])  # number of batches for VALIDATION DATA
    nwordi = int(train_data["nwordi"][:][0])  # VOCAB SIZE FOR SOURCE
    nwordt = int(
        train_data["nwordt"][:][0])  # VOCAB SIZE FOR PE [TODO: SIMILAR FOR MT]

    print('[INFO] number of batches for TRAINING DATA: ', ntrain)
    print('[INFO] number of batches for VALIDATION DATA: ', nvalid)
    print('[INFO] Source vocab size: ', nwordi)
    print('[INFO] Target vocab size: ', nwordt)

    random_seed = torch.initial_seed() if cnfg.seed is None else cnfg.seed

    rpyseed(random_seed)

    if use_cuda:
        torch.cuda.manual_seed_all(random_seed)
        print('[Info] Setting up random seed using CUDA.')
    else:
        torch.manual_seed(random_seed)

    logger.info("Design models with seed: %d" % torch.initial_seed())

    mymodel = NMT(cnfg.isize, nwordi, nwordt, cnfg.num_src_layer,
                  cnfg.num_mt_layer, cnfg.num_pe_layer, cnfg.ff_hsize,
                  cnfg.drop, cnfg.attn_drop, cnfg.share_emb, cnfg.nhead,
                  cnfg.cache_len, cnfg.attn_hsize, cnfg.norm_output,
                  cnfg.bindDecoderEmb,
                  cnfg.forbidden_indexes)  # TODO NEED DOCUMENTATION

    tl = [("i" + str(i), "m" + str(i), "t" + str(i))
          for i in range(ntrain)]  # TRAINING LIST

    fine_tune_m = cnfg.fine_tune_m
    # Fine tune model

    if fine_tune_m is not None:
        logger.info("Load pre-trained model from: " + fine_tune_m)
        mymodel = load_model_cpu(fine_tune_m, mymodel)

    lossf = LabelSmoothingLoss(nwordt,
                               cnfg.label_smoothing,
                               ignore_index=0,
                               reduction='sum',
                               forbidden_index=cnfg.forbidden_indexes)

    if use_cuda:
        mymodel.to(cuda_device)
        lossf.to(cuda_device)

    if fine_tune_m is None:
        for p in mymodel.parameters():
            if p.requires_grad and (p.dim() > 1):
                xavier_uniform_(p)
        if cnfg.src_emb is not None:
            _emb = torch.load(cnfg.src_emb, map_location='cpu')
            if nwordi < _emb.size(0):
                _emb = _emb.narrow(0, 0, nwordi).contiguous()
            if use_cuda:
                _emb = _emb.to(cuda_device)
            mymodel.enc.wemb.weight.data = _emb
            if cnfg.freeze_srcemb:
                mymodel.enc.wemb.weight.requires_grad_(False)
            else:
                mymodel.enc.wemb.weight.requires_grad_(True)
        if cnfg.tgt_emb is not None:
            _emb = torch.load(cnfg.tgt_emb, map_location='cpu')
            if nwordt < _emb.size(0):
                _emb = _emb.narrow(0, 0, nwordt).contiguous()
            if use_cuda:
                _emb = _emb.to(cuda_device)
            mymodel.dec.wemb.weight.data = _emb
            if cnfg.freeze_tgtemb:
                mymodel.dec.wemb.weight.requires_grad_(False)
            else:
                mymodel.dec.wemb.weight.requires_grad_(True)
        mymodel.apply(init_fixing)

    # lr will be over written by GoogleLR before used
    optimizer = optim.Adam(mymodel.parameters(),
                           lr=1e-4,
                           betas=(0.9, 0.98),
                           eps=1e-9,
                           weight_decay=cnfg.weight_decay,
                           amsgrad=use_ams)

    # TODO: Need to implement
    '''if multi_gpu:
        # mymodel = nn.DataParallel(mymodel, device_ids=cuda_devices, output_device=cuda_device.index)
        mymodel = DataParallelMT(mymodel, device_ids=cuda_devices, output_device=cuda_device.index, host_replicate=True,
                                 gather_output=False)
        lossf = DataParallelCriterion(lossf, device_ids=cuda_devices, output_device=cuda_device.index,
                                      replicate_once=True)'''

    # Load fine tune state if declared
    fine_tune_state = cnfg.fine_tune_state
    if fine_tune_state is not None:
        logger.info("Load optimizer state from: " + fine_tune_state)
        optimizer.load_state_dict(torch.load(fine_tune_state))

    lrsch = GoogleLR(optimizer, cnfg.isize, cnfg.warm_step)
    lrsch.step()

    num_checkpoint = cnfg.num_checkpoint
    cur_checkid = 0  # initialized current check point

    tminerr = float("inf")  # minimum error during training

    minloss, minerr = eva(valid_data, nvalid, mymodel, lossf, cuda_device,
                          multi_gpu)
    logger.info("".join(("Init lr: ", ",".join(tostr(getlr(optimizer))),
                         ", Dev Loss/Error: %.3f %.2f" % (minloss, minerr))))

    # if fine_tune_m is None:
    save_model(mymodel, wkdir + "init.t7", multi_gpu)
    logger.info("Initial model saved")
    # ==================================================Fine tune ========================================
    if fine_tune_m is None:
        save_model(mymodel, wkdir + "init.t7", multi_gpu)
        logger.info("Initial model saved")
    else:
        cnt_states = cnfg.train_statesf
        if (cnt_states is not None) and path_check(cnt_states):
            logger.info("Continue last epoch")
            args = {}
            tminerr, done_tokens, cur_checkid, remain_steps, _ = train(
                train_data, load_states(cnt_states), valid_data, nvalid,
                optimizer, lrsch, mymodel, lossf, cuda_device, logger,
                done_tokens, multi_gpu, tokens_optm, batch_report, save_every,
                chkpt, chkptoptf, chkptstatesf, num_checkpoint, cur_checkid,
                report_eva, remain_steps, False)
            vloss, vprec = eva(valid_data, nvalid, mymodel, lossf, cuda_device,
                               multi_gpu)
            logger.info(
                "Epoch: 0, train loss: %.3f, valid loss/error: %.3f %.2f" %
                (tminerr, vloss, vprec))
            save_model(
                mymodel,
                wkdir + "train_0_%.3f_%.3f_%.2f.t7" % (tminerr, vloss, vprec),
                multi_gpu)
            if save_optm_state:
                torch.save(
                    optimizer.state_dict(), wkdir +
                    "train_0_%.3f_%.3f_%.2f.optm.t7" % (tminerr, vloss, vprec))
            logger.info("New best model saved")

        # assume that the continue trained model has already been through sort grad, thus shuffle the training list.
        shuffle(tl)
    # ====================================================================================================

    # ================================Dynamic sentence Sampling =========================================
    if cnfg.dss_ws is not None and cnfg.dss_ws > 0.0 and cnfg.dss_ws < 1.0:
        dss_ws = int(cnfg.dss_ws * ntrain)
        _Dws = {}
        _prev_Dws = {}
        _crit_inc = {}
        if cnfg.dss_rm is not None and cnfg.dss_rm > 0.0 and cnfg.dss_rm < 1.0:
            dss_rm = int(cnfg.dss_rm * ntrain * (1.0 - cnfg.dss_ws))
        else:
            dss_rm = 0
    else:
        dss_ws = 0
        dss_rm = 0
        _Dws = None
    # ====================================================================================================

    namin = 0

    # TRAINING EPOCH STARTS
    for i in range(1, epochs + 1):
        terr, done_tokens, cur_checkid, remain_steps, _Dws = train(
            train_data, tl, valid_data, nvalid, optimizer, lrsch, mymodel,
            lossf, cuda_device, logger, done_tokens, multi_gpu, tokens_optm,
            batch_report, save_every, chkpt, chkptoptf, chkptstatesf,
            num_checkpoint, cur_checkid, report_eva, remain_steps, dss_ws > 0)
        # VALIDATION
        vloss, vprec = eva(valid_data, nvalid, mymodel, lossf, cuda_device,
                           multi_gpu)
        logger.info(
            "Epoch: %d ||| train loss: %.3f ||| valid loss/error: %.3f/%.2f" %
            (i, terr, vloss, vprec))

        # CONDITION TO SAVE MODELS
        if (vprec <= minerr) or (vloss <= minloss):
            save_model(
                mymodel,
                wkdir + "eva_%d_%.3f_%.3f_%.2f.t7" % (i, terr, vloss, vprec),
                multi_gpu)
            if save_optm_state:
                torch.save(
                    optimizer.state_dict(), wkdir +
                    "eva_%d_%.3f_%.3f_%.2f.optm.t7" % (i, terr, vloss, vprec))
            logger.info("New best model saved"
                        )  # [TODO CALCULATE BLEU FOR VALIDATION SET]

            namin = 0

            if vprec < minerr:
                minerr = vprec
            if vloss < minloss:
                minloss = vloss

        else:
            if terr < tminerr:
                tminerr = terr
                save_model(
                    mymodel, wkdir + "train_%d_%.3f_%.3f_%.2f.t7" %
                    (i, terr, vloss, vprec), multi_gpu)
                if save_optm_state:
                    torch.save(
                        optimizer.state_dict(),
                        wkdir + "train_%d_%.3f_%.3f_%.2f.optm.t7" %
                        (i, terr, vloss, vprec))
            elif epoch_save:
                save_model(
                    mymodel, wkdir + "epoch_%d_%.3f_%.3f_%.2f.t7" %
                    (i, terr, vloss, vprec), multi_gpu)

            namin += 1
            # CONDITIONED TO EARLY STOP
            if namin >= earlystop:
                if done_tokens > 0:
                    if multi_gpu:
                        mymodel.collect_gradients()
                    optimizer.step()
                    # lrsch.step()
                    done_tokens = 0
                # optimizer.zero_grad()
                logger.info("early stop")
                break

        if remain_steps is not None and remain_steps <= 0:
            logger.info("Last training step reached")
            break
        '''if dss_ws > 0:
            if _prev_Dws:
                for _key, _value in _Dws.items():
                    if _key in _prev_Dws:
                        _ploss = _prev_Dws[_key]
                        _crit_inc[_key] = (_ploss - _value) / _ploss
                tl = dynamic_sample(_crit_inc, dss_ws, dss_rm)
            _prev_Dws = _Dws'''

        shuffle(tl)
        '''oldlr = getlr(optimizer)
        lrsch.step(terr)
        newlr = getlr(optimizer)
        if updated_lr(oldlr, newlr):
          logger.info("".join(("lr update from: ", ",".join(tostr(oldlr)), ", to: ", ",".join(tostr(newlr)))))
          hook_lr_update(optimizer, use_ams)'''

    if done_tokens > 0:
        if multi_gpu:
            mymodel.collect_gradients()
        optimizer.step()
    # lrsch.step()
    # done_tokens = 0
    # optimizer.zero_grad()

    save_model(mymodel, wkdir + "last.t7", multi_gpu)
    if save_optm_state:
        torch.save(optimizer.state_dict(), wkdir + "last.optm.t7")
    logger.info("model saved")

    train_data.close()
    valid_data.close()