Ejemplo n.º 1
0
def main():
    model = Model(cf.segment_class, cf.level_class, cf.image_scale)
    if torch.cuda.is_available():
        model.cuda()
    else:
        print("No cuda QAQ")
    trainer = Trainer(model,
                      torch.optim.Adam(model.parameters(), cf.lr),
                      epoch=cf.epoch,
                      use_cuda=torch.cuda.is_available(),
                      loss_weight=cf.loss_weight,
                      loss_func=3)
    trainer.train(init_from_exist=cf.import_model)
    trainer.test()
Ejemplo n.º 2
0
def main(alpha=None, gamma=None):
    config = Config(args.config_path)
    if args.mode:
        config.mode = args.mode
    if args.train_id:
        config.train_id = args.train_id
    if args.num_epochs:
        config.num_epochs = args.num_epochs
    if args.base_dir:
        config.base_dir = args.base_dir

    config.use_bayes_opt = args.use_bayes_opt
    config.use_preprocess = args.use_preprocess
    config.use_swa = args.use_swa

    train_path = os.path.join(config.base_dir, config.train_dir, config.train_id)
    result_path = os.path.join(config.base_dir, config.result_dir, config.train_id)
    data_path = os.path.join(config.base_dir, config.data_dir)

    if not os.path.isdir(train_path):
        os.mkdir(train_path)

    if not os.path.isdir(result_path):
        os.mkdir(result_path)

    init_logger(os.path.join(result_path, 'log.txt'))
    set_seed(config)

    # get data loader
    tokenizer = AutoTokenizer.from_pretrained(config.bert_model_name)

    param = {"root": data_path, "batch_size": config.batch_size, "tokenizer": tokenizer, "config": config}
    train_dataloader = data_loader(**param, phase='train')
    validate_dataloader = data_loader(**param, phase='validate')
    test_dataloader = data_loader(**param, phase='test')

    # create model config 확인
    model = Trainer(config, train_dataloader, validate_dataloader, test_dataloader)

    if config.mode == 'train':
        result = model.train(alpha=alpha, gamma=gamma)
    elif config.mode == 'test':
        model.load_model(config.model_weight_file)
        result = model.evaluate('test')

    del model
    return result
Ejemplo n.º 3
0
def main():
    # get args
    parser = argparse.ArgumentParser(description="Im2Latex Training Program")
    # parser.add_argument('--path', required=True, help='root of the model')

    # model args
    parser.add_argument("--emb_dim",
                        type=int,
                        default=80,
                        help="Embedding size")
    parser.add_argument("--dec_rnn_h",
                        type=int,
                        default=512,
                        help="The hidden state of the decoder RNN")
    parser.add_argument("--data_path",
                        type=str,
                        default="./data/",
                        help="The dataset's dir")
    parser.add_argument("--add_position_features",
                        action='store_true',
                        default=False,
                        help="Use position embeddings or not")
    # training args
    parser.add_argument("--max_len",
                        type=int,
                        default=150,
                        help="Max size of formula")
    parser.add_argument("--dropout",
                        type=float,
                        default=0.4,
                        help="Dropout probility")
    parser.add_argument("--cuda",
                        action='store_true',
                        default=True,
                        help="Use cuda or not")
    parser.add_argument("--batch_size", type=int, default=16)  # 指定batch_size
    parser.add_argument("--epoches", type=int, default=15)
    parser.add_argument("--lr", type=float, default=3e-4, help="Learning Rate")
    parser.add_argument("--min_lr",
                        type=float,
                        default=3e-5,
                        help="Learning Rate")
    parser.add_argument("--sample_method",
                        type=str,
                        default="teacher_forcing",
                        choices=('teacher_forcing', 'exp', 'inv_sigmoid'),
                        help="The method to schedule sampling")
    parser.add_argument(
        "--decay_k",
        type=float,
        default=1.,
        help="Base of Exponential decay for Schedule Sampling. "
        "When sample method is Exponential deca;"
        "Or a constant in Inverse sigmoid decay Equation. "
        "See details in https://arxiv.org/pdf/1506.03099.pdf")

    parser.add_argument("--lr_decay",
                        type=float,
                        default=0.5,
                        help="Learning Rate Decay Rate")
    parser.add_argument("--lr_patience",
                        type=int,
                        default=3,
                        help="Learning Rate Decay Patience")
    parser.add_argument("--clip",
                        type=float,
                        default=2.0,
                        help="The max gradient norm")
    parser.add_argument("--save_dir",
                        type=str,
                        default="./ckpts",
                        help="The dir to save checkpoints")
    parser.add_argument("--print_freq",
                        type=int,
                        default=100,
                        help="The frequency to print message")
    parser.add_argument("--seed",
                        type=int,
                        default=2020,
                        help="The random seed for reproducing ")
    parser.add_argument("--from_check_point",
                        action='store_true',
                        default=False,
                        help="Training from checkpoint or not")  # 是否finetune
    parser.add_argument("--exp", default="")  # 实验名称,ckpt的名称

    args = parser.parse_args()
    max_epoch = args.epoches
    from_check_point = args.from_check_point
    if from_check_point:
        checkpoint_path = get_checkpoint(args.save_dir)
        checkpoint = torch.load(checkpoint_path)
        args = checkpoint['args']
    print("Training args:", args)

    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    # Building vocab
    print("Load vocab...")
    vocab = load_vocab(args.data_path)

    use_cuda = True if args.cuda and torch.cuda.is_available() else False
    device = torch.device("cuda" if use_cuda else "cpu")

    # data loader
    print("Construct data loader...")
    train_loader = DataLoader(
        Im2LatexDataset(args.data_path, 'train', args.max_len),  # 测试偶尔用test
        # Im2LatexDataset(args.data_path, 'test', args.max_len),
        batch_size=args.batch_size,
        collate_fn=partial(collate_fn, vocab.sign2id),
        pin_memory=True
        if use_cuda else False,  # 锁页内存,这样的话数据都会加载到内存中,交换更快,但是要求设备更高
        num_workers=4)
    val_loader = DataLoader(Im2LatexDataset(args.data_path, 'validate',
                                            args.max_len),
                            batch_size=args.batch_size,
                            collate_fn=partial(collate_fn, vocab.sign2id),
                            pin_memory=True if use_cuda else False,
                            num_workers=4)

    # construct model
    print("Construct model")
    vocab_size = len(vocab)
    model = Im2LatexModel(vocab_size,
                          args.emb_dim,
                          args.dec_rnn_h,
                          add_pos_feat=args.add_position_features,
                          dropout=args.dropout)
    model = model.to(device)
    print("Model Settings:")
    print(model)

    # construct optimizer
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    lr_scheduler = ReduceLROnPlateau(optimizer,
                                     "min",
                                     factor=args.lr_decay,
                                     patience=args.lr_patience,
                                     verbose=True,
                                     min_lr=args.min_lr)

    if from_check_point:
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        lr_scheduler.load_state_dict(checkpoint['lr_sche'])
        # init trainer from checkpoint
        max_epoch = epoch + max_epoch  # 修改一个bug
        print('From %s To %s...' % (epoch, max_epoch))
        trainer = Trainer(optimizer,
                          model,
                          lr_scheduler,
                          train_loader,
                          val_loader,
                          args,
                          use_cuda=use_cuda,
                          init_epoch=epoch,
                          last_epoch=max_epoch)
    else:
        trainer = Trainer(optimizer,
                          model,
                          lr_scheduler,
                          train_loader,
                          val_loader,
                          args,
                          use_cuda=use_cuda,
                          init_epoch=1,
                          last_epoch=args.epoches,
                          exp=args.exp)
    # begin training
    trainer.train()
Ejemplo n.º 4
0
def train(hps, device, batch_size, test_batch_size, epochs, learning_rate,
          num_gpus, hosts, backend, current_host, model_dir, output_dir, seed,
          log_interval, beta1, nz, nc, ngf, ndf, dataloader):

    trainer = Trainer(nz,
                      nc,
                      ngf,
                      ndf,
                      weights_init,
                      device=device,
                      num_gpus=num_gpus)
    trainer.fixed_noise = torch.randn(batch_size, nz, 1, 1, device=device)

    # setup optimizer
    trainer.optimizerD = optim.Adam(trainer.netD.parameters(),
                                    lr=learning_rate,
                                    betas=(beta1, 0.999))
    trainer.optimizerG = optim.Adam(trainer.netG.parameters(),
                                    lr=learning_rate,
                                    betas=(beta1, 0.999))

    for epoch in range(epochs):
        trainer.train(epoch=epoch,
                      epochs=epochs,
                      log_batch=log_batch,
                      sample_batch=sample_batch,
                      dataloader=dataloader,
                      log_interval=log_interval,
                      output_dir=output_dir)

        # do checkpointing
        checkpoint_epoch(trainer, epoch, output_dir)

    trainer.save_model(model_dir)

    return

    is_distributed = len(hosts) > 1 and backend is not None
    logger.debug("Distributed training - {}".format(is_distributed))

    if is_distributed:
        # Initialize the distributed environment.
        world_size = len(hosts)
        os.environ['WORLD_SIZE'] = str(world_size)
        host_rank = hosts.index(current_host)
        os.environ['RANK'] = str(host_rank)
        dist.init_process_group(backend=backend,
                                rank=host_rank,
                                world_size=world_size)
        logger.info(
            'Initialized the distributed environment: \'{}\' backend on {} nodes. '
            .format(backend, dist.get_world_size()) +
            'Current host rank is {}. Number of gpus: {}'.format(
                dist.get_rank(), num_gpus))

    # set the seed for generating random numbers
    torch.manual_seed(seed)
    if device_name == "cuda":
        torch.cuda.manual_seed(seed)

    logging.getLogger().setLevel(logging.DEBUG)

    logger.debug("Processes {}/{} ({:.0f}%) of train data".format(
        len(train_loader.sampler), len(train_loader.dataset),
        100. * len(train_loader.sampler) / len(train_loader.dataset)))

    logger.debug("Processes {}/{} ({:.0f}%) of test data".format(
        len(test_loader.sampler), len(test_loader.dataset),
        100. * len(test_loader.sampler) / len(test_loader.dataset)))

    model = Net().to(device)
    if is_distributed and use_cuda:
        # multi-machine multi-gpu case
        model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        # single-machine multi-gpu case or single-machine or multi-machine cpu case
        model = torch.nn.DataParallel(model)

    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    for epoch in range(1, epochs + 1):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader, 1):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            if is_distributed and not device == "cuda":
                # average gradients manually for multi-machine cpu case only
                _average_gradients(model)
            optimizer.step()
            if batch_idx % log_interval == 0:
                logger.info(
                    'Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format(
                        epoch, batch_idx * len(data),
                        len(train_loader.sampler),
                        100. * batch_idx / len(train_loader), loss.item()))
        test(model, test_loader, device)
    save_model(model_dir, model)
Ejemplo n.º 5
0
file_ptr = open(mapping_file, 'r')
actions = file_ptr.read().split('\n')[:-1]
file_ptr.close()
actions_dict = dict()
for a in actions:
    actions_dict[a.split()[1]] = int(a.split()[0])

num_classes = len(actions_dict)

trainer = Trainer(num_stages,
                  num_layers,
                  num_f_maps,
                  features_dim,
                  num_classes,
                  pooling_type=pooling_type,
                  dropout=dropout)
if args.action == "train":
    batch_gen = BatchGenerator(num_classes, actions_dict, gt_path,
                               features_path, sample_rate)
    batch_gen.read_data(vid_list_file)
    trainer.train(model_dir,
                  batch_gen,
                  num_epochs=num_epochs,
                  batch_size=bz,
                  learning_rate=lr,
                  device=device)

if args.action == "predict":
    trainer.predict(model_dir, results_dir, features_path, vid_list_file_tst,
                    num_epochs, actions_dict, device, sample_rate)
Ejemplo n.º 6
0
    valid_ground_truth_path = './data/valid/ground_truth.txt'

    test_input_path = "./data/test/test.json"
    test_ground_truth_path = "./data/test/ground_truth.txt"

    config = {
        "max_length": 512,
        "epochs": 6,
        "batch_size": 3,
        "learning_rate": 2e-5,
        "fp16": True,
        "fp16_opt_level": "O1",
        "max_grad_norm": 1.0,
        "warmup_steps": 0.1,
    }
    hyper_parameter = HyperParameters()
    hyper_parameter.__dict__ = config
    algorithm = "LFESM"

    trainer = Trainer(
        training_dataset,
        bert_pretrained_model,
        hyper_parameter,
        algorithm,
        valid_input_path,
        valid_ground_truth_path,
        test_input_path,
        test_ground_truth_path,
    )
    trainer.train(MODEL_DIR)
Ejemplo n.º 7
0
    drop_last=True,  # drop the last batch that cannot be divided by batch_size
    pin_memory=True)
print('BUILDING MODEL')
criterion = nn.MSELoss()
encoder = resnet_encoder(7, kernel_size=31).cuda()
decoder = dresnet_decoder(7, kernel_size=31).cuda()
model = FullyConnectedConv1d_SpeechEnhancement(encoder=encoder,
                                               decoder=decoder).cuda()
resynthesizer = data.AudioResynthesizer(
    model=model,
    data_folder_path=serialized_testing_data_folder,
    saving_folder=saving_folder,
    transform=quick_transforms)
# model = WaveNet(layers=3,in_channels=1,output_length=32,kernel_size=3,bias=False,residual_channels=16).cuda()
# optimizer = torch.optim.Adam(model.parameters(),weight_decay=10.e-5)
optimizer = torch.optim.SGD(model.parameters(),
                            lr=0.1,
                            momentum=0.9,
                            weight_decay=10.e-5)
trainer = Trainer(model,
                  training_loader,
                  optimizer,
                  criterion,
                  test_loader=testing_loader,
                  verbose=True,
                  saving_folder=saving_folder,
                  resynthesizer=resynthesizer,
                  device_ids=[0, 1],
                  checkpoint=True)
trainer.train(70, drop_learning_rate=[10, 40, 50])
Ejemplo n.º 8
0
            'dir:' + str(dir) + '\n' +  \
            'batch:' + str(batch) + '\n' +  \
            'C-SGEN layers:' + str(C_SGEN_layers) + '\n' +  \
            'epochs:' + str(iteration)

    print(setting, '\n')

    model = C_SGEN().to(torch.device('cuda'))

    trainer = Trainer(model.train(), C_SGEN_layers)
    tester = T(model.eval(), C_SGEN_layers)

    Best_MSE = 100

    for epoch in range(1, (iteration + 1)):
        train_loss = trainer.train(train_loader)
        test_loss, RMSE_test, predicted_test, true_test = tester.test(
            test_loader)
        print('Epoch:', epoch, 'MSE:', test_loss)

        if test_loss < Best_MSE:
            Best_MSE = test_loss
            Best_epoch = epoch
            T_val, P_val = np.array(true_test), np.array(predicted_test)
            pear = pearson(T_val, P_val)
            spear = spearman(T_val, P_val)
            ci_value = ci(T_val, P_val)
            print('MSE improved to', Best_MSE, 'Pearson:', pear, 'Spearman:',
                  spear, 'CI:', ci_value, '\n')
            plots(T_val,
                  P_val,
Ejemplo n.º 9
0
def main(args, device, model_load_dir, model_save_dir, results_save_dir):

    if args.action == 'train' and args.extract_save_pseudo_labels == 0:
        # load train dataset and test dataset
        print(f'Load train data: {args.train_data}')
        train_loader = DataLoader(args, args.train_data, 'train')
        print(f'Load test data: {args.test_data}')
        test_loader = DataLoader(args, args.test_data, 'test')

        print(f'Start training.')
        trainer = Trainer(
                    args.num_stages,
                    args.num_layers,
                    args.num_f_maps,
                    args.features_dim,
                    train_loader.num_classes,
                    device,
                    train_loader.weights,
                    model_save_dir
                    )

        eval_args = [
            args,
            model_save_dir,
            results_save_dir,
            test_loader.features_dict,
            test_loader.gt_dict,
            test_loader.eval_gt_dict,
            test_loader.vid_list,
            args.num_epochs,
            device,
            'eval',
            args.classification_threshold,
        ]

        batch_gen = BatchGenerator(
            train_loader.num_classes,
            train_loader.gt_dict,
            train_loader.features_dict,
            train_loader.eval_gt_dict
            )

        batch_gen.read_data(train_loader.vid_list)
        trainer.train(
            model_save_dir,
            batch_gen,
            args.num_epochs,
            args.bz,
            args.lr,
            device,
            eval_args,
            pretrained=model_load_dir)

    elif args.extract_save_pseudo_labels and args.pseudo_label_type != 'PL':
        # extract/ generate pseudo labels and save in "data/pseudo_labels"
        print(f'Load test data: {args.test_data}')
        test_loader = DataLoader(args, args.test_data, args.extract_set, results_dir=results_save_dir)
        print(f'Extract {args.pseudo_label_type}')
        
        if args.pseudo_label_type == 'local':
            get_save_local_fusion(args, test_loader.features_dict, test_loader.gt_dict)
        elif args.pseudo_label_type == 'merge':
            merge_PL_CP(args, test_loader.features_dict, test_loader.gt_dict)
        elif args.pseudo_label_type == 'CMPL':
            CMPL(args, test_loader.features_dict, test_loader.gt_dict)
        elif args.pseudo_label_type == 'CP':
            extract_CP(args, test_loader.features_dict)
        
        print('Self labelling process finished')


    else:
        print(f'Load test data: {args.test_data}')
        test_loader = DataLoader(args, args.test_data, args.extract_set, results_dir=results_save_dir)

        if args.extract_save_pseudo_labels and args.pseudo_label_type == 'PL':
            print(f'Extract {args.pseudo_label_type}')
            extract_save_PL = 1
        else:
            print(f'Start inference.')
            extract_save_PL = 0

        trainer = Trainer(
            args.num_stages,
            args.num_layers,
            args.num_f_maps,
            args.features_dim,
            test_loader.num_classes,
            device,
            test_loader.weights,
            results_save_dir)

        trainer.predict(
            args,
            model_load_dir,
            results_save_dir,
            test_loader.features_dict,
            test_loader.gt_dict,
            test_loader.eval_gt_dict,
            test_loader.vid_list,
            args.num_epochs,
            device,
            'test',
            args.classification_threshold,
            uniform=args.uniform,
            save_pslabels=extract_save_PL,
            CP_dict=test_loader.CP_dict,
            )
Ejemplo n.º 10
0
save_name = os.path.join(save_dir, save_name)

encoder = Denses(encoder_list)
encoder_list.reverse()
decoder = Denses(encoder_list)
encoder_list.reverse()
trainer = Trainer(encoder, decoder).to(device)
optimizer = optim.Adam(trainer.parameters())
loss_fn = MSE()

loss_log = []
all_len = len(train_data) // batch_size + 1

for e in range(epochs):
    loss_ = []
    trainer.train()
    for i, x in enumerate(yield_data_time(train_data, batch_size, 1, True)):
        x = x.reshape(x.shape[1], -1)
        x = torch.FloatTensor(x).to(device)
        out = trainer(x, None)
        loss = loss_fn(x, out)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_.append(loss.cpu().data.numpy())
        if i % 10 == 0:
            print('e:{}/{}  {}/{}  loss{}'.format(e, epochs, i, all_len,
                                                  loss_[-1]))

    trainer.eval()
    loss_eval_ = []
Ejemplo n.º 11
0
def main(args):

    train_dir = args.train_dir
    train_csv = args.train_csv
    test_dir = args.test_dir
    test_csv = args.test_csv

    ratio = args.train_valid_ratio
    batch_size = args.batch_size
    epochs = args.epochs

    train_flag = args.train
    pretrain_weight = args.pretrain_weight
    verbose = args.verbose

    if (train_flag == 0):
        if (verbose == 2):
            print("Reading Training Data...")

        train_csv = pd.read_csv(train_csv)
        train_csv, valid_csv = train_valid_split(train_csv, ratio)

        train = RetinopathyDataset(train_csv, train_dir)
        valid = RetinopathyDataset(valid_csv, train_dir)

        if (verbose == 2):
            print("Creating DataLoader...")

        train_dataloader = DataLoader(train,
                                      batch_size=batch_size,
                                      shuffle=True,
                                      num_workers=4)
        valid_dataloader = DataLoader(valid,
                                      batch_size=batch_size * 4,
                                      shuffle=False,
                                      num_workers=4)

        if (verbose == 2):
            print("Creating EfficientNet Model...")

        model = EfficientNetFinetune(
            level="efficientnet-b5",
            finetune=False,
            pretrain_weight="./weights/pretrained/aptos2018.pth")

        trainer = Trainer(model,
                          train_dataloader,
                          valid_dataloader,
                          epochs,
                          early_stop="QK",
                          verbose=verbose)

        if (verbose == 2):
            print("Strat Training...")
        trainer.train()

    if (train_flag == 1):
        if (verbose == 2):
            print("Strat Predicting...")

        test_csv = pd.read_csv(test_csv)
        test = RetinopathyDataset(test_csv, test_dir, test=True)
        test_dataloader = DataLoader(test,
                                     batch_size=batch_size * 4,
                                     shuffle=False,
                                     num_workers=4)
        model = EfficientNetFinetune(level="efficientnet-b5",
                                     finetune=False,
                                     test=True,
                                     pretrain_weight=pretrain_weight)
        tester(model, test_dataloader, verbose)
def main():

    # get args
    parser = argparse.ArgumentParser(description="Im2Latex Training Program")
    # parser.add_argument('--path', required=True, help='root of the model')

    # model args
    parser.add_argument("--emb_dim",
                        type=int,
                        default=80,
                        help="Embedding size")
    parser.add_argument("--dec_rnn_h",
                        type=int,
                        default=512,
                        help="The hidden state of the decoder RNN")
    parser.add_argument("--data_path",
                        type=str,
                        default="/root/private/im2latex/data/",
                        help="The dataset's dir")
    parser.add_argument("--add_position_features",
                        action='store_true',
                        default=False,
                        help="Use position embeddings or not")
    # training args
    parser.add_argument("--max_len",
                        type=int,
                        default=150,
                        help="Max size of formula")
    parser.add_argument("--dropout",
                        type=float,
                        default=0.,
                        help="Dropout probility")
    parser.add_argument("--cuda",
                        action='store_true',
                        default=True,
                        help="Use cuda or not")
    parser.add_argument("--batch_size", type=int, default=8)
    parser.add_argument("--epoches", type=int, default=200)
    parser.add_argument("--lr", type=float, default=3e-4, help="Learning Rate")
    parser.add_argument("--min_lr",
                        type=float,
                        default=3e-5,
                        help="Learning Rate")
    parser.add_argument("--sample_method",
                        type=str,
                        default="teacher_forcing",
                        choices=('teacher_forcing', 'exp', 'inv_sigmoid'),
                        help="The method to schedule sampling")
    parser.add_argument("--decay_k", type=float, default=1.)

    parser.add_argument("--lr_decay",
                        type=float,
                        default=0.5,
                        help="Learning Rate Decay Rate")
    parser.add_argument("--lr_patience",
                        type=int,
                        default=3,
                        help="Learning Rate Decay Patience")
    parser.add_argument("--clip",
                        type=float,
                        default=2.0,
                        help="The max gradient norm")
    parser.add_argument("--save_dir",
                        type=str,
                        default="./ckpts",
                        help="The dir to save checkpoints")
    parser.add_argument("--print_freq",
                        type=int,
                        default=100,
                        help="The frequency to print message")
    parser.add_argument("--seed",
                        type=int,
                        default=2020,
                        help="The random seed for reproducing ")
    parser.add_argument("--from_check_point",
                        action='store_true',
                        default=False,
                        help="Training from checkpoint or not")
    parser.add_argument("--batch_size_per_gpu", type=int, default=16)
    parser.add_argument("--gpu_num", type=int, default=4)
    device_ids = [0, 1, 2, 3]

    args = parser.parse_args()
    max_epoch = args.epoches
    from_check_point = args.from_check_point
    if from_check_point:
        checkpoint_path = get_checkpoint(args.save_dir)
        checkpoint = torch.load(checkpoint_path)
        args = checkpoint['args']
    print("Training args:", args)

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    # Building vocab
    print("Load vocab...")
    vocab = load_vocab(args.data_path)

    use_cuda = True if args.cuda and torch.cuda.is_available() else False
    print(use_cuda)
    device = torch.device("cuda" if use_cuda else "cpu")

    # data loader
    print("Construct data loader...")
    # train_loader = DataLoader(
    #     Im2LatexDataset(args.data_path, 'train', args.max_len),
    #     batch_size=args.batch_size,
    #     collate_fn=partial(collate_fn, vocab.token2idx),
    #     pin_memory=True if use_cuda else False,
    #     num_workers=4)
    train_loader = DataLoader(
        Im2LatexDataset(args.data_path, 'train', args.max_len),
        batch_size=args.batch_size_per_gpu * args.gpu_num,
        collate_fn=partial(collate_fn, vocab.token2idx),
        pin_memory=True if use_cuda else False,
        num_workers=2)
    # val_loader = DataLoader(
    #     Im2LatexDataset(args.data_path, 'validate', args.max_len),
    #     batch_size=args.batch_size,
    #     collate_fn=partial(collate_fn, vocab.token2idx),
    #     pin_memory=True if use_cuda else False,
    #     num_workers=4)
    val_loader = DataLoader(Im2LatexDataset(args.data_path, 'validate',
                                            args.max_len),
                            batch_size=args.batch_size_per_gpu * args.gpu_num,
                            collate_fn=partial(collate_fn, vocab.token2idx),
                            pin_memory=True if use_cuda else False,
                            num_workers=2)

    # construct model
    print("Construct model")
    vocab_size = len(vocab)
    model = Im2LatexModel(vocab_size,
                          args.emb_dim,
                          args.dec_rnn_h,
                          add_pos_feat=args.add_position_features,
                          dropout=args.dropout)
    model = nn.DataParallel(model, device_ids=device_ids)
    # model = model.
    model = model.cuda()
    print("Model Settings:")
    print(model)

    # construct optimizer
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    lr_scheduler = ReduceLROnPlateau(optimizer,
                                     "min",
                                     factor=args.lr_decay,
                                     patience=args.lr_patience,
                                     verbose=True,
                                     min_lr=args.min_lr)

    if from_check_point:
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
        lr_scheduler.load_state_dict(checkpoint['lr_sche'])
        # init trainer from checkpoint
        trainer = Trainer(optimizer,
                          model,
                          lr_scheduler,
                          train_loader,
                          val_loader,
                          args,
                          use_cuda=use_cuda,
                          init_epoch=epoch,
                          last_epoch=max_epoch)
    else:
        trainer = Trainer(optimizer,
                          model,
                          lr_scheduler,
                          train_loader,
                          val_loader,
                          args,
                          use_cuda=use_cuda,
                          init_epoch=1,
                          last_epoch=args.epoches)
    # begin training
    trainer.train()