Exemple #1
0
def main(opt):
    print("=" * 80)
    print("Generating partial discrete flashing ratchet trajectories")
    data = simulation(2, opt.n_step, opt.potential, seed=0)
    trainset = data[0] % 3
    testset = data[1] % 3
    print("Done")
    print("=" * 80)

    use_cuda = not opt.no_cuda and torch.cuda.is_available()
    torch.manual_seed(opt.seed)
    opt.device = torch.device("cuda" if use_cuda else "cpu")

    model = RNEEP(opt)
    model = model.to(opt.device)
    optim = torch.optim.Adam(model.parameters(), opt.lr, weight_decay=opt.wd)

    trajs_t = torch.from_numpy(trainset).to(opt.device).long().view(1, -1)
    test_trajs_t = torch.from_numpy(testset).to(opt.device).long().view(1, -1)

    train_sampler = CartesianSeqSampler(1, opt.n_step, opt.seq_len,
                                        opt.batch_size)
    test_sampler = CartesianSeqSampler(1,
                                       opt.n_step,
                                       opt.seq_len,
                                       opt.test_batch_size,
                                       train=False)

    ret_train = []
    ret_test = []

    if not os.path.exists(opt.save):
        os.makedirs(opt.save)

    for i in tqdm(range(1, opt.n_iter + 1)):
        if i % opt.record_freq == 0 or i == 1:
            preds, train_loss = validate(opt, model, trajs_t, train_sampler)
            train_log = logging_rneep(i, train_loss, opt.seq_len, preds)

            preds, test_loss = validate(opt, model, test_trajs_t, test_sampler)
            test_log = logging_rneep(i,
                                     test_loss,
                                     opt.seq_len,
                                     preds,
                                     train=False)
            if i == 1:
                best_loss = test_loss
                best_pred_rate = test_log["pred_rate"]
            else:
                is_best = test_loss < best_loss
                if is_best:
                    best_loss = test_loss
                    best_pred_rate = test_log["pred_rate"]
                save_checkpoint(
                    {
                        "iteration": i,
                        "state_dict": model.state_dict(),
                        "best_loss": best_loss,
                        "best_pred_rate": best_pred_rate,
                        "optimizer": optim.state_dict(),
                    },
                    is_best,
                    opt.save,
                )
            test_log["best_loss"] = best_loss
            test_log["best_pred_rate"] = best_pred_rate
            ret_train.append(train_log)
            ret_test.append(test_log)
            train_sampler.train()

        train(opt, model, optim, trajs_t, train_sampler)

    train_df = pd.DataFrame(ret_train)
    test_df = pd.DataFrame(ret_test)

    train_df.to_csv(os.path.join(opt.save, "train_log.csv"), index=False)
    test_df.to_csv(os.path.join(opt.save, "test_log.csv"), index=False)
    opt.device = "cuda" if use_cuda else "cpu"
    hparams = json.dumps(vars(opt))
    with open(os.path.join(opt.save, "hparams.json"), "w") as f:
        f.write(hparams)
Exemple #2
0
def train(opt):

    ################################
    # Build dataloader
    ################################
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    ##########################
    # Initialize infos
    ##########################
    infos = {
        'iter': 0,
        'epoch': 0,
        'loader_state_dict': None,
        'vocab': loader.get_vocab(),
    }
    # Load old infos(if there is) and check if models are compatible
    if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl')):
        with open(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl'), 'rb') as f:
            infos = utils.pickle_load(f)
            saved_model_opt = infos['opt']
            need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"]
            for checkme in need_be_same:
                assert getattr(saved_model_opt, checkme) == getattr(opt, checkme), "Command line argument and saved model disagree on '%s' " % checkme
    infos['opt'] = opt

    #########################
    # Build logger
    #########################
    # naive dict logger
    histories = defaultdict(dict)
    if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')):
        with open(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl'), 'rb') as f:
            histories.update(utils.pickle_load(f))

    # tensorboard logger
    tb_summary_writer = SummaryWriter(opt.checkpoint_path)

    ##########################
    # Build model
    ##########################
    opt.vocab = loader.get_vocab()
    model = models.setup(opt).cuda()
    del opt.vocab
    # Load pretrained weights:
    if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, 'model.pth')):
        model.load_state_dict(torch.load(os.path.join(opt.start_from, 'model.pth')))
    
    # Wrap generation model with loss function(used for training)
    # This allows loss function computed separately on each machine
    lw_model = LossWrapper(model, opt)
    # Wrap with dataparallel
    dp_model = torch.nn.DataParallel(model)
    dp_lw_model = torch.nn.DataParallel(lw_model)

    ##########################
    #  Build optimizer
    ##########################
    if opt.noamopt:
        assert opt.caption_model == 'transformer', 'noamopt can only work with transformer'
        optimizer = utils.get_std_opt(model, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup)
    elif opt.reduce_on_plateau:
        optimizer = utils.build_optimizer(model.parameters(), opt)
        optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    else:
        optimizer = utils.build_optimizer(model.parameters(), opt)
    # Load the optimizer
    if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from,"optimizer.pth")):
        optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    #########################
    # Get ready to start
    #########################
    iteration = infos['iter']
    epoch = infos['epoch']
    # For back compatibility
    if 'iterators' in infos:
        infos['loader_state_dict'] = {split: {'index_list': infos['split_ix'][split], 'iter_counter': infos['iterators'][split]} for split in ['train', 'val', 'test']}
    loader.load_state_dict(infos['loader_state_dict'])
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)
    if opt.noamopt:
        optimizer._step = iteration
    # flag indicating finish of an epoch
    # Always set to True at the beginning to initialize the lr or etc.
    epoch_done = True
    # Assure in training mode
    dp_lw_model.train()

    # Start training
    try:
        while True:
            if epoch_done:
                if not opt.noamopt and not opt.reduce_on_plateau:
                    # Assign the learning rate
                    if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                        frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every
                        decay_factor = opt.learning_rate_decay_rate  ** frac
                        opt.current_lr = opt.learning_rate * decay_factor
                    else:
                        opt.current_lr = opt.learning_rate
                    utils.set_lr(optimizer, opt.current_lr) # set the decayed rate
                # Assign the scheduled sampling prob
                if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                    frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every
                    opt.ss_prob = min(opt.scheduled_sampling_increase_prob  * frac, opt.scheduled_sampling_max_prob)
                    model.ss_prob = opt.ss_prob

                # If start self critical training
                if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                    sc_flag = True
                    init_scorer(opt.cached_tokens)
                else:
                    sc_flag = False
                
                # If start structure loss training
                if opt.structure_after != -1 and epoch >= opt.structure_after:
                    struc_flag = True
                    init_scorer(opt.cached_tokens)
                else:
                    struc_flag = False

                epoch_done = False
                    
            start = time.time()
            # Load data from train split (0)
            data = loader.get_batch('train')
            print('Read data:', time.time() - start)

            torch.cuda.synchronize()
            start = time.time()

            tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks']]
            tmp = [_ if _ is None else _.cuda() for _ in tmp]
            fc_feats, att_feats, labels, masks, att_masks = tmp
            
            optimizer.zero_grad()
            model_out = dp_lw_model(fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts'])), sc_flag, struc_flag)

            loss = model_out['loss'].mean()

            loss.backward()
            utils.clip_gradient(optimizer, opt.grad_clip)
            optimizer.step()
            train_loss = loss.item()
            torch.cuda.synchronize()
            end = time.time()
            if struc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, train_loss, model_out['lm_loss'].mean().item(), model_out['struc_loss'].mean().item(), end - start))
            elif not sc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, train_loss, end - start))
            else:
                print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, model_out['reward'].mean(), end - start))

            # Update the iteration and epoch
            iteration += 1
            if data['bounds']['wrapped']:
                epoch += 1
                epoch_done = True

            # Write the training loss summary
            if (iteration % opt.losses_log_every == 0):
                tb_summary_writer.add_scalar('train_loss', train_loss, iteration)
                if opt.noamopt:
                    opt.current_lr = optimizer.rate()
                elif opt.reduce_on_plateau:
                    opt.current_lr = optimizer.current_lr
                tb_summary_writer.add_scalar('learning_rate', opt.current_lr, iteration)
                tb_summary_writer.add_scalar('scheduled_sampling_prob', model.ss_prob, iteration)
                if sc_flag:
                    tb_summary_writer.add_scalar('avg_reward', model_out['reward'].mean(), iteration)
                elif struc_flag:
                    tb_summary_writer.add_scalar('lm_loss', model_out['lm_loss'].mean().item(), iteration)
                    tb_summary_writer.add_scalar('struc_loss', model_out['struc_loss'].mean().item(), iteration)
                    tb_summary_writer.add_scalar('reward', model_out['reward'].mean().item(), iteration)

                histories['loss_history'][iteration] = train_loss if not sc_flag else model_out['reward'].mean()
                histories['lr_history'][iteration] = opt.current_lr
                histories['ss_prob_history'][iteration] = model.ss_prob

            # update infos
            infos['iter'] = iteration
            infos['epoch'] = epoch
            infos['loader_state_dict'] = loader.state_dict()
            
            # make evaluation on validation set, and save model
            if (iteration % opt.save_checkpoint_every == 0):
                # eval model
                eval_kwargs = {'split': 'val',
                                'dataset': opt.input_json}
                eval_kwargs.update(vars(opt))
                val_loss, predictions, lang_stats = eval_utils.eval_split(
                    dp_model, lw_model.crit, loader, eval_kwargs)

                if opt.reduce_on_plateau:
                    if 'CIDEr' in lang_stats:
                        optimizer.scheduler_step(-lang_stats['CIDEr'])
                    else:
                        optimizer.scheduler_step(val_loss)
                # Write validation result into summary
                tb_summary_writer.add_scalar('validation loss', val_loss, iteration)
                if lang_stats is not None:
                    for k,v in lang_stats.items():
                        tb_summary_writer.add_scalar(k, v, iteration)
                histories['val_result_history'][iteration] = {'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions}

                # Save model if is improving on validation result
                if opt.language_eval == 1:
                    current_score = lang_stats['CIDEr']
                else:
                    current_score = - val_loss

                best_flag = False

                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True

                # Dump miscalleous informations
                infos['best_val_score'] = best_val_score

                utils.save_checkpoint(opt, model, infos, optimizer, histories)
                if opt.save_history_ckpt:
                    utils.save_checkpoint(opt, model, infos, optimizer, append=str(iteration))

                if best_flag:
                    utils.save_checkpoint(opt, model, infos, optimizer, append='best')

            # Stop if reaching max epochs
            if epoch >= opt.max_epochs and opt.max_epochs != -1:
                break
    except (RuntimeError, KeyboardInterrupt):
        print('Save ckpt on exception ...')
        utils.save_checkpoint(opt, model, infos, optimizer)
        print('Save ckpt done.')
        stack_trace = traceback.format_exc()
        print(stack_trace)
def main():
    global opt
    best_prec1 = 0
    # only used when we resume training from some checkpoint model 
    resume_epoch = 0 
    # train data loader
    # for loader, droplast by default is set to false 
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batchSize,
                                     shuffle=True, num_workers=int(opt.workers))
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=opt.batchSize,
                                     shuffle=True, num_workers=int(opt.workers))
    
    
    # create model 
    # for modelnet40, opt.num_points is set to be 2048, opt.num_classes is 40
    model = pointnet.PointNetCls(num_points = opt.num_points, k = opt.num_classes)
    if opt.init_model != '':
        print('loading pretrained model from {0}'.format(opt.init_model))
        model.load_state_dict(torch.load(opt.init_model))

    criterion = nn.CrossEntropyLoss()

    if opt.cuda:  
        print('shift model and criterion to GPU .. ')
        model = model.cuda() 
        # define loss function (criterion) and pptimizer
        criterion = criterion.cuda()
    # optimizer 

    optimizer = optim.SGD(model.parameters(), opt.lr,
                                momentum=opt.momentum,
                                weight_decay=opt.weight_decay)

    if opt.optim_state_from != '':
        print('loading optim_state_from {0}'.format(opt.optim_state_from))
        optim_state = torch.load(opt.optim_state_from)
        resume_epoch = optim_state['epoch']
        best_prec1 = optim_state['best_prec1']
        # configure optimzer 
        optimizer.load_state_dict(optim_state['optim_state_best'])

    for epoch in range(resume_epoch, opt.max_epochs):
    	#################################
        # train for one epoch
        # debug_here()
        #################################
        train(train_loader, model, criterion, optimizer, epoch, opt)


    	#################################
        # validate 
        #################################
        prec1 = validate(test_loader, model, criterion, epoch, opt)

        ##################################
        # save checkpoints 
        ################################## 
        if best_prec1 < prec1: 
            best_prec1 = prec1 
            path_checkpoint = '{0}/model_best.pth'.format(opt.checkpoint_folder)
            utils.save_checkpoint(model.state_dict(), path_checkpoint)

            # save optim state 
            path_optim_state = '{0}/optim_state_best.pth'.format(opt.checkpoint_folder)
            optim_state = {} 
            optim_state['epoch'] = epoch + 1 # because epoch starts from 0
            optim_state['best_prec1'] = best_prec1  
            optim_state['optim_state_best'] = optimizer.state_dict() 
            utils.save_checkpoint(optim_state, path_optim_state)
        # problem, should we store latest optim state or model, currently, we donot  
            
        print('best accuracy: ', best_prec1)
Exemple #4
0
def main(opt):
    trajs = simulation(opt.n_trj,
                       opt.n_step,
                       opt.n_bead,
                       opt.time_step,
                       seed=0)
    test_trajs = simulation(opt.n_trj,
                            opt.n_step,
                            opt.n_bead,
                            opt.time_step,
                            seed=3)

    mean, std = trajs.mean(axis=(0, 1)).to(
        opt.device), trajs.std(axis=(0, 1)).to(opt.device)
    transform = lambda x: (x - mean) / std if opt.normalize else lambda x: x

    opt.n_input = opt.n_bead
    torch.manual_seed(opt.seed)
    random.seed(opt.seed)

    model = NEEP(opt)
    model = model.to(opt.device)
    optim = torch.optim.Adam(model.parameters(), opt.lr, weight_decay=opt.wd)

    train_sampler = CartesianSampler(opt.n_trj, opt.n_step, opt.batch_size)
    test_sampler = CartesianSampler(opt.n_trj,
                                    opt.n_step,
                                    opt.test_batch_size,
                                    train=False)

    ents = tot_entpy(test_trajs)

    ret_train = []
    ret_test = []

    if not os.path.exists(opt.save):
        os.makedirs(opt.save)

    for i in tqdm(range(1, opt.n_iter + 1)):
        if i % opt.record_freq == 0 or i == 1:
            preds, train_loss = validate(opt, model, trajs, test_sampler,
                                         transform)
            train_log = logging(i, train_loss, opt.time_step, preds)

            preds, test_loss = validate(opt, model, test_trajs, test_sampler,
                                        transform)
            test_log = logging_r(i, test_loss, opt.time_step, ents, preds)
            if i == 1:
                best_loss = test_loss
                best_pred_rate = test_log["pred_rate"]
            else:
                is_best = test_loss < best_loss
                if is_best:
                    best_loss = test_loss
                    best_pred_rate = test_log["pred_rate"]
                save_checkpoint(
                    {
                        "iteration": i,
                        "state_dict": model.state_dict(),
                        "best_loss": best_loss,
                        "best_pred_rate": best_pred_rate,
                        "optimizer": optim.state_dict(),
                    },
                    is_best,
                    opt.save,
                )
            test_log["best_loss"] = best_loss
            test_log["best_pred_rate"] = best_pred_rate
            ret_train.append(train_log)
            ret_test.append(test_log)
            train_sampler.train()

        train(opt, model, optim, trajs, train_sampler, transform)

    train_df = pd.DataFrame(ret_train)
    test_df = pd.DataFrame(ret_test)

    train_df.to_csv(os.path.join(opt.save, "train_log.csv"), index=False)
    test_df.to_csv(os.path.join(opt.save, "test_log.csv"), index=False)
    opt.device = "cuda" if use_cuda else "cpu"
    hparams = json.dumps(vars(opt))
    with open(os.path.join(opt.save, "hparams.json"), "w") as f:
        f.write(hparams)
def main():
    global opt
    best_prec1 = 0
    # only used when we resume training from some checkpoint model
    resume_epoch = 0
    # train data loader
    # for loader, droplast by default is set to false
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=int(opt.workers))
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=opt.batch_size,
                                             shuffle=True,
                                             num_workers=int(opt.workers))

    # create model
    # for modelnet40, opt.num_points is set to be 2048, opt.num_classes is 40
    opt.num_seg_classes = train_dataset.num_seg_classes
    opt.num_points = train_dataset.num_points
    opt.num_classes = train_dataset.num_classes

    model = pointnet.PointNetPartDenseCls(num_points=opt.num_points,
                                          k=opt.num_seg_classes)

    if opt.init_model != '':
        print('loading pretrained model from {0}'.format(opt.init_model))

        model.load_state_dict(torch.load(opt.init_model))
    # segmentation loss
    criterion = nn.NLLLoss()

    if opt.cuda:
        print('shift model and criterion to GPU .. ')
        model = model.cuda()
        # define loss function (criterion) and pptimizer
        criterion = criterion.cuda()
    # optimizer

    optimizer = optim.SGD(model.parameters(),
                          opt.lr,
                          momentum=opt.momentum,
                          weight_decay=opt.weight_decay)

    if opt.optim_state_from != '':
        print('loading optim_state_from {0}'.format(opt.optim_state_from))
        optim_state = torch.load(opt.optim_state_from)
        resume_epoch = optim_state['epoch']
        best_prec1 = optim_state['best_prec1']
        # configure optimzer
        optimizer.load_state_dict(optim_state['optim_state_best'])

    for epoch in range(resume_epoch, opt.max_epochs):
        #################################
        # train for one epoch
        # debug_here()
        #################################
        train(train_loader, model, criterion, optimizer, epoch, opt)

        #################################
        # validate
        #################################
        prec1 = validate(val_loader, model, criterion, epoch, opt)
        ##################################
        # save checkpoints
        ##################################
        if best_prec1 < prec1:
            best_prec1 = prec1
            path_checkpoint = '{0}/model_best.pth'.format(
                opt.checkpoint_folder)
            utils.save_checkpoint(model.state_dict(), path_checkpoint)

            # save optim state
            path_optim_state = '{0}/optim_state_best.pth'.format(
                opt.checkpoint_folder)
            optim_state = {}
            optim_state['epoch'] = epoch + 1  # because epoch starts from 0
            optim_state['best_prec1'] = best_prec1
            optim_state['optim_state_best'] = optimizer.state_dict()
            utils.save_checkpoint(optim_state, path_optim_state)
        # problem, should we store latest optim state or model, currently, we donot

        print('best accuracy: ', best_prec1)
Exemple #6
0
        if (sum(recall[0]) + sum(recall[1]) > best_rec):
            best_rec = sum(recall[0]) + sum(recall[1])
            is_best = True

        state = {
            'epoch': epoch,
            'state_dict': join_emb.state_dict(),
            'best_rec': best_rec,
            'args_dict': args,
            'optimizer': optimizer.state_dict(),
        }

        log_epoch(logger, epoch, train_loss, val_loss,
                  optimizer.param_groups[0]['lr'], batch_train, batch_val,
                  data_train, data_val, recall)
        save_checkpoint(state, is_best, args.name, epoch)

        # Optimizing the text pipeline after one epoch
        if epoch == 1:
            for param in join_emb.cap_emb.parameters():
                param.requires_grad = True
            optimizer.add_param_group({
                'params': join_emb.cap_emb.parameters(),
                'lr': optimizer.param_groups[0]['lr'],
                'initial_lr': args.lr
            })
            lr_scheduler = MultiStepLR(optimizer,
                                       args.lrd[1:],
                                       gamma=args.lrd[0])

        # Starting the finetuning of the whole model
Exemple #7
0
def train(opt):

    ################################
    # Build dataloader
    ################################
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    ##########################
    # Initialize infos
    ##########################
    infos = {
        'iter': 0,
        'epoch': 0,
        'loader_state_dict': None,
        'vocab': loader.get_vocab(),
    }
    # Load old infos(if there is) and check if models are compatible
    if opt.start_from is not None and os.path.isfile(
            os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')):
        with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl'),
                  'rb') as f:
            infos = utils.pickle_load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert getattr(saved_model_opt, checkme) == getattr(
                    opt, checkme
                ), "Command line argument and saved model disagree on '%s' " % checkme
    infos['opt'] = opt

    #########################
    # Build logger
    #########################
    # naive dict logger
    histories = defaultdict(dict)
    if opt.start_from is not None and os.path.isfile(
            os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
        with open(os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl'),
                  'rb') as f:
            histories.update(utils.pickle_load(f))

    # tensorboard logger
    tb_summary_writer = SummaryWriter(opt.checkpoint_path)

    ##########################
    # Build model
    ##########################
    opt.vocab = loader.get_vocab()
    multi_models_list = []
    for order in range(opt.number_of_models):
        multi_models_list.append(models.setup(opt).cuda())
    for order in range(opt.number_of_models):
        multi_models_list.append(models.setup(opt).cuda())
    for order in range(opt.number_of_models, 2 * opt.number_of_models):
        for param in multi_models_list[order].parameters():
            param.detach_()
    for order in range(opt.number_of_models):
        for param, param_ema in zip(
                multi_models_list[order].parameters(),
                multi_models_list[order + opt.number_of_models].parameters()):
            param_ema.data = param.data.clone()
    # multi_models = MultiModels(multi_models_list)
    # multi_models_list.append(SenEncodeModel(opt).cuda())
    multi_models = nn.ModuleList(multi_models_list)
    del opt.vocab
    # Load pretrained weights:
    if opt.start_from is not None and os.path.isfile(
            os.path.join(opt.start_from, 'model.pth')):
        multi_models.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'model.pth')))

    # Wrap generation model with loss function(used for training)
    # This allows loss function computed separately on each machine
    lw_models = nn.ModuleList([
        LossWrapper(multi_models[index], opt)
        for index in range(opt.number_of_models)
    ])
    kdlw_models = nn.ModuleList([
        KDLossWrapper(multi_models[index], opt)
        for index in range(opt.number_of_models)
    ])
    lw_models_ema = nn.ModuleList([
        LossWrapper(multi_models[opt.number_of_models + index], opt)
        for index in range(opt.number_of_models)
    ])
    kdlw_models_ema = nn.ModuleList([
        KDLossWrapper(multi_models[opt.number_of_models + index], opt)
        for index in range(opt.number_of_models)
    ])
    # Wrap with dataparallel
    dp_models = nn.ModuleList([
        torch.nn.DataParallel(multi_models[index])
        for index in range(opt.number_of_models)
    ])
    dp_lw_models = nn.ModuleList([
        torch.nn.DataParallel(lw_models[index])
        for index in range(opt.number_of_models)
    ])
    dp_kdlw_models = nn.ModuleList([
        torch.nn.DataParallel(kdlw_models[index])
        for index in range(opt.number_of_models)
    ])
    dp_models_ema = nn.ModuleList([
        torch.nn.DataParallel(multi_models[opt.number_of_models + index])
        for index in range(opt.number_of_models)
    ])
    dp_lw_models_ema = nn.ModuleList([
        torch.nn.DataParallel(lw_models_ema[index])
        for index in range(opt.number_of_models)
    ])
    dp_kdlw_models_ema = nn.ModuleList([
        torch.nn.DataParallel(kdlw_models_ema[index])
        for index in range(opt.number_of_models)
    ])

    ##########################
    #  Build optimizer
    ##########################
    if opt.noamopt:
        assert opt.caption_model in [
            'transformer', 'bert', 'm2transformer'
        ], 'noamopt can only work with transformer'
        optimizer = utils.get_std_opt(multi_models,
                                      factor=opt.noamopt_factor,
                                      warmup=opt.noamopt_warmup)
    elif opt.reduce_on_plateau:
        optimizer = utils.build_optimizer(multi_models.parameters(), opt)
        optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    else:
        optimizer = utils.build_optimizer(multi_models.parameters(), opt)
    # Load the optimizer
    if opt.start_from is not None and os.path.isfile(
            os.path.join(opt.start_from, "optimizer.pth")):
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    ##########################
    #  Build loss
    ##########################
    # triplet_loss = nn.TripletMarginLoss()

    #########################
    # Get ready to start
    #########################
    iteration = infos['iter']
    epoch = infos['epoch']
    # For back compatibility
    if 'iterators' in infos:
        infos['loader_state_dict'] = {
            split: {
                'index_list': infos['split_ix'][split],
                'iter_counter': infos['iterators'][split]
            }
            for split in [
                'paired_train', 'unpaired_images_train',
                'unpaired_captions_train', 'train', 'val', 'test'
            ]
        }
    loader.load_state_dict(infos['loader_state_dict'])
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)
    if opt.noamopt:
        optimizer._step = iteration
    # flag indicating finish of an epoch
    # Always set to True at the beginning to initialize the lr or etc.
    epoch_done = True
    # Assure in training mode
    dp_lw_models.train()
    dp_kdlw_models.train()
    dp_lw_models_ema.train()
    dp_kdlw_models_ema.train()

    # Build the ensemble model
    # # Setup the model
    model_ensemble = AttEnsemble(multi_models_list[opt.number_of_models:2 *
                                                   opt.number_of_models],
                                 weights=None)
    # model_ensemble.seq_length = 20
    model_ensemble.cuda()
    # model_ensemble.eval()
    kd_model_outs_list = []

    # Start training
    try:
        while True:
            # Stop if reaching max epochs
            if epoch >= opt.max_epochs and opt.max_epochs != -1:
                break

            if epoch_done:
                if not opt.noamopt and not opt.reduce_on_plateau:
                    # Assign the learning rate
                    if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                        frac = (epoch - opt.learning_rate_decay_start
                                ) // opt.learning_rate_decay_every
                        decay_factor = opt.learning_rate_decay_rate**frac
                        opt.current_lr = opt.learning_rate * decay_factor
                    else:
                        opt.current_lr = opt.learning_rate
                    utils.set_lr(optimizer,
                                 opt.current_lr)  # set the decayed rate
                # Assign the scheduled sampling prob
                if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                    frac = (epoch - opt.scheduled_sampling_start
                            ) // opt.scheduled_sampling_increase_every
                    opt.ss_prob = min(
                        opt.scheduled_sampling_increase_prob * frac,
                        opt.scheduled_sampling_max_prob)
                    for index in range(opt.number_of_models):
                        multi_models[index].ss_prob = opt.ss_prob

                # If start self critical training
                if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                    sc_flag = True
                    init_scorer(opt.cached_tokens)
                else:
                    sc_flag = False

                # If start structure loss training
                if opt.structure_after != -1 and epoch >= opt.structure_after:
                    struc_flag = True
                    init_scorer(opt.cached_tokens)
                else:
                    struc_flag = False

                if epoch >= opt.paired_train_epoch:
                    opt.current_lambda_x = opt.hyper_parameter_lambda_x * \
                                         (epoch - (opt.paired_train_epoch - 1)) /\
                                         (opt.max_epochs - opt.paired_train_epoch)
                    opt.current_lambda_y = opt.hyper_parameter_lambda_y * \
                                           (epoch - (opt.paired_train_epoch - 1)) / \
                                           (opt.max_epochs - opt.paired_train_epoch)

                epoch_done = False

            start = time.time()
            # Load data from train split (0)
            if epoch < opt.language_pretrain_epoch:
                data = loader.get_batch('unpaired_captions_train')
            elif epoch < opt.paired_train_epoch:
                data = loader.get_batch('paired_train')
            else:
                data = loader.get_batch('paired_train')
                unpaired_data = loader.get_batch('unpaired_images_train')
                unpaired_caption = loader.get_batch('unpaired_captions_train')
            print('Read data:', time.time() - start)

            torch.cuda.synchronize()
            start = time.time()
            if epoch < opt.language_pretrain_epoch:
                tmp = [
                    data['fc_feats'] * 0, data['att_feats'] * 0,
                    data['labels'], data['masks'], data['att_masks']
                ]
            elif epoch < opt.paired_train_epoch:
                tmp = [
                    data['fc_feats'], data['att_feats'], data['labels'],
                    data['masks'], data['att_masks']
                ]
            else:
                tmp = [
                    data['fc_feats'], data['att_feats'], data['labels'],
                    data['masks'], data['att_masks']
                ]
                unpaired_tmp = [
                    unpaired_data['fc_feats'], unpaired_data['att_feats'],
                    unpaired_data['labels'], unpaired_data['masks'],
                    unpaired_data['att_masks']
                ]
                unpaired_caption_tmp = [
                    unpaired_caption['fc_feats'] * 0,
                    unpaired_caption['att_feats'] * 0,
                    unpaired_caption['labels'], unpaired_caption['masks'],
                    unpaired_caption['att_masks']
                ]

            tmp = [_ if _ is None else _.cuda() for _ in tmp]
            fc_feats, att_feats, labels, masks, att_masks = tmp

            if epoch >= opt.paired_train_epoch:
                unpaired_tmp = [
                    _ if _ is None else _.cuda() for _ in unpaired_tmp
                ]
                unpaired_fc_feats, unpaired_att_feats, unpaired_labels, unpaired_masks, unpaired_att_masks = unpaired_tmp
                unpaired_caption_tmp = [
                    _ if _ is None else _.cuda() for _ in unpaired_caption_tmp
                ]
                unpaired_caption_fc_feats, unpaired_caption_att_feats, unpaired_caption_labels, unpaired_caption_masks, unpaired_caption_att_masks = unpaired_caption_tmp
                unpaired_caption_fc_feats = unpaired_caption_fc_feats.repeat(
                    5, 1)
                unpaired_caption_fc_feats = opt.std_pseudo_visual_feature * torch.randn_like(
                    unpaired_caption_fc_feats)
                unpaired_caption_att_feats = unpaired_caption_att_feats.repeat(
                    5, 1, 1)
                unpaired_caption_fc_feats.requires_grad = True
                unpaired_caption_att_feats.requires_grad = True
                unpaired_caption_labels = unpaired_caption_labels.reshape(
                    unpaired_caption_fc_feats.shape[0], -1)
                unpaired_caption_masks = unpaired_caption_masks.reshape(
                    unpaired_caption_fc_feats.shape[0], -1)

            optimizer.zero_grad()
            if epoch < opt.language_pretrain_epoch:
                language_loss = 0
                model_outs_list = []
                for index in range(opt.number_of_models):
                    model_out = dp_lw_models[index](
                        fc_feats, att_feats, labels, masks,
                        att_masks, data['gts'],
                        torch.arange(0, len(data['gts'])), sc_flag, struc_flag)
                    model_outs_list.append(model_out)
                    language_loss += model_out['loss'].mean()

                loss = language_loss
            elif epoch < opt.paired_train_epoch:
                language_loss = 0
                model_outs_list = []
                for index in range(opt.number_of_models):
                    model_out = dp_lw_models[index](
                        fc_feats, att_feats, labels, masks,
                        att_masks, data['gts'],
                        torch.arange(0, len(data['gts'])), sc_flag, struc_flag)
                    model_outs_list.append(model_out)
                    language_loss += model_out['loss'].mean()

                loss = language_loss
            else:
                language_loss = 0
                model_outs_list = []
                for index in range(opt.number_of_models):
                    model_out = dp_lw_models[index](
                        fc_feats, att_feats, labels, masks,
                        att_masks, data['gts'],
                        torch.arange(0, len(data['gts'])), sc_flag, struc_flag)
                    model_outs_list.append(model_out)
                    language_loss += model_out['loss'].mean()
                loss = language_loss

                # else:
                # for unpaired image sentences
                # # Setup the model
                # model_ensemble = AttEnsemble(multi_models_list[:opt.number_of_models], weights=None)
                # model_ensemble.seq_length = 16
                # model_ensemble.cuda()
                # model_ensemble.eval()

                model_ensemble.eval()
                eval_kwargs = dict()
                eval_kwargs.update(vars(opt))

                with torch.no_grad():
                    seq, seq_logprobs = model_ensemble(unpaired_fc_feats,
                                                       unpaired_att_feats,
                                                       unpaired_att_masks,
                                                       opt=eval_kwargs,
                                                       mode='sample')
                    # val_loss, predictions, lang_stats = eval_utils.eval_split(model_ensemble, lw_models[0].crit, loader,
                    #                                                           eval_kwargs)
                # print('\n'.join([utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in
                #                  model_ensemble.done_beams[0]]))
                # print('++' * 10)
                # for ii in range(10):
                #     sents = utils.decode_sequence(loader.get_vocab(), seq[ii].unsqueeze(0))
                #     gt_sent = utils.decode_sequence(loader.get_vocab(), labels[ii,0].unsqueeze(0))
                #     a=1

                model_ensemble.train()

                model_ensemble_sudo_labels = labels.new_zeros(
                    (opt.batch_size, opt.beam_size,
                     eval_kwargs['max_length'] + 2))
                model_ensemble_sudo_log_prob = masks.new_zeros(
                    (opt.batch_size,
                     opt.beam_size, eval_kwargs['max_length'] + 2,
                     len(loader.get_vocab()) + 1))
                model_ensemble_sum_log_prob = masks.new_zeros(
                    (opt.batch_size, opt.beam_size))

                for batch_index in range(opt.batch_size):
                    for beam_index in range(opt.beam_size):
                        # for beam_index in range(3):
                        pred = model_ensemble.done_beams[batch_index][
                            beam_index]['seq']
                        log_prob = model_ensemble.done_beams[batch_index][
                            beam_index]['logps']
                        model_ensemble_sudo_labels[batch_index, beam_index,
                                                   1:pred.shape[0] + 1] = pred
                        model_ensemble_sudo_log_prob[batch_index, beam_index,
                                                     1:pred.shape[0] +
                                                     1] = log_prob
                        model_ensemble_sum_log_prob[batch_index][
                            beam_index] = model_ensemble.done_beams[
                                batch_index][beam_index]['p']

                # model_ensemble_prob = F.softmax(model_ensemble_sum_log_prob)

                data_ensemble_sudo_gts = list()
                for data_ensemble_sudo_gts_index in range(
                        model_ensemble_sudo_labels.shape[0]):
                    data_ensemble_sudo_gts.append(model_ensemble_sudo_labels[
                        data_ensemble_sudo_gts_index, :,
                        1:-1].data.cpu().numpy())

                # generated_sentences = list()
                # for i in range(unpaired_fc_feats.shape[0]):
                #     generated_sentences.append(
                #         [utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in
                #          model_ensemble.done_beams[i]])
                #
                # pos_tag_results = list()
                # for i in range(unpaired_fc_feats.shape[0]):
                #     generated_sentences_i = generated_sentences[i]
                #     pos_tag_results_i = []
                #     for text in generated_sentences_i:
                #         text_tokenize = nltk.word_tokenize(text)
                #         pos_tag_results_i_jbeam = []
                #         for vob, vob_type in nltk.pos_tag(text_tokenize):
                #             if vob_type == 'NN' or vob_type == 'NNS':
                #                 pos_tag_results_i_jbeam.append(vob)
                #         pos_tag_results_i.append(pos_tag_results_i_jbeam)
                #     pos_tag_results.append(pos_tag_results_i)

                # for i in range(fc_feats.shape[0]):
                #     print('\n'.join([utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in
                #                      model_ensemble.done_beams[i]]))
                #     print('--' * 10)
                # dets = data['dets']
                #
                # promising_flag = labels.new_zeros(opt.batch_size, opt.beam_size)
                # for batch_index in range(opt.batch_size):
                #     dets_batch = dets[batch_index]
                #     for beam_index in range(opt.beam_size):
                #         indicator = [0] * len(dets_batch)
                #         pos_tag_batch_beam = pos_tag_results[batch_index][beam_index]
                #         for pos_tag_val in pos_tag_batch_beam:
                #             for ii in range(len(dets_batch)):
                #                 possible_list = vob_transform_list[dets_batch[ii]]
                #                 if pos_tag_val in possible_list:
                #                     indicator[ii] = 1
                #         if sum(indicator) == len(dets_batch) or sum(indicator) >= 2:
                #             promising_flag[batch_index, beam_index] = 1
                #
                # # model_ensemble_sudo_log_prob = model_ensemble_sudo_log_prob * promising_flag.unsqueeze(-1).unsqueeze(-1)
                # model_ensemble_sudo_labels = model_ensemble_sudo_labels * promising_flag.unsqueeze(-1)

                #sudo_masks_for_model = sudo_masks_for_model.detach()
                distilling_loss = 0
                # We use the random study machinism
                who_to_study = random.randint(0, opt.number_of_models - 1)

                # for index in range(opt.number_of_models):
                #     model_out = dp_kdlw_models[index](unpaired_fc_feats, unpaired_att_feats, model_ensemble_sudo_labels,
                #                                     model_ensemble_sudo_log_prob, att_masks, data_ensemble_sudo_gts,
                #                                     torch.arange(0, len(data_ensemble_sudo_gts)), sc_flag,
                #                                     struc_flag, model_ensemble_sum_log_prob)
                #     kd_model_outs_list.append(model_out)

                model_out = dp_kdlw_models[who_to_study](
                    unpaired_fc_feats, unpaired_att_feats,
                    model_ensemble_sudo_labels, model_ensemble_sudo_log_prob,
                    att_masks, data_ensemble_sudo_gts,
                    torch.arange(0, len(data_ensemble_sudo_gts)), sc_flag,
                    struc_flag, model_ensemble_sum_log_prob)
                # kd_model_outs_list.append(model_out)
                distilling_loss += model_out['loss'].mean()
                loss += opt.number_of_models * opt.current_lambda_x * distilling_loss

                ###################################################################
                # use unlabelled captions
                # simple_sgd = utils.gradient_descent(unpaired_caption_fc_feats, stepsize=1e3)
                simple_sgd = utils.gradient_descent_adagrad(
                    unpaired_caption_fc_feats, stepsize=1)
                gts_tmp = unpaired_caption['gts']
                new_gts = []
                for ii in range(len(data['gts'])):
                    for jj in range(gts_tmp[ii].shape[0]):
                        new_gts.append(gts_tmp[ii][jj])
                unpaired_caption['gts'] = new_gts
                for itr in range(opt.inner_iteration):
                    unlabelled_caption_model_out = dp_lw_models_ema[
                        itr % opt.number_of_models](
                            unpaired_caption_fc_feats,
                            unpaired_caption_att_feats,
                            unpaired_caption_labels, unpaired_caption_masks,
                            unpaired_caption_att_masks,
                            unpaired_caption['gts'],
                            torch.arange(0, len(unpaired_caption['gts'])),
                            sc_flag, struc_flag)
                    unlabelled_caption_loss = unlabelled_caption_model_out[
                        'loss'].mean()
                    unlabelled_caption_loss.backward()
                    # print(unlabelled_caption_loss)
                    simple_sgd.update(unpaired_caption_fc_feats)
                    # a=1

                unpaired_caption_fc_feats.requires_grad = False
                unpaired_caption_att_feats.requires_grad = False
                unlabelled_caption_model_out = dp_lw_models[who_to_study](
                    unpaired_caption_fc_feats, unpaired_caption_att_feats,
                    unpaired_caption_labels, unpaired_caption_masks,
                    unpaired_caption_att_masks, unpaired_caption['gts'],
                    torch.arange(0, len(unpaired_caption['gts'])), sc_flag,
                    struc_flag)
                unlabelled_caption_loss = unlabelled_caption_model_out[
                    'loss'].mean()
                loss += opt.number_of_models * opt.current_lambda_y * unlabelled_caption_loss

            loss.backward()
            if opt.grad_clip_value != 0:
                getattr(torch.nn.utils, 'clip_grad_%s_' %
                        (opt.grad_clip_mode))(multi_models.parameters(),
                                              opt.grad_clip_value)
            optimizer.step()

            for order in range(opt.number_of_models):
                for param, param_ema in zip(
                        multi_models_list[order].parameters(),
                        multi_models_list[order +
                                          opt.number_of_models].parameters()):
                    param_ema.data = opt.alpha * param_ema.data + (
                        1 - opt.alpha) * param.data

            train_loss = loss.item()
            torch.cuda.synchronize()
            end = time.time()
            # if struc_flag:
            #     print("iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}" \
            #         .format(iteration, epoch, train_loss, model_out['lm_loss'].mean().item(), model_out['struc_loss'].mean().item(), end - start))
            # elif not sc_flag:
            #     print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
            #         .format(iteration, epoch, train_loss, end - start))
            # else:
            #     print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \
            #         .format(iteration, epoch, model_out['reward'].mean(), end - start))
            if struc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, train_loss/opt.number_of_models, sum([model_outs_list[index]['lm_loss'].mean().item() for index in range(opt.number_of_models)])/opt.number_of_models,
                            sum([model_outs_list[index]['struc_loss'].mean().item() for index in range(opt.number_of_models)])/opt.number_of_models,
                            end - start))
            elif not sc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, language_loss.item()/opt.number_of_models, end - start))
            else:
                print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, sum([model_outs_list[index]['reward'].mean().item() for index in range(opt.number_of_models)])/opt.number_of_models, end - start))

            # Update the iteration and epoch
            iteration += 1
            if epoch < opt.paired_train_epoch:
                if data['bounds']['wrapped']:
                    epoch += 1
                    epoch_done = True
            else:
                if data['bounds']['wrapped']:
                    epoch += 1
                    epoch_done = True

            # Write the training loss summary
            if (iteration % opt.losses_log_every == 0):
                # tb_summary_writer.add_scalar('train_loss', train_loss, iteration)
                for index in range(opt.number_of_models):
                    model_id = 'model_{}'.format(index)
                    tb_summary_writer.add_scalars('language_loss', {
                        model_id:
                        model_outs_list[index]['loss'].mean().item()
                    }, iteration)
                if epoch >= opt.paired_train_epoch:
                    # for index in range(opt.number_of_models):
                    #     model_id = 'model_{}'.format(index)
                    #     kd_model_outs_val = 0 if len(kd_model_outs_list) == 0 else kd_model_outs_list[index]['loss'].mean().item()
                    #     tb_summary_writer.add_scalars('distilling_loss',
                    #                                   {model_id: kd_model_outs_val},
                    #                                   iteration)
                    tb_summary_writer.add_scalar('distilling_loss',
                                                 distilling_loss.item(),
                                                 iteration)
                    tb_summary_writer.add_scalar(
                        'unlabelled_caption_loss',
                        unlabelled_caption_loss.item(), iteration)
                    tb_summary_writer.add_scalar('hyper_parameter_lambda_x',
                                                 opt.current_lambda_x,
                                                 iteration)
                    tb_summary_writer.add_scalar('hyper_parameter_lambda_y',
                                                 opt.current_lambda_y,
                                                 iteration)
                # tb_summary_writer.add_scalar('triplet_loss', triplet_loss_val.item(), iteration)
                if opt.noamopt:
                    opt.current_lr = optimizer.rate()
                elif opt.reduce_on_plateau:
                    opt.current_lr = optimizer.current_lr
                tb_summary_writer.add_scalar('learning_rate', opt.current_lr,
                                             iteration)
                tb_summary_writer.add_scalar('scheduled_sampling_prob',
                                             multi_models[0].ss_prob,
                                             iteration)
                if sc_flag:
                    for index in range(opt.number_of_models):
                        # tb_summary_writer.add_scalar('avg_reward', model_out['reward'].mean(), iteration)
                        model_id = 'model_{}'.format(index)
                        tb_summary_writer.add_scalars(
                            'avg_reward', {
                                model_id:
                                model_outs_list[index]['reward'].mean().item()
                            }, iteration)
                elif struc_flag:
                    # tb_summary_writer.add_scalar('lm_loss', model_out['lm_loss'].mean().item(), iteration)
                    # tb_summary_writer.add_scalar('struc_loss', model_out['struc_loss'].mean().item(), iteration)
                    # tb_summary_writer.add_scalar('reward', model_out['reward'].mean().item(), iteration)
                    # tb_summary_writer.add_scalar('reward_var', model_out['reward'].var(1).mean(), iteration)
                    model_id = 'model_{}'.format(index)
                    for index in range(opt.number_of_models):
                        tb_summary_writer.add_scalars(
                            'lm_loss', {
                                model_id:
                                model_outs_list[index]
                                ['lm_loss'].mean().item()
                            }, iteration)
                        tb_summary_writer.add_scalars(
                            'struc_loss', {
                                model_id:
                                model_outs_list[index]
                                ['struc_loss'].mean().item()
                            }, iteration)
                        tb_summary_writer.add_scalars(
                            'reward', {
                                model_id:
                                model_outs_list[index]['reward'].mean().item()
                            }, iteration)
                        tb_summary_writer.add_scalars(
                            'reward_var', {
                                model_id:
                                model_outs_list[index]['reward'].var(1).mean()
                            }, iteration)

                histories['loss_history'][
                    iteration] = train_loss if not sc_flag else sum([
                        model_outs_list[index]['reward'].mean().item()
                        for index in range(opt.number_of_models)
                    ]) / opt.number_of_models
                histories['lr_history'][iteration] = opt.current_lr
                histories['ss_prob_history'][iteration] = multi_models[
                    0].ss_prob

            # update infos
            infos['iter'] = iteration
            infos['epoch'] = epoch
            infos['loader_state_dict'] = loader.state_dict()

            # make evaluation on validation set, and save model
            if (iteration % opt.save_checkpoint_every == 0 and not opt.save_every_epoch and epoch >= opt.paired_train_epoch) or \
                (epoch_done and opt.save_every_epoch and epoch >= opt.paired_train_epoch):
                # load ensemble
                # Setup the model
                model = AttEnsemble(multi_models_list[opt.number_of_models:2 *
                                                      opt.number_of_models],
                                    weights=None)
                model.seq_length = opt.max_length
                model.cuda()
                model.eval()
                # eval model
                eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
                eval_kwargs.update(vars(opt))
                # eval_kwargs['beam_size'] = 5
                # eval_kwargs['verbose_beam'] = 1
                # eval_kwargs['verbose_loss'] = 1
                # val_loss, predictions, lang_stats = eval_utils.eval_split(
                #     dp_model, lw_model.crit, loader, eval_kwargs)
                with torch.no_grad():
                    val_loss, predictions, lang_stats = eval_utils.eval_split(
                        model, lw_models[0].crit, loader, eval_kwargs)
                model.train()

                if opt.reduce_on_plateau:
                    if 'CIDEr' in lang_stats:
                        optimizer.scheduler_step(-lang_stats['CIDEr'])
                    else:
                        optimizer.scheduler_step(val_loss)
                # Write validation result into summary
                tb_summary_writer.add_scalar('validation loss', val_loss,
                                             iteration)
                if lang_stats is not None:
                    for k, v in lang_stats.items():
                        tb_summary_writer.add_scalar(k, v, iteration)
                histories['val_result_history'][iteration] = {
                    'loss': val_loss,
                    'lang_stats': lang_stats,
                    'predictions': predictions
                }

                # Save model if is improving on validation result
                if opt.language_eval == 1:
                    current_score = lang_stats['CIDEr']
                else:
                    current_score = -val_loss

                best_flag = False

                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True

                # Dump miscalleous informations
                infos['best_val_score'] = best_val_score

                utils.save_checkpoint(opt, multi_models, infos, optimizer,
                                      histories)
                if opt.save_history_ckpt:
                    utils.save_checkpoint(
                        opt,
                        multi_models,
                        infos,
                        optimizer,
                        append=str(epoch)
                        if opt.save_every_epoch else str(iteration))

                if best_flag:
                    utils.save_checkpoint(opt,
                                          multi_models,
                                          infos,
                                          optimizer,
                                          append='best')

            # if epoch_done and epoch == opt.paired_train_epoch:
            #     utils.save_checkpoint(opt, multi_models, infos, optimizer, histories)
            #     if opt.save_history_ckpt:
            #         utils.save_checkpoint(opt, multi_models, infos, optimizer,
            #                               append=str(epoch) if opt.save_every_epoch else str(iteration))
            #     cmd = 'cp -r ' + 'log_' + opt.id + ' ' + 'log_' + opt.id + '_backup'
            #     os.system(cmd)

    except (RuntimeError, KeyboardInterrupt):
        print('Save ckpt on exception ...')
        utils.save_checkpoint(opt, multi_models, infos, optimizer)
        print('Save ckpt done.')
        stack_trace = traceback.format_exc()
        print(stack_trace)
def main(opt):
    np.random.seed(opt.seed)
    torch.manual_seed(opt.seed)
    cudnn.benchmark = True
    opt.checkpoint_folder += '_'+opt.backbone
    if opt.sketch_finetune:
        opt.checkpoint_folder += '_finetune'
    if not os.path.exists(opt.checkpoint_folder):
        os.makedirs(opt.checkpoint_folder)

    print(opt)
    # Redirect print to both console and log file
    # if not opt.evaluate:
    #    sys.stdout = Logger(os.path.join(opt.logs_dir, opt.log_name))

    # Create data loaders
    if opt.height is None or opt.width is None:
        opt.height, opt.width = (224, 224)

    train_sketch_loader, train_shape_loader, test_sketch_loader, test_shape_loader  =  get_data(opt.train_shape_views_folder, 
                            opt.test_shape_views_folder, opt.train_shape_flist, opt.test_shape_flist, 
                            opt.train_sketch_folder, opt.test_sketch_folder, opt.train_sketch_flist, opt.test_sketch_flist, 
                            opt.height, opt.width, opt.batch_size, opt.workers, pk_flag=False)

    # Create model
    #if opt.pool_idx is None:
    #    opt.pool_idx = set_default_pool
    kwargs = {'pool_idx': opt.pool_idx} if opt.pool_idx is not None else {} 
    backbone = eval('models.'+opt.backbone)
    net_bp = backbone.Net_Prev_Pool(**kwargs)
    net_vp = backbone.View_And_Pool()
    net_ap = backbone.Net_After_Pool(**kwargs)
    if opt.sketch_finetune:
        net_whole = backbone.Net_Whole(nclasses = 10, use_finetuned=True)
    else:
        net_whole = backbone.Net_Whole(nclasses = 10)
    # for alexnet or vgg, feat_dim = 4096
    # for resnet, feat_dim = 2048
    net_cls = backbone.Net_Classifier(nclasses = 10)
    # Criterion
    # criterion = nn.CrossEntropyLoss().cuda()
    # if opt.balance: # current no balancing
    #    crt_cls = nn.CrossEntropyLoss().cuda()
    # else:
    # classification loss 
    crt_cls = nn.CrossEntropyLoss().cuda()
    # triplet center loss 
    crt_tlc = custom_loss.TripletCenterLoss(margin=opt.margin).cuda()
    if opt.wn:
        crt_tlc = torch.nn.utils.weight_norm(crt_tlc, name='centers')
    criterion = [crt_cls, crt_tlc, opt.w1, opt.w2]

    # Load from checkpoint
    start_epoch = best_top1 = 0
    if opt.resume:
        checkpoint = torch.load(opt.resume)
        net_bp.load_state_dict(checkpoint['net_bp'])
        net_ap.load_state_dict(checkpoint['net_ap'])
        net_whole.load_state_dict(checkpoint['net_whole'])
        net_cls.load_state_dict(checkpoint['net_cls'])
        crt_tlc.load_state_dict(checkpoint['centers'])
        start_epoch = checkpoint['epoch']
        best_top1 = checkpoint['best_prec']
        # start_epoch = checkpoint['epoch']
        # best_top1 = checkpoint['best_top1']
        # print("=> Start epoch {}  best top1 {:.1%}"
        #      .format(start_epoch, best_top1))
    
    # model = nn.DataParallel(model).cuda()
    net_bp = nn.DataParallel(net_bp).cuda()
    net_vp = net_vp.cuda()
    net_ap = nn.DataParallel(net_ap).cuda()
    net_whole = nn.DataParallel(net_whole).cuda()
    net_cls = nn.DataParallel(net_cls).cuda()
    # wrap multiple models in optimizer 
    optim_shape = optim.SGD([{'params': net_ap.parameters()},
                            {'params': net_bp.parameters(), 'lr':1e-3},
                            {'params': net_cls.parameters()}],
                          lr=0.001, momentum=0.9, weight_decay=opt.weight_decay)

    base_param_ids = set(map(id, net_whole.module.features.parameters()))
    new_params = [p for p in net_whole.parameters() if id(p) not in base_param_ids]
    param_groups = [
    {'params': net_whole.module.features.parameters(), 'lr_mult':0.1},
    {'params':new_params, 'lr_mult':1.0}]

    # optim_sketch = optim.SGD(net_whole.module.parameters(), lr=0.01)
    optim_sketch = optim.SGD(param_groups, lr=0.001, momentum=0.9, weight_decay=opt.weight_decay)
    optim_centers = optim.SGD(crt_tlc.parameters(), lr=0.1)

    optimizer = (optim_sketch, optim_shape, optim_centers)
    model = (net_whole, net_bp, net_vp, net_ap, net_cls)

    # Schedule learning rate
    def adjust_lr(epoch, optimizer):
        step_size = 800 if opt.pk_flag else 80 # 40
        lr = opt.lr * (0.1 ** (epoch // step_size))
        for g in optimizer.param_groups:
            g['lr'] = lr * g.get('lr_mult', 1)

    # Start training
    top1 = 0.0
    if opt.evaluate:
        # validate and compute mAP
        _, top1 = validate(test_sketch_loader, test_shape_loader, model, criterion, 0, opt)
        exit()
    best_epoch = -1
    best_metric = None
    # total_epochs = opt.max_epochs*10 if opt.pk_flag else opt.max_epochs
    for epoch in range(start_epoch, opt.max_epochs):
        # adjust_lr(epoch, optim_sketch)
        # adjust_lr(epoch, optim_shape)
        # adjust_lr(epoch, optim_centers)
        # cls acc top1
        train_top1 = train(train_sketch_loader, train_shape_loader, model, criterion, optimizer, epoch, opt)
        if epoch < opt.start_save and (epoch % opt.interval == 0):
            continue

        if train_top1 > 0.1:
            print("Test:")
            cur_metric = validate(test_sketch_loader, test_shape_loader, model, criterion, epoch, opt)
            top1 = cur_metric[-1]

        is_best = top1 > best_top1
        if is_best:
            best_epoch = epoch + 1
            best_metric = cur_metric
        best_top1 = max(top1, best_top1)


        
        checkpoint = {} 
        checkpoint['epoch'] = epoch + 1
        checkpoint['current_prec'] = top1
        checkpoint['best_prec'] = best_top1
        checkpoint['net_bp'] = net_bp.module.state_dict() 
        checkpoint['net_ap'] = net_ap.module.state_dict() 
        checkpoint['net_whole'] = net_whole.module.state_dict() 
        checkpoint['net_cls'] = net_cls.module.state_dict() 
        checkpoint['centers'] = crt_tlc.state_dict()
        
        path_checkpoint = '{0}/model_latest.pth'.format(opt.checkpoint_folder)
        utils.save_checkpoint(checkpoint, path_checkpoint)
        
        if is_best: # save checkpoint 
            path_checkpoint = '{0}/model_best.pth'.format(opt.checkpoint_folder)
            utils.save_checkpoint(checkpoint, path_checkpoint)
            if opt.sf:
              shutil.copyfile(opt.checkpoint_folder+'/test_feat_temp.mat', opt.checkpoint_folder+'/test_feat_best.mat')

        print('\n * Finished epoch {:3d}  top1: {:5.3%}  best: {:5.3%}{} @epoch {}\n'.
              format(epoch, top1, best_top1, ' *' if is_best else '', best_epoch))

        print('Best metric', best_metric)
Exemple #9
0
def main():
    # Setup workspace and backup files
    cfg = options.get_config()
    workspace = utils.setup_workspace(cfg.workspace)
    if cfg.pretrained is not None:
        logger = utils.Logger(os.path.join(workspace.log, 'train_log.txt'),
                              mode='a')
    else:
        logger = utils.Logger(os.path.join(workspace.log, 'train_log.txt'))
    tf_logger = SummaryWriter(workspace.log)
    logger.write('Workspace: {}'.format(cfg.workspace), 'green')
    logger.write('CUDA: {}, Multi-GPU: {}'.format(cfg.cuda, cfg.multi_gpu),
                 'green')
    logger.write('To-disparity: {}'.format(cfg.to_disparity), 'green')

    # Define dataloader
    logger.write('Dataset: {}'.format(cfg.dataset_name), 'green')
    train_dataset, val_dataset = options.get_dataset(cfg.dataset_name)
    train_loader = DataLoader(
        train_dataset,
        batch_size=cfg.batch_size,
        shuffle=True,
        num_workers=cfg.workers,
        pin_memory=True,
        sampler=None,
        worker_init_fn=lambda work_id: np.random.seed(work_id))
    # worker_init_fn ensures different sampling patterns for
    # each data loading thread
    val_loader = DataLoader(val_dataset,
                            batch_size=1,
                            shuffle=False,
                            pin_memory=True,
                            num_workers=cfg.workers)

    # Define model
    logger.write('Model: {}'.format(cfg.model_name), 'green')
    model = options.get_model(cfg.model_name)
    if cfg.multi_gpu:
        model = nn.DataParallel(model)
    if cfg.cuda:
        model = model.cuda()

    # Define loss function
    criterion = options.get_criterion(cfg.criterion_name)
    if cfg.cuda:
        criterion = criterion.cuda()
    logger.write('Criterion: {}'.format(criterion), 'green')

    # Define optimizer and learning rate scheduler
    optim = options.get_optimizer(cfg.optimizer_name, model.parameters())
    lr_scheduler = options.get_lr_scheduler(cfg.lr_scheduler_name, optim)
    logger.write('Optimizer: {}'.format(optim), 'green')
    if lr_scheduler is not None:
        logger.write('Learning rate schedular: {}'.format(lr_scheduler),
                     'green')

    # [Optional] load pretrained model
    start_ep = 0
    global_step = 0
    local_start = 0
    if cfg.pretrained is not None:
        start_ep, global_step = utils.load_checkpoint(model, optim,
                                                      lr_scheduler,
                                                      cfg.pretrained,
                                                      cfg.weight_only)
        logger.write('Load pretrained model from {}'.format(cfg.pretrained),
                     'green')
        #global_step = len(train_dataset) * start_ep # NOTE: global step start from the beginning of the epoch
        local_start = global_step % len(train_dataset)

    # Start training
    logger.write('Start training...', 'green')
    for ep in range(start_ep, cfg.max_epoch):
        if lr_scheduler is not None:
            logger.write('Update learning rate: {} --> '.format(
                lr_scheduler.get_lr()[0]),
                         'magenta',
                         end='')
            lr_scheduler.step()
            logger.write('{}'.format(lr_scheduler.get_lr()[0]), 'magenta')

        # Train an epoch
        model.train()
        meters = metric.Metrics(cfg.train_metric_field)
        avg_meters = metric.MovingAverageEstimator(cfg.train_metric_field)
        end = time.time()
        for it, data in enumerate(train_loader, local_start):
            # Pack data
            if cfg.cuda:
                for k in data.keys():
                    data[k] = data[k].cuda()
            inputs = dict()
            inputs['left_rgb'] = data['left_rgb']
            inputs['right_rgb'] = data['right_rgb']
            if cfg.to_disparity:
                inputs['left_sd'] = data['left_sdisp']
                inputs['right_sd'] = data['right_sdisp']
                target = data['left_disp']
            else:
                inputs['left_sd'] = data['left_sd']
                inputs['right_sd'] = data['right_sd']
                target = data['left_d']
            data_time = time.time() - end

            # Inference, compute loss and update model
            end = time.time()
            optim.zero_grad()
            pred = model(inputs)
            if cfg.criterion_name in ['inv_disp_l1']:
                pred_d = utils.disp2depth(pred, data['width'].item())
                loss = criterion(pred_d, data['left_d'])
            else:
                loss = criterion(pred, target)
            loss.backward()
            optim.step()
            update_time = time.time() - end
            end = time.time()

            # Measure performance
            pred_np = pred.data.cpu().numpy()
            target_np = target.data.cpu().numpy()
            results = meters.compute(pred_np, target_np)
            avg_meters.update(results)

            # Print results
            if (it % cfg.print_step) == 0:
                logger.write('[{:2d}/{:2d}][{:5d}/{:5d}] data time: {:4.3f}, update time: {:4.3f}, loss: {:.4f}'\
                             .format(ep, cfg.max_epoch, it, len(train_loader), data_time,
                                     update_time, loss.item()))
                avg_results = avg_meters.compute()
                logger.write('   [Average results] ', end='')
                for key, val in avg_results.items():
                    logger.write('{}: {:5.3f} '.format(key, val), end='')
                logger.write('')
                avg_meters.reset()

            # Log to tensorboard
            if (it % cfg.tflog_step) == 0:
                tf_logger.add_scalar('A-Loss/loss', loss.data, global_step)
                for key, val in results.items():
                    tf_logger.add_scalar('B-Train-Dense-Metric/{}'.format(key),
                                         val, global_step)
                if cfg.lr_scheduler_name is not None:
                    tf_logger.add_scalar('C-Learning-Rate',
                                         lr_scheduler.get_lr()[0], global_step)
                tf_logger.add_image('A-RGB/left', inputs['left_rgb'].data,
                                    global_step)
                tf_logger.add_image('A-RGB/right', inputs['right_rgb'].data,
                                    global_step)
                norm_factor = target.data.max(-1)[0].max(-1)[0].max(
                    -1)[0][:, None, None, None]
                tf_logger.add_image('B-sD',
                                    inputs['left_sd'].data / norm_factor,
                                    global_step)
                tf_logger.add_image('C-Pred', pred.data / norm_factor,
                                    global_step)
                tf_logger.add_image('C-Ground-Truth',
                                    target.data / norm_factor, global_step)
                if cfg.dump_all_param:  # NOTE: this will require a lot of HDD memory
                    for name, param in model.named_parameters():
                        tf_logger.add_histogram(
                            name + '/vars',
                            param.data.clone().cpu().numpy(), global_step)
                        if param.requires_grad:
                            tf_logger.add_histogram(
                                name + '/grads',
                                param.grad.clone().cpu().numpy(), global_step)

            # On-the-fly validation
            if (it % cfg.val_step) == 0:  # and not (ep == 0 and it == 0):
                validate(global_step, val_loader, model, logger, tf_logger,
                         cfg)

            # Save model
            if (it % cfg.save_step) == 0:
                ckpt_path = utils.save_checkpoint(workspace.ckpt, model, optim,
                                                  lr_scheduler, ep,
                                                  global_step)
                logger.write('Save checkpoint to {}'.format(ckpt_path),
                             'magenta')

            # Update global step
            global_step += 1

            if it >= len(train_dataset):
                local_start = 0
                break
def train(opt):
    ################################
    # Build dataloader
    ################################
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    ##########################
    # Initialize infos
    ##########################
    infos = {
        'iter': 0,
        'epoch': 0,
        'vocab': loader.get_vocab(),
    }
    # Load old infos (if there is) and check if models are compatible
    if opt.checkpoint_path is not None and os.path.isfile(
            os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl')):
        with open(
                os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'),
                'rb') as f:
            infos = utils.pickle_load(f)
            print('infos load success')
    infos['opt'] = opt

    # tensorboard logger
    tb_summary_writer = SummaryWriter(opt.checkpoint_path)

    ##########################
    # Build model
    ##########################
    opt.vocab = loader.get_vocab()
    model = models.setup(opt).cuda()
    del opt.vocab
    # Load pretrained weights:
    if opt.checkpoint_path is not None and os.path.isfile(
            os.path.join(opt.checkpoint_path, 'model.pth')):
        model.load_state_dict(
            torch.load(os.path.join(opt.checkpoint_path, 'model.pth')))
        print('model load success')

    # Wrap generation model with loss function(used for training)
    # This allows loss function computed separately on each machine
    lw_model = LossWrapper(model, opt)
    # Wrap with dataparallel
    dp_model = torch.nn.DataParallel(model)
    dp_lw_model = torch.nn.DataParallel(lw_model)

    ##########################
    #  Build optimizer
    ##########################
    optimizer = utils.ReduceLROnPlateau(optim.Adam(model.parameters(),
                                                   opt.learning_rate),
                                        factor=0.5,
                                        patience=3)
    # Load the optimizer
    if opt.checkpoint_path is not None and os.path.isfile(
            os.path.join(opt.checkpoint_path, "optimizer.pth")):
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.checkpoint_path, 'optimizer.pth')))

    #########################
    # Get ready to start
    #########################
    iteration = infos['iter']
    epoch = infos['epoch']
    best_val_score = infos.get('best_val_score', None)
    print('iter {}, epoch {}, best_val_score {}'.format(
        iteration, epoch, best_val_score))

    print(sorted(dict(set(vars(opt).items())).items(), key=lambda x: x[0]))
    # Start training
    if opt.self_critical:
        init_scorer(opt.cached_tokens)
    # Assure in training mode
    dp_lw_model.train()
    try:
        while True:
            # Stop if reaching max_epoch
            if epoch >= opt.max_epochs:
                break

            # Load data from train split (0)
            data = loader.get_batch('train')

            torch.cuda.synchronize()

            tmp = [
                data['fc_feats'], data['att_feats'], data['labels'],
                data['masks'], data['att_masks']
            ]
            tmp = [_ if _ is None else _.cuda() for _ in tmp]
            fc_feats, att_feats, labels, masks, att_masks = tmp

            optimizer.zero_grad()
            model_out = dp_lw_model(fc_feats, att_feats, labels, masks,
                                    att_masks, data['gts'],
                                    torch.arange(0, len(data['gts'])))

            loss = model_out['loss'].mean()

            loss.backward()
            torch.nn.utils.clip_grad_value_(model.parameters(), 0.1)
            optimizer.step()
            train_loss = loss.item()
            torch.cuda.synchronize()

            # Update the iteration and epoch
            iteration += 1
            if data['bounds']['wrapped']:
                epoch += 1

            # Write the training loss summary
            if iteration % opt.losses_log_every == 0:
                tb_summary_writer.add_scalar('train_loss', train_loss,
                                             iteration)
                opt.current_lr = optimizer.current_lr
                tb_summary_writer.add_scalar('learning_rate', opt.current_lr,
                                             iteration)
                if opt.self_critical:
                    tb_summary_writer.add_scalar('avg_reward',
                                                 model_out['reward'].mean(),
                                                 iteration)

            # update infos
            infos['iter'] = iteration
            infos['epoch'] = epoch

            # make evaluation on validation set, and save model
            if iteration % opt.save_checkpoint_every == 0:
                tb_summary_writer.add_scalar('epoch', epoch, iteration)
                # eval model
                eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
                eval_kwargs.update(vars(opt))
                _, _, lang_stats = eval_utils.eval_split(
                    dp_model, loader, eval_kwargs)

                optimizer.scheduler_step(-lang_stats['CIDEr'])
                # Write validation result into summary
                for k, v in lang_stats.items():
                    tb_summary_writer.add_scalar(k, v, iteration)

                # Save model if is improving on validation result
                current_score = lang_stats['CIDEr']

                best_flag = False
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True

                # Dump miscellaneous information
                infos['best_val_score'] = best_val_score

                utils.save_checkpoint(opt, model, infos, optimizer)
                if best_flag:
                    utils.save_checkpoint(opt,
                                          model,
                                          infos,
                                          optimizer,
                                          append='best')

    except (RuntimeError, KeyboardInterrupt):
        pass