def show(opt):

    opt.use_att = utils.if_use_att(opt.caption_model)
    loader = DataLoader(opt, is_show=True)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    infos = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(opt)[checkme], "! Command line argument and saved model disagree on '%s' " % checkme

    model = models.setup(opt)
    model.cuda()
    model.load_state_dict(torch.load(os.path.join(opt.start_from, 'model-best.pth')))

    crit = utils.LanguageModelCriterion()

    # eval model
    eval_kwargs = {}
    eval_kwargs.update(vars(opt))
    eval_kwargs.update({'split': 'show',
                        'dataset': opt.input_json,
                        'language_eval': 0,
                        'beam_size': 5,
                        'print_all_beam': True})
    val_loss, predictions, lang_stats = eval_utils.eval_split(model, crit, loader, eval_kwargs)
Exemple #2
0
def train(opt):
    # opt.use_att = utils.if_use_att(opt.caption_model)
    opt.use_att = True
    if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5

    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length
    print(opt.checkpoint_path)
    tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from,
                               'infos_' + opt.id + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
            with open(
                    os.path.join(opt.start_from,
                                 'histories_' + opt.id + '.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    critic_loss_history = histories.get('critic_loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})
    variance_history = histories.get('variance_history', {})
    time_history = histories.get('time_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    model = models.setup(opt).cuda()
    dp_model = model

    target_actor = models.setup(opt).cuda()

    ####################### Critic pretrain #####################################################################
    ##### Critic with state as input
    # if opt.critic_model == 'state_critic':
    #     critic_model = CriticModel(opt)
    # else:
    critic_model = AttCriticModel(opt)
    target_critic = AttCriticModel(opt)
    if vars(opt).get('start_from_critic', None) is not None and True:
        # check if all necessary files exist
        assert os.path.isdir(opt.start_from_critic
                             ), " %s must be a a path" % opt.start_from_critic
        print(
            os.path.join(opt.start_from_critic,
                         opt.critic_model + '_model.pth'))
        critic_model.load_state_dict(
            torch.load(
                os.path.join(opt.start_from_critic,
                             opt.critic_model + '_model.pth')))
        target_critic.load_state_dict(
            torch.load(
                os.path.join(opt.start_from_critic,
                             opt.critic_model + '_model.pth')))
    critic_model = critic_model.cuda()
    target_critic = target_critic.cuda()
    critic_optimizer = utils.build_optimizer(critic_model.parameters(), opt)
    dp_model.eval()
    critic_iter = 0
    init_scorer(opt.cached_tokens)
    critic_model.train()
    error_sum = 0
    loss_vector_sum = 0
    while opt.pretrain_critic == 1:
        if critic_iter > opt.pretrain_critic_steps:
            print('****************Finished critic training!')
            break
        data = loader.get_batch('train')
        torch.cuda.synchronize()
        start = time.time()
        tmp = [
            data['fc_feats'], data['att_feats'], data['labels'], data['masks'],
            data['att_masks']
        ]
        tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp]
        fc_feats, att_feats, labels, masks, att_masks = tmp
        critic_model.train()
        critic_optimizer.zero_grad()
        assert opt.critic_model == 'att_critic_vocab'
        # crit_loss, reward, std = critic_loss_fun(fc_feats, att_feats, att_masks, dp_model, critic_model, opt, data)
        crit_loss, reward, std = target_critic_loss_fun_mask(
            fc_feats, att_feats, att_masks, dp_model, critic_model, opt, data,
            target_critic, target_actor)
        crit_loss.backward()
        critic_optimizer.step()
        #TODO update target.
        for cp, tp in zip(critic_model.parameters(),
                          target_critic.parameters()):
            tp.data = tp.data + opt.gamma_critic * (cp.data - tp.data)
        crit_train_loss = crit_loss.item()
        torch.cuda.synchronize
        end = time.time()
        error_sum += crit_train_loss**0.5 - std
        if (critic_iter % opt.losses_log_every == 0):
            print("iter {} , crit_train_loss = {:.3f}, difference = {:.3f}, difference_sum = {:.3f}, time/batch = {:.3f}" \
                .format(critic_iter, crit_train_loss**0.5, crit_train_loss**0.5-std, error_sum, end - start))
            print(opt.checkpoint_path)
            opt.importance_sampling = 1
            critic_model.eval()
            _, _, _, _ = get_rf_loss(dp_model,
                                     fc_feats,
                                     att_feats,
                                     att_masks,
                                     data,
                                     opt,
                                     loader,
                                     critic_model,
                                     test_critic=True)

        critic_iter += 1

        # make evaluation on validation set, and save model
        if (critic_iter % opt.save_checkpoint_every == 0):
            if not os.path.isdir(opt.checkpoint_path):
                os.mkdir(opt.checkpoint_path)
            checkpoint_path = os.path.join(opt.checkpoint_path,
                                           opt.critic_model + '_model.pth')
            torch.save(critic_model.state_dict(), checkpoint_path)

    ######################### Actor-critic Training #####################################################################

    update_lr_flag = True
    # Assure in training mode
    dp_model.train()

    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()

    optimizer = utils.build_optimizer(model.parameters(), opt)
    # Load the optimizer
    if vars(opt).get('start_from', None) is not None and os.path.isfile(
            os.path.join(opt.start_from, "optimizer.pth")):
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    first_order = 0
    second_order = 0
    while True:
        if update_lr_flag:
            # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
            else:
                opt.current_lr = opt.learning_rate
            utils.set_lr(optimizer, opt.current_lr)
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob

            # If start self critical training
            if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                sc_flag = True
                init_scorer(opt.cached_tokens)
            else:
                sc_flag = False

            update_lr_flag = False

        # Load data from train split (0)
        data = loader.get_batch('train')
        if data['bounds']['it_pos_now'] > 5000:
            loader.reset_iterator('train')
            continue
        dp_model.train()
        critic_model.eval()

        torch.cuda.synchronize()
        start = time.time()
        gen_result = None
        tmp = [
            data['fc_feats'], data['att_feats'], data['labels'], data['masks'],
            data['att_masks']
        ]
        tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp]
        fc_feats, att_feats, labels, masks, att_masks = tmp
        optimizer.zero_grad()
        if not sc_flag:
            loss = crit(dp_model(fc_feats, att_feats, labels, att_masks),
                        labels[:, 1:], masks[:, 1:])
        else:
            if opt.rl_type == 'sc':
                gen_result, sample_logprobs = dp_model(fc_feats,
                                                       att_feats,
                                                       att_masks,
                                                       opt={'sample_max': 0},
                                                       mode='sample')
                reward = get_self_critical_reward(dp_model, fc_feats,
                                                  att_feats, att_masks, data,
                                                  gen_result, opt)
                loss = rl_crit(sample_logprobs, gen_result.data,
                               torch.from_numpy(reward).float().cuda())
            elif opt.rl_type == 'reinforce':
                gen_result, sample_logprobs = dp_model(fc_feats,
                                                       att_feats,
                                                       att_masks,
                                                       opt={'sample_max': 0},
                                                       mode='sample')
                reward = get_reward(data, gen_result, opt)
                loss = rl_crit(sample_logprobs, gen_result.data,
                               torch.from_numpy(reward).float().cuda())
            elif opt.rl_type == 'arsm':
                loss = get_arm_loss(dp_model, fc_feats, att_feats, att_masks,
                                    data, opt, loader)
                #print(loss)
                reward = np.zeros([2, 2])
            elif opt.rl_type == 'rf4':
                loss, _, _, _ = get_rf_loss(dp_model, fc_feats, att_feats,
                                            att_masks, data, opt, loader)
                # print(loss)
                reward = np.zeros([2, 2])
            elif opt.rl_type == 'importance_sampling':
                opt.importance_sampling = 1
                loss, gen_result, reward, sample_logprobs_total = get_rf_loss(
                    dp_model, fc_feats, att_feats, att_masks, data, opt,
                    loader)
                reward = np.repeat(reward[:, np.newaxis], gen_result.shape[1],
                                   1)
                std = np.std(reward)
            elif opt.rl_type == 'importance_sampling_critic':
                opt.importance_sampling = 1
                loss, gen_result, reward, sample_logprobs_total = get_rf_loss(
                    target_actor, fc_feats, att_feats, att_masks, data, opt,
                    loader, target_critic)
                reward = np.repeat(reward[:, np.newaxis], gen_result.shape[1],
                                   1)
                std = np.std(reward)
            elif opt.rl_type == 'ar':
                loss = get_ar_loss(dp_model, fc_feats, att_feats, att_masks,
                                   data, opt, loader)
                reward = np.zeros([2, 2])
            elif opt.rl_type == 'mct_baseline':
                opt.rf_demean = 0
                gen_result, sample_logprobs, probs, mct_baseline = get_mct_loss(
                    dp_model, fc_feats, att_feats, att_masks, data, opt,
                    loader)
                reward = get_reward(data, gen_result, opt)
                reward_cuda = torch.from_numpy(reward).float().cuda()
                mct_baseline[mct_baseline < 0] = reward_cuda[mct_baseline < 0]
                if opt.arm_step_sample == 'greedy':
                    sample_logprobs = sample_logprobs * probs
                loss = rl_crit(
                    sample_logprobs, gen_result.data,
                    torch.from_numpy(reward).float().cuda() - mct_baseline)
            elif opt.rl_type == 'arsm_baseline':
                opt.arm_as_baseline = 1
                opt.rf_demean = 0
                gen_result, sample_logprobs, probs, arm_baseline = get_arm_loss(
                    dp_model, fc_feats, att_feats, att_masks, data, opt,
                    loader)
                reward = get_reward(data, gen_result, opt)
                reward_cuda = torch.from_numpy(reward).float().cuda()
                arm_baseline[arm_baseline < 0] = reward_cuda[arm_baseline < 0]
                if opt.arm_step_sample == 'greedy' and False:
                    sample_logprobs = sample_logprobs * probs
                loss = rl_crit(sample_logprobs, gen_result.data,
                               reward_cuda - arm_baseline)
            elif opt.rl_type == 'ars_indicator':
                opt.arm_as_baseline = 1
                opt.rf_demean = 0
                gen_result, sample_logprobs, probs, arm_baseline = get_arm_loss(
                    dp_model, fc_feats, att_feats, att_masks, data, opt,
                    loader)
                reward = get_self_critical_reward(dp_model, fc_feats,
                                                  att_feats, att_masks, data,
                                                  gen_result, opt)
                reward_cuda = torch.from_numpy(reward).float().cuda()
                loss = rl_crit(sample_logprobs, gen_result.data,
                               reward_cuda * arm_baseline)
            elif opt.rl_type == 'arsm_baseline_critic':
                opt.arm_as_baseline = 1
                opt.rf_demean = 0
                gen_result, sample_logprobs, probs, arm_baseline = get_arm_loss(
                    dp_model, fc_feats, att_feats, att_masks, data, opt,
                    loader, critic_model)
                reward, std = get_reward(data, gen_result, opt, critic=True)
                if opt.arm_step_sample == 'greedy':
                    sample_logprobs = sample_logprobs * probs
                loss = rl_crit(
                    sample_logprobs, gen_result.data,
                    torch.from_numpy(reward).float().cuda() - arm_baseline)
            elif opt.rl_type == 'arsm_critic':
                #print(opt.critic_model)
                tic = time.time()
                loss = get_arm_loss(dp_model, fc_feats, att_feats, att_masks,
                                    data, opt, loader, critic_model)
                #print('arm_loss time', str(time.time()-tic))
                reward = np.zeros([2, 2])
            elif opt.rl_type == 'critic_vocab_sum':
                assert opt.critic_model == 'att_critic_vocab'
                tic = time.time()
                gen_result, sample_logprobs_total = dp_model(
                    fc_feats,
                    att_feats,
                    att_masks,
                    opt={'sample_max': 0},
                    total_probs=True,
                    mode='sample')  #batch, seq, vocab
                #print('generation time', time.time()-tic)
                gen_result_pad = torch.cat([
                    gen_result.new_zeros(
                        gen_result.size(0), 1, dtype=torch.long), gen_result
                ], 1)
                tic = time.time()
                critic_value = critic_model(gen_result_pad, fc_feats,
                                            att_feats, True, opt,
                                            att_masks)  #batch, seq, vocab
                #print('critic time', time.time() - tic)
                probs = torch.sum(
                    F.softmax(sample_logprobs_total, 2) *
                    critic_value.detach(), 2)
                mask = (gen_result > 0).float()
                mask = torch.cat(
                    [mask.new(mask.size(0), 1).fill_(1), mask[:, :-1]], 1)
                loss = -torch.sum(probs * mask) / torch.sum(mask)
                reward = np.zeros([2, 2])
            elif opt.rl_type == 'reinforce_critic':
                #TODO change the critic to attention
                if opt.critic_model == 'state_critic':
                    critic_value, gen_result, sample_logprobs = critic_model(
                        dp_model, fc_feats, att_feats, opt, att_masks)
                    reward, std = get_reward(data,
                                             gen_result,
                                             opt,
                                             critic=True)
                    loss = rl_crit(
                        sample_logprobs, gen_result.data,
                        torch.from_numpy(reward).float().cuda() -
                        critic_value[:, :-1].data)
                elif opt.critic_model == 'att_critic':
                    gen_result, sample_logprobs = dp_model(
                        fc_feats,
                        att_feats,
                        att_masks,
                        opt={'sample_max': 0},
                        mode='sample')
                    gen_result_pad = torch.cat([
                        gen_result.new_zeros(gen_result.size(0),
                                             1,
                                             dtype=torch.long), gen_result
                    ], 1)
                    critic_value = critic_model(gen_result_pad, fc_feats,
                                                att_feats, True, opt,
                                                att_masks).squeeze(2)

                    reward, std = get_reward(data,
                                             gen_result,
                                             opt,
                                             critic=True)
                    loss = rl_crit(
                        sample_logprobs, gen_result.data,
                        torch.from_numpy(reward).float().cuda() -
                        critic_value.data)
        if opt.mle_weights != 0:
            loss += opt.mle_weights * crit(
                dp_model(fc_feats, att_feats, labels, att_masks),
                labels[:, 1:], masks[:, 1:])
        #TODO make sure all sampling replaced by greedy for critic
        #### update the actor
        loss.backward()
        # with open(os.path.join(opt.checkpoint_path, 'best_embed.pkl'), 'wb') as f:
        #     cPickle.dump(list(dp_model.embed.parameters())[0].data.cpu().numpy(), f)
        # with open(os.path.join(opt.checkpoint_path, 'best_logit.pkl'), 'wb') as f:
        #     cPickle.dump(list(dp_model.logit.parameters())[0].data.cpu().numpy(), f)
        ## compute variance
        gradient = torch.zeros([0]).cuda()
        for i in model.parameters():
            gradient = torch.cat((gradient, i.grad.view(-1)), 0)
        first_order = 0.9999 * first_order + 0.0001 * gradient
        second_order = 0.9999 * second_order + 0.0001 * gradient.pow(2)
        # print(torch.max(torch.abs(gradient)))
        variance = torch.mean(torch.abs(second_order -
                                        first_order.pow(2))).item()
        if opt.rl_type != 'arsm' or not sc_flag:
            utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        # ### update the critic
        if 'critic' in opt.rl_type:
            dp_model.eval()
            critic_model.train()
            utils.set_lr(critic_optimizer, opt.critic_learning_rate)
            critic_optimizer.zero_grad()
            assert opt.critic_model == 'att_critic_vocab'
            crit_loss, reward, std = target_critic_loss_fun_mask(
                fc_feats,
                att_feats,
                att_masks,
                dp_model,
                critic_model,
                opt,
                data,
                target_critic,
                target_actor,
                gen_result=gen_result,
                sample_logprobs_total=sample_logprobs_total,
                reward=reward)
            crit_loss.backward()
            critic_optimizer.step()
            for cp, tp in zip(critic_model.parameters(),
                              target_critic.parameters()):
                tp.data = tp.data + opt.gamma_critic * (cp.data - tp.data)
            for cp, tp in zip(dp_model.parameters(),
                              target_actor.parameters()):
                tp.data = tp.data + opt.gamma_actor * (cp.data - tp.data)
            crit_train_loss = crit_loss.item()
            error_sum += crit_train_loss**0.5 - std
        train_loss = loss.item()
        torch.cuda.synchronize()
        end = time.time()
        if (iteration % opt.losses_log_every == 0):
            if not sc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, train_loss, end - start))
                print(opt.checkpoint_path)
            elif 'critic' in opt.rl_type:
                print(
                    "iter {} , crit_train_loss = {:.3f}, difference = {:.3f}, difference_sum = {:.3f},variance = {:g}, time/batch = {:.3f}" \
                    .format(iteration, crit_train_loss ** 0.5, crit_train_loss ** 0.5 - std, error_sum, variance, end - start))
                print(opt.checkpoint_path)
                critic_model.eval()
                _, _, _, _ = get_rf_loss(dp_model,
                                         fc_feats,
                                         att_feats,
                                         att_masks,
                                         data,
                                         opt,
                                         loader,
                                         critic_model,
                                         test_critic=True)
            else:
                print("iter {} (epoch {}), avg_reward = {:.3f}, variance = {:g}, time/batch = {:.3f}" \
                      .format(iteration, epoch, np.mean(reward[:, 0]), variance, end - start))

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            add_summary_value(tb_summary_writer, 'train_loss', train_loss,
                              iteration)
            add_summary_value(tb_summary_writer, 'learning_rate',
                              opt.current_lr, iteration)
            add_summary_value(tb_summary_writer, 'scheduled_sampling_prob',
                              model.ss_prob, iteration)
            if sc_flag:
                add_summary_value(tb_summary_writer, 'avg_reward',
                                  np.mean(reward), iteration)
                add_summary_value(tb_summary_writer, 'variance', variance,
                                  iteration)

            loss_history[iteration] = train_loss if not sc_flag else np.mean(
                reward)
            critic_loss_history[
                iteration] = crit_train_loss if 'critic' in opt.rl_type else 0
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob
            variance_history[iteration] = variance
            time_history[iteration] = end - start

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model
            eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats = eval_utils.eval_split(
                dp_model, crit, loader, eval_kwargs)

            # Write validation result into summary
            add_summary_value(tb_summary_writer, 'validation loss', val_loss,
                              iteration)
            if lang_stats is not None:
                for k, v in lang_stats.items():
                    add_summary_value(tb_summary_writer, k, v, iteration)
            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = -val_loss

            best_flag = False
            if True:  # if true
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                if not os.path.isdir(opt.checkpoint_path):
                    os.mkdir(opt.checkpoint_path)
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               opt.critic_model + '_model.pth')
                torch.save(critic_model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path,
                                              'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['split_ix'] = loader.split_ix
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['vocab'] = loader.get_vocab()

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['critic_loss_history'] = critic_loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                histories['variance_history'] = variance_history
                histories['time'] = time_history
                # histories['variance'] = 0
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'infos_' + opt.id + '.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'histories_' + opt.id + '.pkl'),
                        'wb') as f:
                    cPickle.dump(histories, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    with open(
                            os.path.join(opt.checkpoint_path,
                                         'infos_' + opt.id + '-best.pkl'),
                            'wb') as f:
                        cPickle.dump(infos, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
def train(opt):

    # Load data
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    # Tensorboard summaries (they're great!)
    tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)

    # Load pretrained model, info file, histories file
    infos = {}
    histories = {}
    if opt.start_from is not None:
        with open(os.path.join(opt.start_from,
                               'infos_' + opt.id + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = ["rnn_type", "rnn_size", "num_layers"]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme
        if os.path.isfile(
                os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
            with open(
                    os.path.join(opt.start_from,
                                 'histories_' + opt.id + '.pkl')) as f:
                histories = cPickle.load(f)
    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)
    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    #ss_prob_history = histories.get('ss_prob_history', {})
    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    # Create model
    model = models.setup(opt).cuda()
    #pretrained_dict = torch.load(opt.model)
    #model.load_state_dict(pretrained_dict, strict=False)

    num_params = get_n_params(model)
    print('number of parameteres:', num_params)

    dp_model = torch.nn.DataParallel(model)
    dp_model.train()

    # Loss function
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()

    # Optimizer and learning rate adjustment flag
    optimizer = utils.build_optimizer(model.parameters(), opt)
    update_lr_flag = True

    # Load the optimizer
    if vars(opt).get('start_from', None) is not None and os.path.isfile(
            os.path.join(opt.start_from, "optimizer.pth")):
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    # Training loop
    while True:

        # Update learning rate once per epoch
        if update_lr_flag:

            # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
            else:
                opt.current_lr = opt.learning_rate
            utils.set_lr(optimizer, opt.current_lr)

            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                #opt.ss_prob = min(opt.scheduled_sampling_increase_prob  * frac, opt.scheduled_sampling_max_prob)
                #model.ss_prob = opt.ss_prob

            # If start self critical training
            if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                sc_flag = True
                init_scorer(opt.cached_tokens)
            else:
                sc_flag = False

            update_lr_flag = False

        # Load data from train split (0)
        start = time.time()
        data = loader.get_batch('train')
        data_time = time.time() - start
        start = time.time()

        # Unpack data
        torch.cuda.synchronize()
        tmp = [
            data['fc_feats'], data['att_feats'], data['labels'], data['dist'],
            data['masks'], data['att_masks']
        ]
        tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp]
        fc_feats, att_feats, labels, dist_label, masks, att_masks = tmp
        batchsize = fc_feats.size(0)
        # Forward pass and loss
        optimizer.zero_grad()
        if not sc_flag:
            wordact, reconstruct = dp_model(fc_feats, att_feats, labels)
            #loss_dist = F.binary_cross_entropy(dist, dist_label.cpu().float())
            fc_feats_max, _ = att_feats.max(1)
            loss_rec = F.mse_loss(reconstruct.cpu(), fc_feats_max.cpu())
            mask = masks[:, 1:].contiguous()
            wordact = wordact[:, :, :-1]
            wordact_t = wordact.permute(0, 2, 1).contiguous()
            wordact_t = wordact_t.view(
                wordact_t.size(0) * wordact_t.size(1), -1)
            labels = labels.contiguous().view(-1, 6 * 30).cpu()
            wordclass_v = labels[:, 1:]
            wordclass_t = wordclass_v.contiguous().view(\
               wordclass_v.size(0) * wordclass_v.size(1), 1)
            maskids = torch.nonzero(mask.view(-1).cpu()).numpy().reshape(-1)
            loss_xe = F.cross_entropy(wordact_t[maskids, ...], \
               wordclass_t[maskids, ...].contiguous().view(maskids.shape[0]))
            loss = 5 * loss_xe + loss_rec
        else:
            gen_result, sample_logprobs = dp_model(fc_feats,
                                                   att_feats,
                                                   att_masks,
                                                   opt={'sample_max': 0},
                                                   mode='sample')
            reward = get_self_critical_reward(dp_model, fc_feats, att_feats,
                                              att_masks, data, gen_result, opt)
            loss = rl_crit(sample_logprobs, gen_result.data,
                           torch.from_numpy(reward).float().cuda())

        # Backward pass
        loss.backward()
        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        train_loss = loss.item()
        torch.cuda.synchronize()

        # Print
        total_time = time.time() - start
        if iteration % opt.print_freq == 1:
            print('Read data:', time.time() - start)
            if not sc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, data_time = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, train_loss, data_time, total_time))
            else:
                print("iter {} (epoch {}), avg_reward = {:.3f}, data_time = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, np.mean(reward[:,0]), data_time, total_time))

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            add_summary_value(tb_summary_writer, 'train_loss', train_loss,
                              iteration)
            add_summary_value(tb_summary_writer, 'learning_rate',
                              opt.current_lr, iteration)
            #add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration)
            if sc_flag:
                add_summary_value(tb_summary_writer, 'avg_reward',
                                  np.mean(reward[:, 0]), iteration)
            loss_history[iteration] = train_loss if not sc_flag else np.mean(
                reward[:, 0])
            lr_history[iteration] = opt.current_lr
            #ss_prob_history[iteration] = model.ss_prob

        # Validate and save model
        if (iteration >= 60000 and iteration % opt.save_checkpoint_every == 0):
            checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth')
            torch.save(model.state_dict(), checkpoint_path)
            print("model saved to {}".format(checkpoint_path))
            optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth')
            torch.save(optimizer.state_dict(), optimizer_path)
            # Evaluate model
            eval_kwargs = {'split': 'test', 'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats = eval_utils.eval_split(
                dp_model, crit, loader, eval_kwargs)
            # Write validation result into summary
            add_summary_value(tb_summary_writer, 'validation loss', val_loss,
                              iteration)
            if lang_stats is not None:
                for k, v in lang_stats.items():
                    add_summary_value(tb_summary_writer, k, v, iteration)
            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            # Our metric is CIDEr if available, otherwise validation loss
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = -val_loss

            # Save model in checkpoint path
            best_flag = False
            if best_val_score is None or current_score > best_val_score:
                best_val_score = current_score
                best_flag = True
            checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth')
            torch.save(model.state_dict(), checkpoint_path)
            print("model saved to {}".format(checkpoint_path))
            optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth')
            torch.save(optimizer.state_dict(), optimizer_path)

            # Dump miscalleous informations
            infos['iter'] = iteration
            infos['epoch'] = epoch
            infos['iterators'] = loader.iterators
            infos['split_ix'] = loader.split_ix
            infos['best_val_score'] = best_val_score
            infos['opt'] = opt
            infos['vocab'] = loader.get_vocab()
            histories['val_result_history'] = val_result_history
            histories['loss_history'] = loss_history
            histories['lr_history'] = lr_history
            #histories['ss_prob_history'] = ss_prob_history
            with open(
                    os.path.join(opt.checkpoint_path,
                                 'infos_' + opt.id + '.pkl'), 'wb') as f:
                cPickle.dump(infos, f)
            with open(
                    os.path.join(opt.checkpoint_path,
                                 'histories_' + opt.id + '.pkl'), 'wb') as f:
                cPickle.dump(histories, f)

            # Save model to unique file if new best model
            if best_flag:
                model_fname = 'model-best.pth'
                infos_fname = 'model-best.pkl'
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               model_fname)
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                with open(os.path.join(opt.checkpoint_path, infos_fname),
                          'wb') as f:
                    cPickle.dump(infos, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
def eval(opt, model_name, infos_name, annFile, listener, split, iteration):
    # Input arguments and options
    # Load infos

    with open(infos_name, 'rb') as f:
        infos = cPickle.load(f, encoding='latin1')

    # For the case that we run eval not immediately after train, so arguments
    # are not exist. 'att_hid_size' is just one possible test to find out.
    if not hasattr(opt, 'att_hid_size'):
        opt = infos['opt']
    opt.split = split
    opt.beam_size = 2

    np.random.seed(123)

    # override and collect parameters
    if len(opt.input_fc_dir) == 0:
        opt.input_fc_dir = infos['opt'].input_fc_dir
        opt.input_att_dir = infos['opt'].input_att_dir
        opt.input_label_h5 = infos['opt'].input_label_h5
    if len(opt.input_json) == 0:
        opt.input_json = infos['opt'].input_json
    if opt.batch_size == 0:
        opt.batch_size = infos['opt'].batch_size
    if len(opt.id) == 0:
        opt.id = infos['opt'].id
    # if opt.initialize_retrieval == None:
    #     opt.initialize_retrieval = infos['opt'].initialize_retrieval
    ignore = [
        "id", "batch_size", "beam_size", "start_from", "language_eval",
        "initialize_retrieval", 'decoding_constraint', 'evaluation_retrieval',
        "input_fc_dir", "input_att_dir", "input_label_h5", 'seq_per_img',
        'closest_num', 'closest_file'
    ]
    # for k in vars(infos['opt']).keys():
    #     if k not in ignore:
    #         if k in vars(opt) and getattr(opt, k) is not None:
    #             assert vars(opt)[k] == vars(infos['opt'])[k], k + ' option not consistent:' + str(vars(opt)[k])+' '+ str(vars(infos['opt'])[k])
    #         else:
    #             vars(opt).update({k: vars(infos['opt'])[k]}) # copy over options from model

    vocab = infos['vocab']  # ix -> word mapping

    # assert opt.closest_num == opt.seq_per_img
    opt.vse_loss_weight = vars(opt).get('vse_loss_weight', 1)
    opt.caption_loss_weight = vars(opt).get('caption_loss_weight', 1)

    opt.cider_optimization = 0

    # Setup the model
    model = models.AlternatingJointModel(opt, iteration)
    # model = models.JointModel(opt)
    utils.load_state_dict(model, torch.load(model_name))
    if listener == 'gt':
        print('gt listener is loaded for evaluation')
        # utils.load_state_dict(model.vse, torch.load(opt.initialize_retrieval))
        utils.load_state_dict(
            model, {
                k: v
                for k, v in torch.load(opt.initialize_retrieval).items()
                if 'vse.' in k
            })

    model.cuda()
    model.eval()

    # Create the Data Loader instance
    loader = DataLoader(opt)
    # Set sample options
    loss, split_predictions, lang_stats = eval_utils.eval_split(
        model, loader, vars(opt), annFile, useGenSent=True)

    return {
        'loss': loss,
        'split_predictions': split_predictions,
        'lang_stats': lang_stats
    }
def train(opt):
    opt.use_att = utils.if_use_att(opt.caption_model)
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    tf_summary_writer = tf and tf.summary.FileWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl'),'rb') as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(opt)[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')):
            with open(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl'),'rb') as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    model = models.setup(opt)
    model.cuda()
    if opt.multi_gpu:
        model=nn.DataParallel(model)
    update_lr_flag = True
    # Assure in training mode
    model.train()

    crit = utils.LanguageModelCriterion()

    optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay)

    # Load the optimizer
    if vars(opt).get('start_from', None) is not None:
        optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    while True:
        if update_lr_flag:
                # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate  ** frac
                opt.current_lr = opt.learning_rate * decay_factor
                utils.set_lr(optimizer, opt.current_lr) # set the decayed rate
            else:
                opt.current_lr = opt.learning_rate
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob  * frac, opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob
            update_lr_flag = False
                
        start = time.time()
        # Load data from train split (0)
        data = loader.get_batch('train')
        print('Read data:', time.time() - start)

        torch.cuda.synchronize()
        start = time.time()

        tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']]
        tmp = [Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp]
        fc_feats, att_feats, labels, masks = tmp
        
        optimizer.zero_grad()
        loss = crit(model(fc_feats, att_feats, labels), labels[:,1:], masks[:,1:])
        loss.backward()
        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        train_loss = loss.data[0]
        torch.cuda.synchronize()
        end = time.time()
        print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
            .format(iteration, epoch, train_loss, end - start))

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            if tf is not None:
                add_summary_value(tf_summary_writer, 'train_loss', train_loss, iteration)
                add_summary_value(tf_summary_writer, 'learning_rate', opt.current_lr, iteration)
                add_summary_value(tf_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration)
                tf_summary_writer.flush()

            loss_history[iteration] = train_loss
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model
            eval_kwargs = {'split': 'val',
                            'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats = eval_utils.eval_split(model, crit, loader, eval_kwargs)

            # Write validation result into summary
            if tf is not None:
                add_summary_value(tf_summary_writer, 'validation loss', val_loss, iteration)
                for k,v in lang_stats.items():
                    add_summary_value(tf_summary_writer, k, v, iteration)
                tf_summary_writer.flush()
            val_result_history[iteration] = {'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions}

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = - val_loss

            best_flag = False
            if True: # if true
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['split_ix'] = loader.split_ix
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['vocab'] = loader.get_vocab()

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(os.path.join(opt.checkpoint_path, 'histories_'+opt.id+'.pkl'), 'wb') as f:
                    cPickle.dump(histories, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'-best.pkl'), 'wb') as f:
                        cPickle.dump(infos, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
Exemple #6
0
# Create the Data Loader instance
if len(opt.image_folder) == 0:
    loader = DataLoader(opt)
else:
    loader = DataLoaderRaw({
        'folder_path': opt.image_folder,
        'coco_json': opt.coco_json,
        'batch_size': opt.batch_size,
        'cnn_model': opt.cnn_model
    })
# When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json
# So make sure to use the vocab in infos file.
loader.ix_to_word = infos['vocab']

#opt.id = '+'.join([_+str(__) for _,__ in zip(opt.ids, opt.weights)])
# Set sample options
split_predictions_list = []
# opt.verbose_beam = 0
for order in range(opt.number_of_models):
    loss, split_predictions, lang_stats = eval_utils.eval_split(
        multi_models_list[order], crit, loader, vars(opt))
    split_predictions_list.append(split_predictions)

print('loss: ', loss)
if lang_stats:
    print(lang_stats)

if opt.dump_json == 1:
    # dump the json
    json.dump(split_predictions, open('vis/vis.json', 'w'))
def train(opt):
    # load train/valid/test data
    opt.vocab_size = get_nwords(opt.data_path)
    opt.category_size = get_nclasses(opt.data_path)
    mytrain_dset, myvalid_dset, mytest_dset = loaddset(opt)

    logger = tf and Logger(opt.checkpoint_path)
    # init or load training infos
    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from,
                               'infos_' + opt.id + '-best.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = ["rnn_size", "num_layers"]  # optim needn't same
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(opt.start_from,
                             'histories_' + opt.id + '-best.pkl')):
            with open(
                    os.path.join(opt.start_from,
                                 'histories_' + opt.id + '-best.pkl')) as f:
                histories = cPickle.load(f)
        # random seed must be inherited if didn't assign it.
        if opt.seed == 0:
            opt.seed = infos['opt'].seed

    iteration = 1  #infos.get('iter', 0) + 1
    epoch = 0  #infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)
    else:
        best_val_score = None

    torch.manual_seed(opt.seed)
    torch.cuda.manual_seed(opt.seed)
    model = SAModel(opt)

    if opt.start_from is not None:
        # check if all necessary files exist
        assert os.path.isdir(
            opt.start_from), " %s must be a a path" % opt.start_from
        model.load_state_dict(torch.load(
            os.path.join(opt.start_from, 'model-best.pth')),
                              strict=False)
    model.cuda()
    model.train()

    crit = LanguageModelCriterion()
    classify_crit = ClassiferCriterion()
    rl_crit = RewardCriterion()

    # select optimizer
    if opt.optim == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=opt.learning_rate,
                               weight_decay=opt.weight_decay)
    elif opt.optim == 'adadelta':
        optimizer = optim.Adadelta(model.parameters(),
                                   lr=1.0,
                                   weight_decay=opt.weight_decay)
        opt.learning_rate_decay_start = -1

    # training start
    tmp_patience = 0
    # each epoch
    while True:
        update_lr_flag = True  # when a new epoch start, set update_lr_flag to True
        if update_lr_flag:
            # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0 and opt.optim != 'adadelta':
                frac = int((epoch - opt.learning_rate_decay_start) /
                           opt.learning_rate_decay_every)
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
                myutils.set_lr(optimizer,
                               opt.current_lr)  # set the decayed rate
            else:
                opt.current_lr = opt.learning_rate
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = int((epoch - opt.scheduled_sampling_start) /
                           opt.scheduled_sampling_increase_every)
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob
            # If start self critical training
            if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                sc_flag = True
                myutils.init_cider_scorer(opt.reward_type)
            else:
                sc_flag = False
            update_lr_flag = False

        #loading train data
        myloader_train = DataLoader(mytrain_dset,
                                    batch_size=opt.batch_size,
                                    collate_fn=data_io.collate_fn,
                                    shuffle=True)
        torch.cuda.synchronize()
        for data, cap, cap_mask, cap_classes, class_mask, feat1, feat2, feat_mask, lens, groundtruth, image_id in myloader_train:
            start = time.time()
            cap = Variable(cap, requires_grad=False).cuda()
            cap_mask = Variable(cap_mask, requires_grad=False).cuda()
            cap_classes = Variable(
                cap_classes, requires_grad=False).cuda()  # (m, seq_len+1)
            class_mask = Variable(class_mask, requires_grad=False).cuda()
            feat1 = Variable(feat1, requires_grad=False).cuda()
            feat2 = Variable(feat2, requires_grad=False).cuda()

            feat_mask = Variable(feat_mask, requires_grad=False).cuda()

            cap_classes = torch.cat([cap_classes[:, -1:], cap_classes[:, :-1]],
                                    dim=-1)  # (m, seq_len+1)
            new_mask = torch.zeros_like(class_mask)  # (m, seq_len+1)
            for i in range(class_mask.size(0)):
                index = np.argwhere(
                    class_mask.data[i, :] != 0)[0][-1]  # posmask_i 中最后一个不为0的地方
                new_mask[i, :index + 1] = 1.0

            optimizer.zero_grad()

            if not sc_flag:
                out = model(
                    feat1, feat2, feat_mask, cap, cap_mask, cap_classes,
                    new_mask
                )  # (m,seq_len+1,n_words),(m, seq_len+1, n_classes)
                loss = classify_crit(out, cap_classes, cap_mask, class_mask)
            else:
                print('Not implement yet.')
                exit(11)
                gen_result, sample_logprobs = model.sample(
                    feat1, feat2, feat_mask, {'sample_max': 0})
                reward = myutils.get_self_critical_reward(
                    model, feat, feat_mask, groundtruth,
                    gen_result)  # (m,max_length)
                loss = rl_crit(
                    sample_logprobs, gen_result,
                    Variable(torch.from_numpy(reward).float().cuda(),
                             requires_grad=False))

            loss.backward()
            myutils.clip_gradient(optimizer, opt.grad_clip)
            optimizer.step()
            train_loss = loss.data[0]
            torch.cuda.synchronize()
            end = time.time()

            if not sc_flag:
                print(
                    "iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}"
                    .format(iteration, epoch, train_loss, end - start))
            else:
                print(
                    "iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}"
                    .format(iteration, epoch, np.mean(reward[:, 0]),
                            end - start))

            # Write the training loss summary
            if (iteration % opt.losses_log_every == 0):
                if tf is not None:
                    logger.scalar_summary('train_loss', train_loss, iteration)
                    logger.scalar_summary('learning_rate', opt.current_lr,
                                          iteration)
                    logger.scalar_summary('scheduled_sampling_prob',
                                          model.ss_prob, iteration)
                    if sc_flag:
                        logger.scalar_summary('avg_reward',
                                              np.mean(reward[:, 0]), iteration)

                loss_history[
                    iteration] = train_loss if not sc_flag else np.mean(
                        reward[:, 0])
                lr_history[iteration] = opt.current_lr
                ss_prob_history[iteration] = model.ss_prob

            # make evaluation on validation set, and save model
            if (iteration % opt.save_checkpoint_every == 0):
                # eval model
                print('validation and save the model...')
                time.sleep(3)
                eval_kwargs = {}
                eval_kwargs.update(
                    vars(opt))  # attend vars(opt) into eval_kwargs
                val_loss, predictions, lang_stats = eval_utils.eval_split(
                    model, crit, classify_crit, myvalid_dset, eval_kwargs)
                print('val loss: {}'.format(val_loss))
                print('validation is finish!')
                time.sleep(3)

                if tf is not None:
                    logger.scalar_summary('validation loss', val_loss,
                                          iteration)
                    if opt.language_eval == 1:
                        for tag, value in lang_stats.items():
                            if type(value) is list:
                                logger.scalar_summary(tag, value[-1],
                                                      iteration)
                            else:
                                logger.scalar_summary(tag, value, iteration)
                        for tag, value in model.named_parameters():
                            tag = tag.replace('.', '/')
                            logger.histo_summary(tag,
                                                 value.data.cpu().numpy(),
                                                 iteration)
                            logger.histo_summary(
                                tag + '/grad', (value.grad).data.cpu().numpy(),
                                iteration)

                val_result_history[iteration] = {
                    'loss': val_loss,
                    'lang_stats': lang_stats,
                    'predictions': predictions
                }

                # Save model if is improving on validation result
                if opt.language_eval == 1:
                    current_score = lang_stats['CIDEr']
                    #current_score = -val_loss
                else:
                    current_score = -val_loss
                best_flag = False

                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                    tmp_patience = 0
                else:
                    tmp_patience += 1

                if not os.path.exists(opt.checkpoint_path):
                    os.mkdir(opt.checkpoint_path)
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))

                # Dump miscalleous informations(current information)
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['val_score'] = lang_stats
                infos['val_sents'] = predictions

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'infos_' + opt.id + '.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'histories_' + opt.id + '.pkl'),
                        'wb') as f:
                    cPickle.dump(histories, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    with open(
                            os.path.join(opt.checkpoint_path,
                                         'infos_' + opt.id + '-best.pkl'),
                            'wb') as f:
                        cPickle.dump(infos, f)
                    with open(
                            os.path.join(opt.checkpoint_path,
                                         'histories_' + opt.id + '-best.pkl'),
                            'wb') as f:
                        cPickle.dump(histories, f)

            if tmp_patience >= opt.patience:
                break
            iteration += 1
        if tmp_patience >= opt.patience:
            print("early stop, trianing is finished!")
            break
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            print("reach max epochs, training is finished!")
            break
        epoch += 1
Exemple #8
0
def train(opt):
    # Deal with feature things before anything
    opt.use_fc, opt.use_att = utils.if_use_feat(opt.caption_model)
    if opt.use_box:
        opt.att_feat_size = opt.att_feat_size + 5

    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl'),
                  'rb') as f:
            infos = utils.pickle_load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
            with open(
                    os.path.join(opt.start_from,
                                 'histories_' + opt.id + '.pkl'), 'rb') as f:
                histories = utils.pickle_load(f)
    else:
        infos['iter'] = 0
        infos['epoch'] = 0
        infos['iterators'] = loader.iterators
        infos['split_ix'] = loader.split_ix
        infos['vocab'] = loader.get_vocab()
    infos['opt'] = opt

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    # cnn_model = utils.build_cnn(opt)
    cnn_model = create_extractor(
        "/root/PycharmProjects/vgg_vae_best_model.pth")
    cnn_model = cnn_model.cuda()

    if vars(opt).get('start_from', None) is not None:
        cnn_model.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'model-cnn.pth')))
        print("load cnn model parameters from {}".format(
            os.path.join(opt.start_from, 'model-cnn.pth')))

    model = models.setup(opt).cuda()
    dp_model = torch.nn.DataParallel(model)
    lw_model = LossWrapper(model, opt)
    dp_lw_model = torch.nn.DataParallel(lw_model)
    # dp_lw_model = lw_model
    epoch_done = True
    # Assure in training mode
    dp_lw_model.train()

    if opt.noamopt:
        assert opt.caption_model == 'transformer', 'noamopt can only work with transformer'
        optimizer = utils.get_std_opt(model,
                                      factor=opt.noamopt_factor,
                                      warmup=opt.noamopt_warmup)
        optimizer._step = iteration
    elif opt.reduce_on_plateau:
        optimizer = utils.build_optimizer(model.parameters(), opt)
        optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    else:
        optimizer = utils.build_optimizer(model.parameters(), opt)

    # if opt.finetune_cnn_after != -1:
    #     # only finetune the layer2 to layer4
    cnn_optimizer = optim.Adam([{
        'params': module.parameters()
    } for module in cnn_model.finetune_modules],
                               lr=opt.cnn_learning_rate,
                               weight_decay=opt.cnn_weight_decay)

    # Load the optimizer
    if vars(opt).get('start_from', None) is not None:
        if os.path.isfile(os.path.join(opt.start_from, "optimizer.pth")):
            optimizer.load_state_dict(
                torch.load(os.path.join(opt.start_from, 'optimizer.pth')))
        if opt.finetune_cnn_after != -1:
            if os.path.isfile(os.path.join(opt.start_from,
                                           'optimizer-cnn.pth')):
                cnn_optimizer.load_state_dict(
                    torch.load(
                        os.path.join(opt.start_from, 'optimizer-cnn.pth')))

    def save_checkpoint(model,
                        cnn_model,
                        infos,
                        optimizer,
                        cnn_optimizer,
                        histories=None,
                        append=''):
        if len(append) > 0:
            append = '-' + append
        # if checkpoint_path doesn't exist
        if not os.path.isdir(opt.checkpoint_path):
            os.makedirs(opt.checkpoint_path)
        checkpoint_path = os.path.join(opt.checkpoint_path,
                                       'model%s.pth' % (append))
        torch.save(model.state_dict(), checkpoint_path)
        print("model saved to {}".format(checkpoint_path))

        cnn_checkpoint_path = os.path.join(opt.checkpoint_path,
                                           'model-cnn%s.pth' % (append))
        torch.save(cnn_model.state_dict(), cnn_checkpoint_path)
        print("cnn model saved to {}".format(cnn_checkpoint_path))

        optimizer_path = os.path.join(opt.checkpoint_path,
                                      'optimizer%s.pth' % (append))
        torch.save(optimizer.state_dict(), optimizer_path)

        if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after:
            cnn_optimizer_path = os.path.join(opt.checkpoint_path,
                                              'optimizer%s-cnn.pth' % (append))
            torch.save(cnn_optimizer.state_dict(), cnn_optimizer_path)

        with open(
                os.path.join(opt.checkpoint_path,
                             'infos_' + opt.id + '%s.pkl' % (append)),
                'wb') as f:
            utils.pickle_dump(infos, f)
        if histories:
            with open(
                    os.path.join(opt.checkpoint_path,
                                 'histories_' + opt.id + '%s.pkl' % (append)),
                    'wb') as f:
                utils.pickle_dump(histories, f)

    try:
        while True:
            if epoch_done:
                if not opt.noamopt and not opt.reduce_on_plateau:
                    # Assign the learning rate
                    if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                        frac = (epoch - opt.learning_rate_decay_start
                                ) // opt.learning_rate_decay_every
                        decay_factor = opt.learning_rate_decay_rate**frac
                        opt.current_lr = opt.learning_rate * decay_factor
                    else:
                        opt.current_lr = opt.learning_rate
                    # set the decayed rate
                    utils.set_lr(optimizer, opt.current_lr)
                # Assign the scheduled sampling prob
                if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                    frac = (epoch - opt.scheduled_sampling_start
                            ) // opt.scheduled_sampling_increase_every
                    opt.ss_prob = min(
                        opt.scheduled_sampling_increase_prob * frac,
                        opt.scheduled_sampling_max_prob)
                    model.ss_prob = opt.ss_prob

                # Update the training stage of cnn
                if opt.finetune_cnn_after == -1 or epoch < opt.finetune_cnn_after:
                    for p in cnn_model.parameters():
                        p.requires_grad = False
                    cnn_model.eval()
                else:
                    for p in cnn_model.parameters():
                        p.requires_grad = True
                    # Fix the first few layers:
                    for module in cnn_model.fixed_modules:
                        for p in module.parameters():
                            p.requires_grad = False
                        cnn_model.train()

                # If start self critical training
                if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                    sc_flag = True
                    init_scorer(opt.cached_tokens)
                else:
                    sc_flag = False

                epoch_done = False

            start = time.time()
            # Load data from train split (0)
            data = loader.get_batch('train')
            torch.cuda.synchronize()
            print('Read data:', time.time() - start)

            torch.cuda.synchronize()
            start = time.time()

            tmp = [
                data['fc_feats'], data['att_feats'], data['labels'],
                data['masks'], data['att_masks']
            ]
            tmp = [_ if _ is None else _.cuda() for _ in tmp]
            fc_feats, att_feats, labels, masks, att_masks = tmp

            # att_feats 8x672x224
            att_feats = att_feats.view(att_feats.size(0), 3, 224, 224)
            att_feats, fc_feats = cnn_model(att_feats)
            # fc_feats = att_feats.mean(3).mean(2)
            # att_feats = torch.nn.functional.adaptive_avg_pool2d(
            #     att_feats, [7, 7]).permute(0, 2, 3, 1)
            att_feats = att_feats.permute(0, 2, 3, 1)
            att_feats = att_feats.view(att_feats.size(0), 49, -1)

            att_feats = att_feats.unsqueeze(1).expand(*((
                att_feats.size(0),
                opt.seq_per_img,
            ) + att_feats.size()[1:])).contiguous().view(
                (att_feats.size(0) * opt.seq_per_img), -1,
                att_feats.size()[-1])
            fc_feats = fc_feats.unsqueeze(1).expand(*((
                fc_feats.size(0),
                opt.seq_per_img,
            ) + fc_feats.size()[1:])).contiguous().view(
                *((fc_feats.size(0) * opt.seq_per_img, ) +
                  fc_feats.size()[1:]))

            optimizer.zero_grad()
            if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after:
                cnn_optimizer.zero_grad()
            model_out = dp_lw_model(fc_feats, att_feats, labels, masks,
                                    att_masks, data['gts'],
                                    torch.arange(0, len(data['gts'])), sc_flag)

            loss = model_out['loss'].mean()

            loss.backward()
            utils.clip_gradient(optimizer, opt.grad_clip)
            optimizer.step()

            if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after:
                utils.clip_gradient(cnn_optimizer, opt.grad_clip)
                cnn_optimizer.step()

            train_loss = loss.item()
            torch.cuda.synchronize()
            end = time.time()
            if not sc_flag:
                print(
                    "iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}"
                    .format(iteration, epoch, train_loss, end - start))
            else:
                print(
                    "iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}"
                    .format(iteration, epoch, model_out['reward'].mean(),
                            end - start))

            # Update the iteration and epoch
            iteration += 1
            if data['bounds']['wrapped']:
                epoch += 1
                epoch_done = True

            # Write the training loss summary
            if (iteration % opt.losses_log_every == 0):
                add_summary_value(tb_summary_writer, 'train_loss', train_loss,
                                  iteration)
                if opt.noamopt:
                    opt.current_lr = optimizer.rate()
                elif opt.reduce_on_plateau:
                    opt.current_lr = optimizer.current_lr
                add_summary_value(tb_summary_writer, 'learning_rate',
                                  opt.current_lr, iteration)
                add_summary_value(tb_summary_writer, 'scheduled_sampling_prob',
                                  model.ss_prob, iteration)
                if sc_flag:
                    add_summary_value(tb_summary_writer, 'avg_reward',
                                      model_out['reward'].mean(), iteration)

                loss_history[
                    iteration] = train_loss if not sc_flag else model_out[
                        'reward'].mean()
                lr_history[iteration] = opt.current_lr
                ss_prob_history[iteration] = model.ss_prob

            # update infos
            infos['iter'] = iteration
            infos['epoch'] = epoch
            infos['iterators'] = loader.iterators
            infos['split_ix'] = loader.split_ix

            # make evaluation on validation set, and save model
            if (iteration % opt.save_checkpoint_every == 0):
                # eval model
                eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
                eval_kwargs.update(vars(opt))
                val_loss, predictions, lang_stats = eval_utils.eval_split(
                    cnn_model, model, lw_model.crit, loader, eval_kwargs)

                if opt.reduce_on_plateau:
                    if 'CIDEr' in lang_stats:
                        optimizer.scheduler_step(-lang_stats['CIDEr'])
                    else:
                        optimizer.scheduler_step(val_loss)
                # Write validation result into summary
                add_summary_value(tb_summary_writer, 'validation loss',
                                  val_loss, iteration)
                if lang_stats is not None:
                    for k, v in lang_stats.items():
                        add_summary_value(tb_summary_writer, k, v, iteration)
                val_result_history[iteration] = {
                    'loss': val_loss,
                    'lang_stats': lang_stats,
                    'predictions': predictions
                }

                # Save model if is improving on validation result
                if opt.language_eval == 1:
                    current_score = lang_stats['CIDEr']
                else:
                    current_score = -val_loss

                best_flag = False

                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True

                # Dump miscalleous informations
                infos['best_val_score'] = best_val_score
                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history

                save_checkpoint(model, cnn_model, infos, optimizer,
                                cnn_optimizer, histories)
                if opt.save_history_ckpt:
                    save_checkpoint(model,
                                    cnn_model,
                                    infos,
                                    optimizer,
                                    cnn_optimizer,
                                    append=str(iteration))

                if best_flag:
                    save_checkpoint(model,
                                    cnn_model,
                                    infos,
                                    optimizer,
                                    cnn_optimizer,
                                    append='best')

            # Stop if reaching max epochs
            if epoch >= opt.max_epochs and opt.max_epochs != -1:
                break
    except (RuntimeError, KeyboardInterrupt):
        print('Save ckpt on exception ...')
        save_checkpoint(model, cnn_model, infos, optimizer, cnn_optimizer)
        print('Save ckpt done.')
        stack_trace = traceback.format_exc()
        print(stack_trace)

    # test model
    test_kwargs = {'split': 'test', 'dataset': opt.input_json}
    test_kwargs.update(vars(opt))
    val_loss, predictions, lang_stats = eval_utils.eval_split(
        cnn_model, model, lw_model.crit, loader, test_kwargs)

    if opt.reduce_on_plateau:
        if 'CIDEr' in lang_stats:
            optimizer.scheduler_step(-lang_stats['CIDEr'])
        else:
            optimizer.scheduler_step(val_loss)
            # Write validation result into summary
    add_summary_value(tb_summary_writer, 'test loss', val_loss, iteration)
    if lang_stats is not None:
        for k, v in lang_stats.items():
            add_summary_value(tb_summary_writer, k, v, iteration)
    val_result_history[iteration] = {
        'loss': val_loss,
        'lang_stats': lang_stats,
        'predictions': predictions
    }
def train(opt):
    # setup dataloader
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    #set the checkpoint path
    opt.checkpoint_path = os.path.join(opt.checkpoint_path, opt.id)
    isExists = os.path.exists(opt.checkpoint_path)
    if not isExists:
        os.makedirs(opt.checkpoint_path)
        os.makedirs(opt.checkpoint_path + '/logs')
        print(opt.checkpoint_path + ' creating !')
    else:
        print(opt.checkpoint_path + ' already exists!')

    tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(
                os.path.join(
                    opt.checkpoint_path, 'infos_' + opt.id +
                    format(int(opt.start_from), '04') + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "att_feat_size", "rnn_size",
                "input_encoding_size"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(
                    opt.checkpoint_path, 'histories_' + opt.id +
                    format(int(opt.start_from), '04') + '.pkl')):
            with open(
                    os.path.join(
                        opt.checkpoint_path, 'histories_' + opt.id +
                        format(int(opt.start_from), '04') + '.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)
    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    word_loss_history = histories.get('word_loss_history', {})
    MAD_loss_history = histories.get('MAD_loss_history', {})
    SAP_loss_history = histories.get('SAP_loss_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    lr_history = histories.get('lr_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    #set up model, assure in training mode
    threshold = opt.threshold
    sc_flag = False
    num_gpu = opt.num_gpu

    model = models.setup(opt).cuda(device=0)
    model.train()
    update_lr_flag = True
    dp_model = torch.nn.parallel.DataParallel(model)

    optimizer = optim.Adam(model.parameters(),
                           opt.learning_rate,
                           (opt.optim_alpha, opt.optim_beta),
                           opt.optim_epsilon,
                           weight_decay=opt.weight_decay)
    # Load the optimizer
    if vars(opt).get('start_from', None) is not None and os.path.isfile(
            os.path.join(
                opt.checkpoint_path, 'optimizer' + opt.id +
                format(int(opt.start_from), '04') + '.pth')):
        optimizer.load_state_dict(
            torch.load(
                os.path.join(
                    opt.checkpoint_path, 'optimizer' + opt.id +
                    format(int(opt.start_from), '04') + '.pth')))

    if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
        frac = (epoch - opt.scheduled_sampling_start
                ) // opt.scheduled_sampling_increase_every
        opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                          opt.scheduled_sampling_max_prob)
        model.ss_prob = opt.ss_prob

    optimizer.zero_grad()
    accumulate_iter = 0
    train_loss = 0

    subsequent_mat = np.load('data/markov_mat.npy')
    subsequent_mat = torch.from_numpy(subsequent_mat).cuda(device=0).float()
    subsequent_mat_all = subsequent_mat.clone()
    # for multi-GPU training
    for i in range(opt.num_gpu - 1):
        subsequent_mat_all = torch.cat([subsequent_mat_all, subsequent_mat],
                                       dim=0)

    while True:
        if update_lr_flag:
            # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
            else:
                opt.current_lr = opt.learning_rate
            for group in optimizer.param_groups:
                group['lr'] = opt.current_lr

            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob

            # If start self critical training
            if sc_flag == False and opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                print('initializing CIDEr scorer...')
                s = time.time()
                global CiderD_scorer
                if (CiderD_scorer is None):
                    CiderD_scorer = CiderD(df=opt.cached_tokens)
                    #takes about 30s
                    print('initlizing CIDEr scorers in {:3f}s'.format(
                        time.time() - s))
                sc_flag = True
                opt.learning_rate_decay_every = opt.learning_rate_decay_every * 2  #default 5 for xe, 10 for scst

            update_lr_flag = False

        print('current_lr is {}'.format(opt.current_lr))
        start = time.time()
        data = loader.get_batch('train', opt.batch_size)

        torch.cuda.synchronize()

        fc_feats = None
        att_feats = None

        tmp = [
            data['fc_feats'], data['labels'], data['masks'], data['att_feats'],
            data['attr_labels'], data['subsequent_labels']
        ]
        tmp = [
            _ if _ is None else torch.from_numpy(_).cuda(device=0) for _ in tmp
        ]
        fc_feats, labels, masks, att_feats, attr_labels, subsequent_labels = tmp

        #convert 1-1000 to 0-999 (perhaps done in preprocessing)
        subsequent_labels = subsequent_labels - 1
        subsequent_mask = (subsequent_labels[:, 1:] >= 0).float()
        subsequent_labels = torch.where(
            subsequent_labels > 0, subsequent_labels,
            torch.zeros_like(subsequent_labels).int().cuda(device=0))

        print('Read and process data:', time.time() - start)

        if not sc_flag:
            SAP_loss, word_loss, MAD_loss = dp_model(
                fc_feats, att_feats, labels, masks, attr_labels,
                subsequent_labels, subsequent_mask, subsequent_mat_all)
            SAP_loss = SAP_loss.mean()
            word_loss = word_loss.mean()
            MAD_loss = MAD_loss.mean()
            accumulate_iter = accumulate_iter + 1
            loss = (word_loss + 0.2 * SAP_loss +
                    0.2 * MAD_loss) / opt.accumulate_number
            loss.backward()
        else:
            st = time.time()
            sm = torch.zeros([num_gpu, 1]).cuda(
                device=0)  #indexs for sampling by probabilities
            gen_result, sample_logprobs, _ = dp_model(fc_feats,
                                                      att_feats,
                                                      attr_labels,
                                                      subsequent_mat_all,
                                                      sm,
                                                      mode='sample')
            dp_model.eval()
            with torch.no_grad():
                greedy_res, _, _ = dp_model(fc_feats,
                                            att_feats,
                                            attr_labels,
                                            subsequent_mat_all,
                                            mode='sample')
            dp_model.train()
            ed = time.time()
            print('GPU time is : {}s'.format(ed - st))
            reward = get_self_critical_reward(gen_result, greedy_res,
                                              data['gts'])
            word_loss = dp_model(sample_logprobs,
                                 gen_result.data,
                                 torch.from_numpy(reward).float().cuda(),
                                 mode='scst_forward')
            word_loss = word_loss.mean()

            loss = word_loss

            #forward to minimize SAP loss and MAD loss
            SAP_loss, _, MAD_loss = dp_model(fc_feats, att_feats, labels,
                                             masks, attr_labels,
                                             subsequent_labels,
                                             subsequent_mask,
                                             subsequent_mat_all)
            SAP_loss = SAP_loss.mean()
            MAD_loss = MAD_loss.mean()
            loss = loss + 0.2 * SAP_loss + 0.2 * MAD_loss
            loss.backward()
            accumulate_iter = accumulate_iter + 1

        if accumulate_iter % opt.accumulate_number == 0:
            utils.clip_gradient(optimizer, opt.grad_clip)
            optimizer.step()
            optimizer.zero_grad()

            iteration += 1
            accumulate_iter = 0
            train_loss = loss.item() * opt.accumulate_number
            end = time.time()
            #you can record the training log if you need
            #text_file = open(opt.checkpoint_path+'/logs/train_log_'+opt.id+'.txt', "aw")
            if not sc_flag:
                print("iter {} (epoch {}), SAP_loss = {:.3f}, word_loss = {:.3f}, MAD_loss = {:.3f} time/batch = {:.3f}" \
                      .format(iteration, epoch,SAP_loss, word_loss,MAD_loss, end - start))
                #text_file.write("iter {} (epoch {}),SAP_loss = {:.3f}, word_loss {:.3f}, MAD_loss {:.3f},time/batch = {:.3f}\n" \
                #      .format(iteration, epoch,SAP_loss, word_loss, MAD_loss, end - start))

            else:
                print("iter {} (epoch {}),SAP_loss = {:.3f}, avg_reward = {:.3f},MAD_loss = {:.3f} time/batch = {:.3f}" \
                      .format(iteration, epoch,SAP_loss,np.mean(reward[:, 0]),MAD_loss, end - start))
                #text_file.write("iter {} (epoch {}), avg_reward = {:.3f} MAD_loss ={:.3f}, time/batch = {:.3f}\n" \
                #      .format(iteration, epoch, np.mean(reward[:, 0]), MAD_loss, end - start))
            #text_file.close()
        torch.cuda.synchronize()

        # Update the iteration and epoch

        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every
                == 0) and (accumulate_iter % opt.accumulate_number == 0):
            add_summary_value(tb_summary_writer, 'word_loss', word_loss.item(),
                              iteration)
            add_summary_value(tb_summary_writer, 'MAD_loss', MAD_loss.item(),
                              iteration)
            add_summary_value(tb_summary_writer, 'SAP_loss', SAP_loss.item(),
                              iteration)
            add_summary_value(tb_summary_writer, 'learning_rate',
                              opt.current_lr, iteration)
            add_summary_value(tb_summary_writer, 'scheduled_sampling_prob',
                              model.ss_prob, iteration)
            if sc_flag:
                add_summary_value(tb_summary_writer, 'avg_reward',
                                  np.mean(reward[:, 0]), iteration)

            loss_history[iteration] = train_loss if not sc_flag else np.mean(
                reward[:, 0])
            word_loss_history[iteration] = word_loss.item()
            SAP_loss_history[iteration] = SAP_loss.item()
            MAD_loss_history[iteration] = MAD_loss.item()

            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every
                == 0) and (accumulate_iter % opt.accumulate_number == 0):
            # eval model
            eval_kwargs = {
                'split': 'val',
                'dataset': opt.input_json,
                'num_images': -1,
                'index_eval': 1,
                'id': opt.id,
                'beam': opt.beam,
                'verbose_loss': 1,
                'checkpoint_path': opt.checkpoint_path
            }
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats, precision, recall = eval_utils.eval_split(
                dp_model, loader, subsequent_mat_all, eval_kwargs)

            # Write validation result into summary
            add_summary_value(tb_summary_writer, 'validation loss', val_loss,
                              iteration)
            if lang_stats is not None:
                for k, v in lang_stats.items():
                    add_summary_value(tb_summary_writer, k, v, iteration)
            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            #save lang stats
            f_lang = open(
                opt.checkpoint_path + '/logs/lang_' + opt.id + '.txt', 'aw')
            f_lang.write(
                str(iteration) + ' ' +
                str(iteration / opt.save_checkpoint_every) + '\n')
            f_lang.write('val loss ' + str(val_loss) + '\n')
            for key_lang in lang_stats:
                f_lang.write(key_lang + ' ' + str(lang_stats[key_lang]) + '\n')
            f_lang.write('precision ' + str(precision) + ' recall ' +
                         str(recall) + '\n')
            f_lang.close()

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = -val_loss

            best_flag = False
            save_id = iteration / opt.save_checkpoint_every

            if best_val_score is None or current_score > best_val_score or current_score > threshold:
                best_val_score = current_score
                best_flag = True

                ##only save the improved models or when the CIDEr-D is larger than a given threshold
                checkpoint_path = os.path.join(
                    opt.checkpoint_path,
                    'model' + opt.id + format(int(save_id), '04') + '.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(
                    opt.checkpoint_path,
                    'optimizer' + opt.id + format(int(save_id), '04') + '.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                #record the lang stats for saved mdoel
                f_lang = open(
                    opt.checkpoint_path + '/logs/Best_lang_' + opt.id + '.txt',
                    'aw')
                f_lang.write(
                    str(iteration) + ' ' +
                    str(iteration / opt.save_checkpoint_every) + '\n')
                f_lang.write('val loss ' + str(val_loss) + '\n')
                for key_lang in lang_stats:
                    f_lang.write(key_lang + ' ' + str(lang_stats[key_lang]) +
                                 '\n')
                f_lang.write('precision ' + str(precision) + ' recall ' +
                             str(recall) + '\n')
                f_lang.close()

            # Dump miscalleous informations
            infos['iter'] = iteration
            infos['epoch'] = epoch
            infos['iterators'] = loader.iterators
            infos['split_ix'] = loader.split_ix
            infos['best_val_score'] = best_val_score
            infos['opt'] = opt
            infos['vocab'] = loader.get_vocab()

            histories['val_result_history'] = val_result_history
            histories['loss_history'] = loss_history
            histories['word_loss_history'] = loss_history
            histories['MAD_loss_history'] = MAD_loss_history
            histories['SAP_loss_history'] = SAP_loss_history

            histories['lr_history'] = lr_history
            histories['ss_prob_history'] = ss_prob_history

            with open(
                    os.path.join(
                        opt.checkpoint_path, 'infos_' + opt.id +
                        format(int(save_id), '04') + '.pkl'), 'wb') as f:
                cPickle.dump(infos, f)
            with open(
                    os.path.join(
                        opt.checkpoint_path, 'histories_' + opt.id +
                        format(int(save_id), '04') + '.pkl'), 'wb') as f:
                cPickle.dump(histories, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
Exemple #10
0
def train(opt):
    opt.use_att = utils.if_use_att(opt.caption_model)
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.maxlen_sen
    opt.inc_seg = loader.inc_seg
    opt.seg_ix = loader.seg_ix
    tf_summary_writer = tf and tf.summary.FileWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    score_list = []
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from,
                               'infos_' + opt.id + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
            with open(
                    os.path.join(opt.start_from,
                                 'histories_' + opt.id + '.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    best_val_score = None
    best_val_score = {}
    score_splits = ['val', 'test']
    score_type = ['Bleu_4', 'METEOR', 'CIDEr']
    for split_i in score_splits:
        for score_item in score_type:
            if split_i not in best_val_score.keys():
                best_val_score[split_i] = {}
            best_val_score[split_i][score_item] = 0.0
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', best_val_score)

    model = models.setup(opt)
    device_ids = [0, 1]

    torch.cuda.set_device(device_ids[0])
    model = nn.DataParallel(model, device_ids=device_ids)
    model = model.cuda()
    update_lr_flag = True
    # Assure in training mode
    model.module.train()
    crit = utils.LanguageModelCriterion()

    optimizer = optim.Adam(model.module.parameters(),
                           lr=opt.learning_rate,
                           weight_decay=opt.weight_decay)
    #optimizer = nn.DataParallel(optimizer, device_ids=device_ids)

    # Load the optimizer
    if vars(opt).get('start_from', None) is not None:
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    while True:
        if update_lr_flag:
            # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
                utils.set_lr(optimizer, opt.current_lr)  # set the decayed rate
            else:
                opt.current_lr = opt.learning_rate
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.module.ss_prob = opt.ss_prob
            update_lr_flag = False

        start = time.time()
        # Load data from train split (0)
        data = loader.get_batch('train')
        print('Read data:', time.time() - start)

        torch.cuda.synchronize()
        start = time.time()

        tmp = [data['fc_feats'], data['labels'], data['x_phrase_mask_0'], data['x_phrase_mask_1'], \
               data['label_masks'], data['salicy_seg'], data['seg_mask']]
        tmp = [
            Variable(torch.from_numpy(_), requires_grad=False).cuda()
            for _ in tmp
        ]
        fc_feats, seq, phrase_mask_0, phrase_mask_1, masks, salicy_seg, seg_mask = tmp

        optimizer.zero_grad()
        remove_len = 2
        outputs, alphas = model.module(fc_feats, seq, phrase_mask_0,
                                       phrase_mask_1, masks, seg_mask,
                                       remove_len)
        loss = crit(outputs, seq[remove_len:, :].permute(1, 0),
                    masks[remove_len:, :].permute(1, 0))
        alphas = alphas.permute(1, 0, 2)
        salicy_seg = salicy_seg[:, :, :]
        seg_mask = seg_mask[:, :]
        if opt.salicy_hard == False:
            if opt.salicy_loss_type == 'l2':
                salicy_loss = (((((salicy_seg * seg_mask[:, :, None] -
                                   alphas * seg_mask[:, :, None])**2).sum(0)
                                 ).sum(-1))**(0.5)).mean()
            if opt.salicy_loss_type == 'kl':
                #alphas: len_sen, batch_size, num_frame
                salicy_loss = kullback_leibler2(
                    alphas * seg_mask[:, :, None],
                    salicy_seg * seg_mask[:, :, None])
                salicy_loss = (((salicy_loss *
                                 seg_mask[:, :, None]).sum(-1)).sum(0)).mean()
        elif opt.salicy_hard == True:
            #salicy len_sen, batch_size, num_frame
            salicy_loss = -torch.log((alphas * salicy_seg).sum(-1) + 1e-8)
            #salicy_loss len_sen, batch_size
            salicy_loss = ((salicy_loss * seg_mask).sum(0)).mean()
        loss = loss + opt.salicy_alpha * salicy_loss
        loss.backward()
        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        train_loss = loss.data[0]
        torch.cuda.synchronize()
        end = time.time()
        print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
            .format(iteration, epoch, train_loss, end - start))

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            if tf is not None:
                add_summary_value(tf_summary_writer, 'train_loss', train_loss,
                                  iteration)
                add_summary_value(tf_summary_writer, 'learning_rate',
                                  opt.current_lr, iteration)
                add_summary_value(tf_summary_writer, 'scheduled_sampling_prob',
                                  model.module.ss_prob, iteration)
                tf_summary_writer.flush()

            loss_history[iteration] = train_loss
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.module.ss_prob

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model
            eval_kwargs = {
                'split': 'val',
                'dataset': opt.dataset,
                'remove_len': remove_len
            }
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats, score_list_i = eval_utils.eval_split(
                model.module, crit, loader, eval_kwargs)
            score_list.append(score_list_i)
            np.savetxt('./save/train_valid_test.txt', score_list, fmt='%.3f')
            # Write validation result into summary
            if tf is not None:
                add_summary_value(tf_summary_writer, 'validation loss',
                                  val_loss, iteration)
                for k in lang_stats.keys():
                    for v in lang_stats[k].keys():
                        add_summary_value(tf_summary_writer, k + v,
                                          lang_stats[k][v], iteration)
                tf_summary_writer.flush()
            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['val']['CIDEr']
            else:
                current_score = -val_loss
            best_flag = {}
            for split_i in score_splits:
                for score_item in score_type:
                    if split_i not in best_flag.keys():
                        best_flag[split_i] = {}
                    best_flag[split_i][score_item] = False
            if True:  # if true
                for split_i in score_splits:
                    for score_item in score_type:
                        if best_val_score is None or lang_stats[split_i][
                                score_item] > best_val_score[split_i][
                                    score_item]:
                            best_val_score[split_i][score_item] = lang_stats[
                                split_i][score_item]
                            best_flag[split_i][score_item] = True

                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model.pth')
                torch.save(model.module.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path,
                                              'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['split_ix'] = loader.split_ix
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['vocab'] = loader.get_vocab()

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'infos_' + opt.id + '.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'histories_' + opt.id + '.pkl'),
                        'wb') as f:
                    cPickle.dump(histories, f)

                for split_i in score_splits:
                    for score_item in score_type:
                        if best_flag[split_i][score_item]:
                            checkpoint_path = os.path.join(
                                opt.checkpoint_path, 'model-best_' + split_i +
                                '_' + score_item + '.pth')
                            torch.save(model.module.state_dict(),
                                       checkpoint_path)
                            print("model saved to {}".format(checkpoint_path))
                            with open(
                                    os.path.join(
                                        opt.checkpoint_path,
                                        'infos_' + split_i + '_' + score_item +
                                        '_' + opt.id + '-best.pkl'),
                                    'wb') as f:
                                cPickle.dump(infos, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
Exemple #11
0
def validation(opts, model, criterion, optimizer, loader, info, history,
               device, val_result_history, iteration, best_val_score,
               stage_id):
    logging.info("Start validation")
    # eval model
    eval_kwargs = {
        'split': 'val',
        'dataset': opts.input_json,
        'device': device,
        'stage': stage_id
    }
    eval_kwargs.update(vars(opts))

    val_loss, predictions, lang_stats = eval_utils.eval_split(
        model, criterion, loader, eval_kwargs)

    val_result_history[iteration] = {
        'loss': val_loss,
        'lang_stats': lang_stats,
        'predictions': predictions
    }

    # Save model if is improving on validation result 如果取language,就以CIDEr为准,否则是loss
    if opts.language_eval == 1:
        current_score = lang_stats['CIDEr']
    else:
        current_score = -val_loss

    best_flag = False
    if best_val_score is None or current_score > best_val_score:
        best_val_score = current_score
        best_flag = True

    # 写入检查点
    checkpoint_path = os.path.join(opts.checkpoint_path, 'model.pth')
    torch.save(model.state_dict(), checkpoint_path)
    logging.info("model saved to {}".format(checkpoint_path))
    optimizer_path = os.path.join(opts.checkpoint_path, 'optimizer.pth')
    torch.save(optimizer.state_dict(), optimizer_path)

    # Dump miscellaneous information

    info['best_val_score'] = best_val_score
    info['opts'] = opts
    info['vocabulary'] = loader.get_vocab()
    info['stage'] = stage_id

    history['val_result_history'] = val_result_history

    with open(
            os.path.join(opts.checkpoint_path,
                         'info_' + opts.train_id + '.pkl'), 'wb') as infofile:
        cPickle.dump(info, infofile)
    with open(
            os.path.join(opts.checkpoint_path,
                         'history_' + opts.train_id + '.pkl'),
            'wb') as historyfile:
        cPickle.dump(history, historyfile)
    logging.info("Checkpoint Saved")

    # 选择的最佳模型
    if best_flag:
        checkpoint_path = os.path.join(opts.checkpoint_path, 'model-best.pth')
        torch.save(model.state_dict(), checkpoint_path)
        logging.info("model saved to {}".format(checkpoint_path))
        with open(
                os.path.join(opts.checkpoint_path,
                             'info_' + opts.train_id + '-best.pkl'),
                'wb') as bestfile:
            cPickle.dump(info, bestfile)

    logging.info("validation complete")

    return val_loss, lang_stats
Exemple #12
0
def train(opt):
    opt.use_att = utils.if_use_att(opt.caption_model)

    from dataloader import DataLoader
    loader = DataLoader(opt)

    opt.vocab_size = loader.vocab_size
    opt.vocab_ccg_size = loader.vocab_ccg_size
    opt.seq_length = loader.seq_length

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from,
                               'infos_' + opt.id + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme
        if os.path.isfile(
                os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
            with open(
                    os.path.join(opt.start_from,
                                 'histories_' + opt.id + '.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)

    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    cnn_model = utils.build_cnn(opt)
    cnn_model.cuda()

    model = models.setup(opt)
    model.cuda()
    # model = DataParallel(model)

    if vars(opt).get('start_from', None) is not None:
        # check if all necessary files exist
        assert os.path.isdir(
            opt.start_from), " %s must be a a path" % opt.start_from
        assert os.path.isfile(
            os.path.join(opt.start_from, "infos_" + opt.id + ".pkl")
        ), "infos.pkl file does not exist in path %s" % opt.start_from
        model.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'model.pth')))

    update_lr_flag = True
    model.train()

    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    multilabel_crit = nn.MultiLabelSoftMarginLoss().cuda()
    #    optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay)
    optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate)
    if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after:
        print('finetune mode')
        cnn_optimizer = optim.Adam([\
            {'params': module.parameters()} for module in cnn_model._modules.values()[5:]\
            ], lr=opt.cnn_learning_rate, weight_decay=opt.cnn_weight_decay)

    if vars(opt).get('start_from', None) is not None and os.path.isfile(
            os.path.join(opt.start_from, "optimizer.pth")):
        if os.path.isfile(os.path.join(opt.start_from, 'optimizer.pth')):
            optimizer.load_state_dict(
                torch.load(os.path.join(opt.start_from, 'optimizer.pth')))
        if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after:
            if os.path.isfile(os.path.join(opt.start_from,
                                           'optimizer-cnn.pth')):
                cnn_optimizer.load_state_dict(
                    torch.load(
                        os.path.join(opt.start_from, 'optimizer-cnn.pth')))

    eval_kwargs = {'split': 'val', 'dataset': opt.input_json, 'verbose': True}
    eval_kwargs.update(vars(opt))
    val_loss, predictions, lang_stats = eval_utils.eval_split(
        cnn_model, model, crit, loader, eval_kwargs, True)
    epoch_start = time.time()
    while True:
        if update_lr_flag:
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
                utils.set_lr(optimizer, opt.current_lr)  # set the decayed rate
            else:
                opt.current_lr = opt.learning_rate
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob
                #model.module.ss_prob = opt.ss_prob
            if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                sc_flag = True
            else:
                sc_flag = False

            # Update the training stage of cnn
            for p in cnn_model.parameters():
                p.requires_grad = True
            # Fix the first few layers:
            for module in cnn_model._modules.values()[:5]:
                for p in module.parameters():
                    p.requires_grad = False
            cnn_model.train()
            update_lr_flag = False

        cnn_model.apply(utils.set_bn_fix)
        cnn_model.apply(utils.set_bn_eval)

        start = time.time()
        torch.cuda.synchronize()
        data = loader.get_batch('train')
        if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after:

            multilabels = [
                data['detection_infos'][i]['label']
                for i in range(len(data['detection_infos']))
            ]

            tmp = [
                data['labels'], data['masks'],
                np.array(multilabels, dtype=np.int16)
            ]
            tmp = [
                Variable(torch.from_numpy(_), requires_grad=False).cuda()
                for _ in tmp
            ]
            labels, masks, multilabels = tmp
            images = data[
                'images']  # it cannot be turned into tensor since different sizes.
            _fc_feats_2048 = []
            _fc_feats_81 = []
            _att_feats = []
            for i in range(loader.batch_size):
                x = Variable(torch.from_numpy(images[i]),
                             requires_grad=False).cuda()
                x = x.unsqueeze(0)
                att_feats, fc_feats_81 = cnn_model(x)
                fc_feats_2048 = att_feats.mean(3).mean(2).squeeze()
                att_feats = F.adaptive_avg_pool2d(att_feats,
                                                  [14, 14]).squeeze().permute(
                                                      1, 2, 0)  #(0, 2, 3, 1)
                _fc_feats_2048.append(fc_feats_2048)
                _fc_feats_81.append(fc_feats_81)
                _att_feats.append(att_feats)
            _fc_feats_2048 = torch.stack(_fc_feats_2048)
            _fc_feats_81 = torch.stack(_fc_feats_81)
            _att_feats = torch.stack(_att_feats)
            att_feats = _att_feats.unsqueeze(1).expand(*((_att_feats.size(0), loader.seq_per_img,) + \
                                                           _att_feats.size()[1:])).contiguous().view(*((_att_feats.size(0) * loader.seq_per_img,) + \
                                                           _att_feats.size()[1:]))
            fc_feats_2048 = _fc_feats_2048.unsqueeze(1).expand(*((_fc_feats_2048.size(0), loader.seq_per_img,) + \
                                                          _fc_feats_2048.size()[1:])).contiguous().view(*((_fc_feats_2048.size(0) * loader.seq_per_img,) + \
                                                          _fc_feats_2048.size()[1:]))
            fc_feats_81 = _fc_feats_81
            #
            cnn_optimizer.zero_grad()
        else:

            tmp = [
                data['fc_feats'], data['att_feats'], data['labels'],
                data['masks']
            ]
            tmp = [
                Variable(torch.from_numpy(_), requires_grad=False).cuda()
                for _ in tmp
            ]
            fc_feats, att_feats, labels, masks = tmp

        optimizer.zero_grad()

        if not sc_flag:
            loss1 = crit(model(fc_feats_2048, att_feats, labels),
                         labels[:, 1:], masks[:, 1:])
            loss2 = multilabel_crit(fc_feats_81.double(), multilabels.double())
            loss = 0.8 * loss1 + 0.2 * loss2.float()
        else:
            gen_result, sample_logprobs = model.sample(fc_feats_2048,
                                                       att_feats,
                                                       {'sample_max': 0})
            reward = get_self_critical_reward(model, fc_feats_2048, att_feats,
                                              data, gen_result)
            loss1 = rl_crit(
                sample_logprobs, gen_result,
                Variable(torch.from_numpy(reward).float().cuda(),
                         requires_grad=False))
            loss2 = multilabel_crit(fc_feats_81.double(), multilabels.double())
            loss3 = crit(model(fc_feats_2048, att_feats, labels),
                         labels[:, 1:], masks[:, 1:])
            loss = 0.995 * loss1 + 0.005 * (loss2.float() + loss3)
        loss.backward()

        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()

        train_loss = loss.data[0]
        mle_loss = loss1.data[0]
        multilabel_loss = loss2.data[0]
        torch.cuda.synchronize()
        end = time.time()
        if not sc_flag and iteration % 2500 == 0:
            print("iter {} (epoch {}), mle_loss = {:.3f}, multilabel_loss = {:.3f}, train_loss = {:.3f}, time/batch = {:.3f}" \
                .format(iteration, epoch, mle_loss, multilabel_loss, train_loss, end - start))

        if sc_flag and iteration % 2500 == 0:
            print("iter {} (epoch {}), avg_reward = {:.3f}, mle_loss = {:.3f}, multilabel_loss = {:.3f}, train_loss = {:.3f}, time/batch = {:.3f}" \
                .format(iteration, epoch, np.mean(reward[:,0]), mle_loss, multilabel_loss, train_loss, end - start))
        iteration += 1
        if (iteration % opt.losses_log_every == 0):
            loss_history[iteration] = train_loss if not sc_flag else np.mean(
                reward[:, 0])
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob

        if (iteration % opt.save_checkpoint_every == 0):
            eval_kwargs = {
                'split': 'val',
                'dataset': opt.input_json,
                'verbose': True
            }
            eval_kwargs.update(vars(opt))

            if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after:
                val_loss, predictions, lang_stats = eval_utils.eval_split(
                    cnn_model, model, crit, loader, eval_kwargs, True)
            else:
                val_loss, predictions, lang_stats = eval_utils.eval_split(
                    cnn_model, model, crit, loader, eval_kwargs, False)

            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = -val_loss

            best_flag = False
            if True:
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))

                cnn_checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model-cnn.pth')
                torch.save(cnn_model.state_dict(), cnn_checkpoint_path)
                print("cnn model saved to {}".format(cnn_checkpoint_path))

                optimizer_path = os.path.join(opt.checkpoint_path,
                                              'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after:
                    cnn_optimizer_path = os.path.join(opt.checkpoint_path,
                                                      'optimizer-cnn.pth')
                    torch.save(cnn_optimizer.state_dict(), cnn_optimizer_path)

                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['split_ix'] = loader.split_ix
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['vocab'] = loader.get_vocab()

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'infos_' + opt.id + '.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'histories_' + opt.id + '.pkl'),
                        'wb') as f:
                    cPickle.dump(histories, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))

                    cnn_checkpoint_path = os.path.join(opt.checkpoint_path,
                                                       'model-cnn-best.pth')
                    torch.save(cnn_model.state_dict(), cnn_checkpoint_path)
                    print("cnn model saved to {}".format(cnn_checkpoint_path))

                    with open(
                            os.path.join(opt.checkpoint_path,
                                         'infos_' + opt.id + '-best.pkl'),
                            'wb') as f:
                        cPickle.dump(infos, f)

        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True
            print("epoch: " + str(epoch) + " during: " +
                  str(time.time() - epoch_start))
            epoch_start = time.time()

        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
Exemple #13
0
def train(opt):
    print("=================Training Information==============")
    print("start from {}".format(opt.start_from))
    print("box from {}".format(opt.input_box_dir))
    print("input json {}".format(opt.input_json))
    print("attributes from {}".format(opt.input_att_dir))
    print("features from {}".format(opt.input_fc_dir))
    print("batch size ={}".format(opt.batch_size))
    print("#GPU={}".format(torch.cuda.device_count()))
    # Deal with feature things before anything
    opt.use_fc, opt.use_att = utils.if_use_feat(opt.caption_model)
    if opt.use_box:
        opt.att_feat_size = opt.att_feat_size + 5

    acc_steps = getattr(opt, 'acc_steps', 1)
    name_append = opt.name_append
    if len(name_append) > 0 and name_append[0] != '-':
        name_append = '_' + name_append

    loader = DataLoader(opt)

    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length
    opt.write_summary = write_summary
    if opt.write_summary:
        print("write summary to {}".format(opt.checkpoint_path))
        tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}

    if opt.start_from is not None:
        # open old infos and check if models are compatible
        infors_path = os.path.join(opt.start_from,
                                   'infos' + name_append + '.pkl')
        print("Load model information {}".format(infors_path))
        with open(infors_path, 'rb') as f:
            infos = utils.pickle_load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        histories_path = os.path.join(opt.start_from,
                                      'histories_' + name_append + '.pkl')
        if os.path.isfile(histories_path):
            with open(histories_path, 'rb') as f:
                histories = utils.pickle_load(f)
    else:  # start from scratch
        print("Initialize training process from all begining")
        infos['iter'] = 0
        infos['epoch'] = 0
        infos['iterators'] = loader.iterators
        infos['split_ix'] = loader.split_ix
        infos['vocab'] = loader.get_vocab()
    infos['opt'] = opt

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    #  sanity check for the saved model name has a correct index
    if opt.name_append.isdigit() and int(opt.name_append) < 100:
        assert int(
            opt.name_append
        ) == epoch, "dismatch in the model index and the real epoch number"
        epoch += 1

    print(
        "==================start from {} epoch================".format(epoch))
    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})
    # pdb.set_trace()
    loader.iterators = infos.get('iterators', loader.iterators)
    start_Img_idx = loader.iterators['train']
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    opt.vocab = loader.get_vocab()
    model = models.setup(opt).cuda()
    del opt.vocab
    dp_model = torch.nn.DataParallel(model)
    lw_model = LossWrapper(model, opt)  # wrap loss into model
    dp_lw_model = torch.nn.DataParallel(lw_model)

    epoch_done = True
    # Assure in training mode
    dp_lw_model.train()

    if opt.noamopt:
        assert opt.caption_model in [
            'transformer', 'aoa'
        ], 'noamopt can only work with transformer'
        optimizer = utils.get_std_opt(model,
                                      factor=opt.noamopt_factor,
                                      warmup=opt.noamopt_warmup)
        optimizer._step = iteration
    elif opt.reduce_on_plateau:
        optimizer = utils.build_optimizer(model.parameters(), opt)
        optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    else:
        optimizer = utils.build_optimizer(model.parameters(), opt)
    # Load the optimizer
    if vars(opt).get('start_from', None) is not None:
        optimizer_path = os.path.join(opt.start_from,
                                      'optimizer' + name_append + '.pth')
        if os.path.isfile(optimizer_path):
            print("Loading optimizer............")
            optimizer.load_state_dict(torch.load(optimizer_path))

    def save_checkpoint(model, infos, optimizer, histories=None, append=''):
        if len(append) > 0:
            append = '_' + append
        # if checkpoint_path doesn't exist
        if not os.path.isdir(opt.checkpoint_path):
            os.makedirs(opt.checkpoint_path)
        checkpoint_path = os.path.join(opt.checkpoint_path,
                                       'model%s.pth' % (append))
        torch.save(model.state_dict(), checkpoint_path)
        print("Save model state to {}".format(checkpoint_path))

        optimizer_path = os.path.join(opt.checkpoint_path,
                                      'optimizer%s.pth' % (append))
        torch.save(optimizer.state_dict(), optimizer_path)
        print("Save model optimizer to {}".format(optimizer_path))

        with open(
                os.path.join(opt.checkpoint_path,
                             'infos' + '%s.pkl' % (append)), 'wb') as f:
            utils.pickle_dump(infos, f)
            print("Save training information to {}".format(
                os.path.join(opt.checkpoint_path,
                             'infos' + '%s.pkl' % (append))))

        if histories:
            with open(
                    os.path.join(opt.checkpoint_path,
                                 'histories_' + '%s.pkl' % (append)),
                    'wb') as f:
                utils.pickle_dump(histories, f)
                print("Save training historyes to {}".format(
                    os.path.join(opt.checkpoint_path,
                                 'histories_' + opt.id + '%s.pkl' % (append))))

    try:
        while True:
            # pdb.set_trace()
            if epoch_done:
                if not opt.noamopt and not opt.reduce_on_plateau:
                    # Assign the learning rate
                    if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                        frac = (epoch - opt.learning_rate_decay_start
                                ) // opt.learning_rate_decay_every
                        decay_factor = opt.learning_rate_decay_rate**frac
                        opt.current_lr = opt.learning_rate * decay_factor
                    else:
                        opt.current_lr = opt.learning_rate
                    utils.set_lr(optimizer,
                                 opt.current_lr)  # set the decayed rate
                # Assign the scheduled sampling prob
                if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                    frac = (epoch - opt.scheduled_sampling_start
                            ) // opt.scheduled_sampling_increase_every
                    opt.ss_prob = min(
                        opt.scheduled_sampling_increase_prob * frac,
                        opt.scheduled_sampling_max_prob)
                    model.ss_prob = opt.ss_prob

                # If start self critical training
                if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                    sc_flag = True
                    init_scorer(opt.cached_tokens)
                else:
                    sc_flag = False

                epoch_done = False
            print("{}th Epoch Training starts now!".format(epoch))
            with tqdm(total=len(loader.split_ix['train']),
                      initial=start_Img_idx) as pbar:
                for i in range(start_Img_idx, len(loader.split_ix['train']),
                               opt.batch_size):
                    # import ipdb; ipdb.set_trace()
                    start = time.time()
                    if (opt.use_warmup
                            == 1) and (iteration < opt.noamopt_warmup):
                        opt.current_lr = opt.learning_rate * (
                            iteration + 1) / opt.noamopt_warmup
                        utils.set_lr(optimizer, opt.current_lr)
                    # Load data from train split (0)
                    data = loader.get_batch('train')
                    # print('Read data:', time.time() - start)

                    if (iteration % acc_steps == 0):
                        optimizer.zero_grad()

                    torch.cuda.synchronize()
                    start = time.time()
                    tmp = [
                        data['fc_feats'], data['att_feats'], data['labels'],
                        data['masks'], data['att_masks']
                    ]
                    tmp = [_ if _ is None else _.cuda() for _ in tmp]
                    fc_feats, att_feats, labels, masks, att_masks = tmp

                    model_out = dp_lw_model(fc_feats, att_feats, labels, masks,
                                            att_masks, data['gts'],
                                            torch.arange(0, len(data['gts'])),
                                            sc_flag)

                    loss = model_out['loss'].mean()
                    loss_sp = loss / acc_steps

                    loss_sp.backward()
                    if ((iteration + 1) % acc_steps == 0):
                        utils.clip_gradient(optimizer, opt.grad_clip)
                        optimizer.step()
                    torch.cuda.synchronize()
                    train_loss = loss.item()
                    end = time.time()
                    # if not sc_flag:
                    #     print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}"
                    #         .format(iteration, epoch, train_loss, end - start))
                    # else:
                    #     print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}"
                    #         .format(iteration, epoch, model_out['reward'].mean(), end - start))
                    if not sc_flag:
                        pbar.set_description(
                            "iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}"
                            .format(iteration, epoch, train_loss, end - start))
                    else:
                        pbar.set_description(
                            "iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}"
                            .format(iteration, epoch,
                                    model_out['reward'].mean(), end - start))

                    # Update the iteration and epoch
                    iteration += 1
                    pbar.update(opt.batch_size)
                    if data['bounds']['wrapped']:
                        # save after each epoch
                        save_checkpoint(model,
                                        infos,
                                        optimizer,
                                        append=str(epoch))
                        epoch += 1
                        # infos['epoch'] = epoch
                        epoch_done = True

                    # Write validation result into summary
                    if (iteration % opt.losses_log_every
                            == 0) and opt.write_summary:
                        add_summary_value(tb_summary_writer, 'loss/train_loss',
                                          train_loss, iteration)
                        if opt.noamopt:
                            opt.current_lr = optimizer.rate()
                        elif opt.reduce_on_plateau:
                            opt.current_lr = optimizer.current_lr
                        add_summary_value(tb_summary_writer,
                                          'hyperparam/learning_rate',
                                          opt.current_lr, iteration)
                        add_summary_value(
                            tb_summary_writer,
                            'hyperparam/scheduled_sampling_prob',
                            model.ss_prob, iteration)
                        if sc_flag:
                            add_summary_value(tb_summary_writer, 'avg_reward',
                                              model_out['reward'].mean(),
                                              iteration)

                        loss_history[
                            iteration] = train_loss if not sc_flag else model_out[
                                'reward'].mean()
                        lr_history[iteration] = opt.current_lr
                        ss_prob_history[iteration] = model.ss_prob

                    # update infos
                    infos['iter'] = iteration
                    infos['epoch'] = epoch
                    infos['iterators'] = loader.iterators
                    infos['split_ix'] = loader.split_ix

                    # make evaluation on validation set, and save model
                    # TODO modify it to evaluate by each epoch
                    # ipdb.set_trace()
                    if (iteration % opt.save_checkpoint_every
                            == 0) and eval_ and epoch > 20:
                        model_path = os.path.join(
                            opt.checkpoint_path,
                            'model_itr%s.pth' % (iteration))
                        eval_kwargs = {
                            'split': 'val',
                            'dataset': opt.input_json,
                            'model': model_path
                        }
                        eval_kwargs.update(vars(opt))
                        val_loss, predictions, lang_stats = eval_utils.eval_split(
                            dp_model, lw_model.crit, loader, eval_kwargs)

                        if opt.reduce_on_plateau:
                            if 'CIDEr' in lang_stats:
                                optimizer.scheduler_step(-lang_stats['CIDEr'])
                            else:
                                optimizer.scheduler_step(val_loss)

                        # Write validation result into summary
                        if opt.write_summary:
                            add_summary_value(tb_summary_writer,
                                              'loss/validation loss', val_loss,
                                              iteration)

                            if lang_stats is not None:
                                bleu_dict = {}
                                for k, v in lang_stats.items():
                                    if 'Bleu' in k:
                                        bleu_dict[k] = v
                                if len(bleu_dict) > 0:
                                    tb_summary_writer.add_scalars(
                                        'val/Bleu', bleu_dict, epoch)

                                for k, v in lang_stats.items():
                                    if 'Bleu' not in k:
                                        add_summary_value(
                                            tb_summary_writer, 'val/' + k, v,
                                            iteration)
                        val_result_history[iteration] = {
                            'loss': val_loss,
                            'lang_stats': lang_stats,
                            'predictions': predictions
                        }

                        # Save model if is improving on validation result
                        if opt.language_eval == 1:
                            current_score = lang_stats['CIDEr']
                        else:
                            current_score = -val_loss

                        best_flag = False

                        if best_val_score is None or current_score > best_val_score:
                            best_val_score = current_score
                            best_flag = True

                        # Dump miscalleous informations
                        infos['best_val_score'] = best_val_score
                        histories['val_result_history'] = val_result_history
                        histories['loss_history'] = loss_history
                        histories['lr_history'] = lr_history
                        histories['ss_prob_history'] = ss_prob_history

                        # save_checkpoint(model, infos, optimizer, histories, append=str(iteration))
                        save_checkpoint(model, infos, optimizer, histories)
                        # if opt.save_history_ckpt:
                        #     save_checkpoint(model, infos, optimizer, append=str(iteration))

                        if best_flag:
                            save_checkpoint(model,
                                            infos,
                                            optimizer,
                                            append='best')
                            print(
                                "update best model at {} iteration--{} epoch".
                                format(iteration, epoch))

                    start_Img_idx = 0
                    # if epoch_done: # go through the set, start a new epoch loop
                    #     break
            # Stop if reaching max epochs
            if epoch >= opt.max_epochs and opt.max_epochs != -1:
                print("epoch {} break all".format(epoch))
                save_checkpoint(model, infos, optimizer)
                tb_summary_writer.close()
                print("============{} Training Done !==============".format(
                    'Refine' if opt.use_test or opt.use_val else ''))
                break
    except (RuntimeError, KeyboardInterrupt):
        print('Save ckpt on exception ...')
        save_checkpoint(model, infos, optimizer, append='_interrupt')
        print('Save ckpt done.')
        stack_trace = traceback.format_exc()
        print(stack_trace)
Exemple #14
0
def train(opt, lossWriter):
    opt.use_att = utils.if_use_att(opt.caption_model)
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    tf_summary_writer = tf and tf.summary.FileWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from,
                               'infos_' + opt.id + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
            with open(
                    os.path.join(opt.start_from,
                                 'histories_' + opt.id + '.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    model = models.setup(opt)
    model.cuda()

    update_lr_flag = True
    # Assure in training mode
    model.train()

    crit = utils.LanguageModelCriterion()

    optimizer = optim.Adam(model.parameters(),
                           lr=opt.learning_rate,
                           weight_decay=opt.weight_decay)

    # Load the optimizer
    if vars(opt).get('start_from', None) is not None:
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    while True:
        if update_lr_flag:
            # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
                utils.set_lr(optimizer, opt.current_lr)  # set the decayed rate
            else:
                opt.current_lr = opt.learning_rate
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob
            update_lr_flag = False

        start = time.time()
        # Load data from train split (0)
        data = loader.get_batch('train')
        print('Read data:', time.time() - start)

        torch.cuda.synchronize()
        start = time.time()

        tmp = [
            data['fc_feats'], data['att_feats'], data['labels'], data['masks']
        ]
        tmp = [
            Variable(torch.from_numpy(_), requires_grad=False).cuda()
            for _ in tmp
        ]
        fc_feats, att_feats, labels, masks = tmp

        optimizer.zero_grad()
        value, _ = model(fc_feats, att_feats, labels)
        loss = crit(value, labels[:, 1:], masks[:, 1:])
        loss.backward()
        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        train_loss = loss.data[0]
        torch.cuda.synchronize()
        end = time.time()
        print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
            .format(iteration, epoch, train_loss, end - start))
        lossWriter.writerow([epoch, iteration, train_loss])
        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            if tf is not None:
                add_summary_value(tf_summary_writer, 'train_loss', train_loss,
                                  iteration)
                add_summary_value(tf_summary_writer, 'learning_rate',
                                  opt.current_lr, iteration)
                add_summary_value(tf_summary_writer, 'scheduled_sampling_prob',
                                  model.ss_prob, iteration)
                tf_summary_writer.flush()

            loss_history[iteration] = train_loss
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model
            eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats = eval_utils.eval_split(
                model, crit, loader, eval_kwargs)

            # Write validation result into summary
            if tf is not None:
                add_summary_value(tf_summary_writer, 'validation loss',
                                  val_loss, iteration)
                for k, v in lang_stats.items():
                    add_summary_value(tf_summary_writer, k, v, iteration)
                tf_summary_writer.flush()
            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = -val_loss

            best_flag = False
            if True:  # if true
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path,
                                              'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['split_ix'] = loader.split_ix
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['vocab'] = loader.get_vocab()

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'infos_' + opt.id + '.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'histories_' + opt.id + '.pkl'),
                        'wb') as f:
                    cPickle.dump(histories, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    with open(
                            os.path.join(opt.checkpoint_path,
                                         'infos_' + opt.id + '-best.pkl'),
                            'wb') as f:
                        cPickle.dump(infos, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
def train(opt):
    # Deal with feature things before anything
    opt.use_att = utils.if_use_att(opt.caption_model)
    if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5

    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(opt)[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')):
            with open(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    model = models.setup(opt).cuda()
    # 将这些model并行执行
    dp_model = torch.nn.DataParallel(model)

    update_lr_flag = True
    # Assure in training mode
    dp_model.train()

    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()

    optimizer = utils.build_optimizer(model.parameters(), opt)
    # Load the optimizer
    if vars(opt).get('start_from', None) is not None and os.path.isfile(os.path.join(opt.start_from,"optimizer.pth")):
        optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    while True:
        if update_lr_flag:
                # Assign the learning rate
                '''
                epoch:训练次数		假设有500次
                learning_rate_decay_start:开始降低学习率的训练次数		假设是100
                opt.learning_rate_decay_every:学习率每次减少的数			假设是0.01
                opt.learning_rate_decay_rate: 学习率的减小率				假设
				frac:指数
				decay_factor 降低率
                '''
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
            	# 超过阈值就改变当前的learning_rate(lr)
            	# //表示返回不大于除法结果的最大整数
                frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every
                # **表示^
                decay_factor = opt.learning_rate_decay_rate  ** frac
                opt.current_lr = opt.learning_rate * decay_factor
            else:
            	# 未超过就不变
                opt.current_lr = opt.learning_rate

            # 将当前的学习率设为优化器的学习率
            utils.set_lr(optimizer, opt.current_lr)

            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
            	# 执行SS来解决训练和预测的矛盾
                frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob  * frac, opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob

            # If start self critical training
            if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                sc_flag = True
                init_scorer(opt.cached_tokens)
            else:
                sc_flag = False

            update_lr_flag = False
                
        start = time.time()
        # Load data from train split (0)
        data = loader.get_batch('train')
        # 返回数据加载所用时间?
        print('Read data:', time.time() - start)

        # Waits for all kernels in all streams on current device to complete.
        torch.cuda.synchronize()
        start = time.time()

        tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks']]
        tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp]
        fc_feats, att_feats, labels, masks, att_masks = tmp
        
        optimizer.zero_grad()
        if not sc_flag:
            loss = crit(dp_model(fc_feats, att_feats, labels, att_masks), labels[:,1:], masks[:,1:])
        else:
            gen_result, sample_logprobs = dp_model(fc_feats, att_feats, att_masks, opt={'sample_max':0}, mode='sample')
            reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt)
            loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda())

        loss.backward()
        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        train_loss = loss.item()
        torch.cuda.synchronize()
        end = time.time()
        if not sc_flag:
            print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                .format(iteration, epoch, train_loss, end - start))
        else:
            print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \
                .format(iteration, epoch, np.mean(reward[:,0]), end - start))

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            add_summary_value(tb_summary_writer, 'train_loss', train_loss, iteration)
            add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration)
            add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration)
            if sc_flag:
                add_summary_value(tb_summary_writer, 'avg_reward', np.mean(reward[:,0]), iteration)

            loss_history[iteration] = train_loss if not sc_flag else np.mean(reward[:,0])
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model
            eval_kwargs = {'split': 'val',
                            'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats = eval_utils.eval_split(dp_model, crit, loader, eval_kwargs)

            # Write validation result into summary
            add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration)
            if lang_stats is not None:
                for k,v in lang_stats.items():
                    add_summary_value(tb_summary_writer, k, v, iteration)
            val_result_history[iteration] = {'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions}

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = - val_loss

            best_flag = False
            if True: # if true
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['split_ix'] = loader.split_ix
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['vocab'] = loader.get_vocab()

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(os.path.join(opt.checkpoint_path, 'histories_'+opt.id+'.pkl'), 'wb') as f:
                    cPickle.dump(histories, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'-best.pkl'), 'wb') as f:
                        cPickle.dump(infos, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
Exemple #16
0
]
for k in vars(infos['opt']).keys():
    if k not in ignore1 and k not in ignore2:
        if k in vars(opt):
            assert vars(opt)[k] == vars(
                infos['opt'])[k], k + ' option not consistent'
        else:
            vars(opt).update({k: vars(infos['opt'])[k]
                              })  # copy over options from model
opt.vocab = infos['vocab']  # ix -> word mapping

# Setup the model
gen_model = MultiModalGenerator(opt)
gen_model.load_state_dict(torch.load(opt.g_model_path))
gen_model.cuda()
gen_model.eval()
crit = utils.LanguageModelCriterion()

# Create the Data Loader instance
loader = DataLoader(opt)

# When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json
# So make sure to use the vocab in infos file.
loader.ix_to_word = infos['vocab']
loss, split_predictions, lang_stats, div = eval_utils.eval_split(
    gen_model, crit, loader, eval_kwargs=vars(opt))

print('loss: ', loss)
if lang_stats:
    print(lang_stats)
Exemple #17
0
def train(opt):

    ################################
    # Build dataloader
    ################################
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    ##########################
    # Initialize infos
    ##########################
    infos = {
        'iter': 0,
        'epoch': 0,
        'loader_state_dict': None,
        'vocab': loader.get_vocab(),
    }
    # Load old infos(if there is) and check if models are compatible
    if opt.start_from is not None and os.path.isfile(
            os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')):
        with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl'),
                  'rb') as f:
            infos = utils.pickle_load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert getattr(saved_model_opt, checkme) == getattr(
                    opt, checkme
                ), "Command line argument and saved model disagree on '%s' " % checkme
    infos['opt'] = opt

    #########################
    # Build logger
    #########################
    # naive dict logger
    histories = defaultdict(dict)
    if opt.start_from is not None and os.path.isfile(
            os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
        with open(os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl'),
                  'rb') as f:
            histories.update(utils.pickle_load(f))

    # tensorboard logger
    tb_summary_writer = SummaryWriter(opt.checkpoint_path)

    ##########################
    # Build model
    ##########################
    opt.vocab = loader.get_vocab()
    multi_models_list = []
    for order in range(opt.number_of_models):
        multi_models_list.append(models.setup(opt).cuda())
    for order in range(opt.number_of_models):
        multi_models_list.append(models.setup(opt).cuda())
    for order in range(opt.number_of_models, 2 * opt.number_of_models):
        for param in multi_models_list[order].parameters():
            param.detach_()
    for order in range(opt.number_of_models):
        for param, param_ema in zip(
                multi_models_list[order].parameters(),
                multi_models_list[order + opt.number_of_models].parameters()):
            param_ema.data = param.data.clone()
    # multi_models = MultiModels(multi_models_list)
    # multi_models_list.append(SenEncodeModel(opt).cuda())
    multi_models = nn.ModuleList(multi_models_list)
    del opt.vocab
    # Load pretrained weights:
    if opt.start_from is not None and os.path.isfile(
            os.path.join(opt.start_from, 'model.pth')):
        multi_models.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'model.pth')))

    # Wrap generation model with loss function(used for training)
    # This allows loss function computed separately on each machine
    lw_models = nn.ModuleList([
        LossWrapper(multi_models[index], opt)
        for index in range(opt.number_of_models)
    ])
    kdlw_models = nn.ModuleList([
        KDLossWrapper(multi_models[index], opt)
        for index in range(opt.number_of_models)
    ])
    lw_models_ema = nn.ModuleList([
        LossWrapper(multi_models[opt.number_of_models + index], opt)
        for index in range(opt.number_of_models)
    ])
    kdlw_models_ema = nn.ModuleList([
        KDLossWrapper(multi_models[opt.number_of_models + index], opt)
        for index in range(opt.number_of_models)
    ])
    # Wrap with dataparallel
    dp_models = nn.ModuleList([
        torch.nn.DataParallel(multi_models[index])
        for index in range(opt.number_of_models)
    ])
    dp_lw_models = nn.ModuleList([
        torch.nn.DataParallel(lw_models[index])
        for index in range(opt.number_of_models)
    ])
    dp_kdlw_models = nn.ModuleList([
        torch.nn.DataParallel(kdlw_models[index])
        for index in range(opt.number_of_models)
    ])
    dp_models_ema = nn.ModuleList([
        torch.nn.DataParallel(multi_models[opt.number_of_models + index])
        for index in range(opt.number_of_models)
    ])
    dp_lw_models_ema = nn.ModuleList([
        torch.nn.DataParallel(lw_models_ema[index])
        for index in range(opt.number_of_models)
    ])
    dp_kdlw_models_ema = nn.ModuleList([
        torch.nn.DataParallel(kdlw_models_ema[index])
        for index in range(opt.number_of_models)
    ])

    ##########################
    #  Build optimizer
    ##########################
    if opt.noamopt:
        assert opt.caption_model in [
            'transformer', 'bert', 'm2transformer'
        ], 'noamopt can only work with transformer'
        optimizer = utils.get_std_opt(multi_models,
                                      factor=opt.noamopt_factor,
                                      warmup=opt.noamopt_warmup)
    elif opt.reduce_on_plateau:
        optimizer = utils.build_optimizer(multi_models.parameters(), opt)
        optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    else:
        optimizer = utils.build_optimizer(multi_models.parameters(), opt)
    # Load the optimizer
    if opt.start_from is not None and os.path.isfile(
            os.path.join(opt.start_from, "optimizer.pth")):
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    ##########################
    #  Build loss
    ##########################
    # triplet_loss = nn.TripletMarginLoss()

    #########################
    # Get ready to start
    #########################
    iteration = infos['iter']
    epoch = infos['epoch']
    # For back compatibility
    if 'iterators' in infos:
        infos['loader_state_dict'] = {
            split: {
                'index_list': infos['split_ix'][split],
                'iter_counter': infos['iterators'][split]
            }
            for split in [
                'paired_train', 'unpaired_images_train',
                'unpaired_captions_train', 'train', 'val', 'test'
            ]
        }
    loader.load_state_dict(infos['loader_state_dict'])
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)
    if opt.noamopt:
        optimizer._step = iteration
    # flag indicating finish of an epoch
    # Always set to True at the beginning to initialize the lr or etc.
    epoch_done = True
    # Assure in training mode
    dp_lw_models.train()
    dp_kdlw_models.train()
    dp_lw_models_ema.train()
    dp_kdlw_models_ema.train()

    # Build the ensemble model
    # # Setup the model
    model_ensemble = AttEnsemble(multi_models_list[opt.number_of_models:2 *
                                                   opt.number_of_models],
                                 weights=None)
    # model_ensemble.seq_length = 20
    model_ensemble.cuda()
    # model_ensemble.eval()
    kd_model_outs_list = []

    # Start training
    try:
        while True:
            # Stop if reaching max epochs
            if epoch >= opt.max_epochs and opt.max_epochs != -1:
                break

            if epoch_done:
                if not opt.noamopt and not opt.reduce_on_plateau:
                    # Assign the learning rate
                    if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                        frac = (epoch - opt.learning_rate_decay_start
                                ) // opt.learning_rate_decay_every
                        decay_factor = opt.learning_rate_decay_rate**frac
                        opt.current_lr = opt.learning_rate * decay_factor
                    else:
                        opt.current_lr = opt.learning_rate
                    utils.set_lr(optimizer,
                                 opt.current_lr)  # set the decayed rate
                # Assign the scheduled sampling prob
                if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                    frac = (epoch - opt.scheduled_sampling_start
                            ) // opt.scheduled_sampling_increase_every
                    opt.ss_prob = min(
                        opt.scheduled_sampling_increase_prob * frac,
                        opt.scheduled_sampling_max_prob)
                    for index in range(opt.number_of_models):
                        multi_models[index].ss_prob = opt.ss_prob

                # If start self critical training
                if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                    sc_flag = True
                    init_scorer(opt.cached_tokens)
                else:
                    sc_flag = False

                # If start structure loss training
                if opt.structure_after != -1 and epoch >= opt.structure_after:
                    struc_flag = True
                    init_scorer(opt.cached_tokens)
                else:
                    struc_flag = False

                if epoch >= opt.paired_train_epoch:
                    opt.current_lambda_x = opt.hyper_parameter_lambda_x * \
                                         (epoch - (opt.paired_train_epoch - 1)) /\
                                         (opt.max_epochs - opt.paired_train_epoch)
                    opt.current_lambda_y = opt.hyper_parameter_lambda_y * \
                                           (epoch - (opt.paired_train_epoch - 1)) / \
                                           (opt.max_epochs - opt.paired_train_epoch)

                epoch_done = False

            start = time.time()
            # Load data from train split (0)
            if epoch < opt.language_pretrain_epoch:
                data = loader.get_batch('unpaired_captions_train')
            elif epoch < opt.paired_train_epoch:
                data = loader.get_batch('paired_train')
            else:
                data = loader.get_batch('paired_train')
                unpaired_data = loader.get_batch('unpaired_images_train')
                unpaired_caption = loader.get_batch('unpaired_captions_train')
            print('Read data:', time.time() - start)

            torch.cuda.synchronize()
            start = time.time()
            if epoch < opt.language_pretrain_epoch:
                tmp = [
                    data['fc_feats'] * 0, data['att_feats'] * 0,
                    data['labels'], data['masks'], data['att_masks']
                ]
            elif epoch < opt.paired_train_epoch:
                tmp = [
                    data['fc_feats'], data['att_feats'], data['labels'],
                    data['masks'], data['att_masks']
                ]
            else:
                tmp = [
                    data['fc_feats'], data['att_feats'], data['labels'],
                    data['masks'], data['att_masks']
                ]
                unpaired_tmp = [
                    unpaired_data['fc_feats'], unpaired_data['att_feats'],
                    unpaired_data['labels'], unpaired_data['masks'],
                    unpaired_data['att_masks']
                ]
                unpaired_caption_tmp = [
                    unpaired_caption['fc_feats'] * 0,
                    unpaired_caption['att_feats'] * 0,
                    unpaired_caption['labels'], unpaired_caption['masks'],
                    unpaired_caption['att_masks']
                ]

            tmp = [_ if _ is None else _.cuda() for _ in tmp]
            fc_feats, att_feats, labels, masks, att_masks = tmp

            if epoch >= opt.paired_train_epoch:
                unpaired_tmp = [
                    _ if _ is None else _.cuda() for _ in unpaired_tmp
                ]
                unpaired_fc_feats, unpaired_att_feats, unpaired_labels, unpaired_masks, unpaired_att_masks = unpaired_tmp
                unpaired_caption_tmp = [
                    _ if _ is None else _.cuda() for _ in unpaired_caption_tmp
                ]
                unpaired_caption_fc_feats, unpaired_caption_att_feats, unpaired_caption_labels, unpaired_caption_masks, unpaired_caption_att_masks = unpaired_caption_tmp
                unpaired_caption_fc_feats = unpaired_caption_fc_feats.repeat(
                    5, 1)
                unpaired_caption_fc_feats = opt.std_pseudo_visual_feature * torch.randn_like(
                    unpaired_caption_fc_feats)
                unpaired_caption_att_feats = unpaired_caption_att_feats.repeat(
                    5, 1, 1)
                unpaired_caption_fc_feats.requires_grad = True
                unpaired_caption_att_feats.requires_grad = True
                unpaired_caption_labels = unpaired_caption_labels.reshape(
                    unpaired_caption_fc_feats.shape[0], -1)
                unpaired_caption_masks = unpaired_caption_masks.reshape(
                    unpaired_caption_fc_feats.shape[0], -1)

            optimizer.zero_grad()
            if epoch < opt.language_pretrain_epoch:
                language_loss = 0
                model_outs_list = []
                for index in range(opt.number_of_models):
                    model_out = dp_lw_models[index](
                        fc_feats, att_feats, labels, masks,
                        att_masks, data['gts'],
                        torch.arange(0, len(data['gts'])), sc_flag, struc_flag)
                    model_outs_list.append(model_out)
                    language_loss += model_out['loss'].mean()

                loss = language_loss
            elif epoch < opt.paired_train_epoch:
                language_loss = 0
                model_outs_list = []
                for index in range(opt.number_of_models):
                    model_out = dp_lw_models[index](
                        fc_feats, att_feats, labels, masks,
                        att_masks, data['gts'],
                        torch.arange(0, len(data['gts'])), sc_flag, struc_flag)
                    model_outs_list.append(model_out)
                    language_loss += model_out['loss'].mean()

                loss = language_loss
            else:
                language_loss = 0
                model_outs_list = []
                for index in range(opt.number_of_models):
                    model_out = dp_lw_models[index](
                        fc_feats, att_feats, labels, masks,
                        att_masks, data['gts'],
                        torch.arange(0, len(data['gts'])), sc_flag, struc_flag)
                    model_outs_list.append(model_out)
                    language_loss += model_out['loss'].mean()
                loss = language_loss

                # else:
                # for unpaired image sentences
                # # Setup the model
                # model_ensemble = AttEnsemble(multi_models_list[:opt.number_of_models], weights=None)
                # model_ensemble.seq_length = 16
                # model_ensemble.cuda()
                # model_ensemble.eval()

                model_ensemble.eval()
                eval_kwargs = dict()
                eval_kwargs.update(vars(opt))

                with torch.no_grad():
                    seq, seq_logprobs = model_ensemble(unpaired_fc_feats,
                                                       unpaired_att_feats,
                                                       unpaired_att_masks,
                                                       opt=eval_kwargs,
                                                       mode='sample')
                    # val_loss, predictions, lang_stats = eval_utils.eval_split(model_ensemble, lw_models[0].crit, loader,
                    #                                                           eval_kwargs)
                # print('\n'.join([utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in
                #                  model_ensemble.done_beams[0]]))
                # print('++' * 10)
                # for ii in range(10):
                #     sents = utils.decode_sequence(loader.get_vocab(), seq[ii].unsqueeze(0))
                #     gt_sent = utils.decode_sequence(loader.get_vocab(), labels[ii,0].unsqueeze(0))
                #     a=1

                model_ensemble.train()

                model_ensemble_sudo_labels = labels.new_zeros(
                    (opt.batch_size, opt.beam_size,
                     eval_kwargs['max_length'] + 2))
                model_ensemble_sudo_log_prob = masks.new_zeros(
                    (opt.batch_size,
                     opt.beam_size, eval_kwargs['max_length'] + 2,
                     len(loader.get_vocab()) + 1))
                model_ensemble_sum_log_prob = masks.new_zeros(
                    (opt.batch_size, opt.beam_size))

                for batch_index in range(opt.batch_size):
                    for beam_index in range(opt.beam_size):
                        # for beam_index in range(3):
                        pred = model_ensemble.done_beams[batch_index][
                            beam_index]['seq']
                        log_prob = model_ensemble.done_beams[batch_index][
                            beam_index]['logps']
                        model_ensemble_sudo_labels[batch_index, beam_index,
                                                   1:pred.shape[0] + 1] = pred
                        model_ensemble_sudo_log_prob[batch_index, beam_index,
                                                     1:pred.shape[0] +
                                                     1] = log_prob
                        model_ensemble_sum_log_prob[batch_index][
                            beam_index] = model_ensemble.done_beams[
                                batch_index][beam_index]['p']

                # model_ensemble_prob = F.softmax(model_ensemble_sum_log_prob)

                data_ensemble_sudo_gts = list()
                for data_ensemble_sudo_gts_index in range(
                        model_ensemble_sudo_labels.shape[0]):
                    data_ensemble_sudo_gts.append(model_ensemble_sudo_labels[
                        data_ensemble_sudo_gts_index, :,
                        1:-1].data.cpu().numpy())

                # generated_sentences = list()
                # for i in range(unpaired_fc_feats.shape[0]):
                #     generated_sentences.append(
                #         [utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in
                #          model_ensemble.done_beams[i]])
                #
                # pos_tag_results = list()
                # for i in range(unpaired_fc_feats.shape[0]):
                #     generated_sentences_i = generated_sentences[i]
                #     pos_tag_results_i = []
                #     for text in generated_sentences_i:
                #         text_tokenize = nltk.word_tokenize(text)
                #         pos_tag_results_i_jbeam = []
                #         for vob, vob_type in nltk.pos_tag(text_tokenize):
                #             if vob_type == 'NN' or vob_type == 'NNS':
                #                 pos_tag_results_i_jbeam.append(vob)
                #         pos_tag_results_i.append(pos_tag_results_i_jbeam)
                #     pos_tag_results.append(pos_tag_results_i)

                # for i in range(fc_feats.shape[0]):
                #     print('\n'.join([utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in
                #                      model_ensemble.done_beams[i]]))
                #     print('--' * 10)
                # dets = data['dets']
                #
                # promising_flag = labels.new_zeros(opt.batch_size, opt.beam_size)
                # for batch_index in range(opt.batch_size):
                #     dets_batch = dets[batch_index]
                #     for beam_index in range(opt.beam_size):
                #         indicator = [0] * len(dets_batch)
                #         pos_tag_batch_beam = pos_tag_results[batch_index][beam_index]
                #         for pos_tag_val in pos_tag_batch_beam:
                #             for ii in range(len(dets_batch)):
                #                 possible_list = vob_transform_list[dets_batch[ii]]
                #                 if pos_tag_val in possible_list:
                #                     indicator[ii] = 1
                #         if sum(indicator) == len(dets_batch) or sum(indicator) >= 2:
                #             promising_flag[batch_index, beam_index] = 1
                #
                # # model_ensemble_sudo_log_prob = model_ensemble_sudo_log_prob * promising_flag.unsqueeze(-1).unsqueeze(-1)
                # model_ensemble_sudo_labels = model_ensemble_sudo_labels * promising_flag.unsqueeze(-1)

                #sudo_masks_for_model = sudo_masks_for_model.detach()
                distilling_loss = 0
                # We use the random study machinism
                who_to_study = random.randint(0, opt.number_of_models - 1)

                # for index in range(opt.number_of_models):
                #     model_out = dp_kdlw_models[index](unpaired_fc_feats, unpaired_att_feats, model_ensemble_sudo_labels,
                #                                     model_ensemble_sudo_log_prob, att_masks, data_ensemble_sudo_gts,
                #                                     torch.arange(0, len(data_ensemble_sudo_gts)), sc_flag,
                #                                     struc_flag, model_ensemble_sum_log_prob)
                #     kd_model_outs_list.append(model_out)

                model_out = dp_kdlw_models[who_to_study](
                    unpaired_fc_feats, unpaired_att_feats,
                    model_ensemble_sudo_labels, model_ensemble_sudo_log_prob,
                    att_masks, data_ensemble_sudo_gts,
                    torch.arange(0, len(data_ensemble_sudo_gts)), sc_flag,
                    struc_flag, model_ensemble_sum_log_prob)
                # kd_model_outs_list.append(model_out)
                distilling_loss += model_out['loss'].mean()
                loss += opt.number_of_models * opt.current_lambda_x * distilling_loss

                ###################################################################
                # use unlabelled captions
                # simple_sgd = utils.gradient_descent(unpaired_caption_fc_feats, stepsize=1e3)
                simple_sgd = utils.gradient_descent_adagrad(
                    unpaired_caption_fc_feats, stepsize=1)
                gts_tmp = unpaired_caption['gts']
                new_gts = []
                for ii in range(len(data['gts'])):
                    for jj in range(gts_tmp[ii].shape[0]):
                        new_gts.append(gts_tmp[ii][jj])
                unpaired_caption['gts'] = new_gts
                for itr in range(opt.inner_iteration):
                    unlabelled_caption_model_out = dp_lw_models_ema[
                        itr % opt.number_of_models](
                            unpaired_caption_fc_feats,
                            unpaired_caption_att_feats,
                            unpaired_caption_labels, unpaired_caption_masks,
                            unpaired_caption_att_masks,
                            unpaired_caption['gts'],
                            torch.arange(0, len(unpaired_caption['gts'])),
                            sc_flag, struc_flag)
                    unlabelled_caption_loss = unlabelled_caption_model_out[
                        'loss'].mean()
                    unlabelled_caption_loss.backward()
                    # print(unlabelled_caption_loss)
                    simple_sgd.update(unpaired_caption_fc_feats)
                    # a=1

                unpaired_caption_fc_feats.requires_grad = False
                unpaired_caption_att_feats.requires_grad = False
                unlabelled_caption_model_out = dp_lw_models[who_to_study](
                    unpaired_caption_fc_feats, unpaired_caption_att_feats,
                    unpaired_caption_labels, unpaired_caption_masks,
                    unpaired_caption_att_masks, unpaired_caption['gts'],
                    torch.arange(0, len(unpaired_caption['gts'])), sc_flag,
                    struc_flag)
                unlabelled_caption_loss = unlabelled_caption_model_out[
                    'loss'].mean()
                loss += opt.number_of_models * opt.current_lambda_y * unlabelled_caption_loss

            loss.backward()
            if opt.grad_clip_value != 0:
                getattr(torch.nn.utils, 'clip_grad_%s_' %
                        (opt.grad_clip_mode))(multi_models.parameters(),
                                              opt.grad_clip_value)
            optimizer.step()

            for order in range(opt.number_of_models):
                for param, param_ema in zip(
                        multi_models_list[order].parameters(),
                        multi_models_list[order +
                                          opt.number_of_models].parameters()):
                    param_ema.data = opt.alpha * param_ema.data + (
                        1 - opt.alpha) * param.data

            train_loss = loss.item()
            torch.cuda.synchronize()
            end = time.time()
            # if struc_flag:
            #     print("iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}" \
            #         .format(iteration, epoch, train_loss, model_out['lm_loss'].mean().item(), model_out['struc_loss'].mean().item(), end - start))
            # elif not sc_flag:
            #     print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
            #         .format(iteration, epoch, train_loss, end - start))
            # else:
            #     print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \
            #         .format(iteration, epoch, model_out['reward'].mean(), end - start))
            if struc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, train_loss/opt.number_of_models, sum([model_outs_list[index]['lm_loss'].mean().item() for index in range(opt.number_of_models)])/opt.number_of_models,
                            sum([model_outs_list[index]['struc_loss'].mean().item() for index in range(opt.number_of_models)])/opt.number_of_models,
                            end - start))
            elif not sc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, language_loss.item()/opt.number_of_models, end - start))
            else:
                print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, sum([model_outs_list[index]['reward'].mean().item() for index in range(opt.number_of_models)])/opt.number_of_models, end - start))

            # Update the iteration and epoch
            iteration += 1
            if epoch < opt.paired_train_epoch:
                if data['bounds']['wrapped']:
                    epoch += 1
                    epoch_done = True
            else:
                if data['bounds']['wrapped']:
                    epoch += 1
                    epoch_done = True

            # Write the training loss summary
            if (iteration % opt.losses_log_every == 0):
                # tb_summary_writer.add_scalar('train_loss', train_loss, iteration)
                for index in range(opt.number_of_models):
                    model_id = 'model_{}'.format(index)
                    tb_summary_writer.add_scalars('language_loss', {
                        model_id:
                        model_outs_list[index]['loss'].mean().item()
                    }, iteration)
                if epoch >= opt.paired_train_epoch:
                    # for index in range(opt.number_of_models):
                    #     model_id = 'model_{}'.format(index)
                    #     kd_model_outs_val = 0 if len(kd_model_outs_list) == 0 else kd_model_outs_list[index]['loss'].mean().item()
                    #     tb_summary_writer.add_scalars('distilling_loss',
                    #                                   {model_id: kd_model_outs_val},
                    #                                   iteration)
                    tb_summary_writer.add_scalar('distilling_loss',
                                                 distilling_loss.item(),
                                                 iteration)
                    tb_summary_writer.add_scalar(
                        'unlabelled_caption_loss',
                        unlabelled_caption_loss.item(), iteration)
                    tb_summary_writer.add_scalar('hyper_parameter_lambda_x',
                                                 opt.current_lambda_x,
                                                 iteration)
                    tb_summary_writer.add_scalar('hyper_parameter_lambda_y',
                                                 opt.current_lambda_y,
                                                 iteration)
                # tb_summary_writer.add_scalar('triplet_loss', triplet_loss_val.item(), iteration)
                if opt.noamopt:
                    opt.current_lr = optimizer.rate()
                elif opt.reduce_on_plateau:
                    opt.current_lr = optimizer.current_lr
                tb_summary_writer.add_scalar('learning_rate', opt.current_lr,
                                             iteration)
                tb_summary_writer.add_scalar('scheduled_sampling_prob',
                                             multi_models[0].ss_prob,
                                             iteration)
                if sc_flag:
                    for index in range(opt.number_of_models):
                        # tb_summary_writer.add_scalar('avg_reward', model_out['reward'].mean(), iteration)
                        model_id = 'model_{}'.format(index)
                        tb_summary_writer.add_scalars(
                            'avg_reward', {
                                model_id:
                                model_outs_list[index]['reward'].mean().item()
                            }, iteration)
                elif struc_flag:
                    # tb_summary_writer.add_scalar('lm_loss', model_out['lm_loss'].mean().item(), iteration)
                    # tb_summary_writer.add_scalar('struc_loss', model_out['struc_loss'].mean().item(), iteration)
                    # tb_summary_writer.add_scalar('reward', model_out['reward'].mean().item(), iteration)
                    # tb_summary_writer.add_scalar('reward_var', model_out['reward'].var(1).mean(), iteration)
                    model_id = 'model_{}'.format(index)
                    for index in range(opt.number_of_models):
                        tb_summary_writer.add_scalars(
                            'lm_loss', {
                                model_id:
                                model_outs_list[index]
                                ['lm_loss'].mean().item()
                            }, iteration)
                        tb_summary_writer.add_scalars(
                            'struc_loss', {
                                model_id:
                                model_outs_list[index]
                                ['struc_loss'].mean().item()
                            }, iteration)
                        tb_summary_writer.add_scalars(
                            'reward', {
                                model_id:
                                model_outs_list[index]['reward'].mean().item()
                            }, iteration)
                        tb_summary_writer.add_scalars(
                            'reward_var', {
                                model_id:
                                model_outs_list[index]['reward'].var(1).mean()
                            }, iteration)

                histories['loss_history'][
                    iteration] = train_loss if not sc_flag else sum([
                        model_outs_list[index]['reward'].mean().item()
                        for index in range(opt.number_of_models)
                    ]) / opt.number_of_models
                histories['lr_history'][iteration] = opt.current_lr
                histories['ss_prob_history'][iteration] = multi_models[
                    0].ss_prob

            # update infos
            infos['iter'] = iteration
            infos['epoch'] = epoch
            infos['loader_state_dict'] = loader.state_dict()

            # make evaluation on validation set, and save model
            if (iteration % opt.save_checkpoint_every == 0 and not opt.save_every_epoch and epoch >= opt.paired_train_epoch) or \
                (epoch_done and opt.save_every_epoch and epoch >= opt.paired_train_epoch):
                # load ensemble
                # Setup the model
                model = AttEnsemble(multi_models_list[opt.number_of_models:2 *
                                                      opt.number_of_models],
                                    weights=None)
                model.seq_length = opt.max_length
                model.cuda()
                model.eval()
                # eval model
                eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
                eval_kwargs.update(vars(opt))
                # eval_kwargs['beam_size'] = 5
                # eval_kwargs['verbose_beam'] = 1
                # eval_kwargs['verbose_loss'] = 1
                # val_loss, predictions, lang_stats = eval_utils.eval_split(
                #     dp_model, lw_model.crit, loader, eval_kwargs)
                with torch.no_grad():
                    val_loss, predictions, lang_stats = eval_utils.eval_split(
                        model, lw_models[0].crit, loader, eval_kwargs)
                model.train()

                if opt.reduce_on_plateau:
                    if 'CIDEr' in lang_stats:
                        optimizer.scheduler_step(-lang_stats['CIDEr'])
                    else:
                        optimizer.scheduler_step(val_loss)
                # Write validation result into summary
                tb_summary_writer.add_scalar('validation loss', val_loss,
                                             iteration)
                if lang_stats is not None:
                    for k, v in lang_stats.items():
                        tb_summary_writer.add_scalar(k, v, iteration)
                histories['val_result_history'][iteration] = {
                    'loss': val_loss,
                    'lang_stats': lang_stats,
                    'predictions': predictions
                }

                # Save model if is improving on validation result
                if opt.language_eval == 1:
                    current_score = lang_stats['CIDEr']
                else:
                    current_score = -val_loss

                best_flag = False

                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True

                # Dump miscalleous informations
                infos['best_val_score'] = best_val_score

                utils.save_checkpoint(opt, multi_models, infos, optimizer,
                                      histories)
                if opt.save_history_ckpt:
                    utils.save_checkpoint(
                        opt,
                        multi_models,
                        infos,
                        optimizer,
                        append=str(epoch)
                        if opt.save_every_epoch else str(iteration))

                if best_flag:
                    utils.save_checkpoint(opt,
                                          multi_models,
                                          infos,
                                          optimizer,
                                          append='best')

            # if epoch_done and epoch == opt.paired_train_epoch:
            #     utils.save_checkpoint(opt, multi_models, infos, optimizer, histories)
            #     if opt.save_history_ckpt:
            #         utils.save_checkpoint(opt, multi_models, infos, optimizer,
            #                               append=str(epoch) if opt.save_every_epoch else str(iteration))
            #     cmd = 'cp -r ' + 'log_' + opt.id + ' ' + 'log_' + opt.id + '_backup'
            #     os.system(cmd)

    except (RuntimeError, KeyboardInterrupt):
        print('Save ckpt on exception ...')
        utils.save_checkpoint(opt, multi_models, infos, optimizer)
        print('Save ckpt done.')
        stack_trace = traceback.format_exc()
        print(stack_trace)
Exemple #18
0
def train(opt):
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    infos = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from,
                               'infos_' + opt.id + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)
    val_result_history = infos.get('val_result_history', {})
    loss_history = infos.get('loss_history', {})
    lr_history = infos.get('lr_history', {})
    ss_prob_history = infos.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    cnn_model = utils.build_cnn(opt)
    cnn_model.cuda()
    model = models.setup(opt)
    model.cuda()

    update_lr_flag = True
    # Assure in training mode
    model.train()

    crit = utils.LanguageModelCriterion()

    optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate)
    cnn_optimizer = optim.Adam(cnn_model.parameters(),
                               lr=opt.cnn_learning_rate,
                               weight_decay=opt.cnn_weight_decay)

    # Load the optimizer
    if vars(opt).get('start_from', None) is not None:
        if os.path.isfile(os.path.join(opt.start_from, 'optimizer.pth')):
            optimizer.load_state_dict(
                torch.load(os.path.join(opt.start_from, 'optimizer.pth')))
        if os.path.isfile(os.path.join(opt.start_from, 'optimizer-cnn.pth')):
            cnn_optimizer.load_state_dict(
                torch.load(os.path.join(opt.start_from, 'optimizer-cnn.pth')))

    while True:
        if update_lr_flag:
            # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
                utils.set_lr(optimizer, opt.current_lr)  # set the decayed rate
            else:
                opt.current_lr = opt.learning_rate
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob
            # Update the training stage of cnn
            if opt.finetune_cnn_after == -1 or epoch < opt.finetune_cnn_after:
                for p in cnn_model.parameters():
                    p.requires_grad = False
                cnn_model.eval()
            else:
                for p in cnn_model.parameters():
                    p.requires_grad = True
                cnn_model.train()
            update_lr_flag = False

        torch.cuda.synchronize()
        start = time.time()
        # Load data from train split (0)
        data = loader.get_batch('train')
        torch.cuda.synchronize()
        print('Read data:', time.time() - start)

        torch.cuda.synchronize()
        start = time.time()

        tmp = [data['images'], data['labels'], data['masks']]
        tmp = [
            Variable(torch.from_numpy(_), requires_grad=False).cuda()
            for _ in tmp
        ]
        images, labels, masks = tmp

        att_feats = cnn_model(images)
        fc_feats = att_feats.mean(2).mean(3).squeeze(2).squeeze(2)

        att_feats = att_feats.unsqueeze(1).expand(*((
            att_feats.size(0),
            opt.seq_per_img,
        ) + att_feats.size()[1:])).contiguous().view(
            *((att_feats.size(0) * opt.seq_per_img, ) + att_feats.size()[1:]))
        fc_feats = fc_feats.unsqueeze(1).expand(*((
            fc_feats.size(0),
            opt.seq_per_img,
        ) + fc_feats.size()[1:])).contiguous().view(
            *((fc_feats.size(0) * opt.seq_per_img, ) + fc_feats.size()[1:]))

        optimizer.zero_grad()
        if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after:
            cnn_optimizer.zero_grad()
        loss = crit(model(fc_feats, att_feats, labels), labels[:, 1:],
                    masks[:, 1:])
        loss.backward()
        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after:
            utils.clip_gradient(cnn_optimizer, opt.grad_clip)
            cnn_optimizer.step()
        train_loss = loss.data[0]
        torch.cuda.synchronize()
        end = time.time()
        print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
            .format(iteration, epoch, train_loss, end - start))

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            loss_history[iteration] = train_loss
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model
            eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats = eval_utils.eval_split(
                cnn_model, model, crit, loader, eval_kwargs)

            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = -val_loss

            best_flag = False
            if True:  # if true
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model.pth')
                cnn_checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model-cnn.pth')
                torch.save(model.state_dict(), checkpoint_path)
                torch.save(cnn_model.state_dict(), cnn_checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                print("cnn model saved to {}".format(cnn_checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path,
                                              'optimizer.pth')
                cnn_optimizer_path = os.path.join(opt.checkpoint_path,
                                                  'optimizer-cnn.pth')
                torch.save(optimizer.state_dict(), optimizer_path)
                torch.save(cnn_optimizer.state_dict(), cnn_optimizer_path)

                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['val_result_history'] = val_result_history
                infos['loss_history'] = loss_history
                infos['lr_history'] = lr_history
                infos['ss_prob_history'] = ss_prob_history
                infos['vocab'] = loader.get_vocab()
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'infos_' + opt.id + '.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model-best.pth')
                    cnn_checkpoint_path = os.path.join(opt.checkpoint_path,
                                                       'model-cnn-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    torch.save(cnn_model.state_dict(), cnn_checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    print("cnn model saved to {}".format(cnn_checkpoint_path))
                    with open(
                            os.path.join(opt.checkpoint_path,
                                         'infos_' + opt.id + '-best.pkl'),
                            'wb') as f:
                        cPickle.dump(infos, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
Exemple #19
0
def train(opt):
    exclude_opt = [
        'training_mode', 'tap_epochs', 'cg_epochs', 'tapcg_epochs', 'lr',
        'learning_rate_decay_start', 'learning_rate_decay_every',
        'learning_rate_decay_rate', 'self_critical_after',
        'save_checkpoint_every', 'id', "pretrain", "pretrain_path", "debug",
        "save_all_checkpoint", "min_epoch_when_save"
    ]

    save_folder, logger, tf_writer = build_floder_and_create_logger(opt)
    saved_info = {'best': {}, 'last': {}, 'history': {}}
    is_continue = opt.start_from != None

    if is_continue:
        infos_path = os.path.join(save_folder, 'info.pkl')
        with open(infos_path) as f:
            logger.info('load info from {}'.format(infos_path))
            saved_info = cPickle.load(f)
            pre_opt = saved_info[opt.start_from_mode]['opt']
            if vars(opt).get("no_exclude_opt", False):
                exclude_opt = []
            for opt_name in vars(pre_opt).keys():
                if (not opt_name in exclude_opt):
                    vars(opt).update({opt_name: vars(pre_opt).get(opt_name)})
                if vars(pre_opt).get(opt_name) != vars(opt).get(opt_name):
                    print('change opt: {} from {} to {}'.format(
                        opt_name,
                        vars(pre_opt).get(opt_name),
                        vars(opt).get(opt_name)))

    opt.use_att = utils.if_use_att(opt.caption_model)
    loader = DataLoader(opt)
    opt.CG_vocab_size = loader.vocab_size
    opt.CG_seq_length = loader.seq_length

    # init training option
    epoch = saved_info[opt.start_from_mode].get('epoch', 0)
    iteration = saved_info[opt.start_from_mode].get('iter', 0)
    best_val_score = saved_info[opt.start_from_mode].get('best_val_score', 0)
    val_result_history = saved_info['history'].get('val_result_history', {})
    loss_history = saved_info['history'].get('loss_history', {})
    lr_history = saved_info['history'].get('lr_history', {})
    loader.iterators = saved_info[opt.start_from_mode].get(
        'iterators', loader.iterators)
    loader.split_ix = saved_info[opt.start_from_mode].get(
        'split_ix', loader.split_ix)
    opt.current_lr = vars(opt).get('current_lr', opt.lr)
    opt.m_batch = vars(opt).get('m_batch', 1)

    # create a tap_model,fusion_model,cg_model

    tap_model = models.setup_tap(opt)
    lm_model = CaptionGenerator(opt)
    cg_model = lm_model

    if is_continue:
        if opt.start_from_mode == 'best':
            model_pth = torch.load(os.path.join(save_folder, 'model-best.pth'))
        elif opt.start_from_mode == 'last':
            model_pth = torch.load(
                os.path.join(save_folder,
                             'model_iter_{}.pth'.format(iteration)))
        assert model_pth['iteration'] == iteration
        logger.info('Loading pth from {}, iteration:{}'.format(
            save_folder, iteration))
        tap_model.load_state_dict(model_pth['tap_model'])
        cg_model.load_state_dict(model_pth['cg_model'])

    elif opt.pretrain:
        print('pretrain {} from {}'.format(opt.pretrain, opt.pretrain_path))
        model_pth = torch.load(opt.pretrain_path)
        if opt.pretrain == 'tap':
            tap_model.load_state_dict(model_pth['tap_model'])
        elif opt.pretrain == 'cg':
            cg_model.load_state_dict(model_pth['cg_model'])
        elif opt.pretrain == 'tap_cg':
            tap_model.load_state_dict(model_pth['tap_model'])
            cg_model.load_state_dict(model_pth['cg_model'])
        else:
            assert 1 == 0, 'opt.pretrain error'

    tap_model.cuda()
    tap_model.train()  # Assure in training mode

    tap_crit = utils.TAPModelCriterion()

    tap_optimizer = optim.Adam(tap_model.parameters(),
                               lr=opt.lr,
                               weight_decay=opt.weight_decay)

    cg_model.cuda()
    cg_model.train()
    cg_optimizer = optim.Adam(cg_model.parameters(),
                              lr=opt.lr,
                              weight_decay=opt.weight_decay)
    cg_crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    cg_optimizer = optim.Adam(cg_model.parameters(),
                              lr=opt.lr,
                              weight_decay=opt.weight_decay)

    allmodels = [tap_model, cg_model]
    optimizers = [tap_optimizer, cg_optimizer]

    if is_continue:
        tap_optimizer.load_state_dict(model_pth['tap_optimizer'])
        cg_optimizer.load_state_dict(model_pth['cg_optimizer'])

    update_lr_flag = True
    loss_sum = np.zeros(5)
    bad_video_num = 0
    best_epoch = epoch
    start = time.time()

    print_opt(opt, allmodels, logger)
    logger.info('\nStart training')

    # set a var to indicate what to train in current iteration: "tap", "cg" or "tap_cg"
    flag_training_whats = get_training_list(opt, logger)

    # Iteration begin
    while True:
        if update_lr_flag:
            if (epoch > opt.learning_rate_decay_start
                    and opt.learning_rate_decay_start >= 0):
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.lr * decay_factor
            else:
                opt.current_lr = opt.lr
            for optimizer in optimizers:
                utils.set_lr(optimizer, opt.current_lr)
            if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                sc_flag = True
                init_scorer(None)
            else:
                sc_flag = False
            update_lr_flag = False

        flag_training_what = flag_training_whats[epoch]
        if opt.training_mode == "alter2":
            flag_training_what = flag_training_whats[iteration]

        # get data
        data = loader.get_batch('train')

        if opt.debug:
            print('vid:', data['vid'])
            print('info:', data['infos'])

        torch.cuda.synchronize()

        if (data["proposal_num"] <= 0) or (data['fc_feats'].shape[0] <= 1):
            bad_video_num += 1  # print('vid:{} has no good proposal.'.format(data['vid']))
            continue

        ind_select_list, soi_select_list, cg_select_list, sampled_ids, = data[
            'ind_select_list'], data['soi_select_list'], data[
                'cg_select_list'], data['sampled_ids']

        if flag_training_what == 'cg' or flag_training_what == 'gt_tap_cg':
            ind_select_list = data['gts_ind_select_list']
            soi_select_list = data['gts_soi_select_list']
            cg_select_list = data['gts_cg_select_list']

        tmp = [
            data['fc_feats'], data['att_feats'], data['lda_feats'],
            data['tap_labels'], data['tap_masks_for_loss'],
            data['cg_labels'][cg_select_list],
            data['cg_masks'][cg_select_list], data['w1']
        ]

        tmp = [
            Variable(torch.from_numpy(_), requires_grad=False).cuda()
            for _ in tmp
        ]

        c3d_feats, att_feats, lda_feats, tap_labels, tap_masks_for_loss, cg_labels, cg_masks, w1 = tmp

        if (iteration - 1) % opt.m_batch == 0:
            tap_optimizer.zero_grad()
            cg_optimizer.zero_grad()

        tap_feats, pred_proposals = tap_model(c3d_feats)
        tap_loss = tap_crit(pred_proposals, tap_masks_for_loss, tap_labels, w1)

        loss_sum[0] = loss_sum[0] + tap_loss.item()

        # Backward Propagation
        if flag_training_what == 'tap':
            tap_loss.backward()
            utils.clip_gradient(tap_optimizer, opt.grad_clip)
            if iteration % opt.m_batch == 0:
                tap_optimizer.step()
        else:
            if not sc_flag:
                pred_captions = cg_model(tap_feats,
                                         c3d_feats,
                                         lda_feats,
                                         cg_labels,
                                         ind_select_list,
                                         soi_select_list,
                                         mode='train')
                cg_loss = cg_crit(pred_captions, cg_labels[:, 1:],
                                  cg_masks[:, 1:])

            else:
                gen_result, sample_logprobs, greedy_res = cg_model(
                    tap_feats,
                    c3d_feats,
                    lda_feats,
                    cg_labels,
                    ind_select_list,
                    soi_select_list,
                    mode='train_rl')
                sentence_info = data['sentences_batch'] if (
                    flag_training_what != 'cg'
                    and flag_training_what != 'gt_tap_cg'
                ) else data['gts_sentences_batch']

                reward = get_self_critical_reward2(
                    greedy_res, (data['vid'], sentence_info),
                    gen_result,
                    vocab=loader.get_vocab(),
                    opt=opt)
                cg_loss = rl_crit(sample_logprobs, gen_result,
                                  torch.from_numpy(reward).float().cuda())

            loss_sum[1] = loss_sum[1] + cg_loss.item()

            if flag_training_what == 'cg' or flag_training_what == 'gt_tap_cg' or flag_training_what == 'LP_cg':
                cg_loss.backward()

                utils.clip_gradient(cg_optimizer, opt.grad_clip)
                if iteration % opt.m_batch == 0:
                    cg_optimizer.step()
                if flag_training_what == 'gt_tap_cg':
                    utils.clip_gradient(tap_optimizer, opt.grad_clip)
                    if iteration % opt.m_batch == 0:
                        tap_optimizer.step()
            elif flag_training_what == 'tap_cg':
                total_loss = opt.lambda1 * tap_loss + opt.lambda2 * cg_loss
                total_loss.backward()
                utils.clip_gradient(tap_optimizer, opt.grad_clip)
                utils.clip_gradient(cg_optimizer, opt.grad_clip)
                if iteration % opt.m_batch == 0:
                    tap_optimizer.step()
                    cg_optimizer.step()

                loss_sum[2] = loss_sum[2] + total_loss.item()

        torch.cuda.synchronize()

        # Updating epoch num
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Print losses, Add to summary
        if iteration % opt.losses_log_every == 0:
            end = time.time()
            losses = np.round(loss_sum / opt.losses_log_every, 3)
            logger.info(
                "iter {} (epoch {}, lr {}), avg_iter_loss({}) = {}, time/batch = {:.3f}, bad_vid = {:.3f}" \
                    .format(iteration, epoch, opt.current_lr, flag_training_what, losses,
                            (end - start) / opt.losses_log_every,
                            bad_video_num))

            tf_writer.add_scalar('lr', opt.current_lr, iteration)
            tf_writer.add_scalar('train_tap_loss', losses[0], iteration)
            tf_writer.add_scalar('train_tap_prop_loss', losses[3], iteration)
            tf_writer.add_scalar('train_tap_bound_loss', losses[4], iteration)
            tf_writer.add_scalar('train_cg_loss', losses[1], iteration)
            tf_writer.add_scalar('train_total_loss', losses[2], iteration)
            if sc_flag and (not flag_training_what == 'tap'):
                tf_writer.add_scalar('avg_reward', np.mean(reward[:, 0]),
                                     iteration)
            loss_history[iteration] = losses
            lr_history[iteration] = opt.current_lr
            loss_sum = np.zeros(5)
            start = time.time()
            bad_video_num = 0

        # Evaluation, and save model
        if (iteration % opt.save_checkpoint_every
                == 0) and (epoch >= opt.min_epoch_when_save):
            eval_kwargs = {
                'split': 'val',
                'val_all_metrics': 0,
                'topN': 100,
            }

            eval_kwargs.update(vars(opt))

            # eval_kwargs['num_vids_eval'] = int(491)
            eval_kwargs['topN'] = 100

            eval_kwargs2 = {
                'split': 'val',
                'val_all_metrics': 1,
                'num_vids_eval': 4917,
            }
            eval_kwargs2.update(vars(opt))

            if not opt.num_vids_eval:
                eval_kwargs['num_vids_eval'] = int(4917.)
                eval_kwargs2['num_vids_eval'] = 4917

            crits = [tap_crit, cg_crit]
            pred_json_path_T = os.path.join(save_folder, 'pred_sent',
                                            'pred_num{}_iter{}.json')

            # if 'alter' in opt.training_mode:
            if flag_training_what == 'tap':
                eval_kwargs['topN'] = 1000
                predictions, eval_score, val_loss = eval_utils.eval_split(
                    allmodels,
                    crits,
                    loader,
                    pred_json_path_T.format(eval_kwargs['num_vids_eval'],
                                            iteration),
                    eval_kwargs,
                    flag_eval_what='tap')
            else:
                if vars(opt).get('fast_eval_cg', False) == False:
                    predictions, eval_score, val_loss = eval_utils.eval_split(
                        allmodels,
                        crits,
                        loader,
                        pred_json_path_T.format(eval_kwargs['num_vids_eval'],
                                                iteration),
                        eval_kwargs,
                        flag_eval_what='tap_cg')

                predictions2, eval_score2, val_loss2 = eval_utils.eval_split(
                    allmodels,
                    crits,
                    loader,
                    pred_json_path_T.format(eval_kwargs2['num_vids_eval'],
                                            iteration),
                    eval_kwargs2,
                    flag_eval_what='cg')

                if (not vars(opt).get('fast_eval_cg', False)
                        == False) or (not vars(opt).get(
                            'fast_eval_cg_top10', False) == False):
                    eval_score = eval_score2
                    val_loss = val_loss2
                    predictions = predictions2

            # else:
            #    predictions, eval_score, val_loss = eval_utils.eval_split(allmodels, crits, loader, pred_json_path,
            #                                                              eval_kwargs,
            #                                                              flag_eval_what=flag_training_what)

            f_f1 = lambda x, y: 2 * x * y / (x + y)
            f1 = f_f1(eval_score['Recall'], eval_score['Precision']).mean()
            if flag_training_what != 'tap':  # if only train tap, use the mean of precision and recall as final score
                current_score = np.array(eval_score['METEOR']).mean() * 100
            else:  # if train tap_cg, use avg_meteor as final score
                current_score = f1

            for model in allmodels:
                for name, param in model.named_parameters():
                    tf_writer.add_histogram(name,
                                            param.clone().cpu().data.numpy(),
                                            iteration,
                                            bins=10)
                    if param.grad is not None:
                        tf_writer.add_histogram(
                            name + '_grad',
                            param.grad.clone().cpu().data.numpy(),
                            iteration,
                            bins=10)

            tf_writer.add_scalar('val_tap_loss', val_loss[0], iteration)
            tf_writer.add_scalar('val_cg_loss', val_loss[1], iteration)
            tf_writer.add_scalar('val_tap_prop_loss', val_loss[3], iteration)
            tf_writer.add_scalar('val_tap_bound_loss', val_loss[4], iteration)
            tf_writer.add_scalar('val_total_loss', val_loss[2], iteration)
            tf_writer.add_scalar('val_score', current_score, iteration)
            if flag_training_what != 'tap':
                tf_writer.add_scalar('val_score_gt_METEOR',
                                     np.array(eval_score2['METEOR']).mean(),
                                     iteration)
                tf_writer.add_scalar('val_score_gt_Bleu_4',
                                     np.array(eval_score2['Bleu_4']).mean(),
                                     iteration)
                tf_writer.add_scalar('val_score_gt_CIDEr',
                                     np.array(eval_score2['CIDEr']).mean(),
                                     iteration)
            tf_writer.add_scalar('val_recall', eval_score['Recall'].mean(),
                                 iteration)
            tf_writer.add_scalar('val_precision',
                                 eval_score['Precision'].mean(), iteration)
            tf_writer.add_scalar('f1', f1, iteration)

            val_result_history[iteration] = {
                'val_loss': val_loss,
                'eval_score': eval_score
            }

            if flag_training_what == 'tap':
                logger.info(
                    'Validation the result of iter {}, score(f1/meteor):{},\n all:{}'
                    .format(iteration, current_score, eval_score))
            else:
                mean_score = {
                    k: np.array(v).mean()
                    for k, v in eval_score.items()
                }
                gt_mean_score = {
                    k: np.array(v).mean()
                    for k, v in eval_score2.items()
                }

                metrics = ['Bleu_4', 'CIDEr', 'METEOR', 'ROUGE_L']
                gt_avg_score = np.array([
                    v for metric, v in gt_mean_score.items()
                    if metric in metrics
                ]).sum()
                logger.info(
                    'Validation the result of iter {}, score(f1/meteor):{},\n all:{}\n mean:{} \n\n gt:{} \n mean:{}\n avg_score: {}'
                    .format(iteration, current_score, eval_score, mean_score,
                            eval_score2, gt_mean_score, gt_avg_score))

            # Save model .pth
            saved_pth = {
                'iteration': iteration,
                'cg_model': cg_model.state_dict(),
                'tap_model': tap_model.state_dict(),
                'cg_optimizer': cg_optimizer.state_dict(),
                'tap_optimizer': tap_optimizer.state_dict(),
            }

            if opt.save_all_checkpoint:
                checkpoint_path = os.path.join(
                    save_folder, 'model_iter_{}.pth'.format(iteration))
            else:
                checkpoint_path = os.path.join(save_folder, 'model.pth')
            torch.save(saved_pth, checkpoint_path)
            logger.info('Save model at iter {} to checkpoint file {}.'.format(
                iteration, checkpoint_path))

            # save info.pkl
            if current_score > best_val_score:
                best_val_score = current_score
                best_epoch = epoch
                saved_info['best'] = {
                    'opt': opt,
                    'iter': iteration,
                    'epoch': epoch,
                    'iterators': loader.iterators,
                    'flag_training_what': flag_training_what,
                    'split_ix': loader.split_ix,
                    'best_val_score': best_val_score,
                    'vocab': loader.get_vocab(),
                }

                best_checkpoint_path = os.path.join(save_folder,
                                                    'model-best.pth')
                torch.save(saved_pth, best_checkpoint_path)
                logger.info(
                    'Save Best-model at iter {} to checkpoint file.'.format(
                        iteration))

            saved_info['last'] = {
                'opt': opt,
                'iter': iteration,
                'epoch': epoch,
                'iterators': loader.iterators,
                'flag_training_what': flag_training_what,
                'split_ix': loader.split_ix,
                'best_val_score': best_val_score,
                'vocab': loader.get_vocab(),
            }
            saved_info['history'] = {
                'val_result_history': val_result_history,
                'loss_history': loss_history,
                'lr_history': lr_history,
            }
            with open(os.path.join(save_folder, 'info.pkl'), 'w') as f:
                cPickle.dump(saved_info, f)
                logger.info('Save info to info.pkl')

            # Stop criterion
            if epoch >= len(flag_training_whats):
                tf_writer.close()
                break
Exemple #20
0
def train(opt):
    # Deal with feature things before anything
    opt.use_att = utils.if_use_att(opt.caption_model)
    if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5

    torch.cuda.set_device(opt.device_num)  #AK
    print(torch.cuda.current_device())  #AK

    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from,
                               'infos_' + opt.id + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            #             need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"] #for now removed caption_model
            need_be_same = ["rnn_type", "rnn_size",
                            "num_layers"]  #for now removed caption_model
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
            with open(
                    os.path.join(opt.start_from,
                                 'histories_' + opt.id + '.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)
    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

#     pdb.set_trace()
    model = models.setup(opt).cuda(opt.device_num)
    dp_model = model  #torch.nn.DataParallel(model) variable length in RNN, unable to use parallelism

    update_lr_flag = True
    # Assure in training mode
    dp_model.train()

    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()

    if opt.reduce_on_plateau:
        optimizer = utils.build_optimizer(model.parameters(), opt)
        optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    else:
        optimizer = utils.build_optimizer(model.parameters(), opt)
    # Load the optimizer
    if vars(opt).get('start_from', None) is not None and os.path.isfile(
            os.path.join(opt.start_from, "optimizer.pth")):
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'optimizer.pth'),
                       map_location="cuda:" + str(opt.device_num)))


#         optimizer.to(opt.device_num)

    while True:
        if update_lr_flag:
            if not opt.reduce_on_plateau:
                # Assign the learning rate
                if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                    frac = (epoch - opt.learning_rate_decay_start
                            ) // opt.learning_rate_decay_every
                    decay_factor = opt.learning_rate_decay_rate**frac
                    opt.current_lr = opt.learning_rate * decay_factor
                else:
                    opt.current_lr = opt.learning_rate
                utils.set_lr(optimizer, opt.current_lr)  # set the decayed rate

            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob

            # If start self critical training
            if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                sc_flag = True
                init_scorer(opt.cached_tokens)
            else:
                sc_flag = False

            update_lr_flag = False

        start = time.time()
        # Load data from train split (0)
        data = loader.get_batch('train')
        print('Read data:', time.time() - start)
        '''checking data object'''
        #         print(type(data))
        #         print(list(data))
        #         pdb.set_trace()

        torch.cuda.synchronize()
        start = time.time()
        tmp = [
            data['fc_feats'], data['att_feats'], data['labels'], data['masks'],
            data['att_masks']
        ]
        tmp = [
            _ if _ is None else torch.from_numpy(_).cuda(opt.device_num)
            for _ in tmp
        ]
        #         pdb.set_trace()
        fc_feats, att_feats, labels, masks, att_masks = tmp

        optimizer.zero_grad()
        if not sc_flag:
            #             pdb.set_trace()
            loss = crit(dp_model(fc_feats, att_feats, labels, att_masks),
                        labels[:, 1:], masks[:, 1:])

        else:
            gen_result, sample_logprobs = dp_model(fc_feats,
                                                   att_feats,
                                                   att_masks,
                                                   opt={'sample_max': 0},
                                                   mode='sample')
            reward = get_self_critical_reward(dp_model, fc_feats, att_feats,
                                              att_masks, data, gen_result, opt)
            loss = rl_crit(
                sample_logprobs, gen_result.data,
                torch.from_numpy(reward).float().cuda(opt.device_num))

        loss.backward()
        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        train_loss = loss.item()
        torch.cuda.synchronize()
        end = time.time()
        if not sc_flag:
            #             pass
            print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                .format(iteration, epoch, train_loss, end - start))
        else:
            #             pass
            print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \
                .format(iteration, epoch, np.mean(reward[:,0]), end - start))

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True
            print("Iter: {}, Loss: {}, Epoc: {}, LR: {}".format(
                iteration, train_loss, epoch, opt.current_lr))

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            add_summary_value(tb_summary_writer, 'train_loss', train_loss,
                              iteration)
            if opt.reduce_on_plateau:
                opt.current_lr = optimizer.current_lr
            add_summary_value(tb_summary_writer, 'learning_rate',
                              opt.current_lr, iteration)
            add_summary_value(tb_summary_writer, 'scheduled_sampling_prob',
                              model.ss_prob, iteration)
            if sc_flag:
                add_summary_value(tb_summary_writer, 'avg_reward',
                                  np.mean(reward[:, 0]), iteration)

            loss_history[iteration] = train_loss if not sc_flag else np.mean(
                reward[:, 0])
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model
            eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats = eval_utils.eval_split(
                dp_model, crit, loader, eval_kwargs)

            if opt.reduce_on_plateau:
                if 'CIDEr' in lang_stats:
                    optimizer.scheduler_step(-lang_stats['CIDEr'])
                else:
                    optimizer.scheduler_step(val_loss)

            # Write validation result into summary
            add_summary_value(tb_summary_writer, 'validation loss', val_loss,
                              iteration)
            if lang_stats is not None:
                for k, v in lang_stats.items():
                    add_summary_value(tb_summary_writer, k, v, iteration)
            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = -val_loss

            best_flag = False
            if True:  # if true
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path,
                                              'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['split_ix'] = loader.split_ix
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['vocab'] = loader.get_vocab()

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'infos_' + opt.id + '.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'histories_' + opt.id + '.pkl'),
                        'wb') as f:
                    cPickle.dump(histories, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    with open(
                            os.path.join(opt.checkpoint_path,
                                         'infos_' + opt.id + '-best.pkl'),
                            'wb') as f:
                        cPickle.dump(infos, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
Exemple #21
0
def train(opt):
    opt.use_att = utils.if_use_att(opt)
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    tf_summary_writer = tf and SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from,
                               'infos_' + opt.id + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
            with open(
                    os.path.join(opt.start_from,
                                 'histories_' + opt.id + '.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)
        best_val_score_vse = infos.get('best_val_score_vse', None)

    model = models.JointModel(opt)
    model.cuda()

    update_lr_flag = True
    # Assure in training mode
    model.train()

    optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad],
                           lr=opt.learning_rate,
                           weight_decay=opt.weight_decay)

    # Load the optimizer
    if vars(opt).get('start_from', None) is not None and os.path.isfile(
            os.path.join(opt.start_from, 'optimizer.pth')):
        state_dict = torch.load(os.path.join(opt.start_from, 'optimizer.pth'))
        if len(state_dict['state']) == len(optimizer.state_dict()['state']):
            optimizer.load_state_dict(state_dict)
        else:
            print(
                'Optimizer param group number not matched? There must be new parameters. Reinit the optimizer.'
            )

    init_scorer(opt.cached_tokens)
    while True:
        if update_lr_flag:
            # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
                utils.set_lr(optimizer, opt.current_lr)  # set the decayed rate
            else:
                opt.current_lr = opt.learning_rate
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.caption_generator.ss_prob = opt.ss_prob
            # Assign retrieval loss weight
            if epoch > opt.retrieval_reward_weight_decay_start and opt.retrieval_reward_weight_decay_start >= 0:
                frac = (epoch - opt.retrieval_reward_weight_decay_start
                        ) // opt.retrieval_reward_weight_decay_every
                model.retrieval_reward_weight = opt.retrieval_reward_weight * (
                    opt.retrieval_reward_weight_decay_rate**frac)
            update_lr_flag = False

        start = time.time()
        # Load data from train split (0)
        data = loader.get_batch('train')
        print('Read data:', time.time() - start)

        torch.cuda.synchronize()
        start = time.time()

        tmp = [
            data['fc_feats'], data['att_feats'], data['att_masks'],
            data['labels'], data['masks']
        ]
        tmp = utils.var_wrapper(tmp)
        fc_feats, att_feats, att_masks, labels, masks = tmp

        optimizer.zero_grad()

        loss = model(fc_feats, att_feats, att_masks, labels, masks, data)
        loss.backward()
        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        train_loss = loss.data[0]
        torch.cuda.synchronize()
        end = time.time()
        print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
            .format(iteration, epoch, train_loss, end - start))
        prt_str = ""
        for k, v in model.loss().items():
            prt_str += "{} = {:.3f} ".format(k, v)
        print(prt_str)

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            if tf is not None:
                tf_summary_writer.add_scalar('train_loss', train_loss,
                                             iteration)
                for k, v in model.loss().items():
                    tf_summary_writer.add_scalar(k, v, iteration)
                tf_summary_writer.add_scalar('learning_rate', opt.current_lr,
                                             iteration)
                tf_summary_writer.add_scalar('scheduled_sampling_prob',
                                             model.caption_generator.ss_prob,
                                             iteration)
                tf_summary_writer.add_scalar('retrieval_reward_weight',
                                             model.retrieval_reward_weight,
                                             iteration)
                tf_summary_writer.file_writer.flush()

            loss_history[iteration] = train_loss
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.caption_generator.ss_prob

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model
            eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            # Load the retrieval model for evaluation
            val_loss, predictions, lang_stats = eval_utils.eval_split(
                model, loader, eval_kwargs)

            # Write validation result into summary
            if tf is not None:
                for k, v in val_loss.items():
                    tf_summary_writer.add_scalar('validation ' + k, v,
                                                 iteration)
                for k, v in lang_stats.items():
                    tf_summary_writer.add_scalar(k, v, iteration)
                tf_summary_writer.add_text(
                    'Captions',
                    '.\n\n'.join([_['caption'] for _ in predictions[:100]]),
                    iteration)
                #tf_summary_writer.add_image('images', utils.make_summary_image(), iteration)
                #utils.make_html(opt.id, iteration)
                tf_summary_writer.file_writer.flush()

            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['SPICE'] * 100
            else:
                current_score = -val_loss['loss_cap']
            current_score_vse = val_loss.get(opt.vse_eval_criterion, 0) * 100

            best_flag = False
            best_flag_vse = False
            if True:  # if true
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                if best_val_score_vse is None or current_score_vse > best_val_score_vse:
                    best_val_score_vse = current_score_vse
                    best_flag_vse = True
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model-%d.pth' % (iteration))
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path,
                                              'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['split_ix'] = loader.split_ix
                infos['best_val_score'] = best_val_score
                infos['best_val_score_vse'] = best_val_score_vse
                infos['opt'] = opt
                infos['vocab'] = loader.get_vocab()

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'infos_' + opt.id + '.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(
                        os.path.join(
                            opt.checkpoint_path,
                            'infos_' + opt.id + '-%d.pkl' % (iteration)),
                        'wb') as f:
                    cPickle.dump(infos, f)
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'histories_' + opt.id + '.pkl'),
                        'wb') as f:
                    cPickle.dump(histories, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    with open(
                            os.path.join(opt.checkpoint_path,
                                         'infos_' + opt.id + '-best.pkl'),
                            'wb') as f:
                        cPickle.dump(infos, f)
                if best_flag_vse:
                    checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model_vse-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    with open(
                            os.path.join(opt.checkpoint_path,
                                         'infos_vse_' + opt.id + '-best.pkl'),
                            'wb') as f:
                        cPickle.dump(infos, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
Exemple #22
0
def train(rank, model, opt, optimizer=None):
    torch.manual_seed(opt.seed + rank)
    if opt.use_cuda:
        torch.cuda.manual_seed(opt.seed + rank)

    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    infos = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(
                os.path.join(opt.start_from,
                             'infos_' + opt.load_model_id + '.pkl'),
                'rb') as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)
    val_result_history = infos.get('val_result_history', {})
    loss_history = infos.get('loss_history', {})
    lr_history = infos.get('lr_history', {})
    ss_prob_history = infos.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_image_id = infos.get('split_image_id', loader.split_image_id)
    best_val_score = 0
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    update_lr_flag = True
    if opt.caption_model == 'show_tell':
        crit = utils.LanguageModelCriterion(opt)

    elif opt.caption_model == 'review_net':
        crit = utils.ReviewNetCriterion(opt)

    elif opt.caption_model == 'recurrent_fusion_model':
        crit = utils.ReviewNetEnsembleCriterion(opt)

    else:
        raise Exception("caption_model not supported: {}".format(
            opt.caption_model))

    if optimizer is None:
        if opt.optim == 'adam':
            optimizer = optim.Adam(model.parameters(),
                                   lr=opt.optim_lr,
                                   betas=(opt.optim_adam_beta1,
                                          opt.optim_adam_beta2),
                                   weight_decay=opt.optim_weight_decay)
        elif opt.optim == 'rmsprop':
            optimizer = optim.RMSprop(model.parameters(),
                                      lr=opt.optim_lr,
                                      momentum=opt.optim_momentum,
                                      alpha=opt.optim_rmsprop_alpha,
                                      weight_decay=opt.weight_decay)
        elif opt.optim == 'sgd':
            optimizer = optim.SGD(model.parameters(),
                                  lr=opt.optim_lr,
                                  momentum=opt.optim_momentum,
                                  weight_decay=opt.optim_weight_decay)
        elif opt.optim == 'adagrad':
            optimizer = optim.Adagrad(model.parameters(),
                                      lr=opt.optim_lr,
                                      lr_decay=opt.optim_lr_decay,
                                      weight_decay=opt.optim_weight_decay)
        elif opt.optim == 'adadelta':
            optimizer = optim.Adadelta(model.parameters(),
                                       rho=opt.optim_rho,
                                       eps=opt.optim_epsilon,
                                       lr=opt.optim_lr,
                                       weight_decay=opt.optim_weight_decay)
        else:
            raise Exception("optim not supported: {}".format(opt.feature_type))

        # Load the optimizer
        if vars(opt).get('start_from', None) is not None:
            optimizer.load_state_dict(
                torch.load(
                    os.path.join(opt.start_from,
                                 'optimizer_' + opt.load_model_id + '.pth')))

    num_period_best = 0
    current_score = 0
    while True:
        if update_lr_flag:
            # Assign the learning rate
            if epoch > opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.optim_lr * decay_factor
                utils.set_lr(optimizer, opt.current_lr)  # set the decayed rate
            else:
                opt.current_lr = opt.optim_lr
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob
            update_lr_flag = False

        start = time.time()
        # Load data from train split (0)
        data = loader.get_batch('train')

        if opt.use_cuda:
            torch.cuda.synchronize()

        if opt.feature_type == 'feat_array':
            fc_feat_array = data['fc_feats_array']
            att_feat_array = data['att_feats_array']
            assert (len(fc_feat_array) == len(att_feat_array))
            for feat_id in range(len(fc_feat_array)):
                if opt.use_cuda:
                    fc_feat_array[feat_id] = Variable(
                        torch.from_numpy(fc_feat_array[feat_id]),
                        requires_grad=False).cuda()
                    att_feat_array[feat_id] = Variable(
                        torch.from_numpy(att_feat_array[feat_id]),
                        requires_grad=False).cuda()
                else:
                    fc_feat_array[feat_id] = Variable(torch.from_numpy(
                        fc_feat_array[feat_id]),
                                                      requires_grad=False)
                    att_feat_array[feat_id] = Variable(torch.from_numpy(
                        att_feat_array[feat_id]),
                                                       requires_grad=False)

            tmp = [data['labels'], data['masks'], data['top_words']]
            if opt.use_cuda:
                tmp = [
                    Variable(torch.from_numpy(_), requires_grad=False).cuda()
                    for _ in tmp
                ]
            else:
                tmp = [
                    Variable(torch.from_numpy(_), requires_grad=False)
                    for _ in tmp
                ]
            labels, masks, top_words = tmp

        else:
            tmp = [
                data['fc_feats'], data['att_feats'], data['labels'],
                data['masks'], data['top_words']
            ]
            if opt.use_cuda:
                tmp = [
                    Variable(torch.from_numpy(_), requires_grad=False).cuda()
                    for _ in tmp
                ]
            else:
                tmp = [
                    Variable(torch.from_numpy(_), requires_grad=False)
                    for _ in tmp
                ]
            fc_feats, att_feats, labels, masks, top_words = tmp

        optimizer.zero_grad()

        if opt.caption_model == 'show_tell':
            log_prob = model(fc_feats, att_feats, labels)  # (80L, 16L, 9488L)
            loss = crit(log_prob, labels[:, 1:], masks[:, 1:])

        elif opt.caption_model == 'review_net':
            log_prob, top_pred = model(fc_feats, att_feats,
                                       labels)  # (80L, 16L, 9488L)
            loss = crit(log_prob, labels[:, 1:], masks[:, 1:], top_pred,
                        top_words, opt.reason_weight)

        elif opt.caption_model == 'recurrent_fusion_model':
            log_prob, top_pred = model(fc_feat_array, att_feat_array,
                                       labels)  # (80L, 16L, 9488L)
            loss = crit(log_prob, labels[:, 1:], masks[:, 1:], top_pred,
                        top_words, opt.reason_weight)

        else:
            raise Exception("caption_model not supported: {}".format(
                opt.caption_model))

        loss.backward()

        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        train_loss = loss.data[0]
        if opt.use_cuda:
            torch.cuda.synchronize()
        end = time.time()

        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if iteration % opt.losses_log_every == 0:
            loss_history[iteration] = train_loss
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob

        # make evaluation on validation set, and save model
        if iteration % opt.save_checkpoint_every == 0:
            # eval model
            eval_kwargs = {
                'eval_split': 'val',
                'dataset': opt.input_json,
                'caption_model': opt.caption_model,
                'reason_weight': opt.reason_weight,
                'guiding_l1_penality': opt.guiding_l1_penality,
                'use_cuda': opt.use_cuda,
                'feature_type': opt.feature_type,
                'rank': rank,
                'val_images_use': opt.val_images_use,
                'language_eval': 1
            }
            eval_kwargs.update(vars(opt))
            eval_kwargs['eval_split'] = 'val'
            val_loss, predictions, lang_stats = eval_utils.eval_split(
                model, crit, loader, eval_kwargs)

            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = -val_loss

            best_flag = False

            if best_val_score is None or current_score > best_val_score:
                best_val_score = current_score
                best_flag = True
                num_period_best = 1
            else:
                num_period_best = num_period_best + 1

            # Dump miscalleous informations
            infos['iter'] = iteration
            infos['epoch'] = epoch
            infos['iterators'] = loader.iterators
            infos['split_image_id'] = loader.split_image_id
            infos['best_val_score'] = best_val_score
            infos['opt'] = opt
            infos['val_result_history'] = val_result_history
            infos['loss_history'] = loss_history
            infos['lr_history'] = lr_history
            infos['ss_prob_history'] = ss_prob_history
            infos['vocab'] = loader.get_vocab()
            with open(
                    os.path.join(opt.checkpoint_path,
                                 'infos_' + opt.id + '_' + str(rank) + '.pkl'),
                    'wb') as f:
                cPickle.dump(infos, f)

            if best_flag:
                checkpoint_path = os.path.join(
                    opt.checkpoint_path,
                    'model_' + opt.id + '_' + str(rank) + '-best.pth')
                torch.save(model.state_dict(), checkpoint_path)
                optimizer_path = os.path.join(
                    opt.checkpoint_path,
                    'optimizer_' + opt.id + '_' + str(rank) + '-best.pth')
                torch.save(optimizer.state_dict(), optimizer_path)
                print("model saved to {}".format(checkpoint_path))
                with open(
                        os.path.join(
                            opt.checkpoint_path,
                            'infos_' + opt.id + '_' + str(rank) + '-best.pkl'),
                        'wb') as f:
                    cPickle.dump(infos, f)

            if num_period_best >= opt.num_eval_no_improve:
                print('no improvement, exit')
                sys.exit()

        print(
            "rank {}, iter {}, (epoch {}), train loss: {}, learning rate: {}, current cider: {:.3f}, best cider: {:.3f}, time: {:.3f}"
            .format(rank, iteration, epoch, train_loss, opt.current_lr,
                    current_score, best_val_score, (end - start)))
        iteration += 1
        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
Exemple #23
0
def train(opt):
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    infos = {}
    if opt.start_from is not None and len(opt.start_from) > 0:
        print("start from %s" % (opt.start_from))
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from,
                               'infos_' + opt.id + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = ["caption_model", "rnn_size", "num_layers"]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)
    val_result_history = infos.get('val_result_history', {})
    loss_history = infos.get('loss_history', {})
    lr_history = infos.get('lr_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    model_cnn = models.setup_resnet152(opt)
    model_cnn.cuda()

    model = models.setup_mixer(opt)
    model.cuda()

    update_lr_flag = True
    is_reinforce = False

    model_cnn.train()
    model.train()

    fc_expander = utils.FeatExpander(5)
    att_expander = utils.FeatExpander(5)

    crit = Criterion.LanguageModelCriterion()
    crit_reinforce = ReinforceCriterion.ReinforceCriterion()

    optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate)
    optimizer_cnn = optim.Adam(model_cnn.parameters(),
                               lr=opt.cnn_learning_rate,
                               weight_decay=opt.cnn_weight_decay)

    # Load the optimizer
    if opt.start_from is not None and len(opt.start_from) > 0:
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'optimizer.pth')))
        optimizer_cnn.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'optimizer_cnn.pth')))

    while True:
        if update_lr_flag:

            # Assign the learning rate
            if opt.learning_rate_decay_start >= 0 and epoch >= opt.learning_rate_decay_start:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
                utils.set_lr(optimizer, opt.current_lr)  # set the decayed rate
            else:
                opt.current_lr = opt.learning_rate

            if opt.finetune_cnn_after >= 0 and epoch >= opt.finetune_cnn_after:
                for p in model_cnn.parameters():
                    p.requires_grad = True
                model_cnn.train()
            else:
                for p in model_cnn.parameters():
                    p.requires_grad = False
                model_cnn.eval()

            update_lr_flag = False

        if opt.reinforce_start >= 0 and epoch >= opt.reinforce_start:
            is_reinforce = True

        start_total = time.time()

        data = loader.get_batch('train')

        tmp = [data['images'], data['labels']]
        tmp = [
            Variable(torch.from_numpy(_), requires_grad=False).cuda()
            for _ in tmp
        ]
        images, labels = tmp

        images = utils.prepro(images, True)
        fc_feats, att_feats = model_cnn(images)

        fc_feats_ext = fc_expander(fc_feats)
        att_feats_ext = att_expander(att_feats)

        optimizer.zero_grad()
        if opt.finetune_cnn_after >= 0 and epoch >= opt.finetune_cnn_after:
            optimizer_cnn.zero_grad()

        output = model(fc_feats_ext, att_feats_ext, labels, is_reinforce)

        if is_reinforce:
            loss = crit_reinforce(output, labels)
        else:
            loss = crit(output, labels)

        loss.backward()

        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        if opt.finetune_cnn_after >= 0 and epoch >= opt.finetune_cnn_after:
            utils.clip_gradient(optimizer_cnn, opt.grad_clip)
            optimizer_cnn.step()

        train_loss = loss.data[0]

        print("iter {} (epoch {}), train_loss = {:.3f}, reinforce = {} time/batch = {:.3f}" \
              .format(iteration, epoch, train_loss, is_reinforce, time.time() - start_total))

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            loss_history[iteration] = train_loss
            lr_history[iteration] = opt.current_lr

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model
            eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats, str_stats = eval_utils.eval_split(
                model_cnn, model, crit, loader, eval_kwargs)

            if not os.path.exists(opt.eval_result_path):
                os.makedirs(opt.eval_result_path)

            eval_result_file = os.path.join(opt.eval_result_path,
                                            opt.id + ".csv")
            with open(eval_result_file, 'a') as f:
                f.write(str_stats + "\n")

            predictions_file = os.path.join(opt.eval_result_path,
                                            opt.id + ".json")
            with open(predictions_file, 'w') as f:
                json.dump(predictions, f)

            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = -val_loss

            best_flag = False
            if True:  # if true
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                if not os.path.exists(opt.checkpoint_path):
                    os.makedirs(opt.checkpoint_path)

                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))

                checkpoint_path_cnn = os.path.join(opt.checkpoint_path,
                                                   'model_cnn.pth')
                torch.save(model_cnn.state_dict(), checkpoint_path_cnn)
                print("model cnn saved to {}".format(checkpoint_path_cnn))

                optimizer_path = os.path.join(opt.checkpoint_path,
                                              'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)
                print("optimizer saved to {}".format(optimizer_path))

                optimizer_path_cnn = os.path.join(opt.checkpoint_path,
                                                  'optimizer_cnn.pth')
                torch.save(optimizer_cnn.state_dict(), optimizer_path_cnn)
                print("optimizer cnn saved to {}".format(optimizer_path_cnn))

                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['val_result_history'] = val_result_history
                infos['loss_history'] = loss_history
                infos['lr_history'] = lr_history
                infos['vocab'] = loader.get_vocab()

                info_path = os.path.join(opt.checkpoint_path,
                                         'infos_' + opt.id + '.pkl')
                with open(info_path, 'wb') as f:
                    cPickle.dump(infos, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model_best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))

                    checkpoint_path_cnn = os.path.join(opt.checkpoint_path,
                                                       'model_cnn_best.pth')
                    torch.save(model_cnn.state_dict(), checkpoint_path_cnn)
                    print("model cnn saved to {}".format(checkpoint_path_cnn))

                    info_path = os.path.join(opt.checkpoint_path,
                                             'infos_' + opt.id + '_best.pkl')
                    with open(info_path, 'wb') as f:
                        cPickle.dump(infos, f)

        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
def train(opt):
    # Deal with feature things before anything
    opt.use_fc, opt.use_att = utils.if_use_feat(opt.caption_model)
    if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5

    acc_steps = getattr(opt, 'acc_steps', 1)

    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl'),
                  'rb') as f:
            infos = utils.pickle_load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
            with open(
                    os.path.join(opt.start_from,
                                 'histories_' + opt.id + '.pkl'), 'rb') as f:
                histories = utils.pickle_load(f)
    else:
        infos['iter'] = 0
        infos['epoch'] = 0
        infos['iterators'] = loader.iterators
        infos['split_ix'] = loader.split_ix
        infos['vocab'] = loader.get_vocab()
    infos['opt'] = opt

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    opt.vocab = loader.get_vocab()
    model = models.setup(opt).cuda()
    del opt.vocab
    dp_model = torch.nn.DataParallel(model)
    lw_model = LossWrapper(model, opt)
    dp_lw_model = torch.nn.DataParallel(lw_model)

    epoch_done = True
    # Assure in training mode
    dp_lw_model.train()

    if opt.noamopt:
        assert opt.caption_model in [
            'transformer', 'aoa'
        ], 'noamopt can only work with transformer'
        optimizer = utils.get_std_opt(model,
                                      factor=opt.noamopt_factor,
                                      warmup=opt.noamopt_warmup)
        optimizer._step = iteration
    elif opt.reduce_on_plateau:
        optimizer = utils.build_optimizer(model.parameters(), opt)
        optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    else:
        optimizer = utils.build_optimizer(model.parameters(), opt)
    # Load the optimizer
    if vars(opt).get('start_from', None) is not None and os.path.isfile(
            os.path.join(opt.start_from, "optimizer.pth")):
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    def save_checkpoint(model, infos, optimizer, histories=None, append=''):
        if len(append) > 0:
            append = '-' + append
        # if checkpoint_path doesn't exist
        if not os.path.isdir(opt.checkpoint_path):
            os.makedirs(opt.checkpoint_path)
        checkpoint_path = os.path.join(opt.checkpoint_path,
                                       'model%s.pth' % (append))
        torch.save(model.state_dict(), checkpoint_path)
        print("model saved to {}".format(checkpoint_path))
        optimizer_path = os.path.join(opt.checkpoint_path,
                                      'optimizer%s.pth' % (append))
        torch.save(optimizer.state_dict(), optimizer_path)
        with open(
                os.path.join(opt.checkpoint_path,
                             'infos_' + opt.id + '%s.pkl' % (append)),
                'wb') as f:
            utils.pickle_dump(infos, f)
        if histories:
            with open(
                    os.path.join(opt.checkpoint_path,
                                 'histories_' + opt.id + '%s.pkl' % (append)),
                    'wb') as f:
                utils.pickle_dump(histories, f)

    try:
        while True:
            if epoch_done:
                if not opt.noamopt and not opt.reduce_on_plateau:
                    # Assign the learning rate
                    if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                        frac = (epoch - opt.learning_rate_decay_start
                                ) // opt.learning_rate_decay_every
                        decay_factor = opt.learning_rate_decay_rate**frac
                        opt.current_lr = opt.learning_rate * decay_factor
                    else:
                        opt.current_lr = opt.learning_rate
                    utils.set_lr(optimizer,
                                 opt.current_lr)  # set the decayed rate
                # Assign the scheduled sampling prob
                if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                    frac = (epoch - opt.scheduled_sampling_start
                            ) // opt.scheduled_sampling_increase_every
                    opt.ss_prob = min(
                        opt.scheduled_sampling_increase_prob * frac,
                        opt.scheduled_sampling_max_prob)
                    model.ss_prob = opt.ss_prob

                # If start self critical training
                if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                    sc_flag = True
                    init_scorer(opt.cached_tokens)
                else:
                    sc_flag = False

                epoch_done = False

            start = time.time()
            if (opt.use_warmup == 1) and (iteration < opt.noamopt_warmup):
                opt.current_lr = opt.learning_rate * (iteration +
                                                      1) / opt.noamopt_warmup
                utils.set_lr(optimizer, opt.current_lr)
            # Load data from train split (0)
            data = loader.get_batch('train')
            print('Read data:', time.time() - start)

            if (iteration % acc_steps == 0):
                optimizer.zero_grad()

            torch.cuda.synchronize()
            start = time.time()
            tmp = [
                data['fc_feats'], data['att_feats'], data['labels'],
                data['masks'], data['att_masks']
            ]
            tmp = [_ if _ is None else _.cuda() for _ in tmp]
            fc_feats, att_feats, labels, masks, att_masks = tmp

            model_out = dp_lw_model(fc_feats, att_feats, labels, masks,
                                    att_masks, data['gts'],
                                    torch.arange(0, len(data['gts'])), sc_flag)

            loss = model_out['loss'].mean()
            loss_sp = loss / acc_steps

            loss_sp.backward()
            if ((iteration + 1) % acc_steps == 0):
                utils.clip_gradient(optimizer, opt.grad_clip)
                optimizer.step()
            torch.cuda.synchronize()
            train_loss = loss.item()
            end = time.time()
            if not sc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, train_loss, end - start))
            else:
                print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, model_out['reward'].mean(), end - start))

            if opt.caption_model == 'aat':
                attention_steps = np.array(model_out['att_step']).transpose()
                avg_att_time = model_out['avg_att_time']
                loss_ = model_out['loss_'].mean().item()
                aat_loss = model_out['aat_loss'].mean().item()
                print("AAT: loss_ = {:.3f}, att_loss = {:.3f}, avg_att_time = {:.3f}" \
                    .format(loss_, aat_loss, avg_att_time))
                print(attention_steps[0])

            # Update the iteration and epoch
            iteration += 1
            if data['bounds']['wrapped']:
                epoch += 1
                epoch_done = True

            # Write the training loss summary
            if (iteration % opt.losses_log_every == 0):
                add_summary_value(tb_summary_writer, 'train_loss', train_loss,
                                  iteration)
                if opt.noamopt:
                    opt.current_lr = optimizer.rate()
                elif opt.reduce_on_plateau:
                    opt.current_lr = optimizer.current_lr
                add_summary_value(tb_summary_writer, 'learning_rate',
                                  opt.current_lr, iteration)
                add_summary_value(tb_summary_writer, 'scheduled_sampling_prob',
                                  model.ss_prob, iteration)
                if sc_flag:
                    add_summary_value(tb_summary_writer, 'avg_reward',
                                      model_out['reward'].mean(), iteration)

                if opt.caption_model == 'aat':
                    add_summary_value(tb_summary_writer, 'loss_', loss_,
                                      iteration)
                    add_summary_value(tb_summary_writer, 'aat_loss', aat_loss,
                                      iteration)
                    add_summary_value(tb_summary_writer, 'avg_att_time',
                                      avg_att_time, iteration)

                loss_history[
                    iteration] = train_loss if not sc_flag else model_out[
                        'reward'].mean()
                lr_history[iteration] = opt.current_lr
                ss_prob_history[iteration] = model.ss_prob

            # update infos
            infos['iter'] = iteration
            infos['epoch'] = epoch
            infos['iterators'] = loader.iterators
            infos['split_ix'] = loader.split_ix

            # make evaluation on validation set, and save model
            if (iteration % opt.save_checkpoint_every == 0):
                # eval model
                eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
                eval_kwargs.update(vars(opt))
                val_loss, predictions, lang_stats = eval_utils.eval_split(
                    dp_model, lw_model.crit, loader, eval_kwargs)

                if opt.reduce_on_plateau:
                    if 'CIDEr' in lang_stats:
                        optimizer.scheduler_step(-lang_stats['CIDEr'])
                    else:
                        optimizer.scheduler_step(val_loss)
                # Write validation result into summary
                add_summary_value(tb_summary_writer, 'validation loss',
                                  val_loss, iteration)
                if lang_stats is not None:
                    for k, v in lang_stats.items():
                        add_summary_value(tb_summary_writer, k, v, iteration)
                val_result_history[iteration] = {
                    'loss': val_loss,
                    'lang_stats': lang_stats,
                    'predictions': predictions
                }

                # Save model if is improving on validation result
                if opt.language_eval == 1:
                    current_score = lang_stats['CIDEr']
                else:
                    current_score = -val_loss

                best_flag = False

                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True

                # Dump miscalleous informations
                infos['best_val_score'] = best_val_score
                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history

                save_checkpoint(model, infos, optimizer, histories)
                if opt.save_history_ckpt:
                    save_checkpoint(model,
                                    infos,
                                    optimizer,
                                    append=str(iteration))

                if best_flag:
                    save_checkpoint(model, infos, optimizer, append='best')

            # Stop if reaching max epochs
            if epoch >= opt.max_epochs and opt.max_epochs != -1:
                break
    except (RuntimeError, KeyboardInterrupt):
        print('Save ckpt on exception ...')
        save_checkpoint(model, infos, optimizer)
        print('Save ckpt done.')
        stack_trace = traceback.format_exc()
        print(stack_trace)
Exemple #25
0
def train(opt):
    # tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)
    if not os.path.exists(opt.checkpoint_path):
        os.makedirs(opt.checkpoint_path)

    with open(os.path.join(opt.checkpoint_path, 'config.json'), 'w') as f:
        json.dump(vars(opt), f, indent=4)

    writer = None
    if tb is not None:
        import shutil
        now = datetime.now()
        if opt.reset_tensorboard:
            for d in os.listdir(opt.checkpoint_path):
                d = os.path.join(opt.checkpoint_path, d)
                if os.path.isdir(d) and 'tb_' in d:
                    shutil.rmtree(d)
                    print('remove', d)
        logdir = os.path.join(opt.checkpoint_path,
                              'tb_' + now.strftime("%Y%m%d-%H%M%S") + "/")
        writer = tb.SummaryWriter(logdir)

    # Load iterators
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.vocab = loader.get_vocab()
    opt.blank_token = loader.get_blank_token()
    opt.seq_length = loader.seq_length

    opt.unique_characters = loader.unique_characters
    opt.max_characters = loader.max_characters
    if opt.glove is not None:
        opt.glove_npy = loader.build_glove(opt.glove)
    else:
        opt.glove_npy = None

    # set up models
    gen_model = FillInCharacter(opt)
    gen_model = gen_model.cuda()

    if torch.cuda.device_count() > 1:
        gen_model = nn.DataParallel(gen_model)
    gen_model.train()
    gen_optimizer = utils.build_optimizer(gen_model.parameters(), opt)

    # keep track of iteration
    g_iter = 0
    g_epoch = 0
    update_lr_flag = True

    # Load from checkpoint path
    infos = {'opt': opt}
    histories = {}
    infos['vocab'] = loader.get_vocab()
    if opt.start_from is not None:
        # Open old infos and check if models are compatible
        with open(os.path.join(opt.start_from, 'infos.pkl'), 'rb') as f:
            infos = pickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = ["rnn_type", "rnn_size", "num_layers"]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        # Load train/val histories
        with open(os.path.join(opt.start_from, 'histories.pkl'), 'rb') as f:
            histories = pickle.load(f)

        # Load generator
        start_epoch = opt.start_epoch
        g_model_path = os.path.join(opt.start_from, "gen_%s.pth" % start_epoch)
        g_optimizer_path = os.path.join(opt.start_from,
                                        "gen_optimizer_%s.pth" % start_epoch)
        assert os.path.isfile(g_model_path) and os.path.isfile(
            g_optimizer_path)
        gen_model.load_state_dict(torch.load(g_model_path))
        gen_optimizer.load_state_dict(torch.load(g_optimizer_path))
        if "latest" not in start_epoch and "best" != start_epoch:
            g_epoch = int(start_epoch) + 1
            g_iter = (g_epoch) * loader.split_size['train'] // opt.batch_size
        elif start_epoch == "best":
            g_epoch = infos['g_epoch_' + start_epoch] + 1
            g_iter = (g_epoch) * loader.split_size['train'] // opt.batch_size
        else:
            g_epoch = infos['g_epoch_' + start_epoch] + 1
            g_iter = infos['g_iter_' + start_epoch]
        print('loaded %s (epoch: %d iter: %d)' %
              (g_model_path, g_epoch, g_iter))
    infos['opt'] = opt
    loader.iterators = infos.get('g_iterators', loader.iterators)

    # misc
    best_val_score = infos.get('g_best_score', None)
    opt.seq_length = loader.seq_length
    opt.video = 1
    g_val_result_history = histories.get('g_val_result_history', {})
    g_loss_history = histories.get('g_loss_history', {})
    """ START TRAINING """
    while g_epoch < opt.pre_nepoch:
        # gc.collect()
        # set every epoch
        if update_lr_flag:
            # Assign the learning rate for generator
            if g_epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (g_epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
            else:
                opt.current_lr = opt.learning_rate
            utils.set_lr(gen_optimizer, opt.current_lr)

            # Assign the scheduled sampling prob
            if g_epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (g_epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                gen_model.ss_prob = opt.ss_prob

            update_lr_flag = False
        """ TRAIN GENERATOR """
        gen_model.train()
        start = time.time()
        gen_loss, wrapped, sent_num = train_generator(gen_model, gen_optimizer,
                                                      loader, opt.grad_clip)
        end = time.time()

        # Print Info
        if g_iter % opt.losses_print_every == 0:
            print("g_iter {} (g_epoch {}), gen_loss = {:.3f}, time/batch = {:.3f}" \
                .format(g_iter, g_epoch, gen_loss, end - start))

        # Log Losses
        if g_iter % opt.losses_log_every == 0:
            g_loss = gen_loss
            loss_history = {'g_loss': g_loss, 'g_epoch': g_epoch}
            g_loss_history[g_iter] = loss_history
            log_metrics(writer, g_iter, loss_history)

        # Update the iteration
        g_iter += 1

        #########################
        # Evaluate & Save Model #
        #########################
        if wrapped:
            # evaluate model on dev set
            eval_kwargs = {
                'split': 'val',
                'dataset': opt.input_json,
                'sample_max': 1,
                'eval_accuracy': opt.eval_accuracy,
                'id': opt.val_id,
                'val_videos_use': opt.val_videos_use,
                'remove': 1
            }  # remove generated caption
            val_loss, predictions, accuracy = eval_split(
                gen_model, loader, eval_kwargs=eval_kwargs)
            if opt.eval_accuracy == 1:
                current_score = accuracy[
                    'Class Accuracy'] if 'Class Accuracy' in accuracy else accuracy[
                        'Instance Accuracy']
            else:
                current_score = -val_loss
            g_val_result_history[g_epoch] = {
                'g_val_loss': val_loss,
                'g_val_score': current_score
            }
            print('validation:', g_val_result_history[g_epoch])

            # Save the best generator model
            if best_val_score is None or current_score > best_val_score:
                best_val_score = current_score
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'gen_best.pth')
                torch.save(
                    gen_optimizer.state_dict(),
                    os.path.join(opt.checkpoint_path,
                                 'gen_optimizer_best.pth'))
                infos['g_epoch_best'] = g_epoch
                infos['g_iter_best'] = g_iter
                infos['g_best_score'] = best_val_score
                torch.save(gen_model.state_dict(), checkpoint_path)
                print("best fill in model saved to {} with score {}".format(
                    checkpoint_path, current_score))

            # Dump miscalleous informations and save
            infos['g_epoch_latest'] = g_epoch
            infos['g_iter_latest'] = g_iter
            infos['g_iterators'] = loader.iterators
            histories['g_val_result_history'] = g_val_result_history
            histories['g_loss_history'] = g_loss_history
            with open(os.path.join(opt.checkpoint_path, 'infos.pkl'),
                      'wb') as f:
                pickle.dump(infos, f)
            with open(os.path.join(opt.checkpoint_path, 'histories.pkl'),
                      'wb') as f:
                pickle.dump(histories, f)
            log_metrics(writer, g_iter, g_val_result_history[g_epoch])

            # save the latest model
            if opt.save_checkpoint_every > 0 and g_epoch % opt.save_checkpoint_every == 0:
                torch.save(
                    gen_model.state_dict(),
                    os.path.join(opt.checkpoint_path, 'gen_%d.pth' % g_epoch))
                torch.save(gen_model.state_dict(),
                           os.path.join(opt.checkpoint_path, 'gen_latest.pth'))
                torch.save(
                    gen_optimizer.state_dict(),
                    os.path.join(opt.checkpoint_path,
                                 'gen_optimizer_%d.pth' % g_epoch))
                torch.save(
                    gen_optimizer.state_dict(),
                    os.path.join(opt.checkpoint_path,
                                 'gen_optimizer_latest.pth'))
                print("fill in model saved to {} at epoch {}".format(
                    opt.checkpoint_path, g_epoch))

            # update epoch and lr
            g_epoch += 1
            update_lr_flag = True
Exemple #26
0
def train(opt):

    ################################
    # Build dataloader
    ################################
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    ##########################
    # Initialize infos
    ##########################
    infos = {
        'iter': 0,
        'epoch': 0,
        'loader_state_dict': None,
        'vocab': loader.get_vocab(),
    }
    # Load old infos(if there is) and check if models are compatible
    if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl')):
        with open(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl'), 'rb') as f:
            infos = utils.pickle_load(f)
            saved_model_opt = infos['opt']
            need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"]
            for checkme in need_be_same:
                assert getattr(saved_model_opt, checkme) == getattr(opt, checkme), "Command line argument and saved model disagree on '%s' " % checkme
    infos['opt'] = opt

    #########################
    # Build logger
    #########################
    # naive dict logger
    histories = defaultdict(dict)
    if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')):
        with open(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl'), 'rb') as f:
            histories.update(utils.pickle_load(f))

    # tensorboard logger
    tb_summary_writer = SummaryWriter(opt.checkpoint_path)

    ##########################
    # Build model
    ##########################
    opt.vocab = loader.get_vocab()
    model = models.setup(opt).cuda()
    del opt.vocab
    # Load pretrained weights:
    if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, 'model.pth')):
        model.load_state_dict(torch.load(os.path.join(opt.start_from, 'model.pth')))
    
    # Wrap generation model with loss function(used for training)
    # This allows loss function computed separately on each machine
    lw_model = LossWrapper(model, opt)
    # Wrap with dataparallel
    dp_model = torch.nn.DataParallel(model)
    dp_lw_model = torch.nn.DataParallel(lw_model)

    ##########################
    #  Build optimizer
    ##########################
    if opt.noamopt:
        assert opt.caption_model == 'transformer', 'noamopt can only work with transformer'
        optimizer = utils.get_std_opt(model, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup)
    elif opt.reduce_on_plateau:
        optimizer = utils.build_optimizer(model.parameters(), opt)
        optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    else:
        optimizer = utils.build_optimizer(model.parameters(), opt)
    # Load the optimizer
    if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from,"optimizer.pth")):
        optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    #########################
    # Get ready to start
    #########################
    iteration = infos['iter']
    epoch = infos['epoch']
    # For back compatibility
    if 'iterators' in infos:
        infos['loader_state_dict'] = {split: {'index_list': infos['split_ix'][split], 'iter_counter': infos['iterators'][split]} for split in ['train', 'val', 'test']}
    loader.load_state_dict(infos['loader_state_dict'])
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)
    if opt.noamopt:
        optimizer._step = iteration
    # flag indicating finish of an epoch
    # Always set to True at the beginning to initialize the lr or etc.
    epoch_done = True
    # Assure in training mode
    dp_lw_model.train()

    # Start training
    try:
        while True:
            if epoch_done:
                if not opt.noamopt and not opt.reduce_on_plateau:
                    # Assign the learning rate
                    if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                        frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every
                        decay_factor = opt.learning_rate_decay_rate  ** frac
                        opt.current_lr = opt.learning_rate * decay_factor
                    else:
                        opt.current_lr = opt.learning_rate
                    utils.set_lr(optimizer, opt.current_lr) # set the decayed rate
                # Assign the scheduled sampling prob
                if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                    frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every
                    opt.ss_prob = min(opt.scheduled_sampling_increase_prob  * frac, opt.scheduled_sampling_max_prob)
                    model.ss_prob = opt.ss_prob

                # If start self critical training
                if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                    sc_flag = True
                    init_scorer(opt.cached_tokens)
                else:
                    sc_flag = False
                
                # If start structure loss training
                if opt.structure_after != -1 and epoch >= opt.structure_after:
                    struc_flag = True
                    init_scorer(opt.cached_tokens)
                else:
                    struc_flag = False

                epoch_done = False
                    
            start = time.time()
            # Load data from train split (0)
            data = loader.get_batch('train')
            print('Read data:', time.time() - start)

            torch.cuda.synchronize()
            start = time.time()

            tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks']]
            tmp = [_ if _ is None else _.cuda() for _ in tmp]
            fc_feats, att_feats, labels, masks, att_masks = tmp
            
            optimizer.zero_grad()
            model_out = dp_lw_model(fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts'])), sc_flag, struc_flag)

            loss = model_out['loss'].mean()

            loss.backward()
            utils.clip_gradient(optimizer, opt.grad_clip)
            optimizer.step()
            train_loss = loss.item()
            torch.cuda.synchronize()
            end = time.time()
            if struc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, train_loss, model_out['lm_loss'].mean().item(), model_out['struc_loss'].mean().item(), end - start))
            elif not sc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, train_loss, end - start))
            else:
                print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \
                    .format(iteration, epoch, model_out['reward'].mean(), end - start))

            # Update the iteration and epoch
            iteration += 1
            if data['bounds']['wrapped']:
                epoch += 1
                epoch_done = True

            # Write the training loss summary
            if (iteration % opt.losses_log_every == 0):
                tb_summary_writer.add_scalar('train_loss', train_loss, iteration)
                if opt.noamopt:
                    opt.current_lr = optimizer.rate()
                elif opt.reduce_on_plateau:
                    opt.current_lr = optimizer.current_lr
                tb_summary_writer.add_scalar('learning_rate', opt.current_lr, iteration)
                tb_summary_writer.add_scalar('scheduled_sampling_prob', model.ss_prob, iteration)
                if sc_flag:
                    tb_summary_writer.add_scalar('avg_reward', model_out['reward'].mean(), iteration)
                elif struc_flag:
                    tb_summary_writer.add_scalar('lm_loss', model_out['lm_loss'].mean().item(), iteration)
                    tb_summary_writer.add_scalar('struc_loss', model_out['struc_loss'].mean().item(), iteration)
                    tb_summary_writer.add_scalar('reward', model_out['reward'].mean().item(), iteration)

                histories['loss_history'][iteration] = train_loss if not sc_flag else model_out['reward'].mean()
                histories['lr_history'][iteration] = opt.current_lr
                histories['ss_prob_history'][iteration] = model.ss_prob

            # update infos
            infos['iter'] = iteration
            infos['epoch'] = epoch
            infos['loader_state_dict'] = loader.state_dict()
            
            # make evaluation on validation set, and save model
            if (iteration % opt.save_checkpoint_every == 0):
                # eval model
                eval_kwargs = {'split': 'val',
                                'dataset': opt.input_json}
                eval_kwargs.update(vars(opt))
                val_loss, predictions, lang_stats = eval_utils.eval_split(
                    dp_model, lw_model.crit, loader, eval_kwargs)

                if opt.reduce_on_plateau:
                    if 'CIDEr' in lang_stats:
                        optimizer.scheduler_step(-lang_stats['CIDEr'])
                    else:
                        optimizer.scheduler_step(val_loss)
                # Write validation result into summary
                tb_summary_writer.add_scalar('validation loss', val_loss, iteration)
                if lang_stats is not None:
                    for k,v in lang_stats.items():
                        tb_summary_writer.add_scalar(k, v, iteration)
                histories['val_result_history'][iteration] = {'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions}

                # Save model if is improving on validation result
                if opt.language_eval == 1:
                    current_score = lang_stats['CIDEr']
                else:
                    current_score = - val_loss

                best_flag = False

                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True

                # Dump miscalleous informations
                infos['best_val_score'] = best_val_score

                utils.save_checkpoint(opt, model, infos, optimizer, histories)
                if opt.save_history_ckpt:
                    utils.save_checkpoint(opt, model, infos, optimizer, append=str(iteration))

                if best_flag:
                    utils.save_checkpoint(opt, model, infos, optimizer, append='best')

            # Stop if reaching max epochs
            if epoch >= opt.max_epochs and opt.max_epochs != -1:
                break
    except (RuntimeError, KeyboardInterrupt):
        print('Save ckpt on exception ...')
        utils.save_checkpoint(opt, model, infos, optimizer)
        print('Save ckpt done.')
        stack_trace = traceback.format_exc()
        print(stack_trace)
Exemple #27
0
else:
    raise Exception("caption_model not supported: {}".format(opt.caption_model))

loader = DataLoader(opt)
eval_kwargs = {'eval_split': opt.eval_split,
               'beam_size': opt.beam_size,
               'dataset': opt.input_json,
               'caption_model': opt.caption_model,
               'reason_weight': opt.reason_weight,  ##
               'guiding_l1_penality': opt.guiding_l1_penality,
               'use_cuda': opt.use_cuda,
               'feature_type': opt.feature_type,
               'language_eval': opt.language_eval,
               'val_images_use': opt.val_images_use,
               'verbose': opt.verbose,
               'sample_max': opt.sample_max,
               'print_beam_candidate': opt.print_beam_candidate,
               'id': opt.id,
               'print_top_words': 0
               }


# Set sample options
# loss, split_predictions, lang_stats = eval_utils.eval_eval(model, crit, loader, vars(opt))
loss, split_predictions, lang_stats = eval_utils.eval_split(model, crit, loader, eval_kwargs)

print('loss: ', loss)
if lang_stats:
    print(lang_stats)
Exemple #28
0
    loader = DataLoader(opt)
else:
    loader = DataLoaderRaw({
        'folder_path': opt.image_folder,
        'coco_json': opt.coco_json,
        'batch_size': opt.batch_size
    })
# When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json
# So make sure to use the vocab in infos file.
loader.ix_to_word = infos['vocab']

# Set sample options
loss, split_predictions, lang_stats = eval_utils.eval_split(
    cnn_model,
    model,
    crit,
    loader,
    vars(opt),
    return_attention=opt.return_attention)

print('loss: ', loss)
if lang_stats:
    print(lang_stats)

if opt.dump_json == 1:
    # dump the json
    json.dump(
        split_predictions,
        open(
            'vis/vis_' + vars(infos['opt'])['caption_model'] + '_' +
            vars(opt)['save_name'] + '.json', 'w'))
Exemple #29
0
            vars(opt).update({k: vars(infos['opt'])[k]
                              })  # copy over options from model
print(opt)
torch.manual_seed(opt.seed)
torch.cuda.manual_seed(opt.seed)
# load train/valid/test data
mytest_dset = test_dataio(opt)
# set model
model = SAModel(opt)
model.load_state_dict(torch.load(opt.model), strict=False)
model.cuda()
model.eval()
crit = LanguageModelCriterion()
classify_crit = ClassiferCriterion()

print("testing starts ...")
test_loss, predictions, lang_stats = eval_utils.eval_split(
    model, crit, classify_crit, mytest_dset, vars(opt))

test_result = {}
test_result['test_loss'] = test_loss
test_result['predictions'] = predictions
test_result['scores'] = lang_stats
#with open(os.path.join(opt.checkpoint_path,'test_result_fuxian.pkl'), 'wb') as f:
#	cPickle.dump(test_result, f)
print("testing finish !\n")

print('loss: ', test_loss)
if lang_stats:
    print(lang_stats)
model.load_state_dict(torch.load(opt.model))
model.cuda()
model.eval()
crit = utils.LanguageModelCriterion()

# Create the Data Loader instance
if len(opt.image_folder) == 0:
    loader = DataLoader(opt)
else:
    loader = DataLoaderRaw({
        'folder_path': opt.image_folder,
        'coco_json': opt.coco_json,
        'batch_size': opt.batch_size,
        'cnn_model': opt.cnn_model
    })
# When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json
# So make sure to use the vocab in infos file.
loader.ix_to_word = infos['vocab']

# Set sample options
loss, split_predictions, lang_stats = eval_utils.eval_split(
    model, crit, loader, vars(opt))

print('loss: ', loss)
if lang_stats:
    print(lang_stats)

if opt.dump_json == 1:
    # dump the json
    json.dump(split_predictions, open('vis/vis.json', 'w'))
def train(opt):
    opt.use_att = utils.if_use_att(opt.caption_model)
    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    tf_summary_writer = tf and tf.summary.FileWriter(opt.checkpoint_path)

    # log information
    folder_id = 'log_result'
    file_id = 'twin_show_attend_tell'
    log_file_name = os.path.join(folder_id, file_id + '.txt')
    log_file = open(log_file_name, 'w')

    infos = {}
    histories = {}
    if opt.start_from is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from,
                               'infos_' + opt.id + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')):
            with open(
                    os.path.join(opt.start_from,
                                 'histories_' + opt.id + '.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    model = models.setup(opt)
    model.cuda()
    back_model = models.setup(opt, reverse=True)  # True for twin-net
    back_model.cuda()

    update_lr_flag = True
    # Assure in training mode
    model.train()
    back_model.train()

    crit = utils.LanguageModelCriterion()  # define the loss criterion
    all_param = chain(model.parameters(), back_model.parameters())
    optimizer = optim.Adam(all_param,
                           lr=opt.learning_rate,
                           weight_decay=opt.weight_decay)

    # Load the optimizer
    if vars(opt).get('start_from', None) is not None:
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.start_from, 'optimizer.pth')))

    while True:
        if update_lr_flag:
            # Assign the learning rate
            if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                frac = (epoch - opt.learning_rate_decay_start
                        ) // opt.learning_rate_decay_every
                decay_factor = opt.learning_rate_decay_rate**frac
                opt.current_lr = opt.learning_rate * decay_factor
                utils.set_lr(optimizer, opt.current_lr)  # set the decayed rate
            else:
                opt.current_lr = opt.learning_rate
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob
            update_lr_flag = False

        start = time.time()
        # Load data from train split (0)
        data = loader.get_batch('train')
        print('Read data:', time.time() - start)

        torch.cuda.synchronize()
        start = time.time()

        # flip the masks and labels for twin-net
        reverse_labels = np.flip(data['labels'], 1).copy()
        reverse_masks = np.flip(data['masks'], 1).copy()

        tmp = [
            data['fc_feats'], data['att_feats'], data['labels'],
            reverse_labels, data['masks'], reverse_masks
        ]
        tmp = [
            Variable(torch.from_numpy(_), requires_grad=False).cuda()
            for _ in tmp
        ]
        fc_feats, att_feats, labels, reverse_labels, masks, reverse_masks = tmp

        optimizer.zero_grad()
        out, states = model(fc_feats, att_feats, labels)
        back_out, back_states = back_model(fc_feats, att_feats, reverse_labels)
        idx = [i for i in range(back_states.size()[1] - 1, -1, -1)]
        # print (back_states.size(), back_states.size()[1])
        # print (type(idx))
        # print (idx)

        idx = torch.LongTensor(idx)
        idx = Variable(idx).cuda()
        invert_backstates = back_states.index_select(1, idx)

        # print (states.size(), back_states.size())

        # check if the back states are inverted
        # back = back_states.index_select(1, Variable(torch.LongTensor([2])).cuda())
        # forw = invert_backstates.index_select(1, Variable(torch.LongTensor([14])).cuda())
        # print (forw, back)
        # print (back_states.index_select(1, Variable(torch.LongTensor([3])).cuda()))
        # print (invert_backstates.size())

        loss = crit(out, labels[:, 1:],
                    masks[:, 1:])  # compute using the defined criterion

        back_loss = crit(back_out, reverse_labels[:, :-1],
                         reverse_masks[:, :-1])

        invert_backstates = invert_backstates.detach()
        l2_loss = ((states - invert_backstates)**2).mean()

        all_loss = loss + 1.5 * l2_loss + back_loss

        all_loss.backward()
        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()

        # store the relevant values
        train_l2_loss = l2_loss.data[0]
        train_loss = loss.data[0]
        train_all_loss = all_loss.data[0]
        train_back_loss = back_loss.data[0]
        torch.cuda.synchronize()
        end = time.time()
        print("iter {} (epoch {}), train_loss = {:.3f}, l2_loss = {:.3f}, back_loss = {:.3f}, all_loss = {:.3f}, time/batch = {:.3f}" \
            .format(iteration, epoch, train_loss, train_l2_loss, train_back_loss, train_all_loss, end - start))

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            update_lr_flag = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            if tf is not None:
                add_summary_value(tf_summary_writer, 'train_loss', train_loss,
                                  iteration)
                add_summary_value(tf_summary_writer, 'l2_loss', train_l2_loss,
                                  iteration)
                add_summary_value(tf_summary_writer, 'all_loss',
                                  train_all_loss, iteration)
                add_summary_value(tf_summary_writer, 'back_loss',
                                  train_back_loss, iteration)
                add_summary_value(tf_summary_writer, 'learning_rate',
                                  opt.current_lr, iteration)
                add_summary_value(tf_summary_writer, 'scheduled_sampling_prob',
                                  model.ss_prob, iteration)
                tf_summary_writer.flush()

            log_line = 'Epoch [%d], Step [%d], all loss: %f,back_loss %f,train_l2_loss %f, train_loss %f, time %f ' % (
                epoch, iteration, train_all_loss, train_back_loss,
                train_l2_loss, train_loss, time.clock())
            log_file.write(log_line + '\n')

            loss_history[iteration] = train_loss
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model
            eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            val_loss, predictions, lang_stats = eval_utils.eval_split(
                model, crit, loader, eval_kwargs)

            # Write validation result into summary
            if tf is not None:
                add_summary_value(tf_summary_writer, 'validation loss',
                                  val_loss, iteration)
                for k, v in lang_stats.items():
                    add_summary_value(tf_summary_writer, k, v, iteration)
                tf_summary_writer.flush()
            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = -val_loss

            best_flag = False
            if True:  # if true
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path,
                                              'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['split_ix'] = loader.split_ix
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['vocab'] = loader.get_vocab()

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'infos_' + opt.id + '.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'histories_' + opt.id + '.pkl'),
                        'wb') as f:
                    cPickle.dump(histories, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    with open(
                            os.path.join(opt.checkpoint_path,
                                         'infos_' + opt.id + '-best.pkl'),
                            'wb') as f:
                        cPickle.dump(infos, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break
model = models.setup(opt)
model.load_state_dict(torch.load(opt.model))
model.cuda()
model.eval()
crit = utils.LanguageModelCriterion()

# Create the Data Loader instance
if len(opt.image_folder) == 0:
    loader = DataLoader(opt)
else:
    loader = DataLoaderRaw({'folder_path': opt.image_folder,
                            'coco_json': opt.coco_json,
                            'batch_size': opt.batch_size,
                            'cnn_model': opt.cnn_model})
# When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json
# So make sure to use the vocab in infos file.
loader.ix_to_word = infos['vocab']


# Set sample options
loss, split_predictions, lang_stats = eval_utils.eval_split(
    model, crit, loader,  vars(opt))

print('loss: ', loss)
if lang_stats:
    print(lang_stats)

if opt.dump_json == 1:
    # dump the json
    json.dump(split_predictions, open('vis/vis.json', 'w'))
Exemple #33
0
def train(opt):
    # Deal with feature things before anything
    opt.use_att = utils.if_use_att(opt.caption_model)
    if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5

    loader = DataLoader(opt)
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}
    if opt.start_from_path is not None:
        # open old infos and check if models are compatible
        with open(os.path.join(opt.start_from_path,
                               'infos_' + opt.id + '.pkl')) as f:
            infos = cPickle.load(f)
            saved_model_opt = infos['opt']
            need_be_same = [
                "caption_model", "rnn_type", "rnn_size", "num_layers"
            ]
            for checkme in need_be_same:
                assert vars(saved_model_opt)[checkme] == vars(
                    opt
                )[checkme], "Command line argument and saved model disagree on '%s' " % checkme

        if os.path.isfile(
                os.path.join(opt.start_from_path,
                             'histories_' + opt.id + '.pkl')):
            with open(
                    os.path.join(opt.start_from_path,
                                 'histories_' + opt.id + '.pkl')) as f:
                histories = cPickle.load(f)

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    #print(val_result_history.get(3000))
    #exit(0)
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)
    if opt.load_best_score == 1:
        best_val_score = infos.get('best_val_score', None)

    model = models.setup(opt).cuda()
    no = sum(p.numel() for p in model.parameters())
    pytorch_total_params = sum(p.numel() for p in model.parameters()
                               if p.requires_grad)
    print("Trainable Params:" + str(pytorch_total_params))
    print("Total Params:" + str(no))
    #exit(0)
    dp_model = torch.nn.DataParallel(model)

    epoch_done = True
    # Assure in training mode
    dp_model.train()
    if (opt.use_obj_mcl_loss == 1):
        mcl_crit = utils.MultiLabelClassification()
    if opt.label_smoothing > 0:
        crit = utils.LabelSmoothing(smoothing=opt.label_smoothing)
    else:
        crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()

    if opt.noamopt:
        assert opt.caption_model == 'transformer', 'noamopt can only work with transformer'
        optimizer = utils.get_std_opt(model,
                                      factor=opt.noamopt_factor,
                                      warmup=opt.noamopt_warmup)
        optimizer._step = iteration
    elif opt.reduce_on_plateau:
        optimizer = utils.build_optimizer(model.parameters(), opt)
        optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    else:
        optimizer = utils.build_optimizer(model.parameters(), opt)
    # Load the optimizer
    if vars(opt).get('start_from_path', None) is not None and os.path.isfile(
            os.path.join(opt.start_from_path, "optimizer.pth")):
        optimizer.load_state_dict(
            torch.load(os.path.join(opt.start_from_path, 'optimizer.pth')))

    time_epoch_start = time.time()
    data_time_sum = 0
    batch_time_sum = 0
    while True:
        if epoch_done:
            torch.cuda.synchronize()
            time_epoch_end = time.time()
            time_elapsed = (time_epoch_end - time_epoch_start)
            print('[DEBUG] Epoch Time: ' + str(time_elapsed))
            print('[DEBUG] Sum Data Time: ' + str(data_time_sum))
            print('[DEBUG] Sum Batch Time: ' + str(batch_time_sum))
            #if epoch==1:
            #    exit(0)
            if not opt.noamopt and not opt.reduce_on_plateau:
                # Assign the learning rate
                if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
                    frac = (epoch - opt.learning_rate_decay_start
                            ) // opt.learning_rate_decay_every
                    decay_factor = opt.learning_rate_decay_rate**frac
                    opt.current_lr = opt.learning_rate * decay_factor
                else:
                    opt.current_lr = opt.learning_rate
                utils.set_lr(optimizer, opt.current_lr)  # set the decayed rate
            # Assign the scheduled sampling prob
            if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0:
                frac = (epoch - opt.scheduled_sampling_start
                        ) // opt.scheduled_sampling_increase_every
                opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac,
                                  opt.scheduled_sampling_max_prob)
                model.ss_prob = opt.ss_prob

            # If start self critical training
            if opt.self_critical_after != -1 and epoch >= opt.self_critical_after:
                sc_flag = True
                init_scorer(opt.cached_tokens)
            else:
                sc_flag = False

            epoch_done = False

        start = time.time()
        # Load data from train split (0)
        data = loader.get_batch('train')
        print('Read data:', time.time() - start)
        data_time_sum += time.time() - start
        torch.cuda.synchronize()
        start = time.time()

        if (opt.use_obj_mcl_loss == 0):
            tmp = [
                data['fc_feats'], data['att_feats'], data['labels'],
                data['masks'], data['att_masks']
            ]
            tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp]
            fc_feats, att_feats, labels, masks, att_masks = tmp
        else:
            if opt.use_obj_att and opt.use_seg_feat:
                tmp = [
                    data['fc_feats'], data['att_feats'], data['obj_att_feats'],
                    data['seg_feat_feats'], data['labels'], data['masks'],
                    data['obj_labels'], data['att_masks'],
                    data['obj_att_masks'], data['seg_feat_masks']
                ]
                tmp = [
                    _ if _ is None else torch.from_numpy(_).cuda() for _ in tmp
                ]
                fc_feats, att_feats, obj_att_feats, seg_feat_feats, labels, masks, obj_labels, att_masks, obj_att_masks, seg_feat_masks = tmp
            elif not opt.use_obj_att and opt.use_seg_feat:
                tmp = [
                    data['fc_feats'], data['att_feats'],
                    data['seg_feat_feats'], data['labels'], data['masks'],
                    data['obj_labels'], data['att_masks'],
                    data['seg_feat_masks']
                ]
                tmp = [
                    _ if _ is None else torch.from_numpy(_).cuda() for _ in tmp
                ]
                fc_feats, att_feats, seg_feat_feats, labels, masks, obj_labels, att_masks, seg_feat_masks = tmp
            elif not opt.use_obj_att and not opt.use_seg_feat:
                tmp = [
                    data['fc_feats'], data['att_feats'], data['labels'],
                    data['masks'], data['obj_labels'], data['att_masks']
                ]
                tmp = [
                    _ if _ is None else torch.from_numpy(_).cuda() for _ in tmp
                ]
                fc_feats, att_feats, labels, masks, obj_labels, att_masks = tmp
            elif opt.use_obj_att and not opt.use_seg_feat:
                tmp = [
                    data['fc_feats'], data['att_feats'], data['obj_att_feats'],
                    data['labels'], data['masks'], data['obj_labels'],
                    data['att_masks'], data['obj_att_masks']
                ]
                tmp = [
                    _ if _ is None else torch.from_numpy(_).cuda() for _ in tmp
                ]
                fc_feats, att_feats, obj_att_feats, labels, masks, obj_labels, att_masks, obj_att_masks = tmp

        optimizer.zero_grad()
        if (opt.use_obj_mcl_loss == 0):
            if not sc_flag:
                loss = crit(dp_model(fc_feats, att_feats, labels, att_masks),
                            labels[:, 1:], masks[:, 1:])
            else:
                gen_result, sample_logprobs = dp_model(fc_feats,
                                                       att_feats,
                                                       att_masks,
                                                       opt={'sample_max': 0},
                                                       mode='sample')
                reward = get_self_critical_reward(dp_model, fc_feats,
                                                  att_feats, att_masks, data,
                                                  gen_result, opt)
                loss = rl_crit(sample_logprobs, gen_result.data,
                               torch.from_numpy(reward).float().cuda())
        else:
            if opt.use_obj_att and opt.use_seg_feat:
                if not sc_flag:
                    logits, out = dp_model(
                        fc_feats, [att_feats, obj_att_feats, seg_feat_feats],
                        labels, [att_masks, obj_att_masks, seg_feat_masks])
                    caption_loss = crit(logits, labels[:, 1:], masks[:, 1:])
                    obj_loss = mcl_crit(out, obj_labels)
                    loss = opt.lambda_caption * caption_loss + opt.lambda_obj * obj_loss
                    #loss = 0.1*caption_loss + obj_loss
                    #loss = caption_loss + 0 * obj_loss
                else:
                    gen_result, sample_logprobs = dp_model(
                        fc_feats,
                        att_feats,
                        att_masks,
                        opt={'sample_max': 0},
                        mode='sample')
                    reward = get_self_critical_reward(dp_model, fc_feats,
                                                      att_feats, att_masks,
                                                      data, gen_result, opt)
                    loss = rl_crit(sample_logprobs, gen_result.data,
                                   torch.from_numpy(reward).float().cuda())
            elif not opt.use_obj_att and opt.use_seg_feat:
                if not sc_flag:
                    logits, out = dp_model(fc_feats,
                                           [att_feats, seg_feat_feats], labels,
                                           [att_masks, seg_feat_masks])
                    caption_loss = crit(logits, labels[:, 1:], masks[:, 1:])
                    obj_loss = mcl_crit(out, obj_labels)
                    loss = opt.lambda_caption * caption_loss + opt.lambda_obj * obj_loss
                    #loss = caption_loss + 0 * obj_loss
                else:
                    gen_result, sample_logprobs = dp_model(
                        fc_feats,
                        att_feats,
                        att_masks,
                        opt={'sample_max': 0},
                        mode='sample')
                    reward = get_self_critical_reward(dp_model, fc_feats,
                                                      att_feats, att_masks,
                                                      data, gen_result, opt)
                    loss = rl_crit(sample_logprobs, gen_result.data,
                                   torch.from_numpy(reward).float().cuda())
            if not opt.use_obj_att and not opt.use_seg_feat:
                if not sc_flag:
                    logits, out = dp_model(fc_feats, att_feats, labels,
                                           att_masks)
                    caption_loss = crit(logits, labels[:, 1:], masks[:, 1:])
                    obj_loss = mcl_crit(out, obj_labels)
                    loss = opt.lambda_caption * caption_loss + opt.lambda_obj * obj_loss
                    #loss = caption_loss + 0 * obj_loss
                else:
                    gen_result, sample_logprobs = dp_model(
                        fc_feats,
                        att_feats,
                        att_masks,
                        opt={'sample_max': 0},
                        mode='sample')
                    reward = get_self_critical_reward(dp_model, fc_feats,
                                                      att_feats, att_masks,
                                                      data, gen_result, opt)
                    loss = rl_crit(sample_logprobs, gen_result.data,
                                   torch.from_numpy(reward).float().cuda())
            elif opt.use_obj_att and not opt.use_seg_feat:
                if not sc_flag:
                    logits, out = dp_model(fc_feats,
                                           [att_feats, obj_att_feats], labels,
                                           [att_masks, obj_att_masks])
                    caption_loss = crit(logits, labels[:, 1:], masks[:, 1:])
                    obj_loss = mcl_crit(out, obj_labels)
                    loss = 0.1 * caption_loss + obj_loss
                    #loss = caption_loss + 0 * obj_loss
                else:
                    gen_result, sample_logprobs = dp_model(
                        fc_feats,
                        att_feats,
                        att_masks,
                        opt={'sample_max': 0},
                        mode='sample')
                    reward = get_self_critical_reward(dp_model, fc_feats,
                                                      att_feats, att_masks,
                                                      data, gen_result, opt)
                    loss = rl_crit(sample_logprobs, gen_result.data,
                                   torch.from_numpy(reward).float().cuda())
        loss.backward()
        utils.clip_gradient(optimizer, opt.grad_clip)
        optimizer.step()
        train_loss = loss.item()
        torch.cuda.synchronize()
        end = time.time()
        batch_time_sum += end - start
        if not sc_flag:
            if (opt.use_obj_mcl_loss == 1):
                print("iter {} (epoch {}), train_loss = {:.3f}, caption_loss = {:.3f}, object_loss = {:.3f}, time/batch = {:.3f}" \
                      .format(iteration, epoch, train_loss, caption_loss.item(), obj_loss.item(), end - start))
            else:
                print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                .format(iteration, epoch, train_loss, end - start))
        else:
            print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \
                .format(iteration, epoch, np.mean(reward[:,0]), end - start))

        # Update the iteration and epoch
        iteration += 1
        if data['bounds']['wrapped']:
            epoch += 1
            epoch_done = True

        # Write the training loss summary
        if (iteration % opt.losses_log_every == 0):
            add_summary_value(tb_summary_writer, 'train_loss', train_loss,
                              iteration)
            if (opt.use_obj_mcl_loss == 1):
                add_summary_value(tb_summary_writer, 'obj_loss',
                                  obj_loss.item(), iteration)
                add_summary_value(tb_summary_writer, 'caption_loss',
                                  caption_loss.item(), iteration)
            if opt.noamopt:
                opt.current_lr = optimizer.rate()
            elif opt.reduce_on_plateau:
                opt.current_lr = optimizer.current_lr
            add_summary_value(tb_summary_writer, 'learning_rate',
                              opt.current_lr, iteration)
            add_summary_value(tb_summary_writer, 'scheduled_sampling_prob',
                              model.ss_prob, iteration)
            if sc_flag:
                add_summary_value(tb_summary_writer, 'avg_reward',
                                  np.mean(reward[:, 0]), iteration)

            loss_history[iteration] = train_loss if not sc_flag else np.mean(
                reward[:, 0])
            lr_history[iteration] = opt.current_lr
            ss_prob_history[iteration] = model.ss_prob

        # make evaluation on validation set, and save model
        if (iteration % opt.save_checkpoint_every == 0):
            # eval model
            orig_batch_size = opt.batch_size
            opt.batch_size = 1
            eval_kwargs = {'split': 'val', 'dataset': opt.input_json}
            eval_kwargs.update(vars(opt))
            loader.batch_size = eval_kwargs.get('batch_size', 1)
            val_loss, predictions, lang_stats = eval_utils.eval_split(
                dp_model, crit, loader, eval_kwargs)
            opt.batch_size = orig_batch_size
            loader.batch_size = orig_batch_size

            if opt.reduce_on_plateau:
                if 'CIDEr' in lang_stats:
                    optimizer.scheduler_step(-lang_stats['CIDEr'])
                else:
                    optimizer.scheduler_step(val_loss)

            # Write validation result into summary
            add_summary_value(tb_summary_writer, 'validation loss', val_loss,
                              iteration)
            for k, v in lang_stats.items():
                add_summary_value(tb_summary_writer, k, v, iteration)
            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            # Save model if is improving on validation result
            if opt.language_eval == 1:
                current_score = lang_stats['CIDEr']
            else:
                current_score = -val_loss

            best_flag = False
            if True:  # if true
                if best_val_score is None or current_score > best_val_score:
                    best_val_score = current_score
                    best_flag = True
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print("model saved to {}".format(checkpoint_path))
                optimizer_path = os.path.join(opt.checkpoint_path,
                                              'optimizer.pth')
                torch.save(optimizer.state_dict(), optimizer_path)

                # Dump miscalleous informations
                infos['iter'] = iteration
                infos['epoch'] = epoch
                infos['iterators'] = loader.iterators
                infos['split_ix'] = loader.split_ix
                infos['best_val_score'] = best_val_score
                infos['opt'] = opt
                infos['vocab'] = loader.get_vocab()

                histories['val_result_history'] = val_result_history
                histories['loss_history'] = loss_history
                histories['lr_history'] = lr_history
                histories['ss_prob_history'] = ss_prob_history
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'infos_' + opt.id + '.pkl'), 'wb') as f:
                    cPickle.dump(infos, f)
                with open(
                        os.path.join(opt.checkpoint_path,
                                     'histories_' + opt.id + '.pkl'),
                        'wb') as f:
                    cPickle.dump(histories, f)

                if best_flag:
                    checkpoint_path = os.path.join(opt.checkpoint_path,
                                                   'model-best.pth')
                    torch.save(model.state_dict(), checkpoint_path)
                    print("model saved to {}".format(checkpoint_path))
                    with open(
                            os.path.join(opt.checkpoint_path,
                                         'infos_' + opt.id + '-best.pkl'),
                            'wb') as f:
                        cPickle.dump(infos, f)

        # Stop if reaching max epochs
        if epoch >= opt.max_epochs and opt.max_epochs != -1:
            break