def show(opt): opt.use_att = utils.if_use_att(opt.caption_model) loader = DataLoader(opt, is_show=True) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length infos = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars(opt)[checkme], "! Command line argument and saved model disagree on '%s' " % checkme model = models.setup(opt) model.cuda() model.load_state_dict(torch.load(os.path.join(opt.start_from, 'model-best.pth'))) crit = utils.LanguageModelCriterion() # eval model eval_kwargs = {} eval_kwargs.update(vars(opt)) eval_kwargs.update({'split': 'show', 'dataset': opt.input_json, 'language_eval': 0, 'beam_size': 5, 'print_all_beam': True}) val_loss, predictions, lang_stats = eval_utils.eval_split(model, crit, loader, eval_kwargs)
def train(opt): # opt.use_att = utils.if_use_att(opt.caption_model) opt.use_att = True if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5 loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length print(opt.checkpoint_path) tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path) infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')): with open( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')) as f: histories = cPickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) critic_loss_history = histories.get('critic_loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) variance_history = histories.get('variance_history', {}) time_history = histories.get('time_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) model = models.setup(opt).cuda() dp_model = model target_actor = models.setup(opt).cuda() ####################### Critic pretrain ##################################################################### ##### Critic with state as input # if opt.critic_model == 'state_critic': # critic_model = CriticModel(opt) # else: critic_model = AttCriticModel(opt) target_critic = AttCriticModel(opt) if vars(opt).get('start_from_critic', None) is not None and True: # check if all necessary files exist assert os.path.isdir(opt.start_from_critic ), " %s must be a a path" % opt.start_from_critic print( os.path.join(opt.start_from_critic, opt.critic_model + '_model.pth')) critic_model.load_state_dict( torch.load( os.path.join(opt.start_from_critic, opt.critic_model + '_model.pth'))) target_critic.load_state_dict( torch.load( os.path.join(opt.start_from_critic, opt.critic_model + '_model.pth'))) critic_model = critic_model.cuda() target_critic = target_critic.cuda() critic_optimizer = utils.build_optimizer(critic_model.parameters(), opt) dp_model.eval() critic_iter = 0 init_scorer(opt.cached_tokens) critic_model.train() error_sum = 0 loss_vector_sum = 0 while opt.pretrain_critic == 1: if critic_iter > opt.pretrain_critic_steps: print('****************Finished critic training!') break data = loader.get_batch('train') torch.cuda.synchronize() start = time.time() tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks'] ] tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp critic_model.train() critic_optimizer.zero_grad() assert opt.critic_model == 'att_critic_vocab' # crit_loss, reward, std = critic_loss_fun(fc_feats, att_feats, att_masks, dp_model, critic_model, opt, data) crit_loss, reward, std = target_critic_loss_fun_mask( fc_feats, att_feats, att_masks, dp_model, critic_model, opt, data, target_critic, target_actor) crit_loss.backward() critic_optimizer.step() #TODO update target. for cp, tp in zip(critic_model.parameters(), target_critic.parameters()): tp.data = tp.data + opt.gamma_critic * (cp.data - tp.data) crit_train_loss = crit_loss.item() torch.cuda.synchronize end = time.time() error_sum += crit_train_loss**0.5 - std if (critic_iter % opt.losses_log_every == 0): print("iter {} , crit_train_loss = {:.3f}, difference = {:.3f}, difference_sum = {:.3f}, time/batch = {:.3f}" \ .format(critic_iter, crit_train_loss**0.5, crit_train_loss**0.5-std, error_sum, end - start)) print(opt.checkpoint_path) opt.importance_sampling = 1 critic_model.eval() _, _, _, _ = get_rf_loss(dp_model, fc_feats, att_feats, att_masks, data, opt, loader, critic_model, test_critic=True) critic_iter += 1 # make evaluation on validation set, and save model if (critic_iter % opt.save_checkpoint_every == 0): if not os.path.isdir(opt.checkpoint_path): os.mkdir(opt.checkpoint_path) checkpoint_path = os.path.join(opt.checkpoint_path, opt.critic_model + '_model.pth') torch.save(critic_model.state_dict(), checkpoint_path) ######################### Actor-critic Training ##################################################################### update_lr_flag = True # Assure in training mode dp_model.train() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = utils.build_optimizer(model.parameters(), opt) # Load the optimizer if vars(opt).get('start_from', None) is not None and os.path.isfile( os.path.join(opt.start_from, "optimizer.pth")): optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) first_order = 0 second_order = 0 while True: if update_lr_flag: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(optimizer, opt.current_lr) # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False update_lr_flag = False # Load data from train split (0) data = loader.get_batch('train') if data['bounds']['it_pos_now'] > 5000: loader.reset_iterator('train') continue dp_model.train() critic_model.eval() torch.cuda.synchronize() start = time.time() gen_result = None tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks'] ] tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp optimizer.zero_grad() if not sc_flag: loss = crit(dp_model(fc_feats, att_feats, labels, att_masks), labels[:, 1:], masks[:, 1:]) else: if opt.rl_type == 'sc': gen_result, sample_logprobs = dp_model(fc_feats, att_feats, att_masks, opt={'sample_max': 0}, mode='sample') reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt) loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda()) elif opt.rl_type == 'reinforce': gen_result, sample_logprobs = dp_model(fc_feats, att_feats, att_masks, opt={'sample_max': 0}, mode='sample') reward = get_reward(data, gen_result, opt) loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda()) elif opt.rl_type == 'arsm': loss = get_arm_loss(dp_model, fc_feats, att_feats, att_masks, data, opt, loader) #print(loss) reward = np.zeros([2, 2]) elif opt.rl_type == 'rf4': loss, _, _, _ = get_rf_loss(dp_model, fc_feats, att_feats, att_masks, data, opt, loader) # print(loss) reward = np.zeros([2, 2]) elif opt.rl_type == 'importance_sampling': opt.importance_sampling = 1 loss, gen_result, reward, sample_logprobs_total = get_rf_loss( dp_model, fc_feats, att_feats, att_masks, data, opt, loader) reward = np.repeat(reward[:, np.newaxis], gen_result.shape[1], 1) std = np.std(reward) elif opt.rl_type == 'importance_sampling_critic': opt.importance_sampling = 1 loss, gen_result, reward, sample_logprobs_total = get_rf_loss( target_actor, fc_feats, att_feats, att_masks, data, opt, loader, target_critic) reward = np.repeat(reward[:, np.newaxis], gen_result.shape[1], 1) std = np.std(reward) elif opt.rl_type == 'ar': loss = get_ar_loss(dp_model, fc_feats, att_feats, att_masks, data, opt, loader) reward = np.zeros([2, 2]) elif opt.rl_type == 'mct_baseline': opt.rf_demean = 0 gen_result, sample_logprobs, probs, mct_baseline = get_mct_loss( dp_model, fc_feats, att_feats, att_masks, data, opt, loader) reward = get_reward(data, gen_result, opt) reward_cuda = torch.from_numpy(reward).float().cuda() mct_baseline[mct_baseline < 0] = reward_cuda[mct_baseline < 0] if opt.arm_step_sample == 'greedy': sample_logprobs = sample_logprobs * probs loss = rl_crit( sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda() - mct_baseline) elif opt.rl_type == 'arsm_baseline': opt.arm_as_baseline = 1 opt.rf_demean = 0 gen_result, sample_logprobs, probs, arm_baseline = get_arm_loss( dp_model, fc_feats, att_feats, att_masks, data, opt, loader) reward = get_reward(data, gen_result, opt) reward_cuda = torch.from_numpy(reward).float().cuda() arm_baseline[arm_baseline < 0] = reward_cuda[arm_baseline < 0] if opt.arm_step_sample == 'greedy' and False: sample_logprobs = sample_logprobs * probs loss = rl_crit(sample_logprobs, gen_result.data, reward_cuda - arm_baseline) elif opt.rl_type == 'ars_indicator': opt.arm_as_baseline = 1 opt.rf_demean = 0 gen_result, sample_logprobs, probs, arm_baseline = get_arm_loss( dp_model, fc_feats, att_feats, att_masks, data, opt, loader) reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt) reward_cuda = torch.from_numpy(reward).float().cuda() loss = rl_crit(sample_logprobs, gen_result.data, reward_cuda * arm_baseline) elif opt.rl_type == 'arsm_baseline_critic': opt.arm_as_baseline = 1 opt.rf_demean = 0 gen_result, sample_logprobs, probs, arm_baseline = get_arm_loss( dp_model, fc_feats, att_feats, att_masks, data, opt, loader, critic_model) reward, std = get_reward(data, gen_result, opt, critic=True) if opt.arm_step_sample == 'greedy': sample_logprobs = sample_logprobs * probs loss = rl_crit( sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda() - arm_baseline) elif opt.rl_type == 'arsm_critic': #print(opt.critic_model) tic = time.time() loss = get_arm_loss(dp_model, fc_feats, att_feats, att_masks, data, opt, loader, critic_model) #print('arm_loss time', str(time.time()-tic)) reward = np.zeros([2, 2]) elif opt.rl_type == 'critic_vocab_sum': assert opt.critic_model == 'att_critic_vocab' tic = time.time() gen_result, sample_logprobs_total = dp_model( fc_feats, att_feats, att_masks, opt={'sample_max': 0}, total_probs=True, mode='sample') #batch, seq, vocab #print('generation time', time.time()-tic) gen_result_pad = torch.cat([ gen_result.new_zeros( gen_result.size(0), 1, dtype=torch.long), gen_result ], 1) tic = time.time() critic_value = critic_model(gen_result_pad, fc_feats, att_feats, True, opt, att_masks) #batch, seq, vocab #print('critic time', time.time() - tic) probs = torch.sum( F.softmax(sample_logprobs_total, 2) * critic_value.detach(), 2) mask = (gen_result > 0).float() mask = torch.cat( [mask.new(mask.size(0), 1).fill_(1), mask[:, :-1]], 1) loss = -torch.sum(probs * mask) / torch.sum(mask) reward = np.zeros([2, 2]) elif opt.rl_type == 'reinforce_critic': #TODO change the critic to attention if opt.critic_model == 'state_critic': critic_value, gen_result, sample_logprobs = critic_model( dp_model, fc_feats, att_feats, opt, att_masks) reward, std = get_reward(data, gen_result, opt, critic=True) loss = rl_crit( sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda() - critic_value[:, :-1].data) elif opt.critic_model == 'att_critic': gen_result, sample_logprobs = dp_model( fc_feats, att_feats, att_masks, opt={'sample_max': 0}, mode='sample') gen_result_pad = torch.cat([ gen_result.new_zeros(gen_result.size(0), 1, dtype=torch.long), gen_result ], 1) critic_value = critic_model(gen_result_pad, fc_feats, att_feats, True, opt, att_masks).squeeze(2) reward, std = get_reward(data, gen_result, opt, critic=True) loss = rl_crit( sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda() - critic_value.data) if opt.mle_weights != 0: loss += opt.mle_weights * crit( dp_model(fc_feats, att_feats, labels, att_masks), labels[:, 1:], masks[:, 1:]) #TODO make sure all sampling replaced by greedy for critic #### update the actor loss.backward() # with open(os.path.join(opt.checkpoint_path, 'best_embed.pkl'), 'wb') as f: # cPickle.dump(list(dp_model.embed.parameters())[0].data.cpu().numpy(), f) # with open(os.path.join(opt.checkpoint_path, 'best_logit.pkl'), 'wb') as f: # cPickle.dump(list(dp_model.logit.parameters())[0].data.cpu().numpy(), f) ## compute variance gradient = torch.zeros([0]).cuda() for i in model.parameters(): gradient = torch.cat((gradient, i.grad.view(-1)), 0) first_order = 0.9999 * first_order + 0.0001 * gradient second_order = 0.9999 * second_order + 0.0001 * gradient.pow(2) # print(torch.max(torch.abs(gradient))) variance = torch.mean(torch.abs(second_order - first_order.pow(2))).item() if opt.rl_type != 'arsm' or not sc_flag: utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() # ### update the critic if 'critic' in opt.rl_type: dp_model.eval() critic_model.train() utils.set_lr(critic_optimizer, opt.critic_learning_rate) critic_optimizer.zero_grad() assert opt.critic_model == 'att_critic_vocab' crit_loss, reward, std = target_critic_loss_fun_mask( fc_feats, att_feats, att_masks, dp_model, critic_model, opt, data, target_critic, target_actor, gen_result=gen_result, sample_logprobs_total=sample_logprobs_total, reward=reward) crit_loss.backward() critic_optimizer.step() for cp, tp in zip(critic_model.parameters(), target_critic.parameters()): tp.data = tp.data + opt.gamma_critic * (cp.data - tp.data) for cp, tp in zip(dp_model.parameters(), target_actor.parameters()): tp.data = tp.data + opt.gamma_actor * (cp.data - tp.data) crit_train_loss = crit_loss.item() error_sum += crit_train_loss**0.5 - std train_loss = loss.item() torch.cuda.synchronize() end = time.time() if (iteration % opt.losses_log_every == 0): if not sc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) print(opt.checkpoint_path) elif 'critic' in opt.rl_type: print( "iter {} , crit_train_loss = {:.3f}, difference = {:.3f}, difference_sum = {:.3f},variance = {:g}, time/batch = {:.3f}" \ .format(iteration, crit_train_loss ** 0.5, crit_train_loss ** 0.5 - std, error_sum, variance, end - start)) print(opt.checkpoint_path) critic_model.eval() _, _, _, _ = get_rf_loss(dp_model, fc_feats, att_feats, att_masks, data, opt, loader, critic_model, test_critic=True) else: print("iter {} (epoch {}), avg_reward = {:.3f}, variance = {:g}, time/batch = {:.3f}" \ .format(iteration, epoch, np.mean(reward[:, 0]), variance, end - start)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): add_summary_value(tb_summary_writer, 'train_loss', train_loss, iteration) add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration) add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: add_summary_value(tb_summary_writer, 'avg_reward', np.mean(reward), iteration) add_summary_value(tb_summary_writer, 'variance', variance, iteration) loss_history[iteration] = train_loss if not sc_flag else np.mean( reward) critic_loss_history[ iteration] = crit_train_loss if 'critic' in opt.rl_type else 0 lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob variance_history[iteration] = variance time_history[iteration] = end - start # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( dp_model, crit, loader, eval_kwargs) # Write validation result into summary add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): add_summary_value(tb_summary_writer, k, v, iteration) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True if not os.path.isdir(opt.checkpoint_path): os.mkdir(opt.checkpoint_path) checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) checkpoint_path = os.path.join(opt.checkpoint_path, opt.critic_model + '_model.pth') torch.save(critic_model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['best_val_score'] = best_val_score infos['opt'] = opt infos['vocab'] = loader.get_vocab() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['critic_loss_history'] = critic_loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history histories['variance_history'] = variance_history histories['time'] = time_history # histories['variance'] = 0 with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break
def train(opt): # Load data loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length # Tensorboard summaries (they're great!) tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path) # Load pretrained model, info file, histories file infos = {} histories = {} if opt.start_from is not None: with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = ["rnn_type", "rnn_size", "num_layers"] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')): with open( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')) as f: histories = cPickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) #ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) # Create model model = models.setup(opt).cuda() #pretrained_dict = torch.load(opt.model) #model.load_state_dict(pretrained_dict, strict=False) num_params = get_n_params(model) print('number of parameteres:', num_params) dp_model = torch.nn.DataParallel(model) dp_model.train() # Loss function crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() # Optimizer and learning rate adjustment flag optimizer = utils.build_optimizer(model.parameters(), opt) update_lr_flag = True # Load the optimizer if vars(opt).get('start_from', None) is not None and os.path.isfile( os.path.join(opt.start_from, "optimizer.pth")): optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) # Training loop while True: # Update learning rate once per epoch if update_lr_flag: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(optimizer, opt.current_lr) # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every #opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) #model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False update_lr_flag = False # Load data from train split (0) start = time.time() data = loader.get_batch('train') data_time = time.time() - start start = time.time() # Unpack data torch.cuda.synchronize() tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['dist'], data['masks'], data['att_masks'] ] tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp] fc_feats, att_feats, labels, dist_label, masks, att_masks = tmp batchsize = fc_feats.size(0) # Forward pass and loss optimizer.zero_grad() if not sc_flag: wordact, reconstruct = dp_model(fc_feats, att_feats, labels) #loss_dist = F.binary_cross_entropy(dist, dist_label.cpu().float()) fc_feats_max, _ = att_feats.max(1) loss_rec = F.mse_loss(reconstruct.cpu(), fc_feats_max.cpu()) mask = masks[:, 1:].contiguous() wordact = wordact[:, :, :-1] wordact_t = wordact.permute(0, 2, 1).contiguous() wordact_t = wordact_t.view( wordact_t.size(0) * wordact_t.size(1), -1) labels = labels.contiguous().view(-1, 6 * 30).cpu() wordclass_v = labels[:, 1:] wordclass_t = wordclass_v.contiguous().view(\ wordclass_v.size(0) * wordclass_v.size(1), 1) maskids = torch.nonzero(mask.view(-1).cpu()).numpy().reshape(-1) loss_xe = F.cross_entropy(wordact_t[maskids, ...], \ wordclass_t[maskids, ...].contiguous().view(maskids.shape[0])) loss = 5 * loss_xe + loss_rec else: gen_result, sample_logprobs = dp_model(fc_feats, att_feats, att_masks, opt={'sample_max': 0}, mode='sample') reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt) loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda()) # Backward pass loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.item() torch.cuda.synchronize() # Print total_time = time.time() - start if iteration % opt.print_freq == 1: print('Read data:', time.time() - start) if not sc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, data_time = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, data_time, total_time)) else: print("iter {} (epoch {}), avg_reward = {:.3f}, data_time = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, np.mean(reward[:,0]), data_time, total_time)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): add_summary_value(tb_summary_writer, 'train_loss', train_loss, iteration) add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration) #add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: add_summary_value(tb_summary_writer, 'avg_reward', np.mean(reward[:, 0]), iteration) loss_history[iteration] = train_loss if not sc_flag else np.mean( reward[:, 0]) lr_history[iteration] = opt.current_lr #ss_prob_history[iteration] = model.ss_prob # Validate and save model if (iteration >= 60000 and iteration % opt.save_checkpoint_every == 0): checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Evaluate model eval_kwargs = {'split': 'test', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( dp_model, crit, loader, eval_kwargs) # Write validation result into summary add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): add_summary_value(tb_summary_writer, k, v, iteration) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Our metric is CIDEr if available, otherwise validation loss if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss # Save model in checkpoint path best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['best_val_score'] = best_val_score infos['opt'] = opt infos['vocab'] = loader.get_vocab() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history #histories['ss_prob_history'] = ss_prob_history with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(histories, f) # Save model to unique file if new best model if best_flag: model_fname = 'model-best.pth' infos_fname = 'model-best.pkl' checkpoint_path = os.path.join(opt.checkpoint_path, model_fname) torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open(os.path.join(opt.checkpoint_path, infos_fname), 'wb') as f: cPickle.dump(infos, f) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break
def eval(opt, model_name, infos_name, annFile, listener, split, iteration): # Input arguments and options # Load infos with open(infos_name, 'rb') as f: infos = cPickle.load(f, encoding='latin1') # For the case that we run eval not immediately after train, so arguments # are not exist. 'att_hid_size' is just one possible test to find out. if not hasattr(opt, 'att_hid_size'): opt = infos['opt'] opt.split = split opt.beam_size = 2 np.random.seed(123) # override and collect parameters if len(opt.input_fc_dir) == 0: opt.input_fc_dir = infos['opt'].input_fc_dir opt.input_att_dir = infos['opt'].input_att_dir opt.input_label_h5 = infos['opt'].input_label_h5 if len(opt.input_json) == 0: opt.input_json = infos['opt'].input_json if opt.batch_size == 0: opt.batch_size = infos['opt'].batch_size if len(opt.id) == 0: opt.id = infos['opt'].id # if opt.initialize_retrieval == None: # opt.initialize_retrieval = infos['opt'].initialize_retrieval ignore = [ "id", "batch_size", "beam_size", "start_from", "language_eval", "initialize_retrieval", 'decoding_constraint', 'evaluation_retrieval', "input_fc_dir", "input_att_dir", "input_label_h5", 'seq_per_img', 'closest_num', 'closest_file' ] # for k in vars(infos['opt']).keys(): # if k not in ignore: # if k in vars(opt) and getattr(opt, k) is not None: # assert vars(opt)[k] == vars(infos['opt'])[k], k + ' option not consistent:' + str(vars(opt)[k])+' '+ str(vars(infos['opt'])[k]) # else: # vars(opt).update({k: vars(infos['opt'])[k]}) # copy over options from model vocab = infos['vocab'] # ix -> word mapping # assert opt.closest_num == opt.seq_per_img opt.vse_loss_weight = vars(opt).get('vse_loss_weight', 1) opt.caption_loss_weight = vars(opt).get('caption_loss_weight', 1) opt.cider_optimization = 0 # Setup the model model = models.AlternatingJointModel(opt, iteration) # model = models.JointModel(opt) utils.load_state_dict(model, torch.load(model_name)) if listener == 'gt': print('gt listener is loaded for evaluation') # utils.load_state_dict(model.vse, torch.load(opt.initialize_retrieval)) utils.load_state_dict( model, { k: v for k, v in torch.load(opt.initialize_retrieval).items() if 'vse.' in k }) model.cuda() model.eval() # Create the Data Loader instance loader = DataLoader(opt) # Set sample options loss, split_predictions, lang_stats = eval_utils.eval_split( model, loader, vars(opt), annFile, useGenSent=True) return { 'loss': loss, 'split_predictions': split_predictions, 'lang_stats': lang_stats }
def train(opt): opt.use_att = utils.if_use_att(opt.caption_model) loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length tf_summary_writer = tf and tf.summary.FileWriter(opt.checkpoint_path) infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl'),'rb') as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars(opt)[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')): with open(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl'),'rb') as f: histories = cPickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) model = models.setup(opt) model.cuda() if opt.multi_gpu: model=nn.DataParallel(model) update_lr_flag = True # Assure in training mode model.train() crit = utils.LanguageModelCriterion() optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) # Load the optimizer if vars(opt).get('start_from', None) is not None: optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) while True: if update_lr_flag: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate ** frac opt.current_lr = opt.learning_rate * decay_factor utils.set_lr(optimizer, opt.current_lr) # set the decayed rate else: opt.current_lr = opt.learning_rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob update_lr_flag = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']] tmp = [Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp] fc_feats, att_feats, labels, masks = tmp optimizer.zero_grad() loss = crit(model(fc_feats, att_feats, labels), labels[:,1:], masks[:,1:]) loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.data[0] torch.cuda.synchronize() end = time.time() print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): if tf is not None: add_summary_value(tf_summary_writer, 'train_loss', train_loss, iteration) add_summary_value(tf_summary_writer, 'learning_rate', opt.current_lr, iteration) add_summary_value(tf_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration) tf_summary_writer.flush() loss_history[iteration] = train_loss lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split(model, crit, loader, eval_kwargs) # Write validation result into summary if tf is not None: add_summary_value(tf_summary_writer, 'validation loss', val_loss, iteration) for k,v in lang_stats.items(): add_summary_value(tf_summary_writer, k, v, iteration) tf_summary_writer.flush() val_result_history[iteration] = {'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions} # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = - val_loss best_flag = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['best_val_score'] = best_val_score infos['opt'] = opt infos['vocab'] = loader.get_vocab() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'.pkl'), 'wb') as f: cPickle.dump(infos, f) with open(os.path.join(opt.checkpoint_path, 'histories_'+opt.id+'.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'-best.pkl'), 'wb') as f: cPickle.dump(infos, f) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break
# Create the Data Loader instance if len(opt.image_folder) == 0: loader = DataLoader(opt) else: loader = DataLoaderRaw({ 'folder_path': opt.image_folder, 'coco_json': opt.coco_json, 'batch_size': opt.batch_size, 'cnn_model': opt.cnn_model }) # When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json # So make sure to use the vocab in infos file. loader.ix_to_word = infos['vocab'] #opt.id = '+'.join([_+str(__) for _,__ in zip(opt.ids, opt.weights)]) # Set sample options split_predictions_list = [] # opt.verbose_beam = 0 for order in range(opt.number_of_models): loss, split_predictions, lang_stats = eval_utils.eval_split( multi_models_list[order], crit, loader, vars(opt)) split_predictions_list.append(split_predictions) print('loss: ', loss) if lang_stats: print(lang_stats) if opt.dump_json == 1: # dump the json json.dump(split_predictions, open('vis/vis.json', 'w'))
def train(opt): # load train/valid/test data opt.vocab_size = get_nwords(opt.data_path) opt.category_size = get_nclasses(opt.data_path) mytrain_dset, myvalid_dset, mytest_dset = loaddset(opt) logger = tf and Logger(opt.checkpoint_path) # init or load training infos infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '-best.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = ["rnn_size", "num_layers"] # optim needn't same for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '-best.pkl')): with open( os.path.join(opt.start_from, 'histories_' + opt.id + '-best.pkl')) as f: histories = cPickle.load(f) # random seed must be inherited if didn't assign it. if opt.seed == 0: opt.seed = infos['opt'].seed iteration = 1 #infos.get('iter', 0) + 1 epoch = 0 #infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) else: best_val_score = None torch.manual_seed(opt.seed) torch.cuda.manual_seed(opt.seed) model = SAModel(opt) if opt.start_from is not None: # check if all necessary files exist assert os.path.isdir( opt.start_from), " %s must be a a path" % opt.start_from model.load_state_dict(torch.load( os.path.join(opt.start_from, 'model-best.pth')), strict=False) model.cuda() model.train() crit = LanguageModelCriterion() classify_crit = ClassiferCriterion() rl_crit = RewardCriterion() # select optimizer if opt.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) elif opt.optim == 'adadelta': optimizer = optim.Adadelta(model.parameters(), lr=1.0, weight_decay=opt.weight_decay) opt.learning_rate_decay_start = -1 # training start tmp_patience = 0 # each epoch while True: update_lr_flag = True # when a new epoch start, set update_lr_flag to True if update_lr_flag: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0 and opt.optim != 'adadelta': frac = int((epoch - opt.learning_rate_decay_start) / opt.learning_rate_decay_every) decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor myutils.set_lr(optimizer, opt.current_lr) # set the decayed rate else: opt.current_lr = opt.learning_rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = int((epoch - opt.scheduled_sampling_start) / opt.scheduled_sampling_increase_every) opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True myutils.init_cider_scorer(opt.reward_type) else: sc_flag = False update_lr_flag = False #loading train data myloader_train = DataLoader(mytrain_dset, batch_size=opt.batch_size, collate_fn=data_io.collate_fn, shuffle=True) torch.cuda.synchronize() for data, cap, cap_mask, cap_classes, class_mask, feat1, feat2, feat_mask, lens, groundtruth, image_id in myloader_train: start = time.time() cap = Variable(cap, requires_grad=False).cuda() cap_mask = Variable(cap_mask, requires_grad=False).cuda() cap_classes = Variable( cap_classes, requires_grad=False).cuda() # (m, seq_len+1) class_mask = Variable(class_mask, requires_grad=False).cuda() feat1 = Variable(feat1, requires_grad=False).cuda() feat2 = Variable(feat2, requires_grad=False).cuda() feat_mask = Variable(feat_mask, requires_grad=False).cuda() cap_classes = torch.cat([cap_classes[:, -1:], cap_classes[:, :-1]], dim=-1) # (m, seq_len+1) new_mask = torch.zeros_like(class_mask) # (m, seq_len+1) for i in range(class_mask.size(0)): index = np.argwhere( class_mask.data[i, :] != 0)[0][-1] # posmask_i 中最后一个不为0的地方 new_mask[i, :index + 1] = 1.0 optimizer.zero_grad() if not sc_flag: out = model( feat1, feat2, feat_mask, cap, cap_mask, cap_classes, new_mask ) # (m,seq_len+1,n_words),(m, seq_len+1, n_classes) loss = classify_crit(out, cap_classes, cap_mask, class_mask) else: print('Not implement yet.') exit(11) gen_result, sample_logprobs = model.sample( feat1, feat2, feat_mask, {'sample_max': 0}) reward = myutils.get_self_critical_reward( model, feat, feat_mask, groundtruth, gen_result) # (m,max_length) loss = rl_crit( sample_logprobs, gen_result, Variable(torch.from_numpy(reward).float().cuda(), requires_grad=False)) loss.backward() myutils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.data[0] torch.cuda.synchronize() end = time.time() if not sc_flag: print( "iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" .format(iteration, epoch, train_loss, end - start)) else: print( "iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" .format(iteration, epoch, np.mean(reward[:, 0]), end - start)) # Write the training loss summary if (iteration % opt.losses_log_every == 0): if tf is not None: logger.scalar_summary('train_loss', train_loss, iteration) logger.scalar_summary('learning_rate', opt.current_lr, iteration) logger.scalar_summary('scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: logger.scalar_summary('avg_reward', np.mean(reward[:, 0]), iteration) loss_history[ iteration] = train_loss if not sc_flag else np.mean( reward[:, 0]) lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model print('validation and save the model...') time.sleep(3) eval_kwargs = {} eval_kwargs.update( vars(opt)) # attend vars(opt) into eval_kwargs val_loss, predictions, lang_stats = eval_utils.eval_split( model, crit, classify_crit, myvalid_dset, eval_kwargs) print('val loss: {}'.format(val_loss)) print('validation is finish!') time.sleep(3) if tf is not None: logger.scalar_summary('validation loss', val_loss, iteration) if opt.language_eval == 1: for tag, value in lang_stats.items(): if type(value) is list: logger.scalar_summary(tag, value[-1], iteration) else: logger.scalar_summary(tag, value, iteration) for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, value.data.cpu().numpy(), iteration) logger.histo_summary( tag + '/grad', (value.grad).data.cpu().numpy(), iteration) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] #current_score = -val_loss else: current_score = -val_loss best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True tmp_patience = 0 else: tmp_patience += 1 if not os.path.exists(opt.checkpoint_path): os.mkdir(opt.checkpoint_path) checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) # Dump miscalleous informations(current information) infos['iter'] = iteration infos['epoch'] = epoch infos['best_val_score'] = best_val_score infos['opt'] = opt infos['val_score'] = lang_stats infos['val_sents'] = predictions histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(histories, f) if tmp_patience >= opt.patience: break iteration += 1 if tmp_patience >= opt.patience: print("early stop, trianing is finished!") break if epoch >= opt.max_epochs and opt.max_epochs != -1: print("reach max epochs, training is finished!") break epoch += 1
def train(opt): # Deal with feature things before anything opt.use_fc, opt.use_att = utils.if_use_feat(opt.caption_model) if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5 loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path) infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl'), 'rb') as f: infos = utils.pickle_load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')): with open( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl'), 'rb') as f: histories = utils.pickle_load(f) else: infos['iter'] = 0 infos['epoch'] = 0 infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['vocab'] = loader.get_vocab() infos['opt'] = opt iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) # cnn_model = utils.build_cnn(opt) cnn_model = create_extractor( "/root/PycharmProjects/vgg_vae_best_model.pth") cnn_model = cnn_model.cuda() if vars(opt).get('start_from', None) is not None: cnn_model.load_state_dict( torch.load(os.path.join(opt.start_from, 'model-cnn.pth'))) print("load cnn model parameters from {}".format( os.path.join(opt.start_from, 'model-cnn.pth'))) model = models.setup(opt).cuda() dp_model = torch.nn.DataParallel(model) lw_model = LossWrapper(model, opt) dp_lw_model = torch.nn.DataParallel(lw_model) # dp_lw_model = lw_model epoch_done = True # Assure in training mode dp_lw_model.train() if opt.noamopt: assert opt.caption_model == 'transformer', 'noamopt can only work with transformer' optimizer = utils.get_std_opt(model, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup) optimizer._step = iteration elif opt.reduce_on_plateau: optimizer = utils.build_optimizer(model.parameters(), opt) optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3) else: optimizer = utils.build_optimizer(model.parameters(), opt) # if opt.finetune_cnn_after != -1: # # only finetune the layer2 to layer4 cnn_optimizer = optim.Adam([{ 'params': module.parameters() } for module in cnn_model.finetune_modules], lr=opt.cnn_learning_rate, weight_decay=opt.cnn_weight_decay) # Load the optimizer if vars(opt).get('start_from', None) is not None: if os.path.isfile(os.path.join(opt.start_from, "optimizer.pth")): optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) if opt.finetune_cnn_after != -1: if os.path.isfile(os.path.join(opt.start_from, 'optimizer-cnn.pth')): cnn_optimizer.load_state_dict( torch.load( os.path.join(opt.start_from, 'optimizer-cnn.pth'))) def save_checkpoint(model, cnn_model, infos, optimizer, cnn_optimizer, histories=None, append=''): if len(append) > 0: append = '-' + append # if checkpoint_path doesn't exist if not os.path.isdir(opt.checkpoint_path): os.makedirs(opt.checkpoint_path) checkpoint_path = os.path.join(opt.checkpoint_path, 'model%s.pth' % (append)) torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) cnn_checkpoint_path = os.path.join(opt.checkpoint_path, 'model-cnn%s.pth' % (append)) torch.save(cnn_model.state_dict(), cnn_checkpoint_path) print("cnn model saved to {}".format(cnn_checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer%s.pth' % (append)) torch.save(optimizer.state_dict(), optimizer_path) if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after: cnn_optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer%s-cnn.pth' % (append)) torch.save(cnn_optimizer.state_dict(), cnn_optimizer_path) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '%s.pkl' % (append)), 'wb') as f: utils.pickle_dump(infos, f) if histories: with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '%s.pkl' % (append)), 'wb') as f: utils.pickle_dump(histories, f) try: while True: if epoch_done: if not opt.noamopt and not opt.reduce_on_plateau: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate # set the decayed rate utils.set_lr(optimizer, opt.current_lr) # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min( opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # Update the training stage of cnn if opt.finetune_cnn_after == -1 or epoch < opt.finetune_cnn_after: for p in cnn_model.parameters(): p.requires_grad = False cnn_model.eval() else: for p in cnn_model.parameters(): p.requires_grad = True # Fix the first few layers: for module in cnn_model.fixed_modules: for p in module.parameters(): p.requires_grad = False cnn_model.train() # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False epoch_done = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') torch.cuda.synchronize() print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks'] ] tmp = [_ if _ is None else _.cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp # att_feats 8x672x224 att_feats = att_feats.view(att_feats.size(0), 3, 224, 224) att_feats, fc_feats = cnn_model(att_feats) # fc_feats = att_feats.mean(3).mean(2) # att_feats = torch.nn.functional.adaptive_avg_pool2d( # att_feats, [7, 7]).permute(0, 2, 3, 1) att_feats = att_feats.permute(0, 2, 3, 1) att_feats = att_feats.view(att_feats.size(0), 49, -1) att_feats = att_feats.unsqueeze(1).expand(*(( att_feats.size(0), opt.seq_per_img, ) + att_feats.size()[1:])).contiguous().view( (att_feats.size(0) * opt.seq_per_img), -1, att_feats.size()[-1]) fc_feats = fc_feats.unsqueeze(1).expand(*(( fc_feats.size(0), opt.seq_per_img, ) + fc_feats.size()[1:])).contiguous().view( *((fc_feats.size(0) * opt.seq_per_img, ) + fc_feats.size()[1:])) optimizer.zero_grad() if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after: cnn_optimizer.zero_grad() model_out = dp_lw_model(fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts'])), sc_flag) loss = model_out['loss'].mean() loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after: utils.clip_gradient(cnn_optimizer, opt.grad_clip) cnn_optimizer.step() train_loss = loss.item() torch.cuda.synchronize() end = time.time() if not sc_flag: print( "iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" .format(iteration, epoch, train_loss, end - start)) else: print( "iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" .format(iteration, epoch, model_out['reward'].mean(), end - start)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 epoch_done = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): add_summary_value(tb_summary_writer, 'train_loss', train_loss, iteration) if opt.noamopt: opt.current_lr = optimizer.rate() elif opt.reduce_on_plateau: opt.current_lr = optimizer.current_lr add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration) add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: add_summary_value(tb_summary_writer, 'avg_reward', model_out['reward'].mean(), iteration) loss_history[ iteration] = train_loss if not sc_flag else model_out[ 'reward'].mean() lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # update infos infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( cnn_model, model, lw_model.crit, loader, eval_kwargs) if opt.reduce_on_plateau: if 'CIDEr' in lang_stats: optimizer.scheduler_step(-lang_stats['CIDEr']) else: optimizer.scheduler_step(val_loss) # Write validation result into summary add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): add_summary_value(tb_summary_writer, k, v, iteration) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True # Dump miscalleous informations infos['best_val_score'] = best_val_score histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history save_checkpoint(model, cnn_model, infos, optimizer, cnn_optimizer, histories) if opt.save_history_ckpt: save_checkpoint(model, cnn_model, infos, optimizer, cnn_optimizer, append=str(iteration)) if best_flag: save_checkpoint(model, cnn_model, infos, optimizer, cnn_optimizer, append='best') # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break except (RuntimeError, KeyboardInterrupt): print('Save ckpt on exception ...') save_checkpoint(model, cnn_model, infos, optimizer, cnn_optimizer) print('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) # test model test_kwargs = {'split': 'test', 'dataset': opt.input_json} test_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( cnn_model, model, lw_model.crit, loader, test_kwargs) if opt.reduce_on_plateau: if 'CIDEr' in lang_stats: optimizer.scheduler_step(-lang_stats['CIDEr']) else: optimizer.scheduler_step(val_loss) # Write validation result into summary add_summary_value(tb_summary_writer, 'test loss', val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): add_summary_value(tb_summary_writer, k, v, iteration) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions }
def train(opt): # setup dataloader loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length #set the checkpoint path opt.checkpoint_path = os.path.join(opt.checkpoint_path, opt.id) isExists = os.path.exists(opt.checkpoint_path) if not isExists: os.makedirs(opt.checkpoint_path) os.makedirs(opt.checkpoint_path + '/logs') print(opt.checkpoint_path + ' creating !') else: print(opt.checkpoint_path + ' already exists!') tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path) infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open( os.path.join( opt.checkpoint_path, 'infos_' + opt.id + format(int(opt.start_from), '04') + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "att_feat_size", "rnn_size", "input_encoding_size" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join( opt.checkpoint_path, 'histories_' + opt.id + format(int(opt.start_from), '04') + '.pkl')): with open( os.path.join( opt.checkpoint_path, 'histories_' + opt.id + format(int(opt.start_from), '04') + '.pkl')) as f: histories = cPickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) word_loss_history = histories.get('word_loss_history', {}) MAD_loss_history = histories.get('MAD_loss_history', {}) SAP_loss_history = histories.get('SAP_loss_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) lr_history = histories.get('lr_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) #set up model, assure in training mode threshold = opt.threshold sc_flag = False num_gpu = opt.num_gpu model = models.setup(opt).cuda(device=0) model.train() update_lr_flag = True dp_model = torch.nn.parallel.DataParallel(model) optimizer = optim.Adam(model.parameters(), opt.learning_rate, (opt.optim_alpha, opt.optim_beta), opt.optim_epsilon, weight_decay=opt.weight_decay) # Load the optimizer if vars(opt).get('start_from', None) is not None and os.path.isfile( os.path.join( opt.checkpoint_path, 'optimizer' + opt.id + format(int(opt.start_from), '04') + '.pth')): optimizer.load_state_dict( torch.load( os.path.join( opt.checkpoint_path, 'optimizer' + opt.id + format(int(opt.start_from), '04') + '.pth'))) if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob optimizer.zero_grad() accumulate_iter = 0 train_loss = 0 subsequent_mat = np.load('data/markov_mat.npy') subsequent_mat = torch.from_numpy(subsequent_mat).cuda(device=0).float() subsequent_mat_all = subsequent_mat.clone() # for multi-GPU training for i in range(opt.num_gpu - 1): subsequent_mat_all = torch.cat([subsequent_mat_all, subsequent_mat], dim=0) while True: if update_lr_flag: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate for group in optimizer.param_groups: group['lr'] = opt.current_lr # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if sc_flag == False and opt.self_critical_after != -1 and epoch >= opt.self_critical_after: print('initializing CIDEr scorer...') s = time.time() global CiderD_scorer if (CiderD_scorer is None): CiderD_scorer = CiderD(df=opt.cached_tokens) #takes about 30s print('initlizing CIDEr scorers in {:3f}s'.format( time.time() - s)) sc_flag = True opt.learning_rate_decay_every = opt.learning_rate_decay_every * 2 #default 5 for xe, 10 for scst update_lr_flag = False print('current_lr is {}'.format(opt.current_lr)) start = time.time() data = loader.get_batch('train', opt.batch_size) torch.cuda.synchronize() fc_feats = None att_feats = None tmp = [ data['fc_feats'], data['labels'], data['masks'], data['att_feats'], data['attr_labels'], data['subsequent_labels'] ] tmp = [ _ if _ is None else torch.from_numpy(_).cuda(device=0) for _ in tmp ] fc_feats, labels, masks, att_feats, attr_labels, subsequent_labels = tmp #convert 1-1000 to 0-999 (perhaps done in preprocessing) subsequent_labels = subsequent_labels - 1 subsequent_mask = (subsequent_labels[:, 1:] >= 0).float() subsequent_labels = torch.where( subsequent_labels > 0, subsequent_labels, torch.zeros_like(subsequent_labels).int().cuda(device=0)) print('Read and process data:', time.time() - start) if not sc_flag: SAP_loss, word_loss, MAD_loss = dp_model( fc_feats, att_feats, labels, masks, attr_labels, subsequent_labels, subsequent_mask, subsequent_mat_all) SAP_loss = SAP_loss.mean() word_loss = word_loss.mean() MAD_loss = MAD_loss.mean() accumulate_iter = accumulate_iter + 1 loss = (word_loss + 0.2 * SAP_loss + 0.2 * MAD_loss) / opt.accumulate_number loss.backward() else: st = time.time() sm = torch.zeros([num_gpu, 1]).cuda( device=0) #indexs for sampling by probabilities gen_result, sample_logprobs, _ = dp_model(fc_feats, att_feats, attr_labels, subsequent_mat_all, sm, mode='sample') dp_model.eval() with torch.no_grad(): greedy_res, _, _ = dp_model(fc_feats, att_feats, attr_labels, subsequent_mat_all, mode='sample') dp_model.train() ed = time.time() print('GPU time is : {}s'.format(ed - st)) reward = get_self_critical_reward(gen_result, greedy_res, data['gts']) word_loss = dp_model(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda(), mode='scst_forward') word_loss = word_loss.mean() loss = word_loss #forward to minimize SAP loss and MAD loss SAP_loss, _, MAD_loss = dp_model(fc_feats, att_feats, labels, masks, attr_labels, subsequent_labels, subsequent_mask, subsequent_mat_all) SAP_loss = SAP_loss.mean() MAD_loss = MAD_loss.mean() loss = loss + 0.2 * SAP_loss + 0.2 * MAD_loss loss.backward() accumulate_iter = accumulate_iter + 1 if accumulate_iter % opt.accumulate_number == 0: utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() optimizer.zero_grad() iteration += 1 accumulate_iter = 0 train_loss = loss.item() * opt.accumulate_number end = time.time() #you can record the training log if you need #text_file = open(opt.checkpoint_path+'/logs/train_log_'+opt.id+'.txt', "aw") if not sc_flag: print("iter {} (epoch {}), SAP_loss = {:.3f}, word_loss = {:.3f}, MAD_loss = {:.3f} time/batch = {:.3f}" \ .format(iteration, epoch,SAP_loss, word_loss,MAD_loss, end - start)) #text_file.write("iter {} (epoch {}),SAP_loss = {:.3f}, word_loss {:.3f}, MAD_loss {:.3f},time/batch = {:.3f}\n" \ # .format(iteration, epoch,SAP_loss, word_loss, MAD_loss, end - start)) else: print("iter {} (epoch {}),SAP_loss = {:.3f}, avg_reward = {:.3f},MAD_loss = {:.3f} time/batch = {:.3f}" \ .format(iteration, epoch,SAP_loss,np.mean(reward[:, 0]),MAD_loss, end - start)) #text_file.write("iter {} (epoch {}), avg_reward = {:.3f} MAD_loss ={:.3f}, time/batch = {:.3f}\n" \ # .format(iteration, epoch, np.mean(reward[:, 0]), MAD_loss, end - start)) #text_file.close() torch.cuda.synchronize() # Update the iteration and epoch if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0) and (accumulate_iter % opt.accumulate_number == 0): add_summary_value(tb_summary_writer, 'word_loss', word_loss.item(), iteration) add_summary_value(tb_summary_writer, 'MAD_loss', MAD_loss.item(), iteration) add_summary_value(tb_summary_writer, 'SAP_loss', SAP_loss.item(), iteration) add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration) add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: add_summary_value(tb_summary_writer, 'avg_reward', np.mean(reward[:, 0]), iteration) loss_history[iteration] = train_loss if not sc_flag else np.mean( reward[:, 0]) word_loss_history[iteration] = word_loss.item() SAP_loss_history[iteration] = SAP_loss.item() MAD_loss_history[iteration] = MAD_loss.item() lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0) and (accumulate_iter % opt.accumulate_number == 0): # eval model eval_kwargs = { 'split': 'val', 'dataset': opt.input_json, 'num_images': -1, 'index_eval': 1, 'id': opt.id, 'beam': opt.beam, 'verbose_loss': 1, 'checkpoint_path': opt.checkpoint_path } eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats, precision, recall = eval_utils.eval_split( dp_model, loader, subsequent_mat_all, eval_kwargs) # Write validation result into summary add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): add_summary_value(tb_summary_writer, k, v, iteration) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } #save lang stats f_lang = open( opt.checkpoint_path + '/logs/lang_' + opt.id + '.txt', 'aw') f_lang.write( str(iteration) + ' ' + str(iteration / opt.save_checkpoint_every) + '\n') f_lang.write('val loss ' + str(val_loss) + '\n') for key_lang in lang_stats: f_lang.write(key_lang + ' ' + str(lang_stats[key_lang]) + '\n') f_lang.write('precision ' + str(precision) + ' recall ' + str(recall) + '\n') f_lang.close() # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False save_id = iteration / opt.save_checkpoint_every if best_val_score is None or current_score > best_val_score or current_score > threshold: best_val_score = current_score best_flag = True ##only save the improved models or when the CIDEr-D is larger than a given threshold checkpoint_path = os.path.join( opt.checkpoint_path, 'model' + opt.id + format(int(save_id), '04') + '.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join( opt.checkpoint_path, 'optimizer' + opt.id + format(int(save_id), '04') + '.pth') torch.save(optimizer.state_dict(), optimizer_path) #record the lang stats for saved mdoel f_lang = open( opt.checkpoint_path + '/logs/Best_lang_' + opt.id + '.txt', 'aw') f_lang.write( str(iteration) + ' ' + str(iteration / opt.save_checkpoint_every) + '\n') f_lang.write('val loss ' + str(val_loss) + '\n') for key_lang in lang_stats: f_lang.write(key_lang + ' ' + str(lang_stats[key_lang]) + '\n') f_lang.write('precision ' + str(precision) + ' recall ' + str(recall) + '\n') f_lang.close() # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['best_val_score'] = best_val_score infos['opt'] = opt infos['vocab'] = loader.get_vocab() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['word_loss_history'] = loss_history histories['MAD_loss_history'] = MAD_loss_history histories['SAP_loss_history'] = SAP_loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open( os.path.join( opt.checkpoint_path, 'infos_' + opt.id + format(int(save_id), '04') + '.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join( opt.checkpoint_path, 'histories_' + opt.id + format(int(save_id), '04') + '.pkl'), 'wb') as f: cPickle.dump(histories, f) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break
def train(opt): opt.use_att = utils.if_use_att(opt.caption_model) loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.maxlen_sen opt.inc_seg = loader.inc_seg opt.seg_ix = loader.seg_ix tf_summary_writer = tf and tf.summary.FileWriter(opt.checkpoint_path) infos = {} histories = {} score_list = [] if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')): with open( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')) as f: histories = cPickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) best_val_score = None best_val_score = {} score_splits = ['val', 'test'] score_type = ['Bleu_4', 'METEOR', 'CIDEr'] for split_i in score_splits: for score_item in score_type: if split_i not in best_val_score.keys(): best_val_score[split_i] = {} best_val_score[split_i][score_item] = 0.0 if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', best_val_score) model = models.setup(opt) device_ids = [0, 1] torch.cuda.set_device(device_ids[0]) model = nn.DataParallel(model, device_ids=device_ids) model = model.cuda() update_lr_flag = True # Assure in training mode model.module.train() crit = utils.LanguageModelCriterion() optimizer = optim.Adam(model.module.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) #optimizer = nn.DataParallel(optimizer, device_ids=device_ids) # Load the optimizer if vars(opt).get('start_from', None) is not None: optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) while True: if update_lr_flag: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor utils.set_lr(optimizer, opt.current_lr) # set the decayed rate else: opt.current_lr = opt.learning_rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.module.ss_prob = opt.ss_prob update_lr_flag = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() tmp = [data['fc_feats'], data['labels'], data['x_phrase_mask_0'], data['x_phrase_mask_1'], \ data['label_masks'], data['salicy_seg'], data['seg_mask']] tmp = [ Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp ] fc_feats, seq, phrase_mask_0, phrase_mask_1, masks, salicy_seg, seg_mask = tmp optimizer.zero_grad() remove_len = 2 outputs, alphas = model.module(fc_feats, seq, phrase_mask_0, phrase_mask_1, masks, seg_mask, remove_len) loss = crit(outputs, seq[remove_len:, :].permute(1, 0), masks[remove_len:, :].permute(1, 0)) alphas = alphas.permute(1, 0, 2) salicy_seg = salicy_seg[:, :, :] seg_mask = seg_mask[:, :] if opt.salicy_hard == False: if opt.salicy_loss_type == 'l2': salicy_loss = (((((salicy_seg * seg_mask[:, :, None] - alphas * seg_mask[:, :, None])**2).sum(0) ).sum(-1))**(0.5)).mean() if opt.salicy_loss_type == 'kl': #alphas: len_sen, batch_size, num_frame salicy_loss = kullback_leibler2( alphas * seg_mask[:, :, None], salicy_seg * seg_mask[:, :, None]) salicy_loss = (((salicy_loss * seg_mask[:, :, None]).sum(-1)).sum(0)).mean() elif opt.salicy_hard == True: #salicy len_sen, batch_size, num_frame salicy_loss = -torch.log((alphas * salicy_seg).sum(-1) + 1e-8) #salicy_loss len_sen, batch_size salicy_loss = ((salicy_loss * seg_mask).sum(0)).mean() loss = loss + opt.salicy_alpha * salicy_loss loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.data[0] torch.cuda.synchronize() end = time.time() print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): if tf is not None: add_summary_value(tf_summary_writer, 'train_loss', train_loss, iteration) add_summary_value(tf_summary_writer, 'learning_rate', opt.current_lr, iteration) add_summary_value(tf_summary_writer, 'scheduled_sampling_prob', model.module.ss_prob, iteration) tf_summary_writer.flush() loss_history[iteration] = train_loss lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.module.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = { 'split': 'val', 'dataset': opt.dataset, 'remove_len': remove_len } eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats, score_list_i = eval_utils.eval_split( model.module, crit, loader, eval_kwargs) score_list.append(score_list_i) np.savetxt('./save/train_valid_test.txt', score_list, fmt='%.3f') # Write validation result into summary if tf is not None: add_summary_value(tf_summary_writer, 'validation loss', val_loss, iteration) for k in lang_stats.keys(): for v in lang_stats[k].keys(): add_summary_value(tf_summary_writer, k + v, lang_stats[k][v], iteration) tf_summary_writer.flush() val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['val']['CIDEr'] else: current_score = -val_loss best_flag = {} for split_i in score_splits: for score_item in score_type: if split_i not in best_flag.keys(): best_flag[split_i] = {} best_flag[split_i][score_item] = False if True: # if true for split_i in score_splits: for score_item in score_type: if best_val_score is None or lang_stats[split_i][ score_item] > best_val_score[split_i][ score_item]: best_val_score[split_i][score_item] = lang_stats[ split_i][score_item] best_flag[split_i][score_item] = True checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.module.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['best_val_score'] = best_val_score infos['opt'] = opt infos['vocab'] = loader.get_vocab() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(histories, f) for split_i in score_splits: for score_item in score_type: if best_flag[split_i][score_item]: checkpoint_path = os.path.join( opt.checkpoint_path, 'model-best_' + split_i + '_' + score_item + '.pth') torch.save(model.module.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open( os.path.join( opt.checkpoint_path, 'infos_' + split_i + '_' + score_item + '_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break
def validation(opts, model, criterion, optimizer, loader, info, history, device, val_result_history, iteration, best_val_score, stage_id): logging.info("Start validation") # eval model eval_kwargs = { 'split': 'val', 'dataset': opts.input_json, 'device': device, 'stage': stage_id } eval_kwargs.update(vars(opts)) val_loss, predictions, lang_stats = eval_utils.eval_split( model, criterion, loader, eval_kwargs) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result 如果取language,就以CIDEr为准,否则是loss if opts.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True # 写入检查点 checkpoint_path = os.path.join(opts.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) logging.info("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opts.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscellaneous information info['best_val_score'] = best_val_score info['opts'] = opts info['vocabulary'] = loader.get_vocab() info['stage'] = stage_id history['val_result_history'] = val_result_history with open( os.path.join(opts.checkpoint_path, 'info_' + opts.train_id + '.pkl'), 'wb') as infofile: cPickle.dump(info, infofile) with open( os.path.join(opts.checkpoint_path, 'history_' + opts.train_id + '.pkl'), 'wb') as historyfile: cPickle.dump(history, historyfile) logging.info("Checkpoint Saved") # 选择的最佳模型 if best_flag: checkpoint_path = os.path.join(opts.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) logging.info("model saved to {}".format(checkpoint_path)) with open( os.path.join(opts.checkpoint_path, 'info_' + opts.train_id + '-best.pkl'), 'wb') as bestfile: cPickle.dump(info, bestfile) logging.info("validation complete") return val_loss, lang_stats
def train(opt): opt.use_att = utils.if_use_att(opt.caption_model) from dataloader import DataLoader loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.vocab_ccg_size = loader.vocab_ccg_size opt.seq_length = loader.seq_length infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')): with open( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')) as f: histories = cPickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) cnn_model = utils.build_cnn(opt) cnn_model.cuda() model = models.setup(opt) model.cuda() # model = DataParallel(model) if vars(opt).get('start_from', None) is not None: # check if all necessary files exist assert os.path.isdir( opt.start_from), " %s must be a a path" % opt.start_from assert os.path.isfile( os.path.join(opt.start_from, "infos_" + opt.id + ".pkl") ), "infos.pkl file does not exist in path %s" % opt.start_from model.load_state_dict( torch.load(os.path.join(opt.start_from, 'model.pth'))) update_lr_flag = True model.train() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() multilabel_crit = nn.MultiLabelSoftMarginLoss().cuda() # optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate) if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after: print('finetune mode') cnn_optimizer = optim.Adam([\ {'params': module.parameters()} for module in cnn_model._modules.values()[5:]\ ], lr=opt.cnn_learning_rate, weight_decay=opt.cnn_weight_decay) if vars(opt).get('start_from', None) is not None and os.path.isfile( os.path.join(opt.start_from, "optimizer.pth")): if os.path.isfile(os.path.join(opt.start_from, 'optimizer.pth')): optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after: if os.path.isfile(os.path.join(opt.start_from, 'optimizer-cnn.pth')): cnn_optimizer.load_state_dict( torch.load( os.path.join(opt.start_from, 'optimizer-cnn.pth'))) eval_kwargs = {'split': 'val', 'dataset': opt.input_json, 'verbose': True} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( cnn_model, model, crit, loader, eval_kwargs, True) epoch_start = time.time() while True: if update_lr_flag: if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor utils.set_lr(optimizer, opt.current_lr) # set the decayed rate else: opt.current_lr = opt.learning_rate if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob #model.module.ss_prob = opt.ss_prob if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True else: sc_flag = False # Update the training stage of cnn for p in cnn_model.parameters(): p.requires_grad = True # Fix the first few layers: for module in cnn_model._modules.values()[:5]: for p in module.parameters(): p.requires_grad = False cnn_model.train() update_lr_flag = False cnn_model.apply(utils.set_bn_fix) cnn_model.apply(utils.set_bn_eval) start = time.time() torch.cuda.synchronize() data = loader.get_batch('train') if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after: multilabels = [ data['detection_infos'][i]['label'] for i in range(len(data['detection_infos'])) ] tmp = [ data['labels'], data['masks'], np.array(multilabels, dtype=np.int16) ] tmp = [ Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp ] labels, masks, multilabels = tmp images = data[ 'images'] # it cannot be turned into tensor since different sizes. _fc_feats_2048 = [] _fc_feats_81 = [] _att_feats = [] for i in range(loader.batch_size): x = Variable(torch.from_numpy(images[i]), requires_grad=False).cuda() x = x.unsqueeze(0) att_feats, fc_feats_81 = cnn_model(x) fc_feats_2048 = att_feats.mean(3).mean(2).squeeze() att_feats = F.adaptive_avg_pool2d(att_feats, [14, 14]).squeeze().permute( 1, 2, 0) #(0, 2, 3, 1) _fc_feats_2048.append(fc_feats_2048) _fc_feats_81.append(fc_feats_81) _att_feats.append(att_feats) _fc_feats_2048 = torch.stack(_fc_feats_2048) _fc_feats_81 = torch.stack(_fc_feats_81) _att_feats = torch.stack(_att_feats) att_feats = _att_feats.unsqueeze(1).expand(*((_att_feats.size(0), loader.seq_per_img,) + \ _att_feats.size()[1:])).contiguous().view(*((_att_feats.size(0) * loader.seq_per_img,) + \ _att_feats.size()[1:])) fc_feats_2048 = _fc_feats_2048.unsqueeze(1).expand(*((_fc_feats_2048.size(0), loader.seq_per_img,) + \ _fc_feats_2048.size()[1:])).contiguous().view(*((_fc_feats_2048.size(0) * loader.seq_per_img,) + \ _fc_feats_2048.size()[1:])) fc_feats_81 = _fc_feats_81 # cnn_optimizer.zero_grad() else: tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'] ] tmp = [ Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp ] fc_feats, att_feats, labels, masks = tmp optimizer.zero_grad() if not sc_flag: loss1 = crit(model(fc_feats_2048, att_feats, labels), labels[:, 1:], masks[:, 1:]) loss2 = multilabel_crit(fc_feats_81.double(), multilabels.double()) loss = 0.8 * loss1 + 0.2 * loss2.float() else: gen_result, sample_logprobs = model.sample(fc_feats_2048, att_feats, {'sample_max': 0}) reward = get_self_critical_reward(model, fc_feats_2048, att_feats, data, gen_result) loss1 = rl_crit( sample_logprobs, gen_result, Variable(torch.from_numpy(reward).float().cuda(), requires_grad=False)) loss2 = multilabel_crit(fc_feats_81.double(), multilabels.double()) loss3 = crit(model(fc_feats_2048, att_feats, labels), labels[:, 1:], masks[:, 1:]) loss = 0.995 * loss1 + 0.005 * (loss2.float() + loss3) loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.data[0] mle_loss = loss1.data[0] multilabel_loss = loss2.data[0] torch.cuda.synchronize() end = time.time() if not sc_flag and iteration % 2500 == 0: print("iter {} (epoch {}), mle_loss = {:.3f}, multilabel_loss = {:.3f}, train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, mle_loss, multilabel_loss, train_loss, end - start)) if sc_flag and iteration % 2500 == 0: print("iter {} (epoch {}), avg_reward = {:.3f}, mle_loss = {:.3f}, multilabel_loss = {:.3f}, train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, np.mean(reward[:,0]), mle_loss, multilabel_loss, train_loss, end - start)) iteration += 1 if (iteration % opt.losses_log_every == 0): loss_history[iteration] = train_loss if not sc_flag else np.mean( reward[:, 0]) lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob if (iteration % opt.save_checkpoint_every == 0): eval_kwargs = { 'split': 'val', 'dataset': opt.input_json, 'verbose': True } eval_kwargs.update(vars(opt)) if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after: val_loss, predictions, lang_stats = eval_utils.eval_split( cnn_model, model, crit, loader, eval_kwargs, True) else: val_loss, predictions, lang_stats = eval_utils.eval_split( cnn_model, model, crit, loader, eval_kwargs, False) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if True: if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) cnn_checkpoint_path = os.path.join(opt.checkpoint_path, 'model-cnn.pth') torch.save(cnn_model.state_dict(), cnn_checkpoint_path) print("cnn model saved to {}".format(cnn_checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after: cnn_optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer-cnn.pth') torch.save(cnn_optimizer.state_dict(), cnn_optimizer_path) infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['best_val_score'] = best_val_score infos['opt'] = opt infos['vocab'] = loader.get_vocab() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) cnn_checkpoint_path = os.path.join(opt.checkpoint_path, 'model-cnn-best.pth') torch.save(cnn_model.state_dict(), cnn_checkpoint_path) print("cnn model saved to {}".format(cnn_checkpoint_path)) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True print("epoch: " + str(epoch) + " during: " + str(time.time() - epoch_start)) epoch_start = time.time() if epoch >= opt.max_epochs and opt.max_epochs != -1: break
def train(opt): print("=================Training Information==============") print("start from {}".format(opt.start_from)) print("box from {}".format(opt.input_box_dir)) print("input json {}".format(opt.input_json)) print("attributes from {}".format(opt.input_att_dir)) print("features from {}".format(opt.input_fc_dir)) print("batch size ={}".format(opt.batch_size)) print("#GPU={}".format(torch.cuda.device_count())) # Deal with feature things before anything opt.use_fc, opt.use_att = utils.if_use_feat(opt.caption_model) if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5 acc_steps = getattr(opt, 'acc_steps', 1) name_append = opt.name_append if len(name_append) > 0 and name_append[0] != '-': name_append = '_' + name_append loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length opt.write_summary = write_summary if opt.write_summary: print("write summary to {}".format(opt.checkpoint_path)) tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path) infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible infors_path = os.path.join(opt.start_from, 'infos' + name_append + '.pkl') print("Load model information {}".format(infors_path)) with open(infors_path, 'rb') as f: infos = utils.pickle_load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme histories_path = os.path.join(opt.start_from, 'histories_' + name_append + '.pkl') if os.path.isfile(histories_path): with open(histories_path, 'rb') as f: histories = utils.pickle_load(f) else: # start from scratch print("Initialize training process from all begining") infos['iter'] = 0 infos['epoch'] = 0 infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['vocab'] = loader.get_vocab() infos['opt'] = opt iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) # sanity check for the saved model name has a correct index if opt.name_append.isdigit() and int(opt.name_append) < 100: assert int( opt.name_append ) == epoch, "dismatch in the model index and the real epoch number" epoch += 1 print( "==================start from {} epoch================".format(epoch)) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) # pdb.set_trace() loader.iterators = infos.get('iterators', loader.iterators) start_Img_idx = loader.iterators['train'] loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) opt.vocab = loader.get_vocab() model = models.setup(opt).cuda() del opt.vocab dp_model = torch.nn.DataParallel(model) lw_model = LossWrapper(model, opt) # wrap loss into model dp_lw_model = torch.nn.DataParallel(lw_model) epoch_done = True # Assure in training mode dp_lw_model.train() if opt.noamopt: assert opt.caption_model in [ 'transformer', 'aoa' ], 'noamopt can only work with transformer' optimizer = utils.get_std_opt(model, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup) optimizer._step = iteration elif opt.reduce_on_plateau: optimizer = utils.build_optimizer(model.parameters(), opt) optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3) else: optimizer = utils.build_optimizer(model.parameters(), opt) # Load the optimizer if vars(opt).get('start_from', None) is not None: optimizer_path = os.path.join(opt.start_from, 'optimizer' + name_append + '.pth') if os.path.isfile(optimizer_path): print("Loading optimizer............") optimizer.load_state_dict(torch.load(optimizer_path)) def save_checkpoint(model, infos, optimizer, histories=None, append=''): if len(append) > 0: append = '_' + append # if checkpoint_path doesn't exist if not os.path.isdir(opt.checkpoint_path): os.makedirs(opt.checkpoint_path) checkpoint_path = os.path.join(opt.checkpoint_path, 'model%s.pth' % (append)) torch.save(model.state_dict(), checkpoint_path) print("Save model state to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer%s.pth' % (append)) torch.save(optimizer.state_dict(), optimizer_path) print("Save model optimizer to {}".format(optimizer_path)) with open( os.path.join(opt.checkpoint_path, 'infos' + '%s.pkl' % (append)), 'wb') as f: utils.pickle_dump(infos, f) print("Save training information to {}".format( os.path.join(opt.checkpoint_path, 'infos' + '%s.pkl' % (append)))) if histories: with open( os.path.join(opt.checkpoint_path, 'histories_' + '%s.pkl' % (append)), 'wb') as f: utils.pickle_dump(histories, f) print("Save training historyes to {}".format( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '%s.pkl' % (append)))) try: while True: # pdb.set_trace() if epoch_done: if not opt.noamopt and not opt.reduce_on_plateau: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(optimizer, opt.current_lr) # set the decayed rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min( opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False epoch_done = False print("{}th Epoch Training starts now!".format(epoch)) with tqdm(total=len(loader.split_ix['train']), initial=start_Img_idx) as pbar: for i in range(start_Img_idx, len(loader.split_ix['train']), opt.batch_size): # import ipdb; ipdb.set_trace() start = time.time() if (opt.use_warmup == 1) and (iteration < opt.noamopt_warmup): opt.current_lr = opt.learning_rate * ( iteration + 1) / opt.noamopt_warmup utils.set_lr(optimizer, opt.current_lr) # Load data from train split (0) data = loader.get_batch('train') # print('Read data:', time.time() - start) if (iteration % acc_steps == 0): optimizer.zero_grad() torch.cuda.synchronize() start = time.time() tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks'] ] tmp = [_ if _ is None else _.cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp model_out = dp_lw_model(fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts'])), sc_flag) loss = model_out['loss'].mean() loss_sp = loss / acc_steps loss_sp.backward() if ((iteration + 1) % acc_steps == 0): utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() torch.cuda.synchronize() train_loss = loss.item() end = time.time() # if not sc_flag: # print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" # .format(iteration, epoch, train_loss, end - start)) # else: # print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" # .format(iteration, epoch, model_out['reward'].mean(), end - start)) if not sc_flag: pbar.set_description( "iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" .format(iteration, epoch, train_loss, end - start)) else: pbar.set_description( "iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" .format(iteration, epoch, model_out['reward'].mean(), end - start)) # Update the iteration and epoch iteration += 1 pbar.update(opt.batch_size) if data['bounds']['wrapped']: # save after each epoch save_checkpoint(model, infos, optimizer, append=str(epoch)) epoch += 1 # infos['epoch'] = epoch epoch_done = True # Write validation result into summary if (iteration % opt.losses_log_every == 0) and opt.write_summary: add_summary_value(tb_summary_writer, 'loss/train_loss', train_loss, iteration) if opt.noamopt: opt.current_lr = optimizer.rate() elif opt.reduce_on_plateau: opt.current_lr = optimizer.current_lr add_summary_value(tb_summary_writer, 'hyperparam/learning_rate', opt.current_lr, iteration) add_summary_value( tb_summary_writer, 'hyperparam/scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: add_summary_value(tb_summary_writer, 'avg_reward', model_out['reward'].mean(), iteration) loss_history[ iteration] = train_loss if not sc_flag else model_out[ 'reward'].mean() lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # update infos infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix # make evaluation on validation set, and save model # TODO modify it to evaluate by each epoch # ipdb.set_trace() if (iteration % opt.save_checkpoint_every == 0) and eval_ and epoch > 20: model_path = os.path.join( opt.checkpoint_path, 'model_itr%s.pth' % (iteration)) eval_kwargs = { 'split': 'val', 'dataset': opt.input_json, 'model': model_path } eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( dp_model, lw_model.crit, loader, eval_kwargs) if opt.reduce_on_plateau: if 'CIDEr' in lang_stats: optimizer.scheduler_step(-lang_stats['CIDEr']) else: optimizer.scheduler_step(val_loss) # Write validation result into summary if opt.write_summary: add_summary_value(tb_summary_writer, 'loss/validation loss', val_loss, iteration) if lang_stats is not None: bleu_dict = {} for k, v in lang_stats.items(): if 'Bleu' in k: bleu_dict[k] = v if len(bleu_dict) > 0: tb_summary_writer.add_scalars( 'val/Bleu', bleu_dict, epoch) for k, v in lang_stats.items(): if 'Bleu' not in k: add_summary_value( tb_summary_writer, 'val/' + k, v, iteration) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True # Dump miscalleous informations infos['best_val_score'] = best_val_score histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history # save_checkpoint(model, infos, optimizer, histories, append=str(iteration)) save_checkpoint(model, infos, optimizer, histories) # if opt.save_history_ckpt: # save_checkpoint(model, infos, optimizer, append=str(iteration)) if best_flag: save_checkpoint(model, infos, optimizer, append='best') print( "update best model at {} iteration--{} epoch". format(iteration, epoch)) start_Img_idx = 0 # if epoch_done: # go through the set, start a new epoch loop # break # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: print("epoch {} break all".format(epoch)) save_checkpoint(model, infos, optimizer) tb_summary_writer.close() print("============{} Training Done !==============".format( 'Refine' if opt.use_test or opt.use_val else '')) break except (RuntimeError, KeyboardInterrupt): print('Save ckpt on exception ...') save_checkpoint(model, infos, optimizer, append='_interrupt') print('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace)
def train(opt, lossWriter): opt.use_att = utils.if_use_att(opt.caption_model) loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length tf_summary_writer = tf and tf.summary.FileWriter(opt.checkpoint_path) infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')): with open( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')) as f: histories = cPickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) model = models.setup(opt) model.cuda() update_lr_flag = True # Assure in training mode model.train() crit = utils.LanguageModelCriterion() optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) # Load the optimizer if vars(opt).get('start_from', None) is not None: optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) while True: if update_lr_flag: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor utils.set_lr(optimizer, opt.current_lr) # set the decayed rate else: opt.current_lr = opt.learning_rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob update_lr_flag = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'] ] tmp = [ Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp ] fc_feats, att_feats, labels, masks = tmp optimizer.zero_grad() value, _ = model(fc_feats, att_feats, labels) loss = crit(value, labels[:, 1:], masks[:, 1:]) loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.data[0] torch.cuda.synchronize() end = time.time() print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) lossWriter.writerow([epoch, iteration, train_loss]) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): if tf is not None: add_summary_value(tf_summary_writer, 'train_loss', train_loss, iteration) add_summary_value(tf_summary_writer, 'learning_rate', opt.current_lr, iteration) add_summary_value(tf_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration) tf_summary_writer.flush() loss_history[iteration] = train_loss lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( model, crit, loader, eval_kwargs) # Write validation result into summary if tf is not None: add_summary_value(tf_summary_writer, 'validation loss', val_loss, iteration) for k, v in lang_stats.items(): add_summary_value(tf_summary_writer, k, v, iteration) tf_summary_writer.flush() val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['best_val_score'] = best_val_score infos['opt'] = opt infos['vocab'] = loader.get_vocab() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break
def train(opt): # Deal with feature things before anything opt.use_att = utils.if_use_att(opt.caption_model) if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5 loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path) infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars(opt)[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')): with open(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')) as f: histories = cPickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) model = models.setup(opt).cuda() # 将这些model并行执行 dp_model = torch.nn.DataParallel(model) update_lr_flag = True # Assure in training mode dp_model.train() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = utils.build_optimizer(model.parameters(), opt) # Load the optimizer if vars(opt).get('start_from', None) is not None and os.path.isfile(os.path.join(opt.start_from,"optimizer.pth")): optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) while True: if update_lr_flag: # Assign the learning rate ''' epoch:训练次数 假设有500次 learning_rate_decay_start:开始降低学习率的训练次数 假设是100 opt.learning_rate_decay_every:学习率每次减少的数 假设是0.01 opt.learning_rate_decay_rate: 学习率的减小率 假设 frac:指数 decay_factor 降低率 ''' if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: # 超过阈值就改变当前的learning_rate(lr) # //表示返回不大于除法结果的最大整数 frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every # **表示^ decay_factor = opt.learning_rate_decay_rate ** frac opt.current_lr = opt.learning_rate * decay_factor else: # 未超过就不变 opt.current_lr = opt.learning_rate # 将当前的学习率设为优化器的学习率 utils.set_lr(optimizer, opt.current_lr) # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: # 执行SS来解决训练和预测的矛盾 frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False update_lr_flag = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') # 返回数据加载所用时间? print('Read data:', time.time() - start) # Waits for all kernels in all streams on current device to complete. torch.cuda.synchronize() start = time.time() tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks']] tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp optimizer.zero_grad() if not sc_flag: loss = crit(dp_model(fc_feats, att_feats, labels, att_masks), labels[:,1:], masks[:,1:]) else: gen_result, sample_logprobs = dp_model(fc_feats, att_feats, att_masks, opt={'sample_max':0}, mode='sample') reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt) loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda()) loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.item() torch.cuda.synchronize() end = time.time() if not sc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) else: print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, np.mean(reward[:,0]), end - start)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): add_summary_value(tb_summary_writer, 'train_loss', train_loss, iteration) add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration) add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: add_summary_value(tb_summary_writer, 'avg_reward', np.mean(reward[:,0]), iteration) loss_history[iteration] = train_loss if not sc_flag else np.mean(reward[:,0]) lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split(dp_model, crit, loader, eval_kwargs) # Write validation result into summary add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration) if lang_stats is not None: for k,v in lang_stats.items(): add_summary_value(tb_summary_writer, k, v, iteration) val_result_history[iteration] = {'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions} # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = - val_loss best_flag = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['best_val_score'] = best_val_score infos['opt'] = opt infos['vocab'] = loader.get_vocab() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'.pkl'), 'wb') as f: cPickle.dump(infos, f) with open(os.path.join(opt.checkpoint_path, 'histories_'+opt.id+'.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open(os.path.join(opt.checkpoint_path, 'infos_'+opt.id+'-best.pkl'), 'wb') as f: cPickle.dump(infos, f) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break
] for k in vars(infos['opt']).keys(): if k not in ignore1 and k not in ignore2: if k in vars(opt): assert vars(opt)[k] == vars( infos['opt'])[k], k + ' option not consistent' else: vars(opt).update({k: vars(infos['opt'])[k] }) # copy over options from model opt.vocab = infos['vocab'] # ix -> word mapping # Setup the model gen_model = MultiModalGenerator(opt) gen_model.load_state_dict(torch.load(opt.g_model_path)) gen_model.cuda() gen_model.eval() crit = utils.LanguageModelCriterion() # Create the Data Loader instance loader = DataLoader(opt) # When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json # So make sure to use the vocab in infos file. loader.ix_to_word = infos['vocab'] loss, split_predictions, lang_stats, div = eval_utils.eval_split( gen_model, crit, loader, eval_kwargs=vars(opt)) print('loss: ', loss) if lang_stats: print(lang_stats)
def train(opt): ################################ # Build dataloader ################################ loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length ########################## # Initialize infos ########################## infos = { 'iter': 0, 'epoch': 0, 'loader_state_dict': None, 'vocab': loader.get_vocab(), } # Load old infos(if there is) and check if models are compatible if opt.start_from is not None and os.path.isfile( os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')): with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl'), 'rb') as f: infos = utils.pickle_load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert getattr(saved_model_opt, checkme) == getattr( opt, checkme ), "Command line argument and saved model disagree on '%s' " % checkme infos['opt'] = opt ######################### # Build logger ######################### # naive dict logger histories = defaultdict(dict) if opt.start_from is not None and os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')): with open(os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl'), 'rb') as f: histories.update(utils.pickle_load(f)) # tensorboard logger tb_summary_writer = SummaryWriter(opt.checkpoint_path) ########################## # Build model ########################## opt.vocab = loader.get_vocab() multi_models_list = [] for order in range(opt.number_of_models): multi_models_list.append(models.setup(opt).cuda()) for order in range(opt.number_of_models): multi_models_list.append(models.setup(opt).cuda()) for order in range(opt.number_of_models, 2 * opt.number_of_models): for param in multi_models_list[order].parameters(): param.detach_() for order in range(opt.number_of_models): for param, param_ema in zip( multi_models_list[order].parameters(), multi_models_list[order + opt.number_of_models].parameters()): param_ema.data = param.data.clone() # multi_models = MultiModels(multi_models_list) # multi_models_list.append(SenEncodeModel(opt).cuda()) multi_models = nn.ModuleList(multi_models_list) del opt.vocab # Load pretrained weights: if opt.start_from is not None and os.path.isfile( os.path.join(opt.start_from, 'model.pth')): multi_models.load_state_dict( torch.load(os.path.join(opt.start_from, 'model.pth'))) # Wrap generation model with loss function(used for training) # This allows loss function computed separately on each machine lw_models = nn.ModuleList([ LossWrapper(multi_models[index], opt) for index in range(opt.number_of_models) ]) kdlw_models = nn.ModuleList([ KDLossWrapper(multi_models[index], opt) for index in range(opt.number_of_models) ]) lw_models_ema = nn.ModuleList([ LossWrapper(multi_models[opt.number_of_models + index], opt) for index in range(opt.number_of_models) ]) kdlw_models_ema = nn.ModuleList([ KDLossWrapper(multi_models[opt.number_of_models + index], opt) for index in range(opt.number_of_models) ]) # Wrap with dataparallel dp_models = nn.ModuleList([ torch.nn.DataParallel(multi_models[index]) for index in range(opt.number_of_models) ]) dp_lw_models = nn.ModuleList([ torch.nn.DataParallel(lw_models[index]) for index in range(opt.number_of_models) ]) dp_kdlw_models = nn.ModuleList([ torch.nn.DataParallel(kdlw_models[index]) for index in range(opt.number_of_models) ]) dp_models_ema = nn.ModuleList([ torch.nn.DataParallel(multi_models[opt.number_of_models + index]) for index in range(opt.number_of_models) ]) dp_lw_models_ema = nn.ModuleList([ torch.nn.DataParallel(lw_models_ema[index]) for index in range(opt.number_of_models) ]) dp_kdlw_models_ema = nn.ModuleList([ torch.nn.DataParallel(kdlw_models_ema[index]) for index in range(opt.number_of_models) ]) ########################## # Build optimizer ########################## if opt.noamopt: assert opt.caption_model in [ 'transformer', 'bert', 'm2transformer' ], 'noamopt can only work with transformer' optimizer = utils.get_std_opt(multi_models, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup) elif opt.reduce_on_plateau: optimizer = utils.build_optimizer(multi_models.parameters(), opt) optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3) else: optimizer = utils.build_optimizer(multi_models.parameters(), opt) # Load the optimizer if opt.start_from is not None and os.path.isfile( os.path.join(opt.start_from, "optimizer.pth")): optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) ########################## # Build loss ########################## # triplet_loss = nn.TripletMarginLoss() ######################### # Get ready to start ######################### iteration = infos['iter'] epoch = infos['epoch'] # For back compatibility if 'iterators' in infos: infos['loader_state_dict'] = { split: { 'index_list': infos['split_ix'][split], 'iter_counter': infos['iterators'][split] } for split in [ 'paired_train', 'unpaired_images_train', 'unpaired_captions_train', 'train', 'val', 'test' ] } loader.load_state_dict(infos['loader_state_dict']) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) if opt.noamopt: optimizer._step = iteration # flag indicating finish of an epoch # Always set to True at the beginning to initialize the lr or etc. epoch_done = True # Assure in training mode dp_lw_models.train() dp_kdlw_models.train() dp_lw_models_ema.train() dp_kdlw_models_ema.train() # Build the ensemble model # # Setup the model model_ensemble = AttEnsemble(multi_models_list[opt.number_of_models:2 * opt.number_of_models], weights=None) # model_ensemble.seq_length = 20 model_ensemble.cuda() # model_ensemble.eval() kd_model_outs_list = [] # Start training try: while True: # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break if epoch_done: if not opt.noamopt and not opt.reduce_on_plateau: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(optimizer, opt.current_lr) # set the decayed rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min( opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) for index in range(opt.number_of_models): multi_models[index].ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False # If start structure loss training if opt.structure_after != -1 and epoch >= opt.structure_after: struc_flag = True init_scorer(opt.cached_tokens) else: struc_flag = False if epoch >= opt.paired_train_epoch: opt.current_lambda_x = opt.hyper_parameter_lambda_x * \ (epoch - (opt.paired_train_epoch - 1)) /\ (opt.max_epochs - opt.paired_train_epoch) opt.current_lambda_y = opt.hyper_parameter_lambda_y * \ (epoch - (opt.paired_train_epoch - 1)) / \ (opt.max_epochs - opt.paired_train_epoch) epoch_done = False start = time.time() # Load data from train split (0) if epoch < opt.language_pretrain_epoch: data = loader.get_batch('unpaired_captions_train') elif epoch < opt.paired_train_epoch: data = loader.get_batch('paired_train') else: data = loader.get_batch('paired_train') unpaired_data = loader.get_batch('unpaired_images_train') unpaired_caption = loader.get_batch('unpaired_captions_train') print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() if epoch < opt.language_pretrain_epoch: tmp = [ data['fc_feats'] * 0, data['att_feats'] * 0, data['labels'], data['masks'], data['att_masks'] ] elif epoch < opt.paired_train_epoch: tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks'] ] else: tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks'] ] unpaired_tmp = [ unpaired_data['fc_feats'], unpaired_data['att_feats'], unpaired_data['labels'], unpaired_data['masks'], unpaired_data['att_masks'] ] unpaired_caption_tmp = [ unpaired_caption['fc_feats'] * 0, unpaired_caption['att_feats'] * 0, unpaired_caption['labels'], unpaired_caption['masks'], unpaired_caption['att_masks'] ] tmp = [_ if _ is None else _.cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp if epoch >= opt.paired_train_epoch: unpaired_tmp = [ _ if _ is None else _.cuda() for _ in unpaired_tmp ] unpaired_fc_feats, unpaired_att_feats, unpaired_labels, unpaired_masks, unpaired_att_masks = unpaired_tmp unpaired_caption_tmp = [ _ if _ is None else _.cuda() for _ in unpaired_caption_tmp ] unpaired_caption_fc_feats, unpaired_caption_att_feats, unpaired_caption_labels, unpaired_caption_masks, unpaired_caption_att_masks = unpaired_caption_tmp unpaired_caption_fc_feats = unpaired_caption_fc_feats.repeat( 5, 1) unpaired_caption_fc_feats = opt.std_pseudo_visual_feature * torch.randn_like( unpaired_caption_fc_feats) unpaired_caption_att_feats = unpaired_caption_att_feats.repeat( 5, 1, 1) unpaired_caption_fc_feats.requires_grad = True unpaired_caption_att_feats.requires_grad = True unpaired_caption_labels = unpaired_caption_labels.reshape( unpaired_caption_fc_feats.shape[0], -1) unpaired_caption_masks = unpaired_caption_masks.reshape( unpaired_caption_fc_feats.shape[0], -1) optimizer.zero_grad() if epoch < opt.language_pretrain_epoch: language_loss = 0 model_outs_list = [] for index in range(opt.number_of_models): model_out = dp_lw_models[index]( fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts'])), sc_flag, struc_flag) model_outs_list.append(model_out) language_loss += model_out['loss'].mean() loss = language_loss elif epoch < opt.paired_train_epoch: language_loss = 0 model_outs_list = [] for index in range(opt.number_of_models): model_out = dp_lw_models[index]( fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts'])), sc_flag, struc_flag) model_outs_list.append(model_out) language_loss += model_out['loss'].mean() loss = language_loss else: language_loss = 0 model_outs_list = [] for index in range(opt.number_of_models): model_out = dp_lw_models[index]( fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts'])), sc_flag, struc_flag) model_outs_list.append(model_out) language_loss += model_out['loss'].mean() loss = language_loss # else: # for unpaired image sentences # # Setup the model # model_ensemble = AttEnsemble(multi_models_list[:opt.number_of_models], weights=None) # model_ensemble.seq_length = 16 # model_ensemble.cuda() # model_ensemble.eval() model_ensemble.eval() eval_kwargs = dict() eval_kwargs.update(vars(opt)) with torch.no_grad(): seq, seq_logprobs = model_ensemble(unpaired_fc_feats, unpaired_att_feats, unpaired_att_masks, opt=eval_kwargs, mode='sample') # val_loss, predictions, lang_stats = eval_utils.eval_split(model_ensemble, lw_models[0].crit, loader, # eval_kwargs) # print('\n'.join([utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in # model_ensemble.done_beams[0]])) # print('++' * 10) # for ii in range(10): # sents = utils.decode_sequence(loader.get_vocab(), seq[ii].unsqueeze(0)) # gt_sent = utils.decode_sequence(loader.get_vocab(), labels[ii,0].unsqueeze(0)) # a=1 model_ensemble.train() model_ensemble_sudo_labels = labels.new_zeros( (opt.batch_size, opt.beam_size, eval_kwargs['max_length'] + 2)) model_ensemble_sudo_log_prob = masks.new_zeros( (opt.batch_size, opt.beam_size, eval_kwargs['max_length'] + 2, len(loader.get_vocab()) + 1)) model_ensemble_sum_log_prob = masks.new_zeros( (opt.batch_size, opt.beam_size)) for batch_index in range(opt.batch_size): for beam_index in range(opt.beam_size): # for beam_index in range(3): pred = model_ensemble.done_beams[batch_index][ beam_index]['seq'] log_prob = model_ensemble.done_beams[batch_index][ beam_index]['logps'] model_ensemble_sudo_labels[batch_index, beam_index, 1:pred.shape[0] + 1] = pred model_ensemble_sudo_log_prob[batch_index, beam_index, 1:pred.shape[0] + 1] = log_prob model_ensemble_sum_log_prob[batch_index][ beam_index] = model_ensemble.done_beams[ batch_index][beam_index]['p'] # model_ensemble_prob = F.softmax(model_ensemble_sum_log_prob) data_ensemble_sudo_gts = list() for data_ensemble_sudo_gts_index in range( model_ensemble_sudo_labels.shape[0]): data_ensemble_sudo_gts.append(model_ensemble_sudo_labels[ data_ensemble_sudo_gts_index, :, 1:-1].data.cpu().numpy()) # generated_sentences = list() # for i in range(unpaired_fc_feats.shape[0]): # generated_sentences.append( # [utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in # model_ensemble.done_beams[i]]) # # pos_tag_results = list() # for i in range(unpaired_fc_feats.shape[0]): # generated_sentences_i = generated_sentences[i] # pos_tag_results_i = [] # for text in generated_sentences_i: # text_tokenize = nltk.word_tokenize(text) # pos_tag_results_i_jbeam = [] # for vob, vob_type in nltk.pos_tag(text_tokenize): # if vob_type == 'NN' or vob_type == 'NNS': # pos_tag_results_i_jbeam.append(vob) # pos_tag_results_i.append(pos_tag_results_i_jbeam) # pos_tag_results.append(pos_tag_results_i) # for i in range(fc_feats.shape[0]): # print('\n'.join([utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in # model_ensemble.done_beams[i]])) # print('--' * 10) # dets = data['dets'] # # promising_flag = labels.new_zeros(opt.batch_size, opt.beam_size) # for batch_index in range(opt.batch_size): # dets_batch = dets[batch_index] # for beam_index in range(opt.beam_size): # indicator = [0] * len(dets_batch) # pos_tag_batch_beam = pos_tag_results[batch_index][beam_index] # for pos_tag_val in pos_tag_batch_beam: # for ii in range(len(dets_batch)): # possible_list = vob_transform_list[dets_batch[ii]] # if pos_tag_val in possible_list: # indicator[ii] = 1 # if sum(indicator) == len(dets_batch) or sum(indicator) >= 2: # promising_flag[batch_index, beam_index] = 1 # # # model_ensemble_sudo_log_prob = model_ensemble_sudo_log_prob * promising_flag.unsqueeze(-1).unsqueeze(-1) # model_ensemble_sudo_labels = model_ensemble_sudo_labels * promising_flag.unsqueeze(-1) #sudo_masks_for_model = sudo_masks_for_model.detach() distilling_loss = 0 # We use the random study machinism who_to_study = random.randint(0, opt.number_of_models - 1) # for index in range(opt.number_of_models): # model_out = dp_kdlw_models[index](unpaired_fc_feats, unpaired_att_feats, model_ensemble_sudo_labels, # model_ensemble_sudo_log_prob, att_masks, data_ensemble_sudo_gts, # torch.arange(0, len(data_ensemble_sudo_gts)), sc_flag, # struc_flag, model_ensemble_sum_log_prob) # kd_model_outs_list.append(model_out) model_out = dp_kdlw_models[who_to_study]( unpaired_fc_feats, unpaired_att_feats, model_ensemble_sudo_labels, model_ensemble_sudo_log_prob, att_masks, data_ensemble_sudo_gts, torch.arange(0, len(data_ensemble_sudo_gts)), sc_flag, struc_flag, model_ensemble_sum_log_prob) # kd_model_outs_list.append(model_out) distilling_loss += model_out['loss'].mean() loss += opt.number_of_models * opt.current_lambda_x * distilling_loss ################################################################### # use unlabelled captions # simple_sgd = utils.gradient_descent(unpaired_caption_fc_feats, stepsize=1e3) simple_sgd = utils.gradient_descent_adagrad( unpaired_caption_fc_feats, stepsize=1) gts_tmp = unpaired_caption['gts'] new_gts = [] for ii in range(len(data['gts'])): for jj in range(gts_tmp[ii].shape[0]): new_gts.append(gts_tmp[ii][jj]) unpaired_caption['gts'] = new_gts for itr in range(opt.inner_iteration): unlabelled_caption_model_out = dp_lw_models_ema[ itr % opt.number_of_models]( unpaired_caption_fc_feats, unpaired_caption_att_feats, unpaired_caption_labels, unpaired_caption_masks, unpaired_caption_att_masks, unpaired_caption['gts'], torch.arange(0, len(unpaired_caption['gts'])), sc_flag, struc_flag) unlabelled_caption_loss = unlabelled_caption_model_out[ 'loss'].mean() unlabelled_caption_loss.backward() # print(unlabelled_caption_loss) simple_sgd.update(unpaired_caption_fc_feats) # a=1 unpaired_caption_fc_feats.requires_grad = False unpaired_caption_att_feats.requires_grad = False unlabelled_caption_model_out = dp_lw_models[who_to_study]( unpaired_caption_fc_feats, unpaired_caption_att_feats, unpaired_caption_labels, unpaired_caption_masks, unpaired_caption_att_masks, unpaired_caption['gts'], torch.arange(0, len(unpaired_caption['gts'])), sc_flag, struc_flag) unlabelled_caption_loss = unlabelled_caption_model_out[ 'loss'].mean() loss += opt.number_of_models * opt.current_lambda_y * unlabelled_caption_loss loss.backward() if opt.grad_clip_value != 0: getattr(torch.nn.utils, 'clip_grad_%s_' % (opt.grad_clip_mode))(multi_models.parameters(), opt.grad_clip_value) optimizer.step() for order in range(opt.number_of_models): for param, param_ema in zip( multi_models_list[order].parameters(), multi_models_list[order + opt.number_of_models].parameters()): param_ema.data = opt.alpha * param_ema.data + ( 1 - opt.alpha) * param.data train_loss = loss.item() torch.cuda.synchronize() end = time.time() # if struc_flag: # print("iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}" \ # .format(iteration, epoch, train_loss, model_out['lm_loss'].mean().item(), model_out['struc_loss'].mean().item(), end - start)) # elif not sc_flag: # print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ # .format(iteration, epoch, train_loss, end - start)) # else: # print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \ # .format(iteration, epoch, model_out['reward'].mean(), end - start)) if struc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss/opt.number_of_models, sum([model_outs_list[index]['lm_loss'].mean().item() for index in range(opt.number_of_models)])/opt.number_of_models, sum([model_outs_list[index]['struc_loss'].mean().item() for index in range(opt.number_of_models)])/opt.number_of_models, end - start)) elif not sc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, language_loss.item()/opt.number_of_models, end - start)) else: print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, sum([model_outs_list[index]['reward'].mean().item() for index in range(opt.number_of_models)])/opt.number_of_models, end - start)) # Update the iteration and epoch iteration += 1 if epoch < opt.paired_train_epoch: if data['bounds']['wrapped']: epoch += 1 epoch_done = True else: if data['bounds']['wrapped']: epoch += 1 epoch_done = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): # tb_summary_writer.add_scalar('train_loss', train_loss, iteration) for index in range(opt.number_of_models): model_id = 'model_{}'.format(index) tb_summary_writer.add_scalars('language_loss', { model_id: model_outs_list[index]['loss'].mean().item() }, iteration) if epoch >= opt.paired_train_epoch: # for index in range(opt.number_of_models): # model_id = 'model_{}'.format(index) # kd_model_outs_val = 0 if len(kd_model_outs_list) == 0 else kd_model_outs_list[index]['loss'].mean().item() # tb_summary_writer.add_scalars('distilling_loss', # {model_id: kd_model_outs_val}, # iteration) tb_summary_writer.add_scalar('distilling_loss', distilling_loss.item(), iteration) tb_summary_writer.add_scalar( 'unlabelled_caption_loss', unlabelled_caption_loss.item(), iteration) tb_summary_writer.add_scalar('hyper_parameter_lambda_x', opt.current_lambda_x, iteration) tb_summary_writer.add_scalar('hyper_parameter_lambda_y', opt.current_lambda_y, iteration) # tb_summary_writer.add_scalar('triplet_loss', triplet_loss_val.item(), iteration) if opt.noamopt: opt.current_lr = optimizer.rate() elif opt.reduce_on_plateau: opt.current_lr = optimizer.current_lr tb_summary_writer.add_scalar('learning_rate', opt.current_lr, iteration) tb_summary_writer.add_scalar('scheduled_sampling_prob', multi_models[0].ss_prob, iteration) if sc_flag: for index in range(opt.number_of_models): # tb_summary_writer.add_scalar('avg_reward', model_out['reward'].mean(), iteration) model_id = 'model_{}'.format(index) tb_summary_writer.add_scalars( 'avg_reward', { model_id: model_outs_list[index]['reward'].mean().item() }, iteration) elif struc_flag: # tb_summary_writer.add_scalar('lm_loss', model_out['lm_loss'].mean().item(), iteration) # tb_summary_writer.add_scalar('struc_loss', model_out['struc_loss'].mean().item(), iteration) # tb_summary_writer.add_scalar('reward', model_out['reward'].mean().item(), iteration) # tb_summary_writer.add_scalar('reward_var', model_out['reward'].var(1).mean(), iteration) model_id = 'model_{}'.format(index) for index in range(opt.number_of_models): tb_summary_writer.add_scalars( 'lm_loss', { model_id: model_outs_list[index] ['lm_loss'].mean().item() }, iteration) tb_summary_writer.add_scalars( 'struc_loss', { model_id: model_outs_list[index] ['struc_loss'].mean().item() }, iteration) tb_summary_writer.add_scalars( 'reward', { model_id: model_outs_list[index]['reward'].mean().item() }, iteration) tb_summary_writer.add_scalars( 'reward_var', { model_id: model_outs_list[index]['reward'].var(1).mean() }, iteration) histories['loss_history'][ iteration] = train_loss if not sc_flag else sum([ model_outs_list[index]['reward'].mean().item() for index in range(opt.number_of_models) ]) / opt.number_of_models histories['lr_history'][iteration] = opt.current_lr histories['ss_prob_history'][iteration] = multi_models[ 0].ss_prob # update infos infos['iter'] = iteration infos['epoch'] = epoch infos['loader_state_dict'] = loader.state_dict() # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0 and not opt.save_every_epoch and epoch >= opt.paired_train_epoch) or \ (epoch_done and opt.save_every_epoch and epoch >= opt.paired_train_epoch): # load ensemble # Setup the model model = AttEnsemble(multi_models_list[opt.number_of_models:2 * opt.number_of_models], weights=None) model.seq_length = opt.max_length model.cuda() model.eval() # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) # eval_kwargs['beam_size'] = 5 # eval_kwargs['verbose_beam'] = 1 # eval_kwargs['verbose_loss'] = 1 # val_loss, predictions, lang_stats = eval_utils.eval_split( # dp_model, lw_model.crit, loader, eval_kwargs) with torch.no_grad(): val_loss, predictions, lang_stats = eval_utils.eval_split( model, lw_models[0].crit, loader, eval_kwargs) model.train() if opt.reduce_on_plateau: if 'CIDEr' in lang_stats: optimizer.scheduler_step(-lang_stats['CIDEr']) else: optimizer.scheduler_step(val_loss) # Write validation result into summary tb_summary_writer.add_scalar('validation loss', val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): tb_summary_writer.add_scalar(k, v, iteration) histories['val_result_history'][iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True # Dump miscalleous informations infos['best_val_score'] = best_val_score utils.save_checkpoint(opt, multi_models, infos, optimizer, histories) if opt.save_history_ckpt: utils.save_checkpoint( opt, multi_models, infos, optimizer, append=str(epoch) if opt.save_every_epoch else str(iteration)) if best_flag: utils.save_checkpoint(opt, multi_models, infos, optimizer, append='best') # if epoch_done and epoch == opt.paired_train_epoch: # utils.save_checkpoint(opt, multi_models, infos, optimizer, histories) # if opt.save_history_ckpt: # utils.save_checkpoint(opt, multi_models, infos, optimizer, # append=str(epoch) if opt.save_every_epoch else str(iteration)) # cmd = 'cp -r ' + 'log_' + opt.id + ' ' + 'log_' + opt.id + '_backup' # os.system(cmd) except (RuntimeError, KeyboardInterrupt): print('Save ckpt on exception ...') utils.save_checkpoint(opt, multi_models, infos, optimizer) print('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace)
def train(opt): loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length infos = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = infos.get('val_result_history', {}) loss_history = infos.get('loss_history', {}) lr_history = infos.get('lr_history', {}) ss_prob_history = infos.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) cnn_model = utils.build_cnn(opt) cnn_model.cuda() model = models.setup(opt) model.cuda() update_lr_flag = True # Assure in training mode model.train() crit = utils.LanguageModelCriterion() optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate) cnn_optimizer = optim.Adam(cnn_model.parameters(), lr=opt.cnn_learning_rate, weight_decay=opt.cnn_weight_decay) # Load the optimizer if vars(opt).get('start_from', None) is not None: if os.path.isfile(os.path.join(opt.start_from, 'optimizer.pth')): optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) if os.path.isfile(os.path.join(opt.start_from, 'optimizer-cnn.pth')): cnn_optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer-cnn.pth'))) while True: if update_lr_flag: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor utils.set_lr(optimizer, opt.current_lr) # set the decayed rate else: opt.current_lr = opt.learning_rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # Update the training stage of cnn if opt.finetune_cnn_after == -1 or epoch < opt.finetune_cnn_after: for p in cnn_model.parameters(): p.requires_grad = False cnn_model.eval() else: for p in cnn_model.parameters(): p.requires_grad = True cnn_model.train() update_lr_flag = False torch.cuda.synchronize() start = time.time() # Load data from train split (0) data = loader.get_batch('train') torch.cuda.synchronize() print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() tmp = [data['images'], data['labels'], data['masks']] tmp = [ Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp ] images, labels, masks = tmp att_feats = cnn_model(images) fc_feats = att_feats.mean(2).mean(3).squeeze(2).squeeze(2) att_feats = att_feats.unsqueeze(1).expand(*(( att_feats.size(0), opt.seq_per_img, ) + att_feats.size()[1:])).contiguous().view( *((att_feats.size(0) * opt.seq_per_img, ) + att_feats.size()[1:])) fc_feats = fc_feats.unsqueeze(1).expand(*(( fc_feats.size(0), opt.seq_per_img, ) + fc_feats.size()[1:])).contiguous().view( *((fc_feats.size(0) * opt.seq_per_img, ) + fc_feats.size()[1:])) optimizer.zero_grad() if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after: cnn_optimizer.zero_grad() loss = crit(model(fc_feats, att_feats, labels), labels[:, 1:], masks[:, 1:]) loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() if opt.finetune_cnn_after != -1 and epoch >= opt.finetune_cnn_after: utils.clip_gradient(cnn_optimizer, opt.grad_clip) cnn_optimizer.step() train_loss = loss.data[0] torch.cuda.synchronize() end = time.time() print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): loss_history[iteration] = train_loss lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( cnn_model, model, crit, loader, eval_kwargs) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') cnn_checkpoint_path = os.path.join(opt.checkpoint_path, 'model-cnn.pth') torch.save(model.state_dict(), checkpoint_path) torch.save(cnn_model.state_dict(), cnn_checkpoint_path) print("model saved to {}".format(checkpoint_path)) print("cnn model saved to {}".format(cnn_checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') cnn_optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer-cnn.pth') torch.save(optimizer.state_dict(), optimizer_path) torch.save(cnn_optimizer.state_dict(), cnn_optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['best_val_score'] = best_val_score infos['opt'] = opt infos['val_result_history'] = val_result_history infos['loss_history'] = loss_history infos['lr_history'] = lr_history infos['ss_prob_history'] = ss_prob_history infos['vocab'] = loader.get_vocab() with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(infos, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') cnn_checkpoint_path = os.path.join(opt.checkpoint_path, 'model-cnn-best.pth') torch.save(model.state_dict(), checkpoint_path) torch.save(cnn_model.state_dict(), cnn_checkpoint_path) print("model saved to {}".format(checkpoint_path)) print("cnn model saved to {}".format(cnn_checkpoint_path)) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break
def train(opt): exclude_opt = [ 'training_mode', 'tap_epochs', 'cg_epochs', 'tapcg_epochs', 'lr', 'learning_rate_decay_start', 'learning_rate_decay_every', 'learning_rate_decay_rate', 'self_critical_after', 'save_checkpoint_every', 'id', "pretrain", "pretrain_path", "debug", "save_all_checkpoint", "min_epoch_when_save" ] save_folder, logger, tf_writer = build_floder_and_create_logger(opt) saved_info = {'best': {}, 'last': {}, 'history': {}} is_continue = opt.start_from != None if is_continue: infos_path = os.path.join(save_folder, 'info.pkl') with open(infos_path) as f: logger.info('load info from {}'.format(infos_path)) saved_info = cPickle.load(f) pre_opt = saved_info[opt.start_from_mode]['opt'] if vars(opt).get("no_exclude_opt", False): exclude_opt = [] for opt_name in vars(pre_opt).keys(): if (not opt_name in exclude_opt): vars(opt).update({opt_name: vars(pre_opt).get(opt_name)}) if vars(pre_opt).get(opt_name) != vars(opt).get(opt_name): print('change opt: {} from {} to {}'.format( opt_name, vars(pre_opt).get(opt_name), vars(opt).get(opt_name))) opt.use_att = utils.if_use_att(opt.caption_model) loader = DataLoader(opt) opt.CG_vocab_size = loader.vocab_size opt.CG_seq_length = loader.seq_length # init training option epoch = saved_info[opt.start_from_mode].get('epoch', 0) iteration = saved_info[opt.start_from_mode].get('iter', 0) best_val_score = saved_info[opt.start_from_mode].get('best_val_score', 0) val_result_history = saved_info['history'].get('val_result_history', {}) loss_history = saved_info['history'].get('loss_history', {}) lr_history = saved_info['history'].get('lr_history', {}) loader.iterators = saved_info[opt.start_from_mode].get( 'iterators', loader.iterators) loader.split_ix = saved_info[opt.start_from_mode].get( 'split_ix', loader.split_ix) opt.current_lr = vars(opt).get('current_lr', opt.lr) opt.m_batch = vars(opt).get('m_batch', 1) # create a tap_model,fusion_model,cg_model tap_model = models.setup_tap(opt) lm_model = CaptionGenerator(opt) cg_model = lm_model if is_continue: if opt.start_from_mode == 'best': model_pth = torch.load(os.path.join(save_folder, 'model-best.pth')) elif opt.start_from_mode == 'last': model_pth = torch.load( os.path.join(save_folder, 'model_iter_{}.pth'.format(iteration))) assert model_pth['iteration'] == iteration logger.info('Loading pth from {}, iteration:{}'.format( save_folder, iteration)) tap_model.load_state_dict(model_pth['tap_model']) cg_model.load_state_dict(model_pth['cg_model']) elif opt.pretrain: print('pretrain {} from {}'.format(opt.pretrain, opt.pretrain_path)) model_pth = torch.load(opt.pretrain_path) if opt.pretrain == 'tap': tap_model.load_state_dict(model_pth['tap_model']) elif opt.pretrain == 'cg': cg_model.load_state_dict(model_pth['cg_model']) elif opt.pretrain == 'tap_cg': tap_model.load_state_dict(model_pth['tap_model']) cg_model.load_state_dict(model_pth['cg_model']) else: assert 1 == 0, 'opt.pretrain error' tap_model.cuda() tap_model.train() # Assure in training mode tap_crit = utils.TAPModelCriterion() tap_optimizer = optim.Adam(tap_model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) cg_model.cuda() cg_model.train() cg_optimizer = optim.Adam(cg_model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) cg_crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() cg_optimizer = optim.Adam(cg_model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) allmodels = [tap_model, cg_model] optimizers = [tap_optimizer, cg_optimizer] if is_continue: tap_optimizer.load_state_dict(model_pth['tap_optimizer']) cg_optimizer.load_state_dict(model_pth['cg_optimizer']) update_lr_flag = True loss_sum = np.zeros(5) bad_video_num = 0 best_epoch = epoch start = time.time() print_opt(opt, allmodels, logger) logger.info('\nStart training') # set a var to indicate what to train in current iteration: "tap", "cg" or "tap_cg" flag_training_whats = get_training_list(opt, logger) # Iteration begin while True: if update_lr_flag: if (epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0): frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.lr * decay_factor else: opt.current_lr = opt.lr for optimizer in optimizers: utils.set_lr(optimizer, opt.current_lr) if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(None) else: sc_flag = False update_lr_flag = False flag_training_what = flag_training_whats[epoch] if opt.training_mode == "alter2": flag_training_what = flag_training_whats[iteration] # get data data = loader.get_batch('train') if opt.debug: print('vid:', data['vid']) print('info:', data['infos']) torch.cuda.synchronize() if (data["proposal_num"] <= 0) or (data['fc_feats'].shape[0] <= 1): bad_video_num += 1 # print('vid:{} has no good proposal.'.format(data['vid'])) continue ind_select_list, soi_select_list, cg_select_list, sampled_ids, = data[ 'ind_select_list'], data['soi_select_list'], data[ 'cg_select_list'], data['sampled_ids'] if flag_training_what == 'cg' or flag_training_what == 'gt_tap_cg': ind_select_list = data['gts_ind_select_list'] soi_select_list = data['gts_soi_select_list'] cg_select_list = data['gts_cg_select_list'] tmp = [ data['fc_feats'], data['att_feats'], data['lda_feats'], data['tap_labels'], data['tap_masks_for_loss'], data['cg_labels'][cg_select_list], data['cg_masks'][cg_select_list], data['w1'] ] tmp = [ Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp ] c3d_feats, att_feats, lda_feats, tap_labels, tap_masks_for_loss, cg_labels, cg_masks, w1 = tmp if (iteration - 1) % opt.m_batch == 0: tap_optimizer.zero_grad() cg_optimizer.zero_grad() tap_feats, pred_proposals = tap_model(c3d_feats) tap_loss = tap_crit(pred_proposals, tap_masks_for_loss, tap_labels, w1) loss_sum[0] = loss_sum[0] + tap_loss.item() # Backward Propagation if flag_training_what == 'tap': tap_loss.backward() utils.clip_gradient(tap_optimizer, opt.grad_clip) if iteration % opt.m_batch == 0: tap_optimizer.step() else: if not sc_flag: pred_captions = cg_model(tap_feats, c3d_feats, lda_feats, cg_labels, ind_select_list, soi_select_list, mode='train') cg_loss = cg_crit(pred_captions, cg_labels[:, 1:], cg_masks[:, 1:]) else: gen_result, sample_logprobs, greedy_res = cg_model( tap_feats, c3d_feats, lda_feats, cg_labels, ind_select_list, soi_select_list, mode='train_rl') sentence_info = data['sentences_batch'] if ( flag_training_what != 'cg' and flag_training_what != 'gt_tap_cg' ) else data['gts_sentences_batch'] reward = get_self_critical_reward2( greedy_res, (data['vid'], sentence_info), gen_result, vocab=loader.get_vocab(), opt=opt) cg_loss = rl_crit(sample_logprobs, gen_result, torch.from_numpy(reward).float().cuda()) loss_sum[1] = loss_sum[1] + cg_loss.item() if flag_training_what == 'cg' or flag_training_what == 'gt_tap_cg' or flag_training_what == 'LP_cg': cg_loss.backward() utils.clip_gradient(cg_optimizer, opt.grad_clip) if iteration % opt.m_batch == 0: cg_optimizer.step() if flag_training_what == 'gt_tap_cg': utils.clip_gradient(tap_optimizer, opt.grad_clip) if iteration % opt.m_batch == 0: tap_optimizer.step() elif flag_training_what == 'tap_cg': total_loss = opt.lambda1 * tap_loss + opt.lambda2 * cg_loss total_loss.backward() utils.clip_gradient(tap_optimizer, opt.grad_clip) utils.clip_gradient(cg_optimizer, opt.grad_clip) if iteration % opt.m_batch == 0: tap_optimizer.step() cg_optimizer.step() loss_sum[2] = loss_sum[2] + total_loss.item() torch.cuda.synchronize() # Updating epoch num iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Print losses, Add to summary if iteration % opt.losses_log_every == 0: end = time.time() losses = np.round(loss_sum / opt.losses_log_every, 3) logger.info( "iter {} (epoch {}, lr {}), avg_iter_loss({}) = {}, time/batch = {:.3f}, bad_vid = {:.3f}" \ .format(iteration, epoch, opt.current_lr, flag_training_what, losses, (end - start) / opt.losses_log_every, bad_video_num)) tf_writer.add_scalar('lr', opt.current_lr, iteration) tf_writer.add_scalar('train_tap_loss', losses[0], iteration) tf_writer.add_scalar('train_tap_prop_loss', losses[3], iteration) tf_writer.add_scalar('train_tap_bound_loss', losses[4], iteration) tf_writer.add_scalar('train_cg_loss', losses[1], iteration) tf_writer.add_scalar('train_total_loss', losses[2], iteration) if sc_flag and (not flag_training_what == 'tap'): tf_writer.add_scalar('avg_reward', np.mean(reward[:, 0]), iteration) loss_history[iteration] = losses lr_history[iteration] = opt.current_lr loss_sum = np.zeros(5) start = time.time() bad_video_num = 0 # Evaluation, and save model if (iteration % opt.save_checkpoint_every == 0) and (epoch >= opt.min_epoch_when_save): eval_kwargs = { 'split': 'val', 'val_all_metrics': 0, 'topN': 100, } eval_kwargs.update(vars(opt)) # eval_kwargs['num_vids_eval'] = int(491) eval_kwargs['topN'] = 100 eval_kwargs2 = { 'split': 'val', 'val_all_metrics': 1, 'num_vids_eval': 4917, } eval_kwargs2.update(vars(opt)) if not opt.num_vids_eval: eval_kwargs['num_vids_eval'] = int(4917.) eval_kwargs2['num_vids_eval'] = 4917 crits = [tap_crit, cg_crit] pred_json_path_T = os.path.join(save_folder, 'pred_sent', 'pred_num{}_iter{}.json') # if 'alter' in opt.training_mode: if flag_training_what == 'tap': eval_kwargs['topN'] = 1000 predictions, eval_score, val_loss = eval_utils.eval_split( allmodels, crits, loader, pred_json_path_T.format(eval_kwargs['num_vids_eval'], iteration), eval_kwargs, flag_eval_what='tap') else: if vars(opt).get('fast_eval_cg', False) == False: predictions, eval_score, val_loss = eval_utils.eval_split( allmodels, crits, loader, pred_json_path_T.format(eval_kwargs['num_vids_eval'], iteration), eval_kwargs, flag_eval_what='tap_cg') predictions2, eval_score2, val_loss2 = eval_utils.eval_split( allmodels, crits, loader, pred_json_path_T.format(eval_kwargs2['num_vids_eval'], iteration), eval_kwargs2, flag_eval_what='cg') if (not vars(opt).get('fast_eval_cg', False) == False) or (not vars(opt).get( 'fast_eval_cg_top10', False) == False): eval_score = eval_score2 val_loss = val_loss2 predictions = predictions2 # else: # predictions, eval_score, val_loss = eval_utils.eval_split(allmodels, crits, loader, pred_json_path, # eval_kwargs, # flag_eval_what=flag_training_what) f_f1 = lambda x, y: 2 * x * y / (x + y) f1 = f_f1(eval_score['Recall'], eval_score['Precision']).mean() if flag_training_what != 'tap': # if only train tap, use the mean of precision and recall as final score current_score = np.array(eval_score['METEOR']).mean() * 100 else: # if train tap_cg, use avg_meteor as final score current_score = f1 for model in allmodels: for name, param in model.named_parameters(): tf_writer.add_histogram(name, param.clone().cpu().data.numpy(), iteration, bins=10) if param.grad is not None: tf_writer.add_histogram( name + '_grad', param.grad.clone().cpu().data.numpy(), iteration, bins=10) tf_writer.add_scalar('val_tap_loss', val_loss[0], iteration) tf_writer.add_scalar('val_cg_loss', val_loss[1], iteration) tf_writer.add_scalar('val_tap_prop_loss', val_loss[3], iteration) tf_writer.add_scalar('val_tap_bound_loss', val_loss[4], iteration) tf_writer.add_scalar('val_total_loss', val_loss[2], iteration) tf_writer.add_scalar('val_score', current_score, iteration) if flag_training_what != 'tap': tf_writer.add_scalar('val_score_gt_METEOR', np.array(eval_score2['METEOR']).mean(), iteration) tf_writer.add_scalar('val_score_gt_Bleu_4', np.array(eval_score2['Bleu_4']).mean(), iteration) tf_writer.add_scalar('val_score_gt_CIDEr', np.array(eval_score2['CIDEr']).mean(), iteration) tf_writer.add_scalar('val_recall', eval_score['Recall'].mean(), iteration) tf_writer.add_scalar('val_precision', eval_score['Precision'].mean(), iteration) tf_writer.add_scalar('f1', f1, iteration) val_result_history[iteration] = { 'val_loss': val_loss, 'eval_score': eval_score } if flag_training_what == 'tap': logger.info( 'Validation the result of iter {}, score(f1/meteor):{},\n all:{}' .format(iteration, current_score, eval_score)) else: mean_score = { k: np.array(v).mean() for k, v in eval_score.items() } gt_mean_score = { k: np.array(v).mean() for k, v in eval_score2.items() } metrics = ['Bleu_4', 'CIDEr', 'METEOR', 'ROUGE_L'] gt_avg_score = np.array([ v for metric, v in gt_mean_score.items() if metric in metrics ]).sum() logger.info( 'Validation the result of iter {}, score(f1/meteor):{},\n all:{}\n mean:{} \n\n gt:{} \n mean:{}\n avg_score: {}' .format(iteration, current_score, eval_score, mean_score, eval_score2, gt_mean_score, gt_avg_score)) # Save model .pth saved_pth = { 'iteration': iteration, 'cg_model': cg_model.state_dict(), 'tap_model': tap_model.state_dict(), 'cg_optimizer': cg_optimizer.state_dict(), 'tap_optimizer': tap_optimizer.state_dict(), } if opt.save_all_checkpoint: checkpoint_path = os.path.join( save_folder, 'model_iter_{}.pth'.format(iteration)) else: checkpoint_path = os.path.join(save_folder, 'model.pth') torch.save(saved_pth, checkpoint_path) logger.info('Save model at iter {} to checkpoint file {}.'.format( iteration, checkpoint_path)) # save info.pkl if current_score > best_val_score: best_val_score = current_score best_epoch = epoch saved_info['best'] = { 'opt': opt, 'iter': iteration, 'epoch': epoch, 'iterators': loader.iterators, 'flag_training_what': flag_training_what, 'split_ix': loader.split_ix, 'best_val_score': best_val_score, 'vocab': loader.get_vocab(), } best_checkpoint_path = os.path.join(save_folder, 'model-best.pth') torch.save(saved_pth, best_checkpoint_path) logger.info( 'Save Best-model at iter {} to checkpoint file.'.format( iteration)) saved_info['last'] = { 'opt': opt, 'iter': iteration, 'epoch': epoch, 'iterators': loader.iterators, 'flag_training_what': flag_training_what, 'split_ix': loader.split_ix, 'best_val_score': best_val_score, 'vocab': loader.get_vocab(), } saved_info['history'] = { 'val_result_history': val_result_history, 'loss_history': loss_history, 'lr_history': lr_history, } with open(os.path.join(save_folder, 'info.pkl'), 'w') as f: cPickle.dump(saved_info, f) logger.info('Save info to info.pkl') # Stop criterion if epoch >= len(flag_training_whats): tf_writer.close() break
def train(opt): # Deal with feature things before anything opt.use_att = utils.if_use_att(opt.caption_model) if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5 torch.cuda.set_device(opt.device_num) #AK print(torch.cuda.current_device()) #AK loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path) infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] # need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"] #for now removed caption_model need_be_same = ["rnn_type", "rnn_size", "num_layers"] #for now removed caption_model for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')): with open( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')) as f: histories = cPickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) # pdb.set_trace() model = models.setup(opt).cuda(opt.device_num) dp_model = model #torch.nn.DataParallel(model) variable length in RNN, unable to use parallelism update_lr_flag = True # Assure in training mode dp_model.train() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() if opt.reduce_on_plateau: optimizer = utils.build_optimizer(model.parameters(), opt) optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3) else: optimizer = utils.build_optimizer(model.parameters(), opt) # Load the optimizer if vars(opt).get('start_from', None) is not None and os.path.isfile( os.path.join(opt.start_from, "optimizer.pth")): optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'), map_location="cuda:" + str(opt.device_num))) # optimizer.to(opt.device_num) while True: if update_lr_flag: if not opt.reduce_on_plateau: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(optimizer, opt.current_lr) # set the decayed rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False update_lr_flag = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') print('Read data:', time.time() - start) '''checking data object''' # print(type(data)) # print(list(data)) # pdb.set_trace() torch.cuda.synchronize() start = time.time() tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks'] ] tmp = [ _ if _ is None else torch.from_numpy(_).cuda(opt.device_num) for _ in tmp ] # pdb.set_trace() fc_feats, att_feats, labels, masks, att_masks = tmp optimizer.zero_grad() if not sc_flag: # pdb.set_trace() loss = crit(dp_model(fc_feats, att_feats, labels, att_masks), labels[:, 1:], masks[:, 1:]) else: gen_result, sample_logprobs = dp_model(fc_feats, att_feats, att_masks, opt={'sample_max': 0}, mode='sample') reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt) loss = rl_crit( sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda(opt.device_num)) loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.item() torch.cuda.synchronize() end = time.time() if not sc_flag: # pass print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) else: # pass print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, np.mean(reward[:,0]), end - start)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True print("Iter: {}, Loss: {}, Epoc: {}, LR: {}".format( iteration, train_loss, epoch, opt.current_lr)) # Write the training loss summary if (iteration % opt.losses_log_every == 0): add_summary_value(tb_summary_writer, 'train_loss', train_loss, iteration) if opt.reduce_on_plateau: opt.current_lr = optimizer.current_lr add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration) add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: add_summary_value(tb_summary_writer, 'avg_reward', np.mean(reward[:, 0]), iteration) loss_history[iteration] = train_loss if not sc_flag else np.mean( reward[:, 0]) lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( dp_model, crit, loader, eval_kwargs) if opt.reduce_on_plateau: if 'CIDEr' in lang_stats: optimizer.scheduler_step(-lang_stats['CIDEr']) else: optimizer.scheduler_step(val_loss) # Write validation result into summary add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): add_summary_value(tb_summary_writer, k, v, iteration) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['best_val_score'] = best_val_score infos['opt'] = opt infos['vocab'] = loader.get_vocab() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break
def train(opt): opt.use_att = utils.if_use_att(opt) loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length tf_summary_writer = tf and SummaryWriter(opt.checkpoint_path) infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')): with open( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')) as f: histories = cPickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) best_val_score_vse = infos.get('best_val_score_vse', None) model = models.JointModel(opt) model.cuda() update_lr_flag = True # Assure in training mode model.train() optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=opt.learning_rate, weight_decay=opt.weight_decay) # Load the optimizer if vars(opt).get('start_from', None) is not None and os.path.isfile( os.path.join(opt.start_from, 'optimizer.pth')): state_dict = torch.load(os.path.join(opt.start_from, 'optimizer.pth')) if len(state_dict['state']) == len(optimizer.state_dict()['state']): optimizer.load_state_dict(state_dict) else: print( 'Optimizer param group number not matched? There must be new parameters. Reinit the optimizer.' ) init_scorer(opt.cached_tokens) while True: if update_lr_flag: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor utils.set_lr(optimizer, opt.current_lr) # set the decayed rate else: opt.current_lr = opt.learning_rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.caption_generator.ss_prob = opt.ss_prob # Assign retrieval loss weight if epoch > opt.retrieval_reward_weight_decay_start and opt.retrieval_reward_weight_decay_start >= 0: frac = (epoch - opt.retrieval_reward_weight_decay_start ) // opt.retrieval_reward_weight_decay_every model.retrieval_reward_weight = opt.retrieval_reward_weight * ( opt.retrieval_reward_weight_decay_rate**frac) update_lr_flag = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() tmp = [ data['fc_feats'], data['att_feats'], data['att_masks'], data['labels'], data['masks'] ] tmp = utils.var_wrapper(tmp) fc_feats, att_feats, att_masks, labels, masks = tmp optimizer.zero_grad() loss = model(fc_feats, att_feats, att_masks, labels, masks, data) loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.data[0] torch.cuda.synchronize() end = time.time() print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) prt_str = "" for k, v in model.loss().items(): prt_str += "{} = {:.3f} ".format(k, v) print(prt_str) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): if tf is not None: tf_summary_writer.add_scalar('train_loss', train_loss, iteration) for k, v in model.loss().items(): tf_summary_writer.add_scalar(k, v, iteration) tf_summary_writer.add_scalar('learning_rate', opt.current_lr, iteration) tf_summary_writer.add_scalar('scheduled_sampling_prob', model.caption_generator.ss_prob, iteration) tf_summary_writer.add_scalar('retrieval_reward_weight', model.retrieval_reward_weight, iteration) tf_summary_writer.file_writer.flush() loss_history[iteration] = train_loss lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.caption_generator.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) # Load the retrieval model for evaluation val_loss, predictions, lang_stats = eval_utils.eval_split( model, loader, eval_kwargs) # Write validation result into summary if tf is not None: for k, v in val_loss.items(): tf_summary_writer.add_scalar('validation ' + k, v, iteration) for k, v in lang_stats.items(): tf_summary_writer.add_scalar(k, v, iteration) tf_summary_writer.add_text( 'Captions', '.\n\n'.join([_['caption'] for _ in predictions[:100]]), iteration) #tf_summary_writer.add_image('images', utils.make_summary_image(), iteration) #utils.make_html(opt.id, iteration) tf_summary_writer.file_writer.flush() val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['SPICE'] * 100 else: current_score = -val_loss['loss_cap'] current_score_vse = val_loss.get(opt.vse_eval_criterion, 0) * 100 best_flag = False best_flag_vse = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True if best_val_score_vse is None or current_score_vse > best_val_score_vse: best_val_score_vse = current_score_vse best_flag_vse = True checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) checkpoint_path = os.path.join(opt.checkpoint_path, 'model-%d.pth' % (iteration)) torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['best_val_score'] = best_val_score infos['best_val_score_vse'] = best_val_score_vse infos['opt'] = opt infos['vocab'] = loader.get_vocab() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join( opt.checkpoint_path, 'infos_' + opt.id + '-%d.pkl' % (iteration)), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) if best_flag_vse: checkpoint_path = os.path.join(opt.checkpoint_path, 'model_vse-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open( os.path.join(opt.checkpoint_path, 'infos_vse_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break
def train(rank, model, opt, optimizer=None): torch.manual_seed(opt.seed + rank) if opt.use_cuda: torch.cuda.manual_seed(opt.seed + rank) loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length infos = {} if opt.start_from is not None: # open old infos and check if models are compatible with open( os.path.join(opt.start_from, 'infos_' + opt.load_model_id + '.pkl'), 'rb') as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = infos.get('val_result_history', {}) loss_history = infos.get('loss_history', {}) lr_history = infos.get('lr_history', {}) ss_prob_history = infos.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_image_id = infos.get('split_image_id', loader.split_image_id) best_val_score = 0 if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) update_lr_flag = True if opt.caption_model == 'show_tell': crit = utils.LanguageModelCriterion(opt) elif opt.caption_model == 'review_net': crit = utils.ReviewNetCriterion(opt) elif opt.caption_model == 'recurrent_fusion_model': crit = utils.ReviewNetEnsembleCriterion(opt) else: raise Exception("caption_model not supported: {}".format( opt.caption_model)) if optimizer is None: if opt.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=opt.optim_lr, betas=(opt.optim_adam_beta1, opt.optim_adam_beta2), weight_decay=opt.optim_weight_decay) elif opt.optim == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=opt.optim_lr, momentum=opt.optim_momentum, alpha=opt.optim_rmsprop_alpha, weight_decay=opt.weight_decay) elif opt.optim == 'sgd': optimizer = optim.SGD(model.parameters(), lr=opt.optim_lr, momentum=opt.optim_momentum, weight_decay=opt.optim_weight_decay) elif opt.optim == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=opt.optim_lr, lr_decay=opt.optim_lr_decay, weight_decay=opt.optim_weight_decay) elif opt.optim == 'adadelta': optimizer = optim.Adadelta(model.parameters(), rho=opt.optim_rho, eps=opt.optim_epsilon, lr=opt.optim_lr, weight_decay=opt.optim_weight_decay) else: raise Exception("optim not supported: {}".format(opt.feature_type)) # Load the optimizer if vars(opt).get('start_from', None) is not None: optimizer.load_state_dict( torch.load( os.path.join(opt.start_from, 'optimizer_' + opt.load_model_id + '.pth'))) num_period_best = 0 current_score = 0 while True: if update_lr_flag: # Assign the learning rate if epoch > opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.optim_lr * decay_factor utils.set_lr(optimizer, opt.current_lr) # set the decayed rate else: opt.current_lr = opt.optim_lr # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob update_lr_flag = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') if opt.use_cuda: torch.cuda.synchronize() if opt.feature_type == 'feat_array': fc_feat_array = data['fc_feats_array'] att_feat_array = data['att_feats_array'] assert (len(fc_feat_array) == len(att_feat_array)) for feat_id in range(len(fc_feat_array)): if opt.use_cuda: fc_feat_array[feat_id] = Variable( torch.from_numpy(fc_feat_array[feat_id]), requires_grad=False).cuda() att_feat_array[feat_id] = Variable( torch.from_numpy(att_feat_array[feat_id]), requires_grad=False).cuda() else: fc_feat_array[feat_id] = Variable(torch.from_numpy( fc_feat_array[feat_id]), requires_grad=False) att_feat_array[feat_id] = Variable(torch.from_numpy( att_feat_array[feat_id]), requires_grad=False) tmp = [data['labels'], data['masks'], data['top_words']] if opt.use_cuda: tmp = [ Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp ] else: tmp = [ Variable(torch.from_numpy(_), requires_grad=False) for _ in tmp ] labels, masks, top_words = tmp else: tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['top_words'] ] if opt.use_cuda: tmp = [ Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp ] else: tmp = [ Variable(torch.from_numpy(_), requires_grad=False) for _ in tmp ] fc_feats, att_feats, labels, masks, top_words = tmp optimizer.zero_grad() if opt.caption_model == 'show_tell': log_prob = model(fc_feats, att_feats, labels) # (80L, 16L, 9488L) loss = crit(log_prob, labels[:, 1:], masks[:, 1:]) elif opt.caption_model == 'review_net': log_prob, top_pred = model(fc_feats, att_feats, labels) # (80L, 16L, 9488L) loss = crit(log_prob, labels[:, 1:], masks[:, 1:], top_pred, top_words, opt.reason_weight) elif opt.caption_model == 'recurrent_fusion_model': log_prob, top_pred = model(fc_feat_array, att_feat_array, labels) # (80L, 16L, 9488L) loss = crit(log_prob, labels[:, 1:], masks[:, 1:], top_pred, top_words, opt.reason_weight) else: raise Exception("caption_model not supported: {}".format( opt.caption_model)) loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.data[0] if opt.use_cuda: torch.cuda.synchronize() end = time.time() if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if iteration % opt.losses_log_every == 0: loss_history[iteration] = train_loss lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # make evaluation on validation set, and save model if iteration % opt.save_checkpoint_every == 0: # eval model eval_kwargs = { 'eval_split': 'val', 'dataset': opt.input_json, 'caption_model': opt.caption_model, 'reason_weight': opt.reason_weight, 'guiding_l1_penality': opt.guiding_l1_penality, 'use_cuda': opt.use_cuda, 'feature_type': opt.feature_type, 'rank': rank, 'val_images_use': opt.val_images_use, 'language_eval': 1 } eval_kwargs.update(vars(opt)) eval_kwargs['eval_split'] = 'val' val_loss, predictions, lang_stats = eval_utils.eval_split( model, crit, loader, eval_kwargs) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True num_period_best = 1 else: num_period_best = num_period_best + 1 # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_image_id'] = loader.split_image_id infos['best_val_score'] = best_val_score infos['opt'] = opt infos['val_result_history'] = val_result_history infos['loss_history'] = loss_history infos['lr_history'] = lr_history infos['ss_prob_history'] = ss_prob_history infos['vocab'] = loader.get_vocab() with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '_' + str(rank) + '.pkl'), 'wb') as f: cPickle.dump(infos, f) if best_flag: checkpoint_path = os.path.join( opt.checkpoint_path, 'model_' + opt.id + '_' + str(rank) + '-best.pth') torch.save(model.state_dict(), checkpoint_path) optimizer_path = os.path.join( opt.checkpoint_path, 'optimizer_' + opt.id + '_' + str(rank) + '-best.pth') torch.save(optimizer.state_dict(), optimizer_path) print("model saved to {}".format(checkpoint_path)) with open( os.path.join( opt.checkpoint_path, 'infos_' + opt.id + '_' + str(rank) + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) if num_period_best >= opt.num_eval_no_improve: print('no improvement, exit') sys.exit() print( "rank {}, iter {}, (epoch {}), train loss: {}, learning rate: {}, current cider: {:.3f}, best cider: {:.3f}, time: {:.3f}" .format(rank, iteration, epoch, train_loss, opt.current_lr, current_score, best_val_score, (end - start))) iteration += 1 # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break
def train(opt): loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length infos = {} if opt.start_from is not None and len(opt.start_from) > 0: print("start from %s" % (opt.start_from)) # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = ["caption_model", "rnn_size", "num_layers"] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = infos.get('val_result_history', {}) loss_history = infos.get('loss_history', {}) lr_history = infos.get('lr_history', {}) loader.iterators = infos.get('iterators', loader.iterators) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) model_cnn = models.setup_resnet152(opt) model_cnn.cuda() model = models.setup_mixer(opt) model.cuda() update_lr_flag = True is_reinforce = False model_cnn.train() model.train() fc_expander = utils.FeatExpander(5) att_expander = utils.FeatExpander(5) crit = Criterion.LanguageModelCriterion() crit_reinforce = ReinforceCriterion.ReinforceCriterion() optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate) optimizer_cnn = optim.Adam(model_cnn.parameters(), lr=opt.cnn_learning_rate, weight_decay=opt.cnn_weight_decay) # Load the optimizer if opt.start_from is not None and len(opt.start_from) > 0: optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) optimizer_cnn.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer_cnn.pth'))) while True: if update_lr_flag: # Assign the learning rate if opt.learning_rate_decay_start >= 0 and epoch >= opt.learning_rate_decay_start: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor utils.set_lr(optimizer, opt.current_lr) # set the decayed rate else: opt.current_lr = opt.learning_rate if opt.finetune_cnn_after >= 0 and epoch >= opt.finetune_cnn_after: for p in model_cnn.parameters(): p.requires_grad = True model_cnn.train() else: for p in model_cnn.parameters(): p.requires_grad = False model_cnn.eval() update_lr_flag = False if opt.reinforce_start >= 0 and epoch >= opt.reinforce_start: is_reinforce = True start_total = time.time() data = loader.get_batch('train') tmp = [data['images'], data['labels']] tmp = [ Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp ] images, labels = tmp images = utils.prepro(images, True) fc_feats, att_feats = model_cnn(images) fc_feats_ext = fc_expander(fc_feats) att_feats_ext = att_expander(att_feats) optimizer.zero_grad() if opt.finetune_cnn_after >= 0 and epoch >= opt.finetune_cnn_after: optimizer_cnn.zero_grad() output = model(fc_feats_ext, att_feats_ext, labels, is_reinforce) if is_reinforce: loss = crit_reinforce(output, labels) else: loss = crit(output, labels) loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() if opt.finetune_cnn_after >= 0 and epoch >= opt.finetune_cnn_after: utils.clip_gradient(optimizer_cnn, opt.grad_clip) optimizer_cnn.step() train_loss = loss.data[0] print("iter {} (epoch {}), train_loss = {:.3f}, reinforce = {} time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, is_reinforce, time.time() - start_total)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): loss_history[iteration] = train_loss lr_history[iteration] = opt.current_lr # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats, str_stats = eval_utils.eval_split( model_cnn, model, crit, loader, eval_kwargs) if not os.path.exists(opt.eval_result_path): os.makedirs(opt.eval_result_path) eval_result_file = os.path.join(opt.eval_result_path, opt.id + ".csv") with open(eval_result_file, 'a') as f: f.write(str_stats + "\n") predictions_file = os.path.join(opt.eval_result_path, opt.id + ".json") with open(predictions_file, 'w') as f: json.dump(predictions, f) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True if not os.path.exists(opt.checkpoint_path): os.makedirs(opt.checkpoint_path) checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) checkpoint_path_cnn = os.path.join(opt.checkpoint_path, 'model_cnn.pth') torch.save(model_cnn.state_dict(), checkpoint_path_cnn) print("model cnn saved to {}".format(checkpoint_path_cnn)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) print("optimizer saved to {}".format(optimizer_path)) optimizer_path_cnn = os.path.join(opt.checkpoint_path, 'optimizer_cnn.pth') torch.save(optimizer_cnn.state_dict(), optimizer_path_cnn) print("optimizer cnn saved to {}".format(optimizer_path_cnn)) infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['best_val_score'] = best_val_score infos['opt'] = opt infos['val_result_history'] = val_result_history infos['loss_history'] = loss_history infos['lr_history'] = lr_history infos['vocab'] = loader.get_vocab() info_path = os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl') with open(info_path, 'wb') as f: cPickle.dump(infos, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model_best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) checkpoint_path_cnn = os.path.join(opt.checkpoint_path, 'model_cnn_best.pth') torch.save(model_cnn.state_dict(), checkpoint_path_cnn) print("model cnn saved to {}".format(checkpoint_path_cnn)) info_path = os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '_best.pkl') with open(info_path, 'wb') as f: cPickle.dump(infos, f) if epoch >= opt.max_epochs and opt.max_epochs != -1: break
def train(opt): # Deal with feature things before anything opt.use_fc, opt.use_att = utils.if_use_feat(opt.caption_model) if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5 acc_steps = getattr(opt, 'acc_steps', 1) loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path) infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl'), 'rb') as f: infos = utils.pickle_load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')): with open( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl'), 'rb') as f: histories = utils.pickle_load(f) else: infos['iter'] = 0 infos['epoch'] = 0 infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['vocab'] = loader.get_vocab() infos['opt'] = opt iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) opt.vocab = loader.get_vocab() model = models.setup(opt).cuda() del opt.vocab dp_model = torch.nn.DataParallel(model) lw_model = LossWrapper(model, opt) dp_lw_model = torch.nn.DataParallel(lw_model) epoch_done = True # Assure in training mode dp_lw_model.train() if opt.noamopt: assert opt.caption_model in [ 'transformer', 'aoa' ], 'noamopt can only work with transformer' optimizer = utils.get_std_opt(model, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup) optimizer._step = iteration elif opt.reduce_on_plateau: optimizer = utils.build_optimizer(model.parameters(), opt) optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3) else: optimizer = utils.build_optimizer(model.parameters(), opt) # Load the optimizer if vars(opt).get('start_from', None) is not None and os.path.isfile( os.path.join(opt.start_from, "optimizer.pth")): optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) def save_checkpoint(model, infos, optimizer, histories=None, append=''): if len(append) > 0: append = '-' + append # if checkpoint_path doesn't exist if not os.path.isdir(opt.checkpoint_path): os.makedirs(opt.checkpoint_path) checkpoint_path = os.path.join(opt.checkpoint_path, 'model%s.pth' % (append)) torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer%s.pth' % (append)) torch.save(optimizer.state_dict(), optimizer_path) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '%s.pkl' % (append)), 'wb') as f: utils.pickle_dump(infos, f) if histories: with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '%s.pkl' % (append)), 'wb') as f: utils.pickle_dump(histories, f) try: while True: if epoch_done: if not opt.noamopt and not opt.reduce_on_plateau: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(optimizer, opt.current_lr) # set the decayed rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min( opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False epoch_done = False start = time.time() if (opt.use_warmup == 1) and (iteration < opt.noamopt_warmup): opt.current_lr = opt.learning_rate * (iteration + 1) / opt.noamopt_warmup utils.set_lr(optimizer, opt.current_lr) # Load data from train split (0) data = loader.get_batch('train') print('Read data:', time.time() - start) if (iteration % acc_steps == 0): optimizer.zero_grad() torch.cuda.synchronize() start = time.time() tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks'] ] tmp = [_ if _ is None else _.cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp model_out = dp_lw_model(fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts'])), sc_flag) loss = model_out['loss'].mean() loss_sp = loss / acc_steps loss_sp.backward() if ((iteration + 1) % acc_steps == 0): utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() torch.cuda.synchronize() train_loss = loss.item() end = time.time() if not sc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) else: print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, model_out['reward'].mean(), end - start)) if opt.caption_model == 'aat': attention_steps = np.array(model_out['att_step']).transpose() avg_att_time = model_out['avg_att_time'] loss_ = model_out['loss_'].mean().item() aat_loss = model_out['aat_loss'].mean().item() print("AAT: loss_ = {:.3f}, att_loss = {:.3f}, avg_att_time = {:.3f}" \ .format(loss_, aat_loss, avg_att_time)) print(attention_steps[0]) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 epoch_done = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): add_summary_value(tb_summary_writer, 'train_loss', train_loss, iteration) if opt.noamopt: opt.current_lr = optimizer.rate() elif opt.reduce_on_plateau: opt.current_lr = optimizer.current_lr add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration) add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: add_summary_value(tb_summary_writer, 'avg_reward', model_out['reward'].mean(), iteration) if opt.caption_model == 'aat': add_summary_value(tb_summary_writer, 'loss_', loss_, iteration) add_summary_value(tb_summary_writer, 'aat_loss', aat_loss, iteration) add_summary_value(tb_summary_writer, 'avg_att_time', avg_att_time, iteration) loss_history[ iteration] = train_loss if not sc_flag else model_out[ 'reward'].mean() lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # update infos infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( dp_model, lw_model.crit, loader, eval_kwargs) if opt.reduce_on_plateau: if 'CIDEr' in lang_stats: optimizer.scheduler_step(-lang_stats['CIDEr']) else: optimizer.scheduler_step(val_loss) # Write validation result into summary add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): add_summary_value(tb_summary_writer, k, v, iteration) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True # Dump miscalleous informations infos['best_val_score'] = best_val_score histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history save_checkpoint(model, infos, optimizer, histories) if opt.save_history_ckpt: save_checkpoint(model, infos, optimizer, append=str(iteration)) if best_flag: save_checkpoint(model, infos, optimizer, append='best') # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break except (RuntimeError, KeyboardInterrupt): print('Save ckpt on exception ...') save_checkpoint(model, infos, optimizer) print('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace)
def train(opt): # tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path) if not os.path.exists(opt.checkpoint_path): os.makedirs(opt.checkpoint_path) with open(os.path.join(opt.checkpoint_path, 'config.json'), 'w') as f: json.dump(vars(opt), f, indent=4) writer = None if tb is not None: import shutil now = datetime.now() if opt.reset_tensorboard: for d in os.listdir(opt.checkpoint_path): d = os.path.join(opt.checkpoint_path, d) if os.path.isdir(d) and 'tb_' in d: shutil.rmtree(d) print('remove', d) logdir = os.path.join(opt.checkpoint_path, 'tb_' + now.strftime("%Y%m%d-%H%M%S") + "/") writer = tb.SummaryWriter(logdir) # Load iterators loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.vocab = loader.get_vocab() opt.blank_token = loader.get_blank_token() opt.seq_length = loader.seq_length opt.unique_characters = loader.unique_characters opt.max_characters = loader.max_characters if opt.glove is not None: opt.glove_npy = loader.build_glove(opt.glove) else: opt.glove_npy = None # set up models gen_model = FillInCharacter(opt) gen_model = gen_model.cuda() if torch.cuda.device_count() > 1: gen_model = nn.DataParallel(gen_model) gen_model.train() gen_optimizer = utils.build_optimizer(gen_model.parameters(), opt) # keep track of iteration g_iter = 0 g_epoch = 0 update_lr_flag = True # Load from checkpoint path infos = {'opt': opt} histories = {} infos['vocab'] = loader.get_vocab() if opt.start_from is not None: # Open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos.pkl'), 'rb') as f: infos = pickle.load(f) saved_model_opt = infos['opt'] need_be_same = ["rnn_type", "rnn_size", "num_layers"] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme # Load train/val histories with open(os.path.join(opt.start_from, 'histories.pkl'), 'rb') as f: histories = pickle.load(f) # Load generator start_epoch = opt.start_epoch g_model_path = os.path.join(opt.start_from, "gen_%s.pth" % start_epoch) g_optimizer_path = os.path.join(opt.start_from, "gen_optimizer_%s.pth" % start_epoch) assert os.path.isfile(g_model_path) and os.path.isfile( g_optimizer_path) gen_model.load_state_dict(torch.load(g_model_path)) gen_optimizer.load_state_dict(torch.load(g_optimizer_path)) if "latest" not in start_epoch and "best" != start_epoch: g_epoch = int(start_epoch) + 1 g_iter = (g_epoch) * loader.split_size['train'] // opt.batch_size elif start_epoch == "best": g_epoch = infos['g_epoch_' + start_epoch] + 1 g_iter = (g_epoch) * loader.split_size['train'] // opt.batch_size else: g_epoch = infos['g_epoch_' + start_epoch] + 1 g_iter = infos['g_iter_' + start_epoch] print('loaded %s (epoch: %d iter: %d)' % (g_model_path, g_epoch, g_iter)) infos['opt'] = opt loader.iterators = infos.get('g_iterators', loader.iterators) # misc best_val_score = infos.get('g_best_score', None) opt.seq_length = loader.seq_length opt.video = 1 g_val_result_history = histories.get('g_val_result_history', {}) g_loss_history = histories.get('g_loss_history', {}) """ START TRAINING """ while g_epoch < opt.pre_nepoch: # gc.collect() # set every epoch if update_lr_flag: # Assign the learning rate for generator if g_epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (g_epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(gen_optimizer, opt.current_lr) # Assign the scheduled sampling prob if g_epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (g_epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) gen_model.ss_prob = opt.ss_prob update_lr_flag = False """ TRAIN GENERATOR """ gen_model.train() start = time.time() gen_loss, wrapped, sent_num = train_generator(gen_model, gen_optimizer, loader, opt.grad_clip) end = time.time() # Print Info if g_iter % opt.losses_print_every == 0: print("g_iter {} (g_epoch {}), gen_loss = {:.3f}, time/batch = {:.3f}" \ .format(g_iter, g_epoch, gen_loss, end - start)) # Log Losses if g_iter % opt.losses_log_every == 0: g_loss = gen_loss loss_history = {'g_loss': g_loss, 'g_epoch': g_epoch} g_loss_history[g_iter] = loss_history log_metrics(writer, g_iter, loss_history) # Update the iteration g_iter += 1 ######################### # Evaluate & Save Model # ######################### if wrapped: # evaluate model on dev set eval_kwargs = { 'split': 'val', 'dataset': opt.input_json, 'sample_max': 1, 'eval_accuracy': opt.eval_accuracy, 'id': opt.val_id, 'val_videos_use': opt.val_videos_use, 'remove': 1 } # remove generated caption val_loss, predictions, accuracy = eval_split( gen_model, loader, eval_kwargs=eval_kwargs) if opt.eval_accuracy == 1: current_score = accuracy[ 'Class Accuracy'] if 'Class Accuracy' in accuracy else accuracy[ 'Instance Accuracy'] else: current_score = -val_loss g_val_result_history[g_epoch] = { 'g_val_loss': val_loss, 'g_val_score': current_score } print('validation:', g_val_result_history[g_epoch]) # Save the best generator model if best_val_score is None or current_score > best_val_score: best_val_score = current_score checkpoint_path = os.path.join(opt.checkpoint_path, 'gen_best.pth') torch.save( gen_optimizer.state_dict(), os.path.join(opt.checkpoint_path, 'gen_optimizer_best.pth')) infos['g_epoch_best'] = g_epoch infos['g_iter_best'] = g_iter infos['g_best_score'] = best_val_score torch.save(gen_model.state_dict(), checkpoint_path) print("best fill in model saved to {} with score {}".format( checkpoint_path, current_score)) # Dump miscalleous informations and save infos['g_epoch_latest'] = g_epoch infos['g_iter_latest'] = g_iter infos['g_iterators'] = loader.iterators histories['g_val_result_history'] = g_val_result_history histories['g_loss_history'] = g_loss_history with open(os.path.join(opt.checkpoint_path, 'infos.pkl'), 'wb') as f: pickle.dump(infos, f) with open(os.path.join(opt.checkpoint_path, 'histories.pkl'), 'wb') as f: pickle.dump(histories, f) log_metrics(writer, g_iter, g_val_result_history[g_epoch]) # save the latest model if opt.save_checkpoint_every > 0 and g_epoch % opt.save_checkpoint_every == 0: torch.save( gen_model.state_dict(), os.path.join(opt.checkpoint_path, 'gen_%d.pth' % g_epoch)) torch.save(gen_model.state_dict(), os.path.join(opt.checkpoint_path, 'gen_latest.pth')) torch.save( gen_optimizer.state_dict(), os.path.join(opt.checkpoint_path, 'gen_optimizer_%d.pth' % g_epoch)) torch.save( gen_optimizer.state_dict(), os.path.join(opt.checkpoint_path, 'gen_optimizer_latest.pth')) print("fill in model saved to {} at epoch {}".format( opt.checkpoint_path, g_epoch)) # update epoch and lr g_epoch += 1 update_lr_flag = True
def train(opt): ################################ # Build dataloader ################################ loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length ########################## # Initialize infos ########################## infos = { 'iter': 0, 'epoch': 0, 'loader_state_dict': None, 'vocab': loader.get_vocab(), } # Load old infos(if there is) and check if models are compatible if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl')): with open(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl'), 'rb') as f: infos = utils.pickle_load(f) saved_model_opt = infos['opt'] need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"] for checkme in need_be_same: assert getattr(saved_model_opt, checkme) == getattr(opt, checkme), "Command line argument and saved model disagree on '%s' " % checkme infos['opt'] = opt ######################### # Build logger ######################### # naive dict logger histories = defaultdict(dict) if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')): with open(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl'), 'rb') as f: histories.update(utils.pickle_load(f)) # tensorboard logger tb_summary_writer = SummaryWriter(opt.checkpoint_path) ########################## # Build model ########################## opt.vocab = loader.get_vocab() model = models.setup(opt).cuda() del opt.vocab # Load pretrained weights: if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, 'model.pth')): model.load_state_dict(torch.load(os.path.join(opt.start_from, 'model.pth'))) # Wrap generation model with loss function(used for training) # This allows loss function computed separately on each machine lw_model = LossWrapper(model, opt) # Wrap with dataparallel dp_model = torch.nn.DataParallel(model) dp_lw_model = torch.nn.DataParallel(lw_model) ########################## # Build optimizer ########################## if opt.noamopt: assert opt.caption_model == 'transformer', 'noamopt can only work with transformer' optimizer = utils.get_std_opt(model, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup) elif opt.reduce_on_plateau: optimizer = utils.build_optimizer(model.parameters(), opt) optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3) else: optimizer = utils.build_optimizer(model.parameters(), opt) # Load the optimizer if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from,"optimizer.pth")): optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) ######################### # Get ready to start ######################### iteration = infos['iter'] epoch = infos['epoch'] # For back compatibility if 'iterators' in infos: infos['loader_state_dict'] = {split: {'index_list': infos['split_ix'][split], 'iter_counter': infos['iterators'][split]} for split in ['train', 'val', 'test']} loader.load_state_dict(infos['loader_state_dict']) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) if opt.noamopt: optimizer._step = iteration # flag indicating finish of an epoch # Always set to True at the beginning to initialize the lr or etc. epoch_done = True # Assure in training mode dp_lw_model.train() # Start training try: while True: if epoch_done: if not opt.noamopt and not opt.reduce_on_plateau: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate ** frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(optimizer, opt.current_lr) # set the decayed rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False # If start structure loss training if opt.structure_after != -1 and epoch >= opt.structure_after: struc_flag = True init_scorer(opt.cached_tokens) else: struc_flag = False epoch_done = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks']] tmp = [_ if _ is None else _.cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp optimizer.zero_grad() model_out = dp_lw_model(fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts'])), sc_flag, struc_flag) loss = model_out['loss'].mean() loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.item() torch.cuda.synchronize() end = time.time() if struc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, model_out['lm_loss'].mean().item(), model_out['struc_loss'].mean().item(), end - start)) elif not sc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) else: print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, model_out['reward'].mean(), end - start)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 epoch_done = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): tb_summary_writer.add_scalar('train_loss', train_loss, iteration) if opt.noamopt: opt.current_lr = optimizer.rate() elif opt.reduce_on_plateau: opt.current_lr = optimizer.current_lr tb_summary_writer.add_scalar('learning_rate', opt.current_lr, iteration) tb_summary_writer.add_scalar('scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: tb_summary_writer.add_scalar('avg_reward', model_out['reward'].mean(), iteration) elif struc_flag: tb_summary_writer.add_scalar('lm_loss', model_out['lm_loss'].mean().item(), iteration) tb_summary_writer.add_scalar('struc_loss', model_out['struc_loss'].mean().item(), iteration) tb_summary_writer.add_scalar('reward', model_out['reward'].mean().item(), iteration) histories['loss_history'][iteration] = train_loss if not sc_flag else model_out['reward'].mean() histories['lr_history'][iteration] = opt.current_lr histories['ss_prob_history'][iteration] = model.ss_prob # update infos infos['iter'] = iteration infos['epoch'] = epoch infos['loader_state_dict'] = loader.state_dict() # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( dp_model, lw_model.crit, loader, eval_kwargs) if opt.reduce_on_plateau: if 'CIDEr' in lang_stats: optimizer.scheduler_step(-lang_stats['CIDEr']) else: optimizer.scheduler_step(val_loss) # Write validation result into summary tb_summary_writer.add_scalar('validation loss', val_loss, iteration) if lang_stats is not None: for k,v in lang_stats.items(): tb_summary_writer.add_scalar(k, v, iteration) histories['val_result_history'][iteration] = {'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions} # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = - val_loss best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True # Dump miscalleous informations infos['best_val_score'] = best_val_score utils.save_checkpoint(opt, model, infos, optimizer, histories) if opt.save_history_ckpt: utils.save_checkpoint(opt, model, infos, optimizer, append=str(iteration)) if best_flag: utils.save_checkpoint(opt, model, infos, optimizer, append='best') # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break except (RuntimeError, KeyboardInterrupt): print('Save ckpt on exception ...') utils.save_checkpoint(opt, model, infos, optimizer) print('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace)
else: raise Exception("caption_model not supported: {}".format(opt.caption_model)) loader = DataLoader(opt) eval_kwargs = {'eval_split': opt.eval_split, 'beam_size': opt.beam_size, 'dataset': opt.input_json, 'caption_model': opt.caption_model, 'reason_weight': opt.reason_weight, ## 'guiding_l1_penality': opt.guiding_l1_penality, 'use_cuda': opt.use_cuda, 'feature_type': opt.feature_type, 'language_eval': opt.language_eval, 'val_images_use': opt.val_images_use, 'verbose': opt.verbose, 'sample_max': opt.sample_max, 'print_beam_candidate': opt.print_beam_candidate, 'id': opt.id, 'print_top_words': 0 } # Set sample options # loss, split_predictions, lang_stats = eval_utils.eval_eval(model, crit, loader, vars(opt)) loss, split_predictions, lang_stats = eval_utils.eval_split(model, crit, loader, eval_kwargs) print('loss: ', loss) if lang_stats: print(lang_stats)
loader = DataLoader(opt) else: loader = DataLoaderRaw({ 'folder_path': opt.image_folder, 'coco_json': opt.coco_json, 'batch_size': opt.batch_size }) # When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json # So make sure to use the vocab in infos file. loader.ix_to_word = infos['vocab'] # Set sample options loss, split_predictions, lang_stats = eval_utils.eval_split( cnn_model, model, crit, loader, vars(opt), return_attention=opt.return_attention) print('loss: ', loss) if lang_stats: print(lang_stats) if opt.dump_json == 1: # dump the json json.dump( split_predictions, open( 'vis/vis_' + vars(infos['opt'])['caption_model'] + '_' + vars(opt)['save_name'] + '.json', 'w'))
vars(opt).update({k: vars(infos['opt'])[k] }) # copy over options from model print(opt) torch.manual_seed(opt.seed) torch.cuda.manual_seed(opt.seed) # load train/valid/test data mytest_dset = test_dataio(opt) # set model model = SAModel(opt) model.load_state_dict(torch.load(opt.model), strict=False) model.cuda() model.eval() crit = LanguageModelCriterion() classify_crit = ClassiferCriterion() print("testing starts ...") test_loss, predictions, lang_stats = eval_utils.eval_split( model, crit, classify_crit, mytest_dset, vars(opt)) test_result = {} test_result['test_loss'] = test_loss test_result['predictions'] = predictions test_result['scores'] = lang_stats #with open(os.path.join(opt.checkpoint_path,'test_result_fuxian.pkl'), 'wb') as f: # cPickle.dump(test_result, f) print("testing finish !\n") print('loss: ', test_loss) if lang_stats: print(lang_stats)
model.load_state_dict(torch.load(opt.model)) model.cuda() model.eval() crit = utils.LanguageModelCriterion() # Create the Data Loader instance if len(opt.image_folder) == 0: loader = DataLoader(opt) else: loader = DataLoaderRaw({ 'folder_path': opt.image_folder, 'coco_json': opt.coco_json, 'batch_size': opt.batch_size, 'cnn_model': opt.cnn_model }) # When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json # So make sure to use the vocab in infos file. loader.ix_to_word = infos['vocab'] # Set sample options loss, split_predictions, lang_stats = eval_utils.eval_split( model, crit, loader, vars(opt)) print('loss: ', loss) if lang_stats: print(lang_stats) if opt.dump_json == 1: # dump the json json.dump(split_predictions, open('vis/vis.json', 'w'))
def train(opt): opt.use_att = utils.if_use_att(opt.caption_model) loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length tf_summary_writer = tf and tf.summary.FileWriter(opt.checkpoint_path) # log information folder_id = 'log_result' file_id = 'twin_show_attend_tell' log_file_name = os.path.join(folder_id, file_id + '.txt') log_file = open(log_file_name, 'w') infos = {} histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')): with open( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')) as f: histories = cPickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) model = models.setup(opt) model.cuda() back_model = models.setup(opt, reverse=True) # True for twin-net back_model.cuda() update_lr_flag = True # Assure in training mode model.train() back_model.train() crit = utils.LanguageModelCriterion() # define the loss criterion all_param = chain(model.parameters(), back_model.parameters()) optimizer = optim.Adam(all_param, lr=opt.learning_rate, weight_decay=opt.weight_decay) # Load the optimizer if vars(opt).get('start_from', None) is not None: optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) while True: if update_lr_flag: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor utils.set_lr(optimizer, opt.current_lr) # set the decayed rate else: opt.current_lr = opt.learning_rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob update_lr_flag = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() # flip the masks and labels for twin-net reverse_labels = np.flip(data['labels'], 1).copy() reverse_masks = np.flip(data['masks'], 1).copy() tmp = [ data['fc_feats'], data['att_feats'], data['labels'], reverse_labels, data['masks'], reverse_masks ] tmp = [ Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp ] fc_feats, att_feats, labels, reverse_labels, masks, reverse_masks = tmp optimizer.zero_grad() out, states = model(fc_feats, att_feats, labels) back_out, back_states = back_model(fc_feats, att_feats, reverse_labels) idx = [i for i in range(back_states.size()[1] - 1, -1, -1)] # print (back_states.size(), back_states.size()[1]) # print (type(idx)) # print (idx) idx = torch.LongTensor(idx) idx = Variable(idx).cuda() invert_backstates = back_states.index_select(1, idx) # print (states.size(), back_states.size()) # check if the back states are inverted # back = back_states.index_select(1, Variable(torch.LongTensor([2])).cuda()) # forw = invert_backstates.index_select(1, Variable(torch.LongTensor([14])).cuda()) # print (forw, back) # print (back_states.index_select(1, Variable(torch.LongTensor([3])).cuda())) # print (invert_backstates.size()) loss = crit(out, labels[:, 1:], masks[:, 1:]) # compute using the defined criterion back_loss = crit(back_out, reverse_labels[:, :-1], reverse_masks[:, :-1]) invert_backstates = invert_backstates.detach() l2_loss = ((states - invert_backstates)**2).mean() all_loss = loss + 1.5 * l2_loss + back_loss all_loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() # store the relevant values train_l2_loss = l2_loss.data[0] train_loss = loss.data[0] train_all_loss = all_loss.data[0] train_back_loss = back_loss.data[0] torch.cuda.synchronize() end = time.time() print("iter {} (epoch {}), train_loss = {:.3f}, l2_loss = {:.3f}, back_loss = {:.3f}, all_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, train_l2_loss, train_back_loss, train_all_loss, end - start)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): if tf is not None: add_summary_value(tf_summary_writer, 'train_loss', train_loss, iteration) add_summary_value(tf_summary_writer, 'l2_loss', train_l2_loss, iteration) add_summary_value(tf_summary_writer, 'all_loss', train_all_loss, iteration) add_summary_value(tf_summary_writer, 'back_loss', train_back_loss, iteration) add_summary_value(tf_summary_writer, 'learning_rate', opt.current_lr, iteration) add_summary_value(tf_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration) tf_summary_writer.flush() log_line = 'Epoch [%d], Step [%d], all loss: %f,back_loss %f,train_l2_loss %f, train_loss %f, time %f ' % ( epoch, iteration, train_all_loss, train_back_loss, train_l2_loss, train_loss, time.clock()) log_file.write(log_line + '\n') loss_history[iteration] = train_loss lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( model, crit, loader, eval_kwargs) # Write validation result into summary if tf is not None: add_summary_value(tf_summary_writer, 'validation loss', val_loss, iteration) for k, v in lang_stats.items(): add_summary_value(tf_summary_writer, k, v, iteration) tf_summary_writer.flush() val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['best_val_score'] = best_val_score infos['opt'] = opt infos['vocab'] = loader.get_vocab() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break
model = models.setup(opt) model.load_state_dict(torch.load(opt.model)) model.cuda() model.eval() crit = utils.LanguageModelCriterion() # Create the Data Loader instance if len(opt.image_folder) == 0: loader = DataLoader(opt) else: loader = DataLoaderRaw({'folder_path': opt.image_folder, 'coco_json': opt.coco_json, 'batch_size': opt.batch_size, 'cnn_model': opt.cnn_model}) # When eval using provided pretrained model, the vocab may be different from what you have in your cocotalk.json # So make sure to use the vocab in infos file. loader.ix_to_word = infos['vocab'] # Set sample options loss, split_predictions, lang_stats = eval_utils.eval_split( model, crit, loader, vars(opt)) print('loss: ', loss) if lang_stats: print(lang_stats) if opt.dump_json == 1: # dump the json json.dump(split_predictions, open('vis/vis.json', 'w'))
def train(opt): # Deal with feature things before anything opt.use_att = utils.if_use_att(opt.caption_model) if opt.use_box: opt.att_feat_size = opt.att_feat_size + 5 loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length tb_summary_writer = tb and tb.SummaryWriter(opt.checkpoint_path) infos = {} histories = {} if opt.start_from_path is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from_path, 'infos_' + opt.id + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile( os.path.join(opt.start_from_path, 'histories_' + opt.id + '.pkl')): with open( os.path.join(opt.start_from_path, 'histories_' + opt.id + '.pkl')) as f: histories = cPickle.load(f) iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) #print(val_result_history.get(3000)) #exit(0) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) model = models.setup(opt).cuda() no = sum(p.numel() for p in model.parameters()) pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("Trainable Params:" + str(pytorch_total_params)) print("Total Params:" + str(no)) #exit(0) dp_model = torch.nn.DataParallel(model) epoch_done = True # Assure in training mode dp_model.train() if (opt.use_obj_mcl_loss == 1): mcl_crit = utils.MultiLabelClassification() if opt.label_smoothing > 0: crit = utils.LabelSmoothing(smoothing=opt.label_smoothing) else: crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() if opt.noamopt: assert opt.caption_model == 'transformer', 'noamopt can only work with transformer' optimizer = utils.get_std_opt(model, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup) optimizer._step = iteration elif opt.reduce_on_plateau: optimizer = utils.build_optimizer(model.parameters(), opt) optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3) else: optimizer = utils.build_optimizer(model.parameters(), opt) # Load the optimizer if vars(opt).get('start_from_path', None) is not None and os.path.isfile( os.path.join(opt.start_from_path, "optimizer.pth")): optimizer.load_state_dict( torch.load(os.path.join(opt.start_from_path, 'optimizer.pth'))) time_epoch_start = time.time() data_time_sum = 0 batch_time_sum = 0 while True: if epoch_done: torch.cuda.synchronize() time_epoch_end = time.time() time_elapsed = (time_epoch_end - time_epoch_start) print('[DEBUG] Epoch Time: ' + str(time_elapsed)) print('[DEBUG] Sum Data Time: ' + str(data_time_sum)) print('[DEBUG] Sum Batch Time: ' + str(batch_time_sum)) #if epoch==1: # exit(0) if not opt.noamopt and not opt.reduce_on_plateau: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(optimizer, opt.current_lr) # set the decayed rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False epoch_done = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') print('Read data:', time.time() - start) data_time_sum += time.time() - start torch.cuda.synchronize() start = time.time() if (opt.use_obj_mcl_loss == 0): tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks'] ] tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp else: if opt.use_obj_att and opt.use_seg_feat: tmp = [ data['fc_feats'], data['att_feats'], data['obj_att_feats'], data['seg_feat_feats'], data['labels'], data['masks'], data['obj_labels'], data['att_masks'], data['obj_att_masks'], data['seg_feat_masks'] ] tmp = [ _ if _ is None else torch.from_numpy(_).cuda() for _ in tmp ] fc_feats, att_feats, obj_att_feats, seg_feat_feats, labels, masks, obj_labels, att_masks, obj_att_masks, seg_feat_masks = tmp elif not opt.use_obj_att and opt.use_seg_feat: tmp = [ data['fc_feats'], data['att_feats'], data['seg_feat_feats'], data['labels'], data['masks'], data['obj_labels'], data['att_masks'], data['seg_feat_masks'] ] tmp = [ _ if _ is None else torch.from_numpy(_).cuda() for _ in tmp ] fc_feats, att_feats, seg_feat_feats, labels, masks, obj_labels, att_masks, seg_feat_masks = tmp elif not opt.use_obj_att and not opt.use_seg_feat: tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['obj_labels'], data['att_masks'] ] tmp = [ _ if _ is None else torch.from_numpy(_).cuda() for _ in tmp ] fc_feats, att_feats, labels, masks, obj_labels, att_masks = tmp elif opt.use_obj_att and not opt.use_seg_feat: tmp = [ data['fc_feats'], data['att_feats'], data['obj_att_feats'], data['labels'], data['masks'], data['obj_labels'], data['att_masks'], data['obj_att_masks'] ] tmp = [ _ if _ is None else torch.from_numpy(_).cuda() for _ in tmp ] fc_feats, att_feats, obj_att_feats, labels, masks, obj_labels, att_masks, obj_att_masks = tmp optimizer.zero_grad() if (opt.use_obj_mcl_loss == 0): if not sc_flag: loss = crit(dp_model(fc_feats, att_feats, labels, att_masks), labels[:, 1:], masks[:, 1:]) else: gen_result, sample_logprobs = dp_model(fc_feats, att_feats, att_masks, opt={'sample_max': 0}, mode='sample') reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt) loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda()) else: if opt.use_obj_att and opt.use_seg_feat: if not sc_flag: logits, out = dp_model( fc_feats, [att_feats, obj_att_feats, seg_feat_feats], labels, [att_masks, obj_att_masks, seg_feat_masks]) caption_loss = crit(logits, labels[:, 1:], masks[:, 1:]) obj_loss = mcl_crit(out, obj_labels) loss = opt.lambda_caption * caption_loss + opt.lambda_obj * obj_loss #loss = 0.1*caption_loss + obj_loss #loss = caption_loss + 0 * obj_loss else: gen_result, sample_logprobs = dp_model( fc_feats, att_feats, att_masks, opt={'sample_max': 0}, mode='sample') reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt) loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda()) elif not opt.use_obj_att and opt.use_seg_feat: if not sc_flag: logits, out = dp_model(fc_feats, [att_feats, seg_feat_feats], labels, [att_masks, seg_feat_masks]) caption_loss = crit(logits, labels[:, 1:], masks[:, 1:]) obj_loss = mcl_crit(out, obj_labels) loss = opt.lambda_caption * caption_loss + opt.lambda_obj * obj_loss #loss = caption_loss + 0 * obj_loss else: gen_result, sample_logprobs = dp_model( fc_feats, att_feats, att_masks, opt={'sample_max': 0}, mode='sample') reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt) loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda()) if not opt.use_obj_att and not opt.use_seg_feat: if not sc_flag: logits, out = dp_model(fc_feats, att_feats, labels, att_masks) caption_loss = crit(logits, labels[:, 1:], masks[:, 1:]) obj_loss = mcl_crit(out, obj_labels) loss = opt.lambda_caption * caption_loss + opt.lambda_obj * obj_loss #loss = caption_loss + 0 * obj_loss else: gen_result, sample_logprobs = dp_model( fc_feats, att_feats, att_masks, opt={'sample_max': 0}, mode='sample') reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt) loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda()) elif opt.use_obj_att and not opt.use_seg_feat: if not sc_flag: logits, out = dp_model(fc_feats, [att_feats, obj_att_feats], labels, [att_masks, obj_att_masks]) caption_loss = crit(logits, labels[:, 1:], masks[:, 1:]) obj_loss = mcl_crit(out, obj_labels) loss = 0.1 * caption_loss + obj_loss #loss = caption_loss + 0 * obj_loss else: gen_result, sample_logprobs = dp_model( fc_feats, att_feats, att_masks, opt={'sample_max': 0}, mode='sample') reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt) loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda()) loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.item() torch.cuda.synchronize() end = time.time() batch_time_sum += end - start if not sc_flag: if (opt.use_obj_mcl_loss == 1): print("iter {} (epoch {}), train_loss = {:.3f}, caption_loss = {:.3f}, object_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, caption_loss.item(), obj_loss.item(), end - start)) else: print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) else: print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, np.mean(reward[:,0]), end - start)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 epoch_done = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): add_summary_value(tb_summary_writer, 'train_loss', train_loss, iteration) if (opt.use_obj_mcl_loss == 1): add_summary_value(tb_summary_writer, 'obj_loss', obj_loss.item(), iteration) add_summary_value(tb_summary_writer, 'caption_loss', caption_loss.item(), iteration) if opt.noamopt: opt.current_lr = optimizer.rate() elif opt.reduce_on_plateau: opt.current_lr = optimizer.current_lr add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration) add_summary_value(tb_summary_writer, 'scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: add_summary_value(tb_summary_writer, 'avg_reward', np.mean(reward[:, 0]), iteration) loss_history[iteration] = train_loss if not sc_flag else np.mean( reward[:, 0]) lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model orig_batch_size = opt.batch_size opt.batch_size = 1 eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) loader.batch_size = eval_kwargs.get('batch_size', 1) val_loss, predictions, lang_stats = eval_utils.eval_split( dp_model, crit, loader, eval_kwargs) opt.batch_size = orig_batch_size loader.batch_size = orig_batch_size if opt.reduce_on_plateau: if 'CIDEr' in lang_stats: optimizer.scheduler_step(-lang_stats['CIDEr']) else: optimizer.scheduler_step(val_loss) # Write validation result into summary add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration) for k, v in lang_stats.items(): add_summary_value(tb_summary_writer, k, v, iteration) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['split_ix'] = loader.split_ix infos['best_val_score'] = best_val_score infos['opt'] = opt infos['vocab'] = loader.get_vocab() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(infos, f) with open( os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '-best.pkl'), 'wb') as f: cPickle.dump(infos, f) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break