def load_model(): opt = parse_args() # Load infos infos = load_infos(opt) ignore = ["id", "batch_size", "beam_size", "start_from_best"] for k in vars(infos['opt']).keys(): if k not in ignore: if k in vars(opt): assert vars(opt)[k] == vars( infos['opt'])[k], k + ' option not consistent' else: vars(opt).update({k: vars(infos['opt'])[k] }) # copy over options from model print(opt) # Setup the model model_cnn = models.setup_cnn(opt) model_cnn.cuda() model = models.setup(opt) model.cuda() # Make sure in the evaluation mode model_cnn.eval() model.eval() ix_to_word = infos['vocab'] return model_cnn, model, ix_to_word, opt
def main(): opt = parse_args() # make dirs print(opt.output_dir) if not os.path.isdir(opt.output_dir): os.makedirs(opt.output_dir) # Load infos infos = load_infos(opt) ignore = ["id", "batch_size", "beam_size", "start_from_best"] for k in vars(infos['opt']).keys(): if k not in ignore: if k in vars(opt): assert vars(opt)[k] == vars( infos['opt'])[k], k + ' option not consistent' else: vars(opt).update({k: vars(infos['opt'])[k] }) # copy over options from model print(opt) # Setup the model model_cnn = models.setup_cnn(opt) model_cnn.cuda() model = models.setup(opt) model.cuda() # Make sure in the evaluation mode model_cnn.eval() model.eval() str_id = ''.join(opt.id.split('_')) path_zip = opt.output_dir + '/results.zip' # zipf = zipfile.ZipFile(path_zip, 'w', zipfile.ZIP_DEFLATED) for dataset in opt.datasets: loader = DataLoaderRaw({ 'folder_path': os.path.join(opt.image_folder, dataset), 'batch_size': opt.batch_size }) loader.ix_to_word = infos['vocab'] # Set sample options predictions = eval_split(model_cnn, model, loader, vars(opt)) path_json = opt.output_dir + '/captions_' + dataset + '_' + str_id + '_results.json' json.dump(predictions, open(path_json, 'w'))
def main(): opt = parse_args() # make dirs print(opt.output_dir) if not os.path.isdir(opt.output_dir): os.makedirs(opt.output_dir) # Load infos infos = load_infos(opt) ignore = [ "id", "batch_size", "beam_size", "start_from_best", "checkpoint_best_path" ] for k in vars(infos['opt']).keys(): if k not in ignore: if k in vars(opt): assert vars(opt)[k] == vars( infos['opt'])[k], k + ' option not consistent' else: vars(opt).update({k: vars(infos['opt'])[k] }) # copy over options from model print(opt) # Setup the model model_cnn = models.setup_cnn(opt) model_cnn.cuda() model = models.setup(opt) model.cuda() # Make sure in the evaluation mode model_cnn.eval() model.eval() save_model_best(model, model_cnn, infos, opt) loader = DataLoaderRaw({ 'folder_path': opt.image_folder, 'batch_size': opt.batch_size }) loader.ix_to_word = infos['vocab'] # Set sample options predictions = eval_split(model_cnn, model, loader, vars(opt)) json.dump(predictions, open(opt.output_dir + '/result.json', 'w'))
def train(opt): loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length opt.caption_model = "SCST" infos = {} if opt.start_from is not None and len(opt.start_from) > 0: print("start from %s" % (opt.start_from)) # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')) as f: infos = cPickle.load(f) saved_model_opt = infos['opt'] need_be_same = ["caption_model", "rnn_size", "num_layers"] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = infos.get('val_result_history', {}) loss_history = infos.get('loss_history', {}) lr_history = infos.get('lr_history', {}) loader.iterators = infos.get('iterators', loader.iterators) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) model_cnn = models.setup_cnn(opt) model_cnn.cuda() model = models.setup(opt) model.cuda() model_cnn.train() model.train() fc_expander = utils.FeatExpander(5) att_expander = utils.FeatExpander(5) crit = Criterion.LanguageModelCriterion() crit_reinforce = Criterion.SCSTCriterion() optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate) optimizer_cnn = optim.Adam(model_cnn.parameters(), lr=opt.cnn_learning_rate, weight_decay=opt.cnn_weight_decay) # Load the optimizer if opt.start_from is not None and len(opt.start_from) > 0: optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) optimizer_cnn.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer_cnn.pth'))) finetune_cnn_start = False use_reinforce = False update_lr_flag = True while True: if update_lr_flag: # Assign the learning rate if opt.learning_rate_decay_start >= 0 and epoch >= opt.learning_rate_decay_start: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor utils.set_lr(optimizer, opt.current_lr) # set the decayed rate else: opt.current_lr = opt.learning_rate if opt.finetune_cnn_after >= 0 and epoch >= opt.finetune_cnn_after: for p in model_cnn.parameters(): p.requires_grad = True model_cnn.train() finetune_cnn_start = True else: for p in model_cnn.parameters(): p.requires_grad = False model_cnn.eval() finetune_cnn_start = False if opt.cnn_learning_rate_decay_start >= 0 and epoch >= opt.cnn_learning_rate_decay_start: frac = (epoch - opt.cnn_learning_rate_decay_start ) // opt.cnn_learning_rate_decay_every decay_factor = opt.cnn_learning_rate_decay_rate**frac opt.current_cnn_lr = opt.cnn_learning_rate * decay_factor utils.set_lr(optimizer_cnn, opt.current_cnn_lr) # set the decayed rate else: opt.current_cnn_lr = opt.cnn_learning_rate update_lr_flag = False start_total = time.time() data = loader.get_batch('train') vocab = loader.get_vocab() images = torch.from_numpy(data['images']).cuda() images = utils.prepro(images, False) images = Variable(images, requires_grad=False) labels = torch.from_numpy(data['labels']).cuda() labels = Variable(labels, requires_grad=False) fc_feats, att_feats = model_cnn(images) fc_feats_ext = fc_expander(fc_feats) att_feats_ext = att_expander(att_feats) optimizer.zero_grad() if opt.finetune_cnn_after >= 0 and epoch >= opt.finetune_cnn_after: optimizer_cnn.zero_grad() reward = 0 reward1 = 0 if opt.reinforce_start >= 0 and epoch >= opt.reinforce_start: use_reinforce = True output1, seq1 = model(fc_feats_ext, att_feats_ext, labels, "test") output, seq = model(fc_feats_ext, att_feats_ext, labels, "train") loss, reward, reward1 = crit_reinforce(output, output1, seq, seq1, labels, vocab) loss.backward() else: use_reinforce = False output, _ = model(fc_feats_ext, att_feats_ext, labels, "xent") loss = crit(output, labels) loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() if opt.finetune_cnn_after >= 0 and epoch >= opt.finetune_cnn_after: utils.clip_gradient(optimizer_cnn, opt.grad_clip) optimizer_cnn.step() train_loss = loss.data[0] print("iter {} (epoch {}), train_loss = {:.3f}, lr = {} lr_cnn = {} finetune_cnn = {} use_reinforce = {} reward = {} reward1 = {} time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, opt.current_lr, opt.current_cnn_lr, finetune_cnn_start, use_reinforce, reward, reward1, time.time() - start_total)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): loss_history[iteration] = train_loss lr_history[iteration] = opt.current_lr # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = { 'split': 'val', 'dataset': opt.input_json, 'caption_model': 'SCST' } eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats, str_stats = eval_utils.eval_split( model_cnn, model, crit, loader, eval_kwargs) if not os.path.exists(opt.eval_result_path): os.makedirs(opt.eval_result_path) eval_result_file = os.path.join(opt.eval_result_path, opt.id + ".csv") with open(eval_result_file, 'a') as f: f.write(str_stats + "\n") predictions_file = os.path.join(opt.eval_result_path, opt.id + ".json") with open(predictions_file, 'w') as f: json.dump(predictions, f) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True if not os.path.exists(opt.checkpoint_path): os.makedirs(opt.checkpoint_path) checkpoint_path = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) checkpoint_path_cnn = os.path.join(opt.checkpoint_path, 'model_cnn.pth') torch.save(model_cnn.state_dict(), checkpoint_path_cnn) print("model cnn saved to {}".format(checkpoint_path_cnn)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) print("optimizer saved to {}".format(optimizer_path)) optimizer_path_cnn = os.path.join(opt.checkpoint_path, 'optimizer_cnn.pth') torch.save(optimizer_cnn.state_dict(), optimizer_path_cnn) print("optimizer cnn saved to {}".format(optimizer_path_cnn)) infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['best_val_score'] = best_val_score infos['opt'] = opt infos['val_result_history'] = val_result_history infos['loss_history'] = loss_history infos['lr_history'] = lr_history infos['vocab'] = loader.get_vocab() info_path = os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl') with open(info_path, 'wb') as f: cPickle.dump(infos, f) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model_best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) checkpoint_path_cnn = os.path.join(opt.checkpoint_path, 'model_cnn_best.pth') torch.save(model_cnn.state_dict(), checkpoint_path_cnn) print("model cnn saved to {}".format(checkpoint_path_cnn)) info_path = os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '_best.pkl') with open(info_path, 'wb') as f: cPickle.dump(infos, f) if epoch >= opt.max_epochs and opt.max_epochs != -1: break
def main(): opt = parse_args() # make dirs print(opt.eval_result_path) if not os.path.isdir(opt.eval_result_path): os.makedirs(opt.eval_result_path) # Load infos infos = load_infos(opt) ignore = [ "id", "input_json", "input_h5", "input_anno", "images_root", "coco_caption_path", "batch_size", "beam_size", "start_from_best", "eval_result_path" ] for k in vars(infos['opt']).keys(): if k not in ignore: if k in vars(opt): assert vars(opt)[k] == vars( infos['opt'])[k], k + ' option not consistent' else: vars(opt).update({k: vars(infos['opt'])[k] }) # copy over options from model # print(opt) # Setup the model model_cnn = models.setup_cnn(opt) model_cnn.cuda() model = models.setup(opt) model.cuda() # Make sure in the evaluation mode model_cnn.eval() model.eval() if models.has_bu(opt.caption_model) or \ models.has_sub_regions(opt.caption_model) or \ models.has_sub_region_bu(opt.caption_model): loader = DataLoaderThreadBu(opt) print("DataLoaderThreadBu") else: loader = DataLoaderThreadNew(opt) print("DataLoaderThreadNew") loader.ix_to_word = infos['vocab'] eval_kwargs = {'split': opt.val_split, 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) start_beam = 0 total_beam = 20 for beam in range(start_beam, total_beam): opt.beam_size = beam + 1 eval_kwargs.update(vars(opt)) print("beam_size: " + str(opt.beam_size)) print("start eval ...") crit = None val_loss, predictions, lang_stats, str_stats = eval_utils.eval_split( model_cnn, model, crit, loader, eval_kwargs) print("end eval ...") msg = "str_stats = {}".format(str_stats) print(msg) save_result(str(opt.beam_size) + "," + str_stats, predictions, opt)
def main(): opt = parse_args() opt.datasets = opt.datasets.split(',') opt.ids = opt.ids.split(',') # make dirs print(opt.output_dir) if not os.path.isdir(opt.output_dir): os.makedirs(opt.output_dir) print(opt.output_beam_dir) if not os.path.isdir(opt.output_beam_dir): os.makedirs(opt.output_beam_dir) # print(opt) all_model_cnns = [] all_models = [] for i in range(len(opt.ids)): # id opt.id = opt.ids[i] # Load infos infos = load_infos(opt) ignore = ["id", "batch_size", "beam_size", "start_from_best", "input_json", "input_h5", "input_anno", "images_root", "aic_caption_path", "input_bu"] for k in vars(infos['opt']).keys(): if k not in ignore: vars(opt).update({k: vars(infos['opt'])[k]}) opt.relu_type = 0 # Setup the model model_cnn = models.setup_cnn(opt) # model_cnn.cuda() model_cnn = nn.DataParallel(model_cnn.cuda()) model = models.setup(opt) model.cuda() # Make sure in the evaluation mode model_cnn.eval() model.eval() all_model_cnns.append(model_cnn) all_models.append(model) if opt.eval_type == 0: # local test print('eval local') if models.has_bu(opt.caption_model): loader = DataLoaderThreadBu(opt) else: loader = DataLoaderThreadNew(opt) # Set sample options predictions, lang_stats, str_stats, beam_vis = eval_split(all_model_cnns, all_models, loader, opt, vars(opt)) save_result(opt.output_dir, str_stats, predictions) save_beam_vis_result(opt.output_beam_dir, "eval_beam_vis.json", beam_vis) elif opt.eval_type == 1: # server print('eval server') for dataset in opt.datasets: print(os.path.join(opt.image_folder, dataset)) loader = DataLoaderRaw({'folder_path': os.path.join(opt.image_folder, dataset), 'batch_size': opt.batch_size, 'start': opt.start, 'num': opt.num, 'use_bu_att': opt.use_bu_att, 'input_bu': opt.input_bu, 'bu_size': opt.bu_size, 'bu_feat_size': opt.bu_feat_size}) loader.ix_to_word = infos['vocab'] # Set sample options predictions, lang_stats, str_stats, beam_vis = eval_split(all_model_cnns, all_models, loader, opt, vars(opt)) path_json = opt.output_dir + '/captions_' + dataset + str(opt.start) + '_ensemble_results.json' json.dump(predictions, open(path_json, 'w')) save_beam_vis_result(opt.output_beam_dir, dataset + str(opt.start) + "_beam_size_" + str(opt.beam_size) + "_beam_type_" + str(opt.beam_type) + "_eval_beam_vis.json", beam_vis)
def train(opt): notifier = notify() notifier.login() # init path if not os.path.exists(opt.eval_result_path): os.makedirs(opt.eval_result_path) config_file = os.path.join(opt.eval_result_path, opt.id + '_config.txt') with open(config_file, 'w') as f: f.write("{}\n".format(json.dumps(vars(opt), sort_keys=True, indent=2))) torch.backends.cudnn.benchmark = True if opt.use_tensorboard: if opt.tensorboard_type == 0: board = tensorboard.TensorBoard() board.start(opt.id, opt.tensorboard_ip, opt.tensorboard_port) else: board = trans_client.TransClient() board.start(opt.id) print(opt.cnn_model) loader = get_loader() opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length vocab = loader.get_vocab() opt.vocab = vocab batch_size = loader.batch_size infos = get_infos() infos['vocab'] = vocab iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = infos.get('val_result_history', {}) loss_history = infos.get('loss_history', {}) lr_history = infos.get('lr_history', {}) finetune_cnn_history = infos.get('finetune_cnn_history', {}) loader.iterators = infos.get('iterators', loader.iterators) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) else: best_val_score = None model_cnn = models.setup_cnn(opt) model_cnn = model_cnn.cuda() model_cnn = nn.DataParallel(model_cnn) model = models.setup(opt) model = model.cuda() # if models.is_transformer(opt.caption_model) or models.is_ctransformer(opt.caption_model): # model = nn.DataParallel(model) train_utils.save_model_conf(model_cnn, model, opt) update_lr_flag = True model_cnn.train() model.train() fc_expander, att_expander, bu_expander = get_expander() optimizer = None optimizer_cnn = None finetune_cnn_start = False early_stop_cnt = 0 params = {} params['model'] = model params['vocab'] = vocab # crit_pg, crit_rl, crit_ctc, crit_c, crit_ac, crit params['crit_pg'] = None params['crit_rl'] = None params['crit_ctc'] = None params['crit_c'] = None params['crit_ac'] = None params['crit'] = None is_eval_start = opt.is_eval_start if opt.use_auto_learning_rate == 1: train_process = train_utils.init_train_process() train_process_index = infos.get('train_process_index', 0) train_step = train_process[train_process_index] optimizer_cnn = None optimizer = None opt.learning_rate = train_step.learning_rate opt.cnn_learning_rate = train_step.cnn_learning_rate opt.finetune_cnn_after = train_step.finetune_cnn_after while True: current_score = None # make evaluation on validation set, and save model if (iteration > 0 and iteration % opt.save_checkpoint_every == 0 and not val_result_history.has_key(iteration)) or is_eval_start: predictions, best_val_score, best_flag, current_score = eval_model( model_cnn, model, params, loader, board, iteration, notifier, val_result_history, best_val_score) infos['best_val_score'] = best_val_score infos['val_result_history'] = val_result_history train_utils.save_infos(infos, opt) if best_flag: train_utils.save_best_result(predictions, opt) train_utils.save_model_best(model, model_cnn, infos, opt) early_stop_cnt = 0 else: early_stop_cnt += 1 is_eval_start = False if epoch >= opt.max_epochs and opt.max_epochs != -1: msg = "max epoch" logger.info(msg) break # auto update model if opt.use_auto_learning_rate == 1 and current_score is not None: if early_stop_cnt > opt.auto_early_stop_cnt or current_score < opt.auto_early_stop_score: early_stop_cnt = 0 train_process_index += 1 msg = opt.id + " early stop " + str(train_process_index) logger.info(msg) infos['train_process_index'] = train_process_index train_utils.save_infos(infos, opt) if train_process_index >= len(train_process): notifier.send(opt.id + " early stop", msg) logger.info("break") break train_step = train_process[train_process_index] optimizer_cnn = None optimizer = None opt.learning_rate = train_step.learning_rate opt.cnn_learning_rate = train_step.cnn_learning_rate opt.finetune_cnn_after = train_step.finetune_cnn_after opt.start_from_best = opt.auto_start_from_best # model_cnn_path = os.path.join(opt.auto_start_from_best, opt.id + '_model_cnn_best.pth') # model_cnn.load_state_dict(torch.load(model_cnn_path)) # model_cnn = model_cnn.cuda() # model_cnn = nn.DataParallel(model_cnn) # # model_path = os.path.join(opt.auto_start_from_best, opt.id + '_model_best.pth') # model.load_state_dict(torch.load(model_path)) # model = model.cuda() del model_cnn del model torch.cuda.empty_cache() model_cnn = models.setup_cnn(opt) model_cnn = model_cnn.cuda() model_cnn = nn.DataParallel(model_cnn) model = models.setup(opt) model = model.cuda() model_cnn.train() model.train() update_lr_flag = True # start train # Update the iteration and epoch iteration += 1 if update_lr_flag: if opt.finetune_cnn_after >= 0 and epoch >= opt.finetune_cnn_after: finetune_cnn_start = True else: finetune_cnn_start = False optimizer_cnn = train_utils.get_cnn_optimizer( model_cnn, optimizer_cnn, finetune_cnn_start, opt) train_utils.update_lr(epoch, optimizer, optimizer_cnn, finetune_cnn_start, opt) update_lr_flag = False if opt.reinforce_start >= 0 and epoch >= opt.reinforce_start: use_reinforce = True else: use_reinforce = False optimizer = get_optimizer(optimizer, epoch, model, model_cnn) start_total = time.time() start = time.time() optimizer.zero_grad() if finetune_cnn_start: optimizer_cnn.zero_grad() # batch data data = loader.get_batch('train', batch_size) images = data['images'] bus = None if models.has_bu(opt.caption_model): bus = data['bus'] if opt.verbose: print('data {:.3f}'.format(time.time() - start)) start = time.time() fc_feats, att_feats, bu_feats = train_cnn(model_cnn, images, bus, fc_expander, att_expander, bu_expander, use_reinforce) if opt.verbose: print('model_cnn {:.3f}'.format(time.time() - start)) # get input data params['fc_feats'] = fc_feats params['att_feats'] = att_feats params['bu_feats'] = bu_feats # get target data params['labels'] = data['labels'] params['masks'] = data['masks'] params['tokens'] = data['tokens'] params['gts'] = data['gts'] params['targets'] = data['targets'] # crit_pg, crit_rl, crit_ctc, crit_c, crit_ac, crit, train_loss, reward_mean, use_reinforce = train_model( params, iteration, epoch, board) # update the gradient update_gradient(optimizer, optimizer_cnn, finetune_cnn_start) time_batch = time.time() - start_total left_time = (opt.save_checkpoint_every - iteration % opt.save_checkpoint_every) * time_batch s_left_time = utils.format_time(left_time) msg = "id {} iter {} (epoch {}), train_loss = {:.3f}, lr = {} lr_cnn = {} f_cnn = {} rf = {} r = {:.3f} early_stop_cnt = {} time/batch = {:.3f}s time/eval = {}" \ .format(opt.id, iteration, epoch, train_loss, opt.current_lr, opt.current_cnn_lr, finetune_cnn_start, use_reinforce, reward_mean, early_stop_cnt, time_batch, s_left_time) logger.info(msg) if opt.use_tensorboard: if iteration % opt.tensorboard_for_train_every == 0: board.loss_train(train_loss, iteration) if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if iteration % opt.losses_log_every == 0: loss_history[iteration] = train_loss lr_history[iteration] = opt.current_lr finetune_cnn_history[iteration] = finetune_cnn_start # update infos infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['best_val_score'] = best_val_score infos['opt'] = opt infos['val_result_history'] = val_result_history infos['loss_history'] = loss_history infos['lr_history'] = lr_history infos['finetune_cnn_history'] = finetune_cnn_history if opt.use_auto_learning_rate == 1: infos['train_process_index'] = train_process_index if opt.save_snapshot_every > 0 and iteration % opt.save_snapshot_every == 0: train_utils.save_model(model, model_cnn, infos, opt) loader.terminate()
def train(opt): notifier = notify() notifier.login() # init path if not os.path.exists(opt.eval_result_path): os.makedirs(opt.eval_result_path) config_file = os.path.join(opt.eval_result_path, opt.id + '_config.txt') with open(config_file, 'w') as f: f.write("{}\n".format(json.dumps(vars(opt), sort_keys=True, indent=2))) torch.backends.cudnn.benchmark = True if opt.use_tensorboard: board = tensorboard.TensorBoard() board.start(opt.id, opt.tensorboard_ip) # board = trans_client.TransClient() # board.start(opt.id) print(opt.cnn_model) loader = DataLoaderThreadBu(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length vocab = loader.get_vocab() batch_size = loader.batch_size try: if opt.is_load_infos == 1: infos = train_utils.load_infos(opt) else: infos = {} except: infos = {} print('load infos error') iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = infos.get('val_result_history', {}) loss_history = infos.get('loss_history', {}) lr_history = infos.get('lr_history', {}) loader.iterators = infos.get('iterators', loader.iterators) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) else: best_val_score = None model_cnn = models.setup_cnn(opt) model_cnn = nn.DataParallel(model_cnn.cuda()) model = models.setup(opt) model.cuda() train_utils.save_model_conf(model_cnn, model, opt) update_lr_flag = True model_cnn.train() model.train() if opt.seq_per_img > 1: fc_expander = utils.FeatExpander(opt.seq_per_img) att_expander = utils.FeatExpander(opt.seq_per_img) bu_expander = utils.FeatExpander(opt.seq_per_img) # crit = Criterion.LanguageModelWeightNewCriterion() crit = Criterion.LanguageModelWithProbWeightCriterion( opt.prob_weight_alpha) crit_rl = None # print(model_cnn) optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, betas=(opt.optim_alpha, opt.optim_beta), eps=opt.optim_epsilon) optimizer_cnn = None finetune_cnn_start = False early_stop_cnt = 0 params = {} params['model'] = model params['crit'] = crit params['vocab'] = vocab while True: # try: if update_lr_flag: if opt.finetune_cnn_after >= 0 and epoch >= opt.finetune_cnn_after: finetune_cnn_start = True else: finetune_cnn_start = False optimizer_cnn = train_utils.finetune_cnn(model_cnn, optimizer_cnn, finetune_cnn_start, opt) train_utils.update_lr(epoch, optimizer, optimizer_cnn, finetune_cnn_start, opt) update_lr_flag = False start_total = time.time() start = time.time() optimizer.zero_grad() if finetune_cnn_start: optimizer_cnn.zero_grad() # batch data data = loader.get_batch('train', batch_size) images = data['images'] labels = data['labels'] masks = data['masks'] tokens = data['tokens'] gts = data['gts'] if opt.verbose: print('data {:.3f}'.format(time.time() - start)) # train cnn fc_feats, att_feats, bu_feats = model_cnn(images) if opt.seq_per_img > 1: fc_feats = fc_expander(fc_feats) att_feats = att_expander(att_feats) bu_feats = bu_expander(bu_feats) params['fc_feats'] = fc_feats params['att_feats'] = att_feats params['bu_feats'] = bu_feats params['labels'] = labels params['masks'] = masks params['tokens'] = tokens params['gts'] = gts if opt.reinforce_start >= 0 and epoch >= opt.reinforce_start: use_reinforce = True if crit_rl is None: if opt.is_aic_data: crit_rl = Criterion.RewardCriterionAIC(opt, vocab) else: crit_rl = Criterion.RewardCriterion(opt) params['crit_rl'] = crit_rl train_loss, reward_mean, sample_mean, greedy_mean = train_utils.train_reinforce( params, opt) if opt.use_tensorboard: board.val("sample_mean", sample_mean, iteration) board.val("greedy_mean", greedy_mean, iteration) else: use_reinforce = False params['crit'] = crit train_loss, reward_mean = train_utils.train_with_prob_weight( params, opt) # update the gradient utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() if finetune_cnn_start: utils.clip_gradient(optimizer_cnn, opt.grad_clip) optimizer_cnn.step() msg = "iter {} (epoch {}), train_loss = {:.3f}, lr = {} lr_cnn = {} f_cnn = {} rf = {} r = {:.3f} time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, opt.current_lr, opt.current_cnn_lr, finetune_cnn_start, use_reinforce, reward_mean, time.time() - start_total) logger.info(msg) if opt.use_tensorboard: board.loss_train(train_loss, iteration) # Update the iteration and epoch if not opt.is_eval_start: iteration += 1 if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): loss_history[iteration] = train_loss lr_history[iteration] = opt.current_lr # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): if opt.is_every_eval: # eval model eval_kwargs = { 'split': opt.val_split, 'dataset': opt.input_json } eval_kwargs.update(vars(opt)) print("start eval ...") val_loss, predictions, lang_stats, str_stats = eval_utils.eval_split_with_region_bu( model_cnn, model, crit, loader, eval_kwargs) if opt.use_tensorboard: board.accuracy(lang_stats, iteration) board.loss_val(val_loss, iteration) print("end eval ...") msg = "iteration = {} val_loss = {} str_stats = {}".format( iteration, val_loss, str_stats) notifier.send(opt.id + " val result", opt.id + " :\n" + msg) logger.info(msg) train_utils.save_result(str_stats + ',' + str(val_loss), predictions, opt) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: eval_metric = opt.eval_metric current_score = lang_stats[eval_metric] else: current_score = -val_loss best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True else: best_flag = True infos['iter'] = iteration infos['epoch'] = epoch infos['iterators'] = loader.iterators infos['best_val_score'] = best_val_score infos['opt'] = opt infos['val_result_history'] = val_result_history infos['loss_history'] = loss_history infos['lr_history'] = lr_history infos['vocab'] = loader.get_vocab() train_utils.save_model(model, model_cnn, infos, opt) if best_flag: train_utils.save_model_best(model, model_cnn, infos, opt) early_stop_cnt = 0 else: early_stop_cnt += 1 if epoch >= opt.max_epochs and opt.max_epochs != -1: break if opt.is_eval_start: iteration += 1 loader.terminate()