def main(opt): print("=" * 80) print("Generating partial discrete flashing ratchet trajectories") data = simulation(2, opt.n_step, opt.potential, seed=0) trainset = data[0] % 3 testset = data[1] % 3 print("Done") print("=" * 80) use_cuda = not opt.no_cuda and torch.cuda.is_available() torch.manual_seed(opt.seed) opt.device = torch.device("cuda" if use_cuda else "cpu") model = RNEEP(opt) model = model.to(opt.device) optim = torch.optim.Adam(model.parameters(), opt.lr, weight_decay=opt.wd) trajs_t = torch.from_numpy(trainset).to(opt.device).long().view(1, -1) test_trajs_t = torch.from_numpy(testset).to(opt.device).long().view(1, -1) train_sampler = CartesianSeqSampler(1, opt.n_step, opt.seq_len, opt.batch_size) test_sampler = CartesianSeqSampler(1, opt.n_step, opt.seq_len, opt.test_batch_size, train=False) ret_train = [] ret_test = [] if not os.path.exists(opt.save): os.makedirs(opt.save) for i in tqdm(range(1, opt.n_iter + 1)): if i % opt.record_freq == 0 or i == 1: preds, train_loss = validate(opt, model, trajs_t, train_sampler) train_log = logging_rneep(i, train_loss, opt.seq_len, preds) preds, test_loss = validate(opt, model, test_trajs_t, test_sampler) test_log = logging_rneep(i, test_loss, opt.seq_len, preds, train=False) if i == 1: best_loss = test_loss best_pred_rate = test_log["pred_rate"] else: is_best = test_loss < best_loss if is_best: best_loss = test_loss best_pred_rate = test_log["pred_rate"] save_checkpoint( { "iteration": i, "state_dict": model.state_dict(), "best_loss": best_loss, "best_pred_rate": best_pred_rate, "optimizer": optim.state_dict(), }, is_best, opt.save, ) test_log["best_loss"] = best_loss test_log["best_pred_rate"] = best_pred_rate ret_train.append(train_log) ret_test.append(test_log) train_sampler.train() train(opt, model, optim, trajs_t, train_sampler) train_df = pd.DataFrame(ret_train) test_df = pd.DataFrame(ret_test) train_df.to_csv(os.path.join(opt.save, "train_log.csv"), index=False) test_df.to_csv(os.path.join(opt.save, "test_log.csv"), index=False) opt.device = "cuda" if use_cuda else "cpu" hparams = json.dumps(vars(opt)) with open(os.path.join(opt.save, "hparams.json"), "w") as f: f.write(hparams)
def train(opt): ################################ # Build dataloader ################################ loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length ########################## # Initialize infos ########################## infos = { 'iter': 0, 'epoch': 0, 'loader_state_dict': None, 'vocab': loader.get_vocab(), } # Load old infos(if there is) and check if models are compatible if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl')): with open(os.path.join(opt.start_from, 'infos_'+opt.id+'.pkl'), 'rb') as f: infos = utils.pickle_load(f) saved_model_opt = infos['opt'] need_be_same=["caption_model", "rnn_type", "rnn_size", "num_layers"] for checkme in need_be_same: assert getattr(saved_model_opt, checkme) == getattr(opt, checkme), "Command line argument and saved model disagree on '%s' " % checkme infos['opt'] = opt ######################### # Build logger ######################### # naive dict logger histories = defaultdict(dict) if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl')): with open(os.path.join(opt.start_from, 'histories_'+opt.id+'.pkl'), 'rb') as f: histories.update(utils.pickle_load(f)) # tensorboard logger tb_summary_writer = SummaryWriter(opt.checkpoint_path) ########################## # Build model ########################## opt.vocab = loader.get_vocab() model = models.setup(opt).cuda() del opt.vocab # Load pretrained weights: if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from, 'model.pth')): model.load_state_dict(torch.load(os.path.join(opt.start_from, 'model.pth'))) # Wrap generation model with loss function(used for training) # This allows loss function computed separately on each machine lw_model = LossWrapper(model, opt) # Wrap with dataparallel dp_model = torch.nn.DataParallel(model) dp_lw_model = torch.nn.DataParallel(lw_model) ########################## # Build optimizer ########################## if opt.noamopt: assert opt.caption_model == 'transformer', 'noamopt can only work with transformer' optimizer = utils.get_std_opt(model, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup) elif opt.reduce_on_plateau: optimizer = utils.build_optimizer(model.parameters(), opt) optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3) else: optimizer = utils.build_optimizer(model.parameters(), opt) # Load the optimizer if opt.start_from is not None and os.path.isfile(os.path.join(opt.start_from,"optimizer.pth")): optimizer.load_state_dict(torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) ######################### # Get ready to start ######################### iteration = infos['iter'] epoch = infos['epoch'] # For back compatibility if 'iterators' in infos: infos['loader_state_dict'] = {split: {'index_list': infos['split_ix'][split], 'iter_counter': infos['iterators'][split]} for split in ['train', 'val', 'test']} loader.load_state_dict(infos['loader_state_dict']) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) if opt.noamopt: optimizer._step = iteration # flag indicating finish of an epoch # Always set to True at the beginning to initialize the lr or etc. epoch_done = True # Assure in training mode dp_lw_model.train() # Start training try: while True: if epoch_done: if not opt.noamopt and not opt.reduce_on_plateau: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate ** frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(optimizer, opt.current_lr) # set the decayed rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False # If start structure loss training if opt.structure_after != -1 and epoch >= opt.structure_after: struc_flag = True init_scorer(opt.cached_tokens) else: struc_flag = False epoch_done = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks']] tmp = [_ if _ is None else _.cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp optimizer.zero_grad() model_out = dp_lw_model(fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts'])), sc_flag, struc_flag) loss = model_out['loss'].mean() loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.item() torch.cuda.synchronize() end = time.time() if struc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, model_out['lm_loss'].mean().item(), model_out['struc_loss'].mean().item(), end - start)) elif not sc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) else: print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, model_out['reward'].mean(), end - start)) # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 epoch_done = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): tb_summary_writer.add_scalar('train_loss', train_loss, iteration) if opt.noamopt: opt.current_lr = optimizer.rate() elif opt.reduce_on_plateau: opt.current_lr = optimizer.current_lr tb_summary_writer.add_scalar('learning_rate', opt.current_lr, iteration) tb_summary_writer.add_scalar('scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: tb_summary_writer.add_scalar('avg_reward', model_out['reward'].mean(), iteration) elif struc_flag: tb_summary_writer.add_scalar('lm_loss', model_out['lm_loss'].mean().item(), iteration) tb_summary_writer.add_scalar('struc_loss', model_out['struc_loss'].mean().item(), iteration) tb_summary_writer.add_scalar('reward', model_out['reward'].mean().item(), iteration) histories['loss_history'][iteration] = train_loss if not sc_flag else model_out['reward'].mean() histories['lr_history'][iteration] = opt.current_lr histories['ss_prob_history'][iteration] = model.ss_prob # update infos infos['iter'] = iteration infos['epoch'] = epoch infos['loader_state_dict'] = loader.state_dict() # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( dp_model, lw_model.crit, loader, eval_kwargs) if opt.reduce_on_plateau: if 'CIDEr' in lang_stats: optimizer.scheduler_step(-lang_stats['CIDEr']) else: optimizer.scheduler_step(val_loss) # Write validation result into summary tb_summary_writer.add_scalar('validation loss', val_loss, iteration) if lang_stats is not None: for k,v in lang_stats.items(): tb_summary_writer.add_scalar(k, v, iteration) histories['val_result_history'][iteration] = {'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions} # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = - val_loss best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True # Dump miscalleous informations infos['best_val_score'] = best_val_score utils.save_checkpoint(opt, model, infos, optimizer, histories) if opt.save_history_ckpt: utils.save_checkpoint(opt, model, infos, optimizer, append=str(iteration)) if best_flag: utils.save_checkpoint(opt, model, infos, optimizer, append='best') # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break except (RuntimeError, KeyboardInterrupt): print('Save ckpt on exception ...') utils.save_checkpoint(opt, model, infos, optimizer) print('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace)
def main(): global opt best_prec1 = 0 # only used when we resume training from some checkpoint model resume_epoch = 0 # train data loader # for loader, droplast by default is set to false train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=int(opt.workers)) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=int(opt.workers)) # create model # for modelnet40, opt.num_points is set to be 2048, opt.num_classes is 40 model = pointnet.PointNetCls(num_points = opt.num_points, k = opt.num_classes) if opt.init_model != '': print('loading pretrained model from {0}'.format(opt.init_model)) model.load_state_dict(torch.load(opt.init_model)) criterion = nn.CrossEntropyLoss() if opt.cuda: print('shift model and criterion to GPU .. ') model = model.cuda() # define loss function (criterion) and pptimizer criterion = criterion.cuda() # optimizer optimizer = optim.SGD(model.parameters(), opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay) if opt.optim_state_from != '': print('loading optim_state_from {0}'.format(opt.optim_state_from)) optim_state = torch.load(opt.optim_state_from) resume_epoch = optim_state['epoch'] best_prec1 = optim_state['best_prec1'] # configure optimzer optimizer.load_state_dict(optim_state['optim_state_best']) for epoch in range(resume_epoch, opt.max_epochs): ################################# # train for one epoch # debug_here() ################################# train(train_loader, model, criterion, optimizer, epoch, opt) ################################# # validate ################################# prec1 = validate(test_loader, model, criterion, epoch, opt) ################################## # save checkpoints ################################## if best_prec1 < prec1: best_prec1 = prec1 path_checkpoint = '{0}/model_best.pth'.format(opt.checkpoint_folder) utils.save_checkpoint(model.state_dict(), path_checkpoint) # save optim state path_optim_state = '{0}/optim_state_best.pth'.format(opt.checkpoint_folder) optim_state = {} optim_state['epoch'] = epoch + 1 # because epoch starts from 0 optim_state['best_prec1'] = best_prec1 optim_state['optim_state_best'] = optimizer.state_dict() utils.save_checkpoint(optim_state, path_optim_state) # problem, should we store latest optim state or model, currently, we donot print('best accuracy: ', best_prec1)
def main(opt): trajs = simulation(opt.n_trj, opt.n_step, opt.n_bead, opt.time_step, seed=0) test_trajs = simulation(opt.n_trj, opt.n_step, opt.n_bead, opt.time_step, seed=3) mean, std = trajs.mean(axis=(0, 1)).to( opt.device), trajs.std(axis=(0, 1)).to(opt.device) transform = lambda x: (x - mean) / std if opt.normalize else lambda x: x opt.n_input = opt.n_bead torch.manual_seed(opt.seed) random.seed(opt.seed) model = NEEP(opt) model = model.to(opt.device) optim = torch.optim.Adam(model.parameters(), opt.lr, weight_decay=opt.wd) train_sampler = CartesianSampler(opt.n_trj, opt.n_step, opt.batch_size) test_sampler = CartesianSampler(opt.n_trj, opt.n_step, opt.test_batch_size, train=False) ents = tot_entpy(test_trajs) ret_train = [] ret_test = [] if not os.path.exists(opt.save): os.makedirs(opt.save) for i in tqdm(range(1, opt.n_iter + 1)): if i % opt.record_freq == 0 or i == 1: preds, train_loss = validate(opt, model, trajs, test_sampler, transform) train_log = logging(i, train_loss, opt.time_step, preds) preds, test_loss = validate(opt, model, test_trajs, test_sampler, transform) test_log = logging_r(i, test_loss, opt.time_step, ents, preds) if i == 1: best_loss = test_loss best_pred_rate = test_log["pred_rate"] else: is_best = test_loss < best_loss if is_best: best_loss = test_loss best_pred_rate = test_log["pred_rate"] save_checkpoint( { "iteration": i, "state_dict": model.state_dict(), "best_loss": best_loss, "best_pred_rate": best_pred_rate, "optimizer": optim.state_dict(), }, is_best, opt.save, ) test_log["best_loss"] = best_loss test_log["best_pred_rate"] = best_pred_rate ret_train.append(train_log) ret_test.append(test_log) train_sampler.train() train(opt, model, optim, trajs, train_sampler, transform) train_df = pd.DataFrame(ret_train) test_df = pd.DataFrame(ret_test) train_df.to_csv(os.path.join(opt.save, "train_log.csv"), index=False) test_df.to_csv(os.path.join(opt.save, "test_log.csv"), index=False) opt.device = "cuda" if use_cuda else "cpu" hparams = json.dumps(vars(opt)) with open(os.path.join(opt.save, "hparams.json"), "w") as f: f.write(hparams)
def main(): global opt best_prec1 = 0 # only used when we resume training from some checkpoint model resume_epoch = 0 # train data loader # for loader, droplast by default is set to false train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.batch_size, shuffle=True, num_workers=int(opt.workers)) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=opt.batch_size, shuffle=True, num_workers=int(opt.workers)) # create model # for modelnet40, opt.num_points is set to be 2048, opt.num_classes is 40 opt.num_seg_classes = train_dataset.num_seg_classes opt.num_points = train_dataset.num_points opt.num_classes = train_dataset.num_classes model = pointnet.PointNetPartDenseCls(num_points=opt.num_points, k=opt.num_seg_classes) if opt.init_model != '': print('loading pretrained model from {0}'.format(opt.init_model)) model.load_state_dict(torch.load(opt.init_model)) # segmentation loss criterion = nn.NLLLoss() if opt.cuda: print('shift model and criterion to GPU .. ') model = model.cuda() # define loss function (criterion) and pptimizer criterion = criterion.cuda() # optimizer optimizer = optim.SGD(model.parameters(), opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay) if opt.optim_state_from != '': print('loading optim_state_from {0}'.format(opt.optim_state_from)) optim_state = torch.load(opt.optim_state_from) resume_epoch = optim_state['epoch'] best_prec1 = optim_state['best_prec1'] # configure optimzer optimizer.load_state_dict(optim_state['optim_state_best']) for epoch in range(resume_epoch, opt.max_epochs): ################################# # train for one epoch # debug_here() ################################# train(train_loader, model, criterion, optimizer, epoch, opt) ################################# # validate ################################# prec1 = validate(val_loader, model, criterion, epoch, opt) ################################## # save checkpoints ################################## if best_prec1 < prec1: best_prec1 = prec1 path_checkpoint = '{0}/model_best.pth'.format( opt.checkpoint_folder) utils.save_checkpoint(model.state_dict(), path_checkpoint) # save optim state path_optim_state = '{0}/optim_state_best.pth'.format( opt.checkpoint_folder) optim_state = {} optim_state['epoch'] = epoch + 1 # because epoch starts from 0 optim_state['best_prec1'] = best_prec1 optim_state['optim_state_best'] = optimizer.state_dict() utils.save_checkpoint(optim_state, path_optim_state) # problem, should we store latest optim state or model, currently, we donot print('best accuracy: ', best_prec1)
if (sum(recall[0]) + sum(recall[1]) > best_rec): best_rec = sum(recall[0]) + sum(recall[1]) is_best = True state = { 'epoch': epoch, 'state_dict': join_emb.state_dict(), 'best_rec': best_rec, 'args_dict': args, 'optimizer': optimizer.state_dict(), } log_epoch(logger, epoch, train_loss, val_loss, optimizer.param_groups[0]['lr'], batch_train, batch_val, data_train, data_val, recall) save_checkpoint(state, is_best, args.name, epoch) # Optimizing the text pipeline after one epoch if epoch == 1: for param in join_emb.cap_emb.parameters(): param.requires_grad = True optimizer.add_param_group({ 'params': join_emb.cap_emb.parameters(), 'lr': optimizer.param_groups[0]['lr'], 'initial_lr': args.lr }) lr_scheduler = MultiStepLR(optimizer, args.lrd[1:], gamma=args.lrd[0]) # Starting the finetuning of the whole model
def train(opt): ################################ # Build dataloader ################################ loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length ########################## # Initialize infos ########################## infos = { 'iter': 0, 'epoch': 0, 'loader_state_dict': None, 'vocab': loader.get_vocab(), } # Load old infos(if there is) and check if models are compatible if opt.start_from is not None and os.path.isfile( os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl')): with open(os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl'), 'rb') as f: infos = utils.pickle_load(f) saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers" ] for checkme in need_be_same: assert getattr(saved_model_opt, checkme) == getattr( opt, checkme ), "Command line argument and saved model disagree on '%s' " % checkme infos['opt'] = opt ######################### # Build logger ######################### # naive dict logger histories = defaultdict(dict) if opt.start_from is not None and os.path.isfile( os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl')): with open(os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl'), 'rb') as f: histories.update(utils.pickle_load(f)) # tensorboard logger tb_summary_writer = SummaryWriter(opt.checkpoint_path) ########################## # Build model ########################## opt.vocab = loader.get_vocab() multi_models_list = [] for order in range(opt.number_of_models): multi_models_list.append(models.setup(opt).cuda()) for order in range(opt.number_of_models): multi_models_list.append(models.setup(opt).cuda()) for order in range(opt.number_of_models, 2 * opt.number_of_models): for param in multi_models_list[order].parameters(): param.detach_() for order in range(opt.number_of_models): for param, param_ema in zip( multi_models_list[order].parameters(), multi_models_list[order + opt.number_of_models].parameters()): param_ema.data = param.data.clone() # multi_models = MultiModels(multi_models_list) # multi_models_list.append(SenEncodeModel(opt).cuda()) multi_models = nn.ModuleList(multi_models_list) del opt.vocab # Load pretrained weights: if opt.start_from is not None and os.path.isfile( os.path.join(opt.start_from, 'model.pth')): multi_models.load_state_dict( torch.load(os.path.join(opt.start_from, 'model.pth'))) # Wrap generation model with loss function(used for training) # This allows loss function computed separately on each machine lw_models = nn.ModuleList([ LossWrapper(multi_models[index], opt) for index in range(opt.number_of_models) ]) kdlw_models = nn.ModuleList([ KDLossWrapper(multi_models[index], opt) for index in range(opt.number_of_models) ]) lw_models_ema = nn.ModuleList([ LossWrapper(multi_models[opt.number_of_models + index], opt) for index in range(opt.number_of_models) ]) kdlw_models_ema = nn.ModuleList([ KDLossWrapper(multi_models[opt.number_of_models + index], opt) for index in range(opt.number_of_models) ]) # Wrap with dataparallel dp_models = nn.ModuleList([ torch.nn.DataParallel(multi_models[index]) for index in range(opt.number_of_models) ]) dp_lw_models = nn.ModuleList([ torch.nn.DataParallel(lw_models[index]) for index in range(opt.number_of_models) ]) dp_kdlw_models = nn.ModuleList([ torch.nn.DataParallel(kdlw_models[index]) for index in range(opt.number_of_models) ]) dp_models_ema = nn.ModuleList([ torch.nn.DataParallel(multi_models[opt.number_of_models + index]) for index in range(opt.number_of_models) ]) dp_lw_models_ema = nn.ModuleList([ torch.nn.DataParallel(lw_models_ema[index]) for index in range(opt.number_of_models) ]) dp_kdlw_models_ema = nn.ModuleList([ torch.nn.DataParallel(kdlw_models_ema[index]) for index in range(opt.number_of_models) ]) ########################## # Build optimizer ########################## if opt.noamopt: assert opt.caption_model in [ 'transformer', 'bert', 'm2transformer' ], 'noamopt can only work with transformer' optimizer = utils.get_std_opt(multi_models, factor=opt.noamopt_factor, warmup=opt.noamopt_warmup) elif opt.reduce_on_plateau: optimizer = utils.build_optimizer(multi_models.parameters(), opt) optimizer = utils.ReduceLROnPlateau(optimizer, factor=0.5, patience=3) else: optimizer = utils.build_optimizer(multi_models.parameters(), opt) # Load the optimizer if opt.start_from is not None and os.path.isfile( os.path.join(opt.start_from, "optimizer.pth")): optimizer.load_state_dict( torch.load(os.path.join(opt.start_from, 'optimizer.pth'))) ########################## # Build loss ########################## # triplet_loss = nn.TripletMarginLoss() ######################### # Get ready to start ######################### iteration = infos['iter'] epoch = infos['epoch'] # For back compatibility if 'iterators' in infos: infos['loader_state_dict'] = { split: { 'index_list': infos['split_ix'][split], 'iter_counter': infos['iterators'][split] } for split in [ 'paired_train', 'unpaired_images_train', 'unpaired_captions_train', 'train', 'val', 'test' ] } loader.load_state_dict(infos['loader_state_dict']) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) if opt.noamopt: optimizer._step = iteration # flag indicating finish of an epoch # Always set to True at the beginning to initialize the lr or etc. epoch_done = True # Assure in training mode dp_lw_models.train() dp_kdlw_models.train() dp_lw_models_ema.train() dp_kdlw_models_ema.train() # Build the ensemble model # # Setup the model model_ensemble = AttEnsemble(multi_models_list[opt.number_of_models:2 * opt.number_of_models], weights=None) # model_ensemble.seq_length = 20 model_ensemble.cuda() # model_ensemble.eval() kd_model_outs_list = [] # Start training try: while True: # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: break if epoch_done: if not opt.noamopt and not opt.reduce_on_plateau: # Assign the learning rate if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate utils.set_lr(optimizer, opt.current_lr) # set the decayed rate # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min( opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) for index in range(opt.number_of_models): multi_models[index].ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False # If start structure loss training if opt.structure_after != -1 and epoch >= opt.structure_after: struc_flag = True init_scorer(opt.cached_tokens) else: struc_flag = False if epoch >= opt.paired_train_epoch: opt.current_lambda_x = opt.hyper_parameter_lambda_x * \ (epoch - (opt.paired_train_epoch - 1)) /\ (opt.max_epochs - opt.paired_train_epoch) opt.current_lambda_y = opt.hyper_parameter_lambda_y * \ (epoch - (opt.paired_train_epoch - 1)) / \ (opt.max_epochs - opt.paired_train_epoch) epoch_done = False start = time.time() # Load data from train split (0) if epoch < opt.language_pretrain_epoch: data = loader.get_batch('unpaired_captions_train') elif epoch < opt.paired_train_epoch: data = loader.get_batch('paired_train') else: data = loader.get_batch('paired_train') unpaired_data = loader.get_batch('unpaired_images_train') unpaired_caption = loader.get_batch('unpaired_captions_train') print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() if epoch < opt.language_pretrain_epoch: tmp = [ data['fc_feats'] * 0, data['att_feats'] * 0, data['labels'], data['masks'], data['att_masks'] ] elif epoch < opt.paired_train_epoch: tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks'] ] else: tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks'] ] unpaired_tmp = [ unpaired_data['fc_feats'], unpaired_data['att_feats'], unpaired_data['labels'], unpaired_data['masks'], unpaired_data['att_masks'] ] unpaired_caption_tmp = [ unpaired_caption['fc_feats'] * 0, unpaired_caption['att_feats'] * 0, unpaired_caption['labels'], unpaired_caption['masks'], unpaired_caption['att_masks'] ] tmp = [_ if _ is None else _.cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp if epoch >= opt.paired_train_epoch: unpaired_tmp = [ _ if _ is None else _.cuda() for _ in unpaired_tmp ] unpaired_fc_feats, unpaired_att_feats, unpaired_labels, unpaired_masks, unpaired_att_masks = unpaired_tmp unpaired_caption_tmp = [ _ if _ is None else _.cuda() for _ in unpaired_caption_tmp ] unpaired_caption_fc_feats, unpaired_caption_att_feats, unpaired_caption_labels, unpaired_caption_masks, unpaired_caption_att_masks = unpaired_caption_tmp unpaired_caption_fc_feats = unpaired_caption_fc_feats.repeat( 5, 1) unpaired_caption_fc_feats = opt.std_pseudo_visual_feature * torch.randn_like( unpaired_caption_fc_feats) unpaired_caption_att_feats = unpaired_caption_att_feats.repeat( 5, 1, 1) unpaired_caption_fc_feats.requires_grad = True unpaired_caption_att_feats.requires_grad = True unpaired_caption_labels = unpaired_caption_labels.reshape( unpaired_caption_fc_feats.shape[0], -1) unpaired_caption_masks = unpaired_caption_masks.reshape( unpaired_caption_fc_feats.shape[0], -1) optimizer.zero_grad() if epoch < opt.language_pretrain_epoch: language_loss = 0 model_outs_list = [] for index in range(opt.number_of_models): model_out = dp_lw_models[index]( fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts'])), sc_flag, struc_flag) model_outs_list.append(model_out) language_loss += model_out['loss'].mean() loss = language_loss elif epoch < opt.paired_train_epoch: language_loss = 0 model_outs_list = [] for index in range(opt.number_of_models): model_out = dp_lw_models[index]( fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts'])), sc_flag, struc_flag) model_outs_list.append(model_out) language_loss += model_out['loss'].mean() loss = language_loss else: language_loss = 0 model_outs_list = [] for index in range(opt.number_of_models): model_out = dp_lw_models[index]( fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts'])), sc_flag, struc_flag) model_outs_list.append(model_out) language_loss += model_out['loss'].mean() loss = language_loss # else: # for unpaired image sentences # # Setup the model # model_ensemble = AttEnsemble(multi_models_list[:opt.number_of_models], weights=None) # model_ensemble.seq_length = 16 # model_ensemble.cuda() # model_ensemble.eval() model_ensemble.eval() eval_kwargs = dict() eval_kwargs.update(vars(opt)) with torch.no_grad(): seq, seq_logprobs = model_ensemble(unpaired_fc_feats, unpaired_att_feats, unpaired_att_masks, opt=eval_kwargs, mode='sample') # val_loss, predictions, lang_stats = eval_utils.eval_split(model_ensemble, lw_models[0].crit, loader, # eval_kwargs) # print('\n'.join([utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in # model_ensemble.done_beams[0]])) # print('++' * 10) # for ii in range(10): # sents = utils.decode_sequence(loader.get_vocab(), seq[ii].unsqueeze(0)) # gt_sent = utils.decode_sequence(loader.get_vocab(), labels[ii,0].unsqueeze(0)) # a=1 model_ensemble.train() model_ensemble_sudo_labels = labels.new_zeros( (opt.batch_size, opt.beam_size, eval_kwargs['max_length'] + 2)) model_ensemble_sudo_log_prob = masks.new_zeros( (opt.batch_size, opt.beam_size, eval_kwargs['max_length'] + 2, len(loader.get_vocab()) + 1)) model_ensemble_sum_log_prob = masks.new_zeros( (opt.batch_size, opt.beam_size)) for batch_index in range(opt.batch_size): for beam_index in range(opt.beam_size): # for beam_index in range(3): pred = model_ensemble.done_beams[batch_index][ beam_index]['seq'] log_prob = model_ensemble.done_beams[batch_index][ beam_index]['logps'] model_ensemble_sudo_labels[batch_index, beam_index, 1:pred.shape[0] + 1] = pred model_ensemble_sudo_log_prob[batch_index, beam_index, 1:pred.shape[0] + 1] = log_prob model_ensemble_sum_log_prob[batch_index][ beam_index] = model_ensemble.done_beams[ batch_index][beam_index]['p'] # model_ensemble_prob = F.softmax(model_ensemble_sum_log_prob) data_ensemble_sudo_gts = list() for data_ensemble_sudo_gts_index in range( model_ensemble_sudo_labels.shape[0]): data_ensemble_sudo_gts.append(model_ensemble_sudo_labels[ data_ensemble_sudo_gts_index, :, 1:-1].data.cpu().numpy()) # generated_sentences = list() # for i in range(unpaired_fc_feats.shape[0]): # generated_sentences.append( # [utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in # model_ensemble.done_beams[i]]) # # pos_tag_results = list() # for i in range(unpaired_fc_feats.shape[0]): # generated_sentences_i = generated_sentences[i] # pos_tag_results_i = [] # for text in generated_sentences_i: # text_tokenize = nltk.word_tokenize(text) # pos_tag_results_i_jbeam = [] # for vob, vob_type in nltk.pos_tag(text_tokenize): # if vob_type == 'NN' or vob_type == 'NNS': # pos_tag_results_i_jbeam.append(vob) # pos_tag_results_i.append(pos_tag_results_i_jbeam) # pos_tag_results.append(pos_tag_results_i) # for i in range(fc_feats.shape[0]): # print('\n'.join([utils.decode_sequence(loader.get_vocab(), _['seq'].unsqueeze(0))[0] for _ in # model_ensemble.done_beams[i]])) # print('--' * 10) # dets = data['dets'] # # promising_flag = labels.new_zeros(opt.batch_size, opt.beam_size) # for batch_index in range(opt.batch_size): # dets_batch = dets[batch_index] # for beam_index in range(opt.beam_size): # indicator = [0] * len(dets_batch) # pos_tag_batch_beam = pos_tag_results[batch_index][beam_index] # for pos_tag_val in pos_tag_batch_beam: # for ii in range(len(dets_batch)): # possible_list = vob_transform_list[dets_batch[ii]] # if pos_tag_val in possible_list: # indicator[ii] = 1 # if sum(indicator) == len(dets_batch) or sum(indicator) >= 2: # promising_flag[batch_index, beam_index] = 1 # # # model_ensemble_sudo_log_prob = model_ensemble_sudo_log_prob * promising_flag.unsqueeze(-1).unsqueeze(-1) # model_ensemble_sudo_labels = model_ensemble_sudo_labels * promising_flag.unsqueeze(-1) #sudo_masks_for_model = sudo_masks_for_model.detach() distilling_loss = 0 # We use the random study machinism who_to_study = random.randint(0, opt.number_of_models - 1) # for index in range(opt.number_of_models): # model_out = dp_kdlw_models[index](unpaired_fc_feats, unpaired_att_feats, model_ensemble_sudo_labels, # model_ensemble_sudo_log_prob, att_masks, data_ensemble_sudo_gts, # torch.arange(0, len(data_ensemble_sudo_gts)), sc_flag, # struc_flag, model_ensemble_sum_log_prob) # kd_model_outs_list.append(model_out) model_out = dp_kdlw_models[who_to_study]( unpaired_fc_feats, unpaired_att_feats, model_ensemble_sudo_labels, model_ensemble_sudo_log_prob, att_masks, data_ensemble_sudo_gts, torch.arange(0, len(data_ensemble_sudo_gts)), sc_flag, struc_flag, model_ensemble_sum_log_prob) # kd_model_outs_list.append(model_out) distilling_loss += model_out['loss'].mean() loss += opt.number_of_models * opt.current_lambda_x * distilling_loss ################################################################### # use unlabelled captions # simple_sgd = utils.gradient_descent(unpaired_caption_fc_feats, stepsize=1e3) simple_sgd = utils.gradient_descent_adagrad( unpaired_caption_fc_feats, stepsize=1) gts_tmp = unpaired_caption['gts'] new_gts = [] for ii in range(len(data['gts'])): for jj in range(gts_tmp[ii].shape[0]): new_gts.append(gts_tmp[ii][jj]) unpaired_caption['gts'] = new_gts for itr in range(opt.inner_iteration): unlabelled_caption_model_out = dp_lw_models_ema[ itr % opt.number_of_models]( unpaired_caption_fc_feats, unpaired_caption_att_feats, unpaired_caption_labels, unpaired_caption_masks, unpaired_caption_att_masks, unpaired_caption['gts'], torch.arange(0, len(unpaired_caption['gts'])), sc_flag, struc_flag) unlabelled_caption_loss = unlabelled_caption_model_out[ 'loss'].mean() unlabelled_caption_loss.backward() # print(unlabelled_caption_loss) simple_sgd.update(unpaired_caption_fc_feats) # a=1 unpaired_caption_fc_feats.requires_grad = False unpaired_caption_att_feats.requires_grad = False unlabelled_caption_model_out = dp_lw_models[who_to_study]( unpaired_caption_fc_feats, unpaired_caption_att_feats, unpaired_caption_labels, unpaired_caption_masks, unpaired_caption_att_masks, unpaired_caption['gts'], torch.arange(0, len(unpaired_caption['gts'])), sc_flag, struc_flag) unlabelled_caption_loss = unlabelled_caption_model_out[ 'loss'].mean() loss += opt.number_of_models * opt.current_lambda_y * unlabelled_caption_loss loss.backward() if opt.grad_clip_value != 0: getattr(torch.nn.utils, 'clip_grad_%s_' % (opt.grad_clip_mode))(multi_models.parameters(), opt.grad_clip_value) optimizer.step() for order in range(opt.number_of_models): for param, param_ema in zip( multi_models_list[order].parameters(), multi_models_list[order + opt.number_of_models].parameters()): param_ema.data = opt.alpha * param_ema.data + ( 1 - opt.alpha) * param.data train_loss = loss.item() torch.cuda.synchronize() end = time.time() # if struc_flag: # print("iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}" \ # .format(iteration, epoch, train_loss, model_out['lm_loss'].mean().item(), model_out['struc_loss'].mean().item(), end - start)) # elif not sc_flag: # print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ # .format(iteration, epoch, train_loss, end - start)) # else: # print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \ # .format(iteration, epoch, model_out['reward'].mean(), end - start)) if struc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss/opt.number_of_models, sum([model_outs_list[index]['lm_loss'].mean().item() for index in range(opt.number_of_models)])/opt.number_of_models, sum([model_outs_list[index]['struc_loss'].mean().item() for index in range(opt.number_of_models)])/opt.number_of_models, end - start)) elif not sc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, language_loss.item()/opt.number_of_models, end - start)) else: print("iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, sum([model_outs_list[index]['reward'].mean().item() for index in range(opt.number_of_models)])/opt.number_of_models, end - start)) # Update the iteration and epoch iteration += 1 if epoch < opt.paired_train_epoch: if data['bounds']['wrapped']: epoch += 1 epoch_done = True else: if data['bounds']['wrapped']: epoch += 1 epoch_done = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): # tb_summary_writer.add_scalar('train_loss', train_loss, iteration) for index in range(opt.number_of_models): model_id = 'model_{}'.format(index) tb_summary_writer.add_scalars('language_loss', { model_id: model_outs_list[index]['loss'].mean().item() }, iteration) if epoch >= opt.paired_train_epoch: # for index in range(opt.number_of_models): # model_id = 'model_{}'.format(index) # kd_model_outs_val = 0 if len(kd_model_outs_list) == 0 else kd_model_outs_list[index]['loss'].mean().item() # tb_summary_writer.add_scalars('distilling_loss', # {model_id: kd_model_outs_val}, # iteration) tb_summary_writer.add_scalar('distilling_loss', distilling_loss.item(), iteration) tb_summary_writer.add_scalar( 'unlabelled_caption_loss', unlabelled_caption_loss.item(), iteration) tb_summary_writer.add_scalar('hyper_parameter_lambda_x', opt.current_lambda_x, iteration) tb_summary_writer.add_scalar('hyper_parameter_lambda_y', opt.current_lambda_y, iteration) # tb_summary_writer.add_scalar('triplet_loss', triplet_loss_val.item(), iteration) if opt.noamopt: opt.current_lr = optimizer.rate() elif opt.reduce_on_plateau: opt.current_lr = optimizer.current_lr tb_summary_writer.add_scalar('learning_rate', opt.current_lr, iteration) tb_summary_writer.add_scalar('scheduled_sampling_prob', multi_models[0].ss_prob, iteration) if sc_flag: for index in range(opt.number_of_models): # tb_summary_writer.add_scalar('avg_reward', model_out['reward'].mean(), iteration) model_id = 'model_{}'.format(index) tb_summary_writer.add_scalars( 'avg_reward', { model_id: model_outs_list[index]['reward'].mean().item() }, iteration) elif struc_flag: # tb_summary_writer.add_scalar('lm_loss', model_out['lm_loss'].mean().item(), iteration) # tb_summary_writer.add_scalar('struc_loss', model_out['struc_loss'].mean().item(), iteration) # tb_summary_writer.add_scalar('reward', model_out['reward'].mean().item(), iteration) # tb_summary_writer.add_scalar('reward_var', model_out['reward'].var(1).mean(), iteration) model_id = 'model_{}'.format(index) for index in range(opt.number_of_models): tb_summary_writer.add_scalars( 'lm_loss', { model_id: model_outs_list[index] ['lm_loss'].mean().item() }, iteration) tb_summary_writer.add_scalars( 'struc_loss', { model_id: model_outs_list[index] ['struc_loss'].mean().item() }, iteration) tb_summary_writer.add_scalars( 'reward', { model_id: model_outs_list[index]['reward'].mean().item() }, iteration) tb_summary_writer.add_scalars( 'reward_var', { model_id: model_outs_list[index]['reward'].var(1).mean() }, iteration) histories['loss_history'][ iteration] = train_loss if not sc_flag else sum([ model_outs_list[index]['reward'].mean().item() for index in range(opt.number_of_models) ]) / opt.number_of_models histories['lr_history'][iteration] = opt.current_lr histories['ss_prob_history'][iteration] = multi_models[ 0].ss_prob # update infos infos['iter'] = iteration infos['epoch'] = epoch infos['loader_state_dict'] = loader.state_dict() # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0 and not opt.save_every_epoch and epoch >= opt.paired_train_epoch) or \ (epoch_done and opt.save_every_epoch and epoch >= opt.paired_train_epoch): # load ensemble # Setup the model model = AttEnsemble(multi_models_list[opt.number_of_models:2 * opt.number_of_models], weights=None) model.seq_length = opt.max_length model.cuda() model.eval() # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) # eval_kwargs['beam_size'] = 5 # eval_kwargs['verbose_beam'] = 1 # eval_kwargs['verbose_loss'] = 1 # val_loss, predictions, lang_stats = eval_utils.eval_split( # dp_model, lw_model.crit, loader, eval_kwargs) with torch.no_grad(): val_loss, predictions, lang_stats = eval_utils.eval_split( model, lw_models[0].crit, loader, eval_kwargs) model.train() if opt.reduce_on_plateau: if 'CIDEr' in lang_stats: optimizer.scheduler_step(-lang_stats['CIDEr']) else: optimizer.scheduler_step(val_loss) # Write validation result into summary tb_summary_writer.add_scalar('validation loss', val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): tb_summary_writer.add_scalar(k, v, iteration) histories['val_result_history'][iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True # Dump miscalleous informations infos['best_val_score'] = best_val_score utils.save_checkpoint(opt, multi_models, infos, optimizer, histories) if opt.save_history_ckpt: utils.save_checkpoint( opt, multi_models, infos, optimizer, append=str(epoch) if opt.save_every_epoch else str(iteration)) if best_flag: utils.save_checkpoint(opt, multi_models, infos, optimizer, append='best') # if epoch_done and epoch == opt.paired_train_epoch: # utils.save_checkpoint(opt, multi_models, infos, optimizer, histories) # if opt.save_history_ckpt: # utils.save_checkpoint(opt, multi_models, infos, optimizer, # append=str(epoch) if opt.save_every_epoch else str(iteration)) # cmd = 'cp -r ' + 'log_' + opt.id + ' ' + 'log_' + opt.id + '_backup' # os.system(cmd) except (RuntimeError, KeyboardInterrupt): print('Save ckpt on exception ...') utils.save_checkpoint(opt, multi_models, infos, optimizer) print('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace)
def main(opt): np.random.seed(opt.seed) torch.manual_seed(opt.seed) cudnn.benchmark = True opt.checkpoint_folder += '_'+opt.backbone if opt.sketch_finetune: opt.checkpoint_folder += '_finetune' if not os.path.exists(opt.checkpoint_folder): os.makedirs(opt.checkpoint_folder) print(opt) # Redirect print to both console and log file # if not opt.evaluate: # sys.stdout = Logger(os.path.join(opt.logs_dir, opt.log_name)) # Create data loaders if opt.height is None or opt.width is None: opt.height, opt.width = (224, 224) train_sketch_loader, train_shape_loader, test_sketch_loader, test_shape_loader = get_data(opt.train_shape_views_folder, opt.test_shape_views_folder, opt.train_shape_flist, opt.test_shape_flist, opt.train_sketch_folder, opt.test_sketch_folder, opt.train_sketch_flist, opt.test_sketch_flist, opt.height, opt.width, opt.batch_size, opt.workers, pk_flag=False) # Create model #if opt.pool_idx is None: # opt.pool_idx = set_default_pool kwargs = {'pool_idx': opt.pool_idx} if opt.pool_idx is not None else {} backbone = eval('models.'+opt.backbone) net_bp = backbone.Net_Prev_Pool(**kwargs) net_vp = backbone.View_And_Pool() net_ap = backbone.Net_After_Pool(**kwargs) if opt.sketch_finetune: net_whole = backbone.Net_Whole(nclasses = 10, use_finetuned=True) else: net_whole = backbone.Net_Whole(nclasses = 10) # for alexnet or vgg, feat_dim = 4096 # for resnet, feat_dim = 2048 net_cls = backbone.Net_Classifier(nclasses = 10) # Criterion # criterion = nn.CrossEntropyLoss().cuda() # if opt.balance: # current no balancing # crt_cls = nn.CrossEntropyLoss().cuda() # else: # classification loss crt_cls = nn.CrossEntropyLoss().cuda() # triplet center loss crt_tlc = custom_loss.TripletCenterLoss(margin=opt.margin).cuda() if opt.wn: crt_tlc = torch.nn.utils.weight_norm(crt_tlc, name='centers') criterion = [crt_cls, crt_tlc, opt.w1, opt.w2] # Load from checkpoint start_epoch = best_top1 = 0 if opt.resume: checkpoint = torch.load(opt.resume) net_bp.load_state_dict(checkpoint['net_bp']) net_ap.load_state_dict(checkpoint['net_ap']) net_whole.load_state_dict(checkpoint['net_whole']) net_cls.load_state_dict(checkpoint['net_cls']) crt_tlc.load_state_dict(checkpoint['centers']) start_epoch = checkpoint['epoch'] best_top1 = checkpoint['best_prec'] # start_epoch = checkpoint['epoch'] # best_top1 = checkpoint['best_top1'] # print("=> Start epoch {} best top1 {:.1%}" # .format(start_epoch, best_top1)) # model = nn.DataParallel(model).cuda() net_bp = nn.DataParallel(net_bp).cuda() net_vp = net_vp.cuda() net_ap = nn.DataParallel(net_ap).cuda() net_whole = nn.DataParallel(net_whole).cuda() net_cls = nn.DataParallel(net_cls).cuda() # wrap multiple models in optimizer optim_shape = optim.SGD([{'params': net_ap.parameters()}, {'params': net_bp.parameters(), 'lr':1e-3}, {'params': net_cls.parameters()}], lr=0.001, momentum=0.9, weight_decay=opt.weight_decay) base_param_ids = set(map(id, net_whole.module.features.parameters())) new_params = [p for p in net_whole.parameters() if id(p) not in base_param_ids] param_groups = [ {'params': net_whole.module.features.parameters(), 'lr_mult':0.1}, {'params':new_params, 'lr_mult':1.0}] # optim_sketch = optim.SGD(net_whole.module.parameters(), lr=0.01) optim_sketch = optim.SGD(param_groups, lr=0.001, momentum=0.9, weight_decay=opt.weight_decay) optim_centers = optim.SGD(crt_tlc.parameters(), lr=0.1) optimizer = (optim_sketch, optim_shape, optim_centers) model = (net_whole, net_bp, net_vp, net_ap, net_cls) # Schedule learning rate def adjust_lr(epoch, optimizer): step_size = 800 if opt.pk_flag else 80 # 40 lr = opt.lr * (0.1 ** (epoch // step_size)) for g in optimizer.param_groups: g['lr'] = lr * g.get('lr_mult', 1) # Start training top1 = 0.0 if opt.evaluate: # validate and compute mAP _, top1 = validate(test_sketch_loader, test_shape_loader, model, criterion, 0, opt) exit() best_epoch = -1 best_metric = None # total_epochs = opt.max_epochs*10 if opt.pk_flag else opt.max_epochs for epoch in range(start_epoch, opt.max_epochs): # adjust_lr(epoch, optim_sketch) # adjust_lr(epoch, optim_shape) # adjust_lr(epoch, optim_centers) # cls acc top1 train_top1 = train(train_sketch_loader, train_shape_loader, model, criterion, optimizer, epoch, opt) if epoch < opt.start_save and (epoch % opt.interval == 0): continue if train_top1 > 0.1: print("Test:") cur_metric = validate(test_sketch_loader, test_shape_loader, model, criterion, epoch, opt) top1 = cur_metric[-1] is_best = top1 > best_top1 if is_best: best_epoch = epoch + 1 best_metric = cur_metric best_top1 = max(top1, best_top1) checkpoint = {} checkpoint['epoch'] = epoch + 1 checkpoint['current_prec'] = top1 checkpoint['best_prec'] = best_top1 checkpoint['net_bp'] = net_bp.module.state_dict() checkpoint['net_ap'] = net_ap.module.state_dict() checkpoint['net_whole'] = net_whole.module.state_dict() checkpoint['net_cls'] = net_cls.module.state_dict() checkpoint['centers'] = crt_tlc.state_dict() path_checkpoint = '{0}/model_latest.pth'.format(opt.checkpoint_folder) utils.save_checkpoint(checkpoint, path_checkpoint) if is_best: # save checkpoint path_checkpoint = '{0}/model_best.pth'.format(opt.checkpoint_folder) utils.save_checkpoint(checkpoint, path_checkpoint) if opt.sf: shutil.copyfile(opt.checkpoint_folder+'/test_feat_temp.mat', opt.checkpoint_folder+'/test_feat_best.mat') print('\n * Finished epoch {:3d} top1: {:5.3%} best: {:5.3%}{} @epoch {}\n'. format(epoch, top1, best_top1, ' *' if is_best else '', best_epoch)) print('Best metric', best_metric)
def main(): # Setup workspace and backup files cfg = options.get_config() workspace = utils.setup_workspace(cfg.workspace) if cfg.pretrained is not None: logger = utils.Logger(os.path.join(workspace.log, 'train_log.txt'), mode='a') else: logger = utils.Logger(os.path.join(workspace.log, 'train_log.txt')) tf_logger = SummaryWriter(workspace.log) logger.write('Workspace: {}'.format(cfg.workspace), 'green') logger.write('CUDA: {}, Multi-GPU: {}'.format(cfg.cuda, cfg.multi_gpu), 'green') logger.write('To-disparity: {}'.format(cfg.to_disparity), 'green') # Define dataloader logger.write('Dataset: {}'.format(cfg.dataset_name), 'green') train_dataset, val_dataset = options.get_dataset(cfg.dataset_name) train_loader = DataLoader( train_dataset, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.workers, pin_memory=True, sampler=None, worker_init_fn=lambda work_id: np.random.seed(work_id)) # worker_init_fn ensures different sampling patterns for # each data loading thread val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, pin_memory=True, num_workers=cfg.workers) # Define model logger.write('Model: {}'.format(cfg.model_name), 'green') model = options.get_model(cfg.model_name) if cfg.multi_gpu: model = nn.DataParallel(model) if cfg.cuda: model = model.cuda() # Define loss function criterion = options.get_criterion(cfg.criterion_name) if cfg.cuda: criterion = criterion.cuda() logger.write('Criterion: {}'.format(criterion), 'green') # Define optimizer and learning rate scheduler optim = options.get_optimizer(cfg.optimizer_name, model.parameters()) lr_scheduler = options.get_lr_scheduler(cfg.lr_scheduler_name, optim) logger.write('Optimizer: {}'.format(optim), 'green') if lr_scheduler is not None: logger.write('Learning rate schedular: {}'.format(lr_scheduler), 'green') # [Optional] load pretrained model start_ep = 0 global_step = 0 local_start = 0 if cfg.pretrained is not None: start_ep, global_step = utils.load_checkpoint(model, optim, lr_scheduler, cfg.pretrained, cfg.weight_only) logger.write('Load pretrained model from {}'.format(cfg.pretrained), 'green') #global_step = len(train_dataset) * start_ep # NOTE: global step start from the beginning of the epoch local_start = global_step % len(train_dataset) # Start training logger.write('Start training...', 'green') for ep in range(start_ep, cfg.max_epoch): if lr_scheduler is not None: logger.write('Update learning rate: {} --> '.format( lr_scheduler.get_lr()[0]), 'magenta', end='') lr_scheduler.step() logger.write('{}'.format(lr_scheduler.get_lr()[0]), 'magenta') # Train an epoch model.train() meters = metric.Metrics(cfg.train_metric_field) avg_meters = metric.MovingAverageEstimator(cfg.train_metric_field) end = time.time() for it, data in enumerate(train_loader, local_start): # Pack data if cfg.cuda: for k in data.keys(): data[k] = data[k].cuda() inputs = dict() inputs['left_rgb'] = data['left_rgb'] inputs['right_rgb'] = data['right_rgb'] if cfg.to_disparity: inputs['left_sd'] = data['left_sdisp'] inputs['right_sd'] = data['right_sdisp'] target = data['left_disp'] else: inputs['left_sd'] = data['left_sd'] inputs['right_sd'] = data['right_sd'] target = data['left_d'] data_time = time.time() - end # Inference, compute loss and update model end = time.time() optim.zero_grad() pred = model(inputs) if cfg.criterion_name in ['inv_disp_l1']: pred_d = utils.disp2depth(pred, data['width'].item()) loss = criterion(pred_d, data['left_d']) else: loss = criterion(pred, target) loss.backward() optim.step() update_time = time.time() - end end = time.time() # Measure performance pred_np = pred.data.cpu().numpy() target_np = target.data.cpu().numpy() results = meters.compute(pred_np, target_np) avg_meters.update(results) # Print results if (it % cfg.print_step) == 0: logger.write('[{:2d}/{:2d}][{:5d}/{:5d}] data time: {:4.3f}, update time: {:4.3f}, loss: {:.4f}'\ .format(ep, cfg.max_epoch, it, len(train_loader), data_time, update_time, loss.item())) avg_results = avg_meters.compute() logger.write(' [Average results] ', end='') for key, val in avg_results.items(): logger.write('{}: {:5.3f} '.format(key, val), end='') logger.write('') avg_meters.reset() # Log to tensorboard if (it % cfg.tflog_step) == 0: tf_logger.add_scalar('A-Loss/loss', loss.data, global_step) for key, val in results.items(): tf_logger.add_scalar('B-Train-Dense-Metric/{}'.format(key), val, global_step) if cfg.lr_scheduler_name is not None: tf_logger.add_scalar('C-Learning-Rate', lr_scheduler.get_lr()[0], global_step) tf_logger.add_image('A-RGB/left', inputs['left_rgb'].data, global_step) tf_logger.add_image('A-RGB/right', inputs['right_rgb'].data, global_step) norm_factor = target.data.max(-1)[0].max(-1)[0].max( -1)[0][:, None, None, None] tf_logger.add_image('B-sD', inputs['left_sd'].data / norm_factor, global_step) tf_logger.add_image('C-Pred', pred.data / norm_factor, global_step) tf_logger.add_image('C-Ground-Truth', target.data / norm_factor, global_step) if cfg.dump_all_param: # NOTE: this will require a lot of HDD memory for name, param in model.named_parameters(): tf_logger.add_histogram( name + '/vars', param.data.clone().cpu().numpy(), global_step) if param.requires_grad: tf_logger.add_histogram( name + '/grads', param.grad.clone().cpu().numpy(), global_step) # On-the-fly validation if (it % cfg.val_step) == 0: # and not (ep == 0 and it == 0): validate(global_step, val_loader, model, logger, tf_logger, cfg) # Save model if (it % cfg.save_step) == 0: ckpt_path = utils.save_checkpoint(workspace.ckpt, model, optim, lr_scheduler, ep, global_step) logger.write('Save checkpoint to {}'.format(ckpt_path), 'magenta') # Update global step global_step += 1 if it >= len(train_dataset): local_start = 0 break
def train(opt): ################################ # Build dataloader ################################ loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length ########################## # Initialize infos ########################## infos = { 'iter': 0, 'epoch': 0, 'vocab': loader.get_vocab(), } # Load old infos (if there is) and check if models are compatible if opt.checkpoint_path is not None and os.path.isfile( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl')): with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl'), 'rb') as f: infos = utils.pickle_load(f) print('infos load success') infos['opt'] = opt # tensorboard logger tb_summary_writer = SummaryWriter(opt.checkpoint_path) ########################## # Build model ########################## opt.vocab = loader.get_vocab() model = models.setup(opt).cuda() del opt.vocab # Load pretrained weights: if opt.checkpoint_path is not None and os.path.isfile( os.path.join(opt.checkpoint_path, 'model.pth')): model.load_state_dict( torch.load(os.path.join(opt.checkpoint_path, 'model.pth'))) print('model load success') # Wrap generation model with loss function(used for training) # This allows loss function computed separately on each machine lw_model = LossWrapper(model, opt) # Wrap with dataparallel dp_model = torch.nn.DataParallel(model) dp_lw_model = torch.nn.DataParallel(lw_model) ########################## # Build optimizer ########################## optimizer = utils.ReduceLROnPlateau(optim.Adam(model.parameters(), opt.learning_rate), factor=0.5, patience=3) # Load the optimizer if opt.checkpoint_path is not None and os.path.isfile( os.path.join(opt.checkpoint_path, "optimizer.pth")): optimizer.load_state_dict( torch.load(os.path.join(opt.checkpoint_path, 'optimizer.pth'))) ######################### # Get ready to start ######################### iteration = infos['iter'] epoch = infos['epoch'] best_val_score = infos.get('best_val_score', None) print('iter {}, epoch {}, best_val_score {}'.format( iteration, epoch, best_val_score)) print(sorted(dict(set(vars(opt).items())).items(), key=lambda x: x[0])) # Start training if opt.self_critical: init_scorer(opt.cached_tokens) # Assure in training mode dp_lw_model.train() try: while True: # Stop if reaching max_epoch if epoch >= opt.max_epochs: break # Load data from train split (0) data = loader.get_batch('train') torch.cuda.synchronize() tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks'] ] tmp = [_ if _ is None else _.cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp optimizer.zero_grad() model_out = dp_lw_model(fc_feats, att_feats, labels, masks, att_masks, data['gts'], torch.arange(0, len(data['gts']))) loss = model_out['loss'].mean() loss.backward() torch.nn.utils.clip_grad_value_(model.parameters(), 0.1) optimizer.step() train_loss = loss.item() torch.cuda.synchronize() # Update the iteration and epoch iteration += 1 if data['bounds']['wrapped']: epoch += 1 # Write the training loss summary if iteration % opt.losses_log_every == 0: tb_summary_writer.add_scalar('train_loss', train_loss, iteration) opt.current_lr = optimizer.current_lr tb_summary_writer.add_scalar('learning_rate', opt.current_lr, iteration) if opt.self_critical: tb_summary_writer.add_scalar('avg_reward', model_out['reward'].mean(), iteration) # update infos infos['iter'] = iteration infos['epoch'] = epoch # make evaluation on validation set, and save model if iteration % opt.save_checkpoint_every == 0: tb_summary_writer.add_scalar('epoch', epoch, iteration) # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) _, _, lang_stats = eval_utils.eval_split( dp_model, loader, eval_kwargs) optimizer.scheduler_step(-lang_stats['CIDEr']) # Write validation result into summary for k, v in lang_stats.items(): tb_summary_writer.add_scalar(k, v, iteration) # Save model if is improving on validation result current_score = lang_stats['CIDEr'] best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True # Dump miscellaneous information infos['best_val_score'] = best_val_score utils.save_checkpoint(opt, model, infos, optimizer) if best_flag: utils.save_checkpoint(opt, model, infos, optimizer, append='best') except (RuntimeError, KeyboardInterrupt): pass