logging.info("Training starts now!") optimizer, scheduler = get_optim(conf.lr * conf.num_gpus * conf.batch_size) for epoch in range(start_epoch + 1, start_epoch + 1 + conf.num_epochs): rez = train_epoch(epoch) #print("overall{:2d}: ({:.3f})\n{}".format(epoch, rez.mean(1)['total'], rez.mean(1)), flush=True) logging.info("overall{:2d}: ({:.3f})\n{}".format(epoch, rez.mean(1)['total'], rez.mean(1))) if conf.save_dir is not None: torch.save( { 'epoch': epoch, 'state_dict': detector.state_dict( ), #{k:v for k,v in detector.state_dict().items() if not k.startswith('detector.')}, # 'optimizer': optimizer.state_dict(), }, os.path.join(conf.save_dir, '{}-{}.tar'.format('vgrel', epoch))) mAp = val_epoch() scheduler.step(mAp) if any([ pg['lr'] <= (conf.lr * conf.num_gpus * conf.batch_size) / 99.0 for pg in optimizer.param_groups ]): #print("exiting training early", flush=True) logging.info("exiting training early") break
def main(): fname = os.path.join(conf.save_dir, 'train_losses.csv') train_f = open(fname, 'w') train_f.write( 'iter,class_loss,rel_loss,total,recall20,recall50,recall100,recall20_con,recall50_con,recall100_con\n' ) train_f.flush() fname = os.path.join(conf.save_dir, 'val_losses.csv') val_f = open(fname, 'w') val_f.write( 'recall20,recall50,recall100,recall20_con,recall50_con,recall100_con\n' ) val_f.flush() train, val, _ = VG.splits(num_val_im=conf.val_size, filter_duplicate_rels=True, use_proposals=conf.use_proposals, filter_non_overlap=conf.mode == 'sgdet') train_loader, val_loader = VGDataLoader.splits( train, val, mode='rel', batch_size=conf.batch_size, num_workers=conf.num_workers, num_gpus=conf.num_gpus) detector = RelModel( classes=train.ind_to_classes, rel_classes=train.ind_to_predicates, num_gpus=conf.num_gpus, mode=conf.mode, require_overlap_det=True, use_resnet=conf.use_resnet, order=conf.order, nl_edge=conf.nl_edge, nl_obj=conf.nl_obj, hidden_dim=conf.hidden_dim, use_proposals=conf.use_proposals, pass_in_obj_feats_to_decoder=conf.pass_in_obj_feats_to_decoder, pass_in_obj_feats_to_edge=conf.pass_in_obj_feats_to_edge, pooling_dim=conf.pooling_dim, rec_dropout=conf.rec_dropout, use_bias=conf.use_bias, use_tanh=conf.use_tanh, limit_vision=conf.limit_vision, lml_topk=conf.lml_topk, lml_softmax=conf.lml_softmax, entr_topk=conf.entr_topk, ml_loss=conf.ml_loss) # Freeze the detector for n, param in detector.detector.named_parameters(): param.requires_grad = False print(print_para(detector), flush=True) ckpt = torch.load(conf.ckpt) if conf.ckpt.split('-')[-2].split('/')[-1] == 'vgrel': print("Loading EVERYTHING") start_epoch = ckpt['epoch'] if not optimistic_restore(detector, ckpt['state_dict']): start_epoch = -1 # optimistic_restore( # detector.detector, # torch.load('checkpoints/vgdet/vg-28.tar')['state_dict'] # ) else: start_epoch = -1 optimistic_restore(detector.detector, ckpt['state_dict']) detector.roi_fmap[1][0].weight.data.copy_( ckpt['state_dict']['roi_fmap.0.weight']) detector.roi_fmap[1][3].weight.data.copy_( ckpt['state_dict']['roi_fmap.3.weight']) detector.roi_fmap[1][0].bias.data.copy_( ckpt['state_dict']['roi_fmap.0.bias']) detector.roi_fmap[1][3].bias.data.copy_( ckpt['state_dict']['roi_fmap.3.bias']) detector.roi_fmap_obj[0].weight.data.copy_( ckpt['state_dict']['roi_fmap.0.weight']) detector.roi_fmap_obj[3].weight.data.copy_( ckpt['state_dict']['roi_fmap.3.weight']) detector.roi_fmap_obj[0].bias.data.copy_( ckpt['state_dict']['roi_fmap.0.bias']) detector.roi_fmap_obj[3].bias.data.copy_( ckpt['state_dict']['roi_fmap.3.bias']) detector.cuda() print("Training starts now!") optimizer, scheduler = get_optim(detector, conf.lr * conf.num_gpus * conf.batch_size) best_eval = None for epoch in range(start_epoch + 1, start_epoch + 1 + conf.num_epochs): rez = train_epoch(epoch, detector, train, train_loader, optimizer, conf, train_f) print("overall{:2d}: ({:.3f})\n{}".format(epoch, rez.mean(1)['total'], rez.mean(1)), flush=True) mAp = val_epoch(detector, val, val_loader, val_f) scheduler.step(mAp) if conf.save_dir is not None: if best_eval is None or mAp > best_eval: torch.save( { 'epoch': epoch, 'state_dict': detector.state_dict(), # 'optimizer': optimizer.state_dict(), }, os.path.join(conf.save_dir, 'best-val.tar')) best_eval = mAp
'pred_rel_inds': rels_i, 'obj_scores': obj_scores_i, 'rel_scores': pred_scores_i, # hack for now. } evaluator[conf.mode].evaluate_scene_graph_entry( gt_entry, pred_entry, ) print("Training starts now!") optimizer, scheduler = get_optim(conf.lr * conf.num_gpus * conf.batch_size) for epoch in range(start_epoch + 1, start_epoch + 1 + conf.num_epochs): rez = train_epoch(epoch) print("overall{:2d}: ({:.3f})\n{}".format( epoch, rez.mean(1)['total'], rez.mean(1)), flush=True) if conf.save_dir is not None: torch.save({ 'epoch': epoch, # {k:v for k,v in detector.state_dict().items() if not k.startswith('detector.')}, 'state_dict': detector.state_dict(), # 'optimizer': optimizer.state_dict(), }, os.path.join(conf.save_dir, '{}-{}.tar'.format('vgrel', epoch))) mAp = val_epoch() scheduler.step(mAp) if any([pg['lr'] <= (conf.lr * conf.num_gpus * conf.batch_size)/99.0 for pg in optimizer.param_groups]): print("exiting training early", flush=True) break