def train_folds(): best_ious = [] folds = [0, 1, 2, 3, 4] for fold in range(args.num_folds): if fold not in folds: print('skip fold', fold) continue # set model filenames model_params = [args.model_name, args.exp_name, fold] MODEL_CKPT = '../model_weights/best/best_fix3_{}_{}_fold-{}.pth'.format( *model_params) net = ResUNet(use_bool=True) pre_model_params = [args.model_name, args.exp_name, 4] start_path = '../model_weights/best/best_fix2_{}_{}_fold-{}.pth'.format( *pre_model_params) net.load_state_dict(torch.load(start_path)) net.to(device) print('loaded', start_path) print(net.parameters) print('Starting fold {} ...'.format(fold)) best_ious.append(train_network(net, fold, model_ckpt=MODEL_CKPT)) print('Average IOU:', np.mean(best_ious))
def train_folds(): best_ious = [] for fold in range(args.num_folds): #if fold > 0: # break # set model filenames model_params = [args.model_name, args.exp_name, fold] MODEL_CKPT = '../model_weights/best_{}_{}_fold-{}.pth'.format( *model_params) net = ResUNet(use_bool=True) if args.gpu == 99: net = nn.parallel.DataParallel(net) net.to(device) print(net.parameters) print('Starting fold {} ...'.format(fold)) best_ious.append(train_network(net, fold, model_ckpt=MODEL_CKPT)) print('Average IOU:', np.mean(best_ious))
def train_folds(): best_ious = [] for fold in range(args.num_folds): if fold > 0: break # set model filenames model_params = [args.model_name, args.exp_name, fold] MODEL_CKPT = '../model_weights/best_{}_{}_fold-{}.pth'.format( *model_params) net = ResUNet(use_bool=True) if args.gpu == 99: net = nn.DataParallel(net, device_ids=[0, 1]).cuda() else: torch.cuda.set_device(args.gpu) cudnn.benchmark = True net.cuda() print('Starting fold {} ...'.format(fold)) best_ious.append(train_network(net, fold, model_ckpt=MODEL_CKPT)) print('Average IOU:', np.mean(best_ious))
def train_folds(): best_ious = [] for fold in range(args.num_folds): #if fold > 0: # break # set model filenames model_params = [args.model_name, args.exp_name, fold] MODEL_CKPT = '../model_weights/best_mt_{}_{}_fold-{}.pth'.format( *model_params) if args.load_best: net.load_state_dict( torch.load(MODEL_CKPT, map_location=lambda storage, loc: storage)) student = ResUNet(use_bool=True) teacher = ResUNet(use_bool=True) for param in teacher.parameters(): param.detach_() if args.gpu == 99: student = nn.DataParallel(student, device_ids=[0, 1]).cuda() teacher = nn.DataParallel(teacher, device_ids=[0, 1]).cuda() else: torch.cuda.set_device(args.gpu) cudnn.benchmark = True student.cuda() teacher.cuda() print('Starting fold {} ...'.format(fold)) best_ious.append( train_network(student, teacher, fold, model_ckpt=MODEL_CKPT)) print('Average IOU:', np.mean(best_ious))
action='store_true', help='whether to use output from deeply supervised parts') parser.add_argument('--mask_zeros', action='store_true', help='whether to set full masks as all 0') args = parser.parse_args() weights = glob.glob(args.weight_folder + '*.pth') print('Found {} models'.format(len(weights))) OUT_FILE = '../subm/averaged_{}_resunet_models.csv'.format(len(weights)) # get the loaders test_loader = get_test_loader(imsize=args.imsize, batch_size=args.batch_size) net = ResUNet(use_bool=args.use_bool) if args.gpu == 99: net = nn.DataParallel(net, device_ids=[0, 1]).cuda() else: torch.cuda.set_device(args.gpu) cudnn.benchmark = True net.cuda() def RLenc(img, order='F', format=True): """ img is binary mask image, shape (r,c) order is down-then-right, i.e. Fortran format determines if the order needs to be preformatted (according to submission rules) or not
def train_network(net, fold=0, model_ckpt=None): # train the network, allow for keyboard interrupt try: # define optimizer optimizer = optim.SGD(net.parameters(), lr=args.lr_max / 4, momentum=0.9, weight_decay=args.l2) # get the loaders train_loader, valid_loader = get_data_loaders( imsize=args.imsize, batch_size=args.batch_size, num_folds=args.num_folds, fold=fold) # training flags swa = False use_lovasz = True freeze_bn = False save_imgs = False train_losses = [] valid_losses = [] valid_ious = [] lr_patience = 0 valid_patience = 0 best_val_metric = 1000.0 best_val_iou = 0.0 cycle = 1 swa_n = 0 t_ = 0 print('args.swa', args.swa) print('args.lr_rampdown', args.lr_rampdown) print('args.use_lovasz', args.use_lovasz) print('Training ...') for e in range(args.epochs): print('\n' + 'Epoch {}/{}'.format(e, args.epochs)) start = time.time() for params in optimizer.param_groups: print('Learning rate set to {:.4}'.format( optimizer.param_groups[0]['lr'])) t_l = train(net, optimizer, train_loader, freeze_bn, use_lovasz) v_l, viou = valid(net, optimizer, valid_loader, use_lovasz, save_imgs, fold) #if swa: # save the model on best validation loss #if not args.cos_anneal: if viou >= best_val_iou: print('new best_val_iou', viou) net.eval() torch.save(net.state_dict(), model_ckpt) best_val_metric = v_l best_val_iou = viou valid_patience = 0 lr_patience = 0 else: print('patience', valid_patience) valid_patience += 1 lr_patience += 1 train_losses.append(t_l) valid_losses.append(v_l) valid_ious.append(viou) t_ += 1 print('Time: {}'.format(time.time() - start)) # if the model stops improving by a certain num epoch, stop if valid_patience > 15: cycle += 1 checkpath = '../model_weights/fix2_{}_{}_cycle-{}_fold-{}.pth'.format( args.model_name, args.exp_name, cycle, fold) print('save model', checkpath) torch.save(net.state_dict(), checkpath) print('cycle', cycle, 'num_cycles', args.num_cycles) if cycle >= args.num_cycles: print('all over') break print('rampdown') for params in optimizer.param_groups: params['lr'] = ( args.lr_min + 0.5 * (args.lr_max - args.lr_min) * (1 + np.cos(np.pi * t_ / args.lr_rampdown))) print('Learning rate set to {:.4}'.format( optimizer.param_groups[0]['lr'])) elif lr_patience > 5: print('Reducing learning rate by {}'.format(0.5)) for params in optimizer.param_groups: params['lr'] *= 0.5 lr_patience = 0 # if the model doesn't improve for n epochs, reduce learning rate if cycle >= 1: print('switching to lovasz') use_lovasz = True except KeyboardInterrupt: pass if args.swa: for i in range(cycle): if i == 0: net.load_state_dict( torch.load('../swa/fix2_cycle_{}.pth'.format(i), map_location=lambda storage, loc: storage)) else: alpha = 1. / (i + 1.) prev = ResUNet() prev.load_state_dict( torch.load('../swa/fix2_cycle_{}.pth'.format(i), map_location=lambda storage, loc: storage)) # average weights for param_c, param_p in zip(net.parameters(), prev.parameters()): param_c.data *= (1.0 - alpha) param_c.data += param_p.data.to(device) * alpha bn_update(train_loader, net, args.gpu) net.eval() torch.save( net.state_dict(), '../model_weights/fix2_swa_{}_{}_fold-{}.pth'.format( args.model_name, args.exp_name, fold)) import pandas as pd out_dict = { 'train_losses': train_losses, 'valid_losses': valid_losses, 'valid_ious': valid_ious } out_log = pd.DataFrame(out_dict) out_log.to_csv('../logs/fix3_resunet_fold-{}.csv'.format(fold), index=False) return best_val_iou
def train_network(net, fold=0, model_ckpt=None): # train the network, allow for keyboard interrupt try: # define optimizer optimizer = optim.SGD(net.parameters(), lr=args.lr_max, momentum=0.9, weight_decay=args.l2) # get the loaders train_loader, valid_loader = get_data_loaders( imsize=args.imsize, batch_size=args.batch_size, num_folds=args.num_folds, fold=fold) # training flags swa = False use_lovasz = False freeze_bn = False save_imgs = False train_losses = [] valid_losses = [] valid_ious = [] valid_patience = 0 best_val_metric = 1000.0 best_val_iou = 0.0 cycle = 0 swa_n = 0 t_ = 0 print('Training ...') for e in range(args.epochs): print('\n' + 'Epoch {}/{}'.format(e, args.epochs)) start = time.time() # LR warm-up #if e < args.lr_rampup: # lr = args.lr_max * (min(e, args.lr_rampup) / args.lr_rampup) # if we get to the end of lr period, save swa weights if t_ >= args.lr_rampdown: # if we are using swa save off the current weights before updating if args.swa: torch.save(net.state_dict(), '../swa/cycle_{}.pth'.format(cycle)) #swa_n += 1 # reset the counter t_ = 0 cycle += 1 torch.save( net.state_dict(), '../model_weights/{}_{}_cycle-{}_fold-{}.pth'.format( args.model_name, args.exp_name, cycle, fold)) save_imgs = True else: save_imgs = False for params in optimizer.param_groups: #print('t_', t_) if args.cos_anneal and e > args.lr_rampup: params['lr'] = ( args.lr_min + 0.5 * (args.lr_max - args.lr_min) * (1 + np.cos(np.pi * t_ / args.lr_rampdown))) elif e < args.lr_rampup: params['lr'] = args.lr_max * (min(t_ + 1, args.lr_rampup) / args.lr_rampup) print('Learning rate set to {:.4}'.format( optimizer.param_groups[0]['lr'])) t_l = train(net, optimizer, train_loader, freeze_bn, use_lovasz) v_l, viou = valid(net, optimizer, valid_loader, use_lovasz, save_imgs, fold) #if swa: # save the model on best validation loss #if not args.cos_anneal: if viou > best_val_iou: net.eval() torch.save(net.state_dict(), model_ckpt) best_val_metric = v_l best_val_iou = viou valid_patience = 0 else: valid_patience += 1 # if the model stops improving by a certain num epoch, stop if cycle >= args.num_cycles: break # if the model doesn't improve for n epochs, reduce learning rate if cycle >= 1: if args.use_lovasz: print('switching to lovasz') use_lovasz = True #dice_weight += 0.5 if not args.cos_anneal: print('Reducing learning rate by {}'.format(args.lr_scale)) for params in optimizer.param_groups: params['lr'] *= args.lr_scale train_losses.append(t_l) valid_losses.append(v_l) valid_ious.append(viou) #if e in LR_SCHED: # print('Reducing learning rate by {}'.format(args.lr_scale)) # for params in optimizer.param_groups: # params['lr'] *= args.lr_scale t_ += 1 print('Time: {}'.format(time.time() - start)) except KeyboardInterrupt: pass if args.swa: for i in range(cycle): if i == 0: net.load_state_dict( torch.load('../swa/cycle_{}.pth'.format(i), map_location=lambda storage, loc: storage)) else: alpha = 1. / (i + 1.) prev = ResUNet() prev.load_state_dict( torch.load('../swa/cycle_{}.pth'.format(i), map_location=lambda storage, loc: storage)) # average weights for param_c, param_p in zip(net.parameters(), prev.parameters()): param_c.data *= (1.0 - alpha) param_c.data += param_p.data.to(device) * alpha bn_update(train_loader, net, args.gpu) net.eval() torch.save( net.state_dict(), '../model_weights/swa_{}_{}_fold-{}.pth'.format( args.model_name, args.exp_name, fold)) import pandas as pd out_dict = { 'train_losses': train_losses, 'valid_losses': valid_losses, 'valid_ious': valid_ious } out_log = pd.DataFrame(out_dict) out_log.to_csv('../logs/resunet_fold-{}.csv'.format(fold), index=False) return best_val_iou