def validate(val_loader, net, criterion, optim, curr_epoch, writer): """ Runs the validation loop after each training epoch val_loader: Data loader for validation net: thet network criterion: loss fn optimizer: optimizer curr_epoch: current epoch writer: tensorboard writer return: val_avg for step function if required """ net.eval() val_loss = AverageMeter() iou_acc = 0 dump_images = [] for val_idx, data in enumerate(val_loader): inputs, gt_image, img_names = data assert len(inputs.size()) == 4 and len(gt_image.size()) == 3 assert inputs.size()[2:] == gt_image.size()[1:] batch_pixel_size = inputs.size(0) * inputs.size(2) * inputs.size(3) inputs, gt_cuda = inputs.cuda(), gt_image.cuda() with torch.no_grad(): output = net(inputs) # output = (1, 19, 713, 713) assert output.size()[2:] == gt_image.size()[1:] assert output.size()[1] == args.dataset_cls.num_classes val_loss.update(criterion(output, gt_cuda).item(), batch_pixel_size) predictions = output.data.max(1)[1].cpu() # Logging if val_idx % 20 == 0: if args.local_rank == 0: logging.info("validating: %d / %d", val_idx + 1, len(val_loader)) if val_idx > 10 and args.test_mode: break # Image Dumps if val_idx < 10: dump_images.append([gt_image, predictions, img_names]) iou_acc += fast_hist(predictions.numpy().flatten(), gt_image.numpy().flatten(), args.dataset_cls.num_classes) del output, val_idx, data if args.apex: iou_acc_tensor = torch.cuda.FloatTensor(iou_acc) torch.distributed.all_reduce(iou_acc_tensor, op=torch.distributed.ReduceOp.SUM) iou_acc = iou_acc_tensor.cpu().numpy() if args.local_rank == 0: evaluate_eval(args, net, optim, val_loss, iou_acc, dump_images, writer, curr_epoch, args.dataset_cls) return val_loss.avg
def validate(val_loader, net, criterion, optim, scheduler, curr_epoch, curr_iter): """ Runs the validation loop after each training epoch val_loader: Data loader for validation net: thet network criterion: loss fn optimizer: optimizer curr_epoch: current epoch return: val_avg for step function if required """ net.eval() val_loss = AverageMeter() iou_acc = 0 error_acc = 0 for val_idx, data in enumerate(val_loader): inputs, gts = data = data assert len(inputs.size()) == 4 and len(gts.size()) == 3 assert inputs.size()[2:] == gts.size()[1:] batch_pixel_size = inputs.size(0) * inputs.size(2) * inputs.size(3) inputs, gts = inputs.cuda(), gts.cuda() with torch.no_grad(): output = net(inputs) del inputs assert output.size()[2:] == gts.size()[1:] assert output.size()[1] == args.num_classes val_loss.update(criterion(output, gts).item(), batch_pixel_size) predictions = output.data.max(1)[1].cpu() # Logging if val_idx % 20 == 0: logging.info("validating: %d / %d", val_idx + 1, len(val_loader)) iou_acc += fast_hist(predictions.numpy().flatten(), gts.cpu().numpy().flatten(), args.num_classes) del gts, output, val_idx, data per_cls_iou = evaluate_eval(args, net, optim, scheduler, val_loss, iou_acc, curr_epoch, args.dataset, curr_iter) return val_loss.avg, per_cls_iou
def validate(val_loader, dataset, net, criterion, optim, scheduler, curr_epoch, writer, curr_iter, save_pth=True): """ Runs the validation loop after each training epoch val_loader: Data loader for validation dataset: dataset name (str) net: thet network criterion: loss fn optimizer: optimizer curr_epoch: current epoch writer: tensorboard writer return: val_avg for step function if required """ net.eval() val_loss = AverageMeter() iou_acc = 0 error_acc = 0 dump_images = [] for val_idx, data in enumerate(val_loader): # input = torch.Size([1, 3, 713, 713]) # gt_image = torch.Size([1, 713, 713]) inputs, gt_image, img_names, _ = data if len(inputs.shape) == 5: B, D, C, H, W = inputs.shape inputs = inputs.view(-1, C, H, W) gt_image = gt_image.view(-1, 1, H, W) assert len(inputs.size()) == 4 and len(gt_image.size()) == 3 assert inputs.size()[2:] == gt_image.size()[1:] batch_pixel_size = inputs.size(0) * inputs.size(2) * inputs.size(3) inputs, gt_cuda = inputs.cuda(), gt_image.cuda() with torch.no_grad(): if args.use_wtloss: output, f_cor_arr = net(inputs, visualize=True) else: output = net(inputs) del inputs assert output.size()[2:] == gt_image.size()[1:] assert output.size()[1] == datasets.num_classes val_loss.update(criterion(output, gt_cuda).item(), batch_pixel_size) del gt_cuda # Collect data from different GPU to a single GPU since # encoding.parallel.criterionparallel function calculates distributed loss # functions predictions = output.data.max(1)[1].cpu() # Logging if val_idx % 20 == 0: if args.local_rank == 0: logging.info("validating: %d / %d", val_idx + 1, len(val_loader)) if val_idx > 10 and args.test_mode: break # Image Dumps if val_idx < 10: dump_images.append([gt_image, predictions, img_names]) iou_acc += fast_hist(predictions.numpy().flatten(), gt_image.numpy().flatten(), datasets.num_classes) del output, val_idx, data iou_acc_tensor = torch.cuda.FloatTensor(iou_acc) torch.distributed.all_reduce(iou_acc_tensor, op=torch.distributed.ReduceOp.SUM) iou_acc = iou_acc_tensor.cpu().numpy() if args.local_rank == 0: evaluate_eval(args, net, optim, scheduler, val_loss, iou_acc, dump_images, writer, curr_epoch, dataset, None, curr_iter, save_pth=save_pth) return val_loss.avg
def validate(val_loader, net, criterion, optimizer, curr_epoch, writer): ''' Runs the validation loop after each training epoch val_loader: Data loader for validation net: thet network criterion: loss fn optimizer: optimizer curr_epoch: current epoch writer: tensorboard writer return: ''' net.eval() val_loss = AverageMeter() mf_score = AverageMeter() IOU_acc = 0 dump_images = [] heatmap_images = [] for vi, data in enumerate(val_loader): input, mask, edge, img_names = data assert len(input.size()) == 4 and len(mask.size()) == 3 assert input.size()[2:] == mask.size()[1:] h, w = mask.size()[1:] batch_pixel_size = input.size(0) * input.size(2) * input.size(3) input, mask_cuda, edge_cuda = input.cuda(), mask.cuda(), edge.cuda() with torch.no_grad(): seg_out, edge_out = net(input) # output = (1, 19, 713, 713) if args.joint_edgeseg_loss: loss_dict = criterion((seg_out, edge_out), (mask_cuda, edge_cuda)) val_loss.update(sum(loss_dict.values()).item(), batch_pixel_size) else: val_loss.update( criterion(seg_out, mask_cuda).item(), batch_pixel_size) # Collect data from different GPU to a single GPU since # encoding.parallel.criterionparallel function calculates distributed loss # functions seg_predictions = seg_out.data.max(1)[1].cpu() edge_predictions = edge_out.max(1)[0].cpu() #Logging if vi % 20 == 0: if args.local_rank == 0: logging.info('validating: %d / %d' % (vi + 1, len(val_loader))) if vi > 10 and args.test_mode: break _edge = edge.max(1)[0] #Image Dumps if vi < 10: dump_images.append([mask, seg_predictions, img_names]) heatmap_images.append([_edge, edge_predictions, img_names]) IOU_acc += fast_hist(seg_predictions.numpy().flatten(), mask.numpy().flatten(), args.dataset_cls.num_classes) del seg_out, edge_out, vi, data if args.local_rank == 0: evaluate_eval(args, net, optimizer, val_loss, mf_score, IOU_acc, dump_images, heatmap_images, writer, curr_epoch, args.dataset_cls) return val_loss.avg
def main(): """ Main Function """ # Set up the Arguments, Tensorboard Writer, Dataloader, Loss Fn, Optimizer assert_and_infer_cfg(args) writer = prep_experiment(args, parser) train_loader, val_loaders, train_obj, extra_val_loaders, covstat_val_loaders = datasets.setup_loaders( args) criterion, criterion_val = loss.get_loss(args) criterion_aux = loss.get_loss_aux(args) net = network.get_net(args, criterion, criterion_aux) optim, scheduler = optimizer.get_optimizer(args, net) net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net) net = network.warp_network_in_dataparallel(net, args.local_rank) epoch = 0 i = 0 if args.snapshot: epoch, mean_iu = optimizer.load_weights(net, optim, scheduler, args.snapshot, args.restore_optimizer) if args.restore_optimizer is True: iter_per_epoch = len(train_loader) i = iter_per_epoch * epoch else: epoch = 0 print("#### iteration", i) torch.cuda.empty_cache() # Main Loop # for epoch in range(args.start_epoch, args.max_epoch): while i < args.max_iter: # Update EPOCH CTR cfg.immutable(False) cfg.ITER = i cfg.immutable(True) i = train(train_loader, net, optim, epoch, writer, scheduler, args.max_iter) train_loader.sampler.set_epoch(epoch + 1) if (args.dynamic and args.use_isw and epoch % (args.cov_stat_epoch + 1) == args.cov_stat_epoch) \ or (args.dynamic is False and args.use_isw and epoch == args.cov_stat_epoch): net.module.reset_mask_matrix() for trial in range(args.trials): for dataset, val_loader in covstat_val_loaders.items( ): # For get the statistics of covariance validate_for_cov_stat(val_loader, dataset, net, criterion_val, optim, scheduler, epoch, writer, i, save_pth=False) net.module.set_mask_matrix() if args.local_rank == 0: print("Saving pth file...") evaluate_eval(args, net, optim, scheduler, None, None, [], writer, epoch, "None", None, i, save_pth=True) if args.class_uniform_pct: if epoch >= args.max_cu_epoch: train_obj.build_epoch(cut=True) train_loader.sampler.set_num_samples() else: train_obj.build_epoch() epoch += 1 # Validation after epochs if len(val_loaders) == 1: # Run validation only one time - To save models for dataset, val_loader in val_loaders.items(): validate(val_loader, dataset, net, criterion_val, optim, scheduler, epoch, writer, i) else: if args.local_rank == 0: print("Saving pth file...") evaluate_eval(args, net, optim, scheduler, None, None, [], writer, epoch, "None", None, i, save_pth=True) for dataset, val_loader in extra_val_loaders.items(): print("Extra validating... This won't save pth file") validate(val_loader, dataset, net, criterion_val, optim, scheduler, epoch, writer, i, save_pth=False)
def validate(val_loader, net, criterion1, criterion2, optim, curr_epoch, writer): """ Runs the validation loop after each training epoch val_loader: Data loader for validation net: thet network criterion: loss fn optimizer: optimizer curr_epoch: current epoch writer: tensorboard writer return: val_avg for step function if required """ net.eval() val_loss1 = AverageMeter() val_loss2 = AverageMeter() iou_acc1 = 0 iou_acc2 = 0 dump_images = [] for val_idx, data in enumerate(val_loader): inputs1, gt_image1, img_names1, inputs2, gt_image2, img_names2 = data assert len(inputs1.size()) == 4 and len(gt_image1.size()) == 3 assert inputs1.size()[2:] == gt_image1.size()[1:] assert len(inputs2.size()) == 4 and len(gt_image2.size()) == 3 assert inputs2.size()[2:] == gt_image2.size()[1:] batch_pixel_size1 = inputs1.size(0) * inputs1.size(2) * inputs1.size(3) batch_pixel_size2 = inputs2.size(0) * inputs2.size(2) * inputs2.size(3) inputs1, gt_cuda1 = inputs1.cuda(), gt_image1.cuda() inputs2, gt_cuda2 = inputs2.cuda(), gt_image2.cuda() with torch.no_grad(): output1 = net(inputs1, task='semantic') # output = (1, 19, 713, 713) output2 = net(inputs2, task='traversability') # output = (1, 19, 713, 713) assert output1.size()[2:] == gt_image1.size()[1:] assert output1.size()[1] == args.dataset_cls.num_classes1 assert output2.size()[2:] == gt_image2.size()[1:] assert output2.size()[1] == args.dataset_cls.num_classes2 val_loss1.update( criterion1(output1, gt_cuda1).item(), batch_pixel_size1) val_loss2.update( criterion2(output2, gt_cuda2).item(), batch_pixel_size2) predictions1 = output1.data.max(1)[1].cpu() predictions2 = output2.data.max(1)[1].cpu() # Logging if val_idx % 20 == 0: if args.local_rank == 0: logging.info("validating: %d / %d", val_idx + 1, len(val_loader)) if val_idx > 10 and args.test_mode: break # Image Dumps # if val_idx < 30: # dump_images.append([gt_image, predictions1, predictions2, img_names]) iou_acc1 += fast_hist(predictions1.numpy().flatten(), gt_image1.numpy().flatten(), args.dataset_cls.num_classes1) iou_acc2 += fast_hist(predictions2.numpy().flatten(), gt_image2.numpy().flatten(), args.dataset_cls.num_classes2) del output1, output2, val_idx, data if args.apex: iou_acc_tensor1 = torch.cuda.FloatTensor(iou_acc1) torch.distributed.all_reduce(iou_acc_tensor1, op=torch.distributed.ReduceOp.SUM) iou_acc1 = iou_acc_tensor1.cpu().numpy() iou_acc_tensor2 = torch.cuda.FloatTensor(iou_acc2) torch.distributed.all_reduce(iou_acc_tensor2, op=torch.distributed.ReduceOp.SUM) iou_acc2 = iou_acc_tensor2.cpu().numpy() if args.local_rank == 0: evaluate_eval(args, net, optim, val_loss1, val_loss2, iou_acc1, iou_acc2, dump_images, writer, curr_epoch, args.dataset_cls) return val_loss1.avg