def validate(val_loader, model, criterion): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to evaluate mode model.eval() end = time.time() for i, data in enumerate(val_loader): input = data[0]["data"] target = data[0]["label"].squeeze().cuda().long() val_loader_len = int(val_loader._size / args.batch_size) # compute output with torch.no_grad(): output = model(input) loss = criterion(output, target) # measure accuracy and record loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) if args.distributed: reduced_loss = reduce_tensor(loss.data) prec1 = reduce_tensor(prec1) prec5 = reduce_tensor(prec5) else: reduced_loss = loss.data losses.update(to_python_float(reduced_loss), input.size(0)) top1.update(to_python_float(prec1), input.size(0)) top5.update(to_python_float(prec5), input.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # TODO: Change timings to mirror train(). if args.local_rank == 0 and i % args.print_freq == 0: print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Speed {2:.3f} ({3:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( i, val_loader_len, args.world_size * args.batch_size / batch_time.val, args.world_size * args.batch_size / batch_time.avg, batch_time=batch_time, loss=losses, top1=top1, top5=top5)) dist_print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'.format( top1=top1, top5=top5)) return [top1.avg, top5.avg]
def cp_projects(auto_backup, to_path): if is_main_process() and auto_backup: with open('./.gitignore', 'r') as fp: ign = fp.read() ign += '\n.git' spec = pathspec.PathSpec.from_lines( pathspec.patterns.GitWildMatchPattern, ign.splitlines()) all_files = { os.path.join(root, name) for root, dirs, files in os.walk('./') for name in files } matches = spec.match_files(all_files) matches = set(matches) to_cp_files = all_files - matches dist_print('Copying projects to ' + to_path + ' for backup') t0 = time.time() warning_flag = True for f in to_cp_files: dirs = os.path.join(to_path, 'code', os.path.split(f[2:])[0]) if not os.path.exists(dirs): os.makedirs(dirs) os.system('cp %s %s' % (f, os.path.join(to_path, 'code', f[2:]))) elapsed_time = time.time() - t0 if elapsed_time > 5 and warning_flag: dist_print( 'If the program is stuck, it might be copying large files in this directory. please don\'t set --auto_backup. Or please make you working directory clean, i.e, don\'t place large files like dataset, log results under this directory.' ) warning_flag = False
def merge_config(): args = get_args().parse_args() cfg = Config.fromfile(args.config) items = [ 'dataset', 'data_root', 'epoch', 'batch_size', 'optimizer', 'learning_rate', 'weight_decay', 'momentum', 'scheduler', 'steps', 'gamma', 'warmup', 'warmup_iters', 'use_aux', 'griding_num', 'backbone', 'sim_loss_w', 'shp_loss_w', 'note', 'log_path', 'finetune', 'resume', 'test_model', 'test_work_dir', 'num_lanes', "save_prefix", "distributed", "width", "height" ] for item in items: if getattr(args, item) is not None: dist_print('merge ', item, ' config') setattr(cfg, item, getattr(args, item)) if cfg.val == "True": if "val_batch_size" not in cfg: cfg.val_batch_size = cfg.batch_size if "val_data_root" not in cfg: cfg.val_data_root = cfg.data_root if "val_dataset" not in cfg: cfg.val_dataset = cfg.dataset return args, cfg
def merge_config(): args = get_args().parse_args() cfg = Config.fromfile(args.config) items = ['dataset','data_root','epoch','batch_size','optimizer','learning_rate', 'weight_decay','momentum','scheduler','steps','gamma','warmup','warmup_iters', 'use_aux','griding_num','backbone','sim_loss_w','shp_loss_w','note','log_path', 'finetune','resume', 'test_model','test_work_dir'] for item in items: if getattr(args, item) is not None: dist_print('merge ', item, ' config') setattr(cfg, item, getattr(args, item)) return args, cfg
def __init__(self, batch_size, num_threads, device_id, data_dir, crop, shard_id, num_shards, dali_cpu=False): super(HybridTrainPipe, self).__init__(batch_size, num_threads, device_id, seed=12 + device_id) self.input = ops.FileReader(file_root=data_dir, shard_id=args.local_rank, num_shards=args.world_size, random_shuffle=True, pad_last_batch=True) #let user decide which pipeline works him bets for RN version he runs dali_device = 'cpu' if dali_cpu else 'gpu' decoder_device = 'cpu' if dali_cpu else 'mixed' # This padding sets the size of the internal nvJPEG buffers to be able to handle all images from full-sized ImageNet # without additional reallocations device_memory_padding = 211025920 if decoder_device == 'mixed' else 0 host_memory_padding = 140544512 if decoder_device == 'mixed' else 0 self.decode = ops.ImageDecoderRandomCrop( device=decoder_device, output_type=types.RGB, device_memory_padding=device_memory_padding, host_memory_padding=host_memory_padding, random_aspect_ratio=[0.8, 1.25], random_area=[0.1, 1.0], num_attempts=100) self.res = ops.Resize(device=dali_device, resize_x=crop, resize_y=crop, interp_type=types.INTERP_TRIANGULAR) self.cmnp = ops.CropMirrorNormalize( device="gpu", dtype=types.FLOAT, output_layout=types.NCHW, crop=(crop, crop), mean=[0.485 * 255, 0.456 * 255, 0.406 * 255], std=[0.229 * 255, 0.224 * 255, 0.225 * 255]) self.coin = ops.CoinFlip(probability=0.5) dist_print('DALI "{0}" variant'.format(dali_device))
def eval_lane(net, dataset, data_root, work_dir, griding_num, use_aux, distributed): net.eval() if dataset == 'CULane': run_test_culane(net, data_root, 'culane_eval_tmp', work_dir, griding_num, use_aux, distributed) synchronize() # wait for all results if is_main_process(): res = call_culane_eval(data_root, 'culane_eval_tmp', work_dir) TP, FP, FN = 0, 0, 0 for k, v in res.items(): val = float(v['Fmeasure']) if 'nan' not in v['Fmeasure'] else 0 val_tp, val_fp, val_fn = int(v['tp']), int(v['fp']), int( v['fn']) TP += val_tp FP += val_fp FN += val_fn dist_print(k, val) P = TP * 1.0 / (TP + FP) R = TP * 1.0 / (TP + FN) F = 2 * P * R / (P + R) dist_print(F) synchronize() elif dataset == 'Tusimple': exp_name = 'tusimple_eval_tmp' run_test(net, dataset, data_root, work_dir, exp_name, griding_num, use_aux, distributed) synchronize() # wait for all results if is_main_process(): combine_tusimple_test(work_dir, exp_name) res = LaneEval.bench_one_submit( os.path.join(work_dir, exp_name + '.txt'), os.path.join(data_root, 'test_label.json')) res = json.loads(res) for r in res: dist_print(r['name'], r['value']) synchronize() elif dataset == 'Neolix': exp_name = 'neolix_eval_tmp' run_test(net, dataset, data_root, work_dir, exp_name, griding_num, use_aux, distributed) synchronize() # wait for all results if is_main_process(): combine_neolix_test(work_dir, exp_name) res = LaneEval.bench_one_submit( os.path.join(work_dir, exp_name + '.txt'), os.path.join(data_root, 'test_label.json')) res = json.loads(res) for r in res: dist_print(r['name'], r['value']) synchronize()
def eval_lane(net, dataset, data_root, work_dir, distributed, cfg): net.eval() run_test(net, data_root, 'culane_eval_tmp', work_dir, distributed, cfg) synchronize() # wait for all results if is_main_process(): res = call_culane_eval(data_root, 'culane_eval_tmp', work_dir) TP, FP, FN = 0, 0, 0 for k, v in res.items(): val = float(v['Fmeasure']) if 'nan' not in v['Fmeasure'] else 0 val_tp, val_fp, val_fn = int(v['tp']), int(v['fp']), int(v['fn']) TP += val_tp FP += val_fp FN += val_fn dist_print(k, val) P = TP * 1.0 / (TP + FP) R = TP * 1.0 / (TP + FN) F = 2 * P * R / (P + R) dist_print(F) synchronize()
def resume(): if os.path.isfile(args.resume): dist_print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) dist_print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: dist_print("=> no checkpoint found at '{}'".format( args.resume))
import scipy.special, tqdm import numpy as np import torch.nn.functional as F import torchvision.transforms as transforms from data.dataset import LaneTestDataset from sklearn.linear_model import RANSACRegressor from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline if __name__ == "__main__": torch.backends.cudnn.benchmark = True args, cfg = merge_config() dist_print('start testing...') net = E2ENet(Channels=96, nums_lane=4, culomn_channels=cfg.griding_num, row_channels=cfg.row_num, initialed=True).cuda() state_dict = torch.load(cfg.test_model, map_location='cpu')['model'] compatible_state_dict = {} for k, v in state_dict.items(): if 'module.' in k: compatible_state_dict[k[7:]] = v else: compatible_state_dict[k] = v
def train(train_loader, model, criterion, optimizer, epoch, logger, scheduler): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode model.train() end = time.time() for i, data in enumerate(train_loader): input = data[0]["data"] target = data[0]["label"].squeeze().cuda().long() train_loader_len = int(math.ceil(train_loader._size / args.batch_size)) if args.prof >= 0 and i == args.prof: dist_print("Profiling begun at iteration {}".format(i)) torch.cuda.cudart().cudaProfilerStart() if args.prof >= 0: torch.cuda.nvtx.range_push("Body of iteration {}".format(i)) scheduler.step() # compute output if args.prof >= 0: torch.cuda.nvtx.range_push("forward") output = model(input) if args.prof >= 0: torch.cuda.nvtx.range_pop() loss = criterion(output, target) # compute gradient and do SGD step optimizer.zero_grad() if args.prof >= 0: torch.cuda.nvtx.range_push("backward") if args.opt_level is not None: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if args.prof >= 0: torch.cuda.nvtx.range_pop() if args.prof >= 0: torch.cuda.nvtx.range_push("optimizer.step()") optimizer.step() if args.prof >= 0: torch.cuda.nvtx.range_pop() if i % args.print_freq == 0: # Every print_freq iterations, check the loss, accuracy, and speed. # For best performance, it doesn't make sense to print these metrics every # iteration, since they incur an allreduce and some host<->device syncs. # Measure accuracy prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) # Average loss and accuracy across processes for logging if args.distributed: reduced_loss = reduce_tensor(loss.data) prec1 = reduce_tensor(prec1) prec5 = reduce_tensor(prec5) else: reduced_loss = loss.data # to_python_float incurs a host<->device sync losses.update(to_python_float(reduced_loss), input.size(0)) top1.update(to_python_float(prec1), input.size(0)) top5.update(to_python_float(prec5), input.size(0)) torch.cuda.synchronize() batch_time.update((time.time() - end) / args.print_freq) end = time.time() if args.local_rank == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Speed {3:.3f} ({4:.3f})\t' 'Loss {loss.val:.5f} ({loss.avg:.4f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, train_loader_len, args.world_size * args.batch_size / batch_time.val, args.world_size * args.batch_size / batch_time.avg, batch_time=batch_time, loss=losses, top1=top1, top5=top5)) logger.add_scalar('Train/loss', losses.val, global_step=epoch * train_loader_len + i) logger.add_scalar('Train/top1', top1.val, global_step=epoch * train_loader_len + i) logger.add_scalar('Train/top5', top5.val, global_step=epoch * train_loader_len + i) logger.add_scalar('Meta/lr', optimizer.param_groups[0]['lr'], global_step=epoch * train_loader_len + i) # Pop range "Body of iteration {}".format(i) if args.prof >= 0: torch.cuda.nvtx.range_pop() if args.prof >= 0 and i == args.prof + 10: print("Profiling ended at iteration {}".format(i)) torch.cuda.cudart().cudaProfilerStop() quit() return batch_time.avg
def main(): time_stamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') global best_prec1, args best_prec1 = 0 args = parse() if not len(args.data): raise Exception("error: No data set provided") args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 # make apex optional if args.opt_level is not None or args.sync_bn: try: global DDP, amp, optimizers, parallel from apex.parallel import DistributedDataParallel as DDP from apex import amp, optimizers, parallel except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to run this example." ) if args.opt_level is None and args.distributed: from torch.nn.parallel import DistributedDataParallel as DDP dist_print("opt_level = {}".format(args.opt_level)) dist_print("keep_batchnorm_fp32 = {}".format(args.keep_batchnorm_fp32), type(args.keep_batchnorm_fp32)) dist_print("loss_scale = {}".format(args.loss_scale), type(args.loss_scale)) dist_print("\nCUDNN VERSION: {}\n".format(torch.backends.cudnn.version())) torch.backends.cudnn.benchmark = True best_prec1 = 0 if args.deterministic: # cudnn.benchmark = False # cudnn.deterministic = True # torch.manual_seed(args.local_rank) torch.set_printoptions(precision=10) setup_seed(0) args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.total_batch_size = args.world_size * args.batch_size assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." args.work_dir = os.path.join(args.work_dir, time_stamp + args.arch + args.note) if not args.evaluate: if args.local_rank == 0: os.makedirs(args.work_dir) logger = DistSummaryWriter(args.work_dir) # create model if args.pretrained: dist_print("=> using pre-trained model '{}'".format(args.arch)) if args.arch == 'fcanet34': model = fcanet34(pretrained=True) elif args.arch == 'fcanet50': model = fcanet50(pretrained=True) elif args.arch == 'fcanet101': model = fcanet101(pretrained=True) elif args.arch == 'fcanet152': model = fcanet152(pretrained=True) else: model = models.__dict__[args.arch](pretrained=True) else: dist_print("=> creating model '{}'".format(args.arch)) if args.arch == 'fcanet34': model = fcanet34() elif args.arch == 'fcanet50': model = fcanet50() elif args.arch == 'fcanet101': model = fcanet101() elif args.arch == 'fcanet152': model = fcanet152() else: model = models.__dict__[args.arch]() if args.sync_bn: dist_print("using apex synced BN") model = parallel.convert_syncbn_model(model) if hasattr(torch, 'channels_last') and hasattr(torch, 'contiguous_format'): if args.channels_last: memory_format = torch.channels_last else: memory_format = torch.contiguous_format model = model.cuda().to(memory_format=memory_format) else: model = model.cuda() # Scale learning rate based on global batch size args.lr = args.lr * float(args.batch_size * args.world_size) / 256. optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Initialize Amp. Amp accepts either values or strings for the optional override arguments, # for convenient interoperation with argparse. if args.opt_level is not None: model, optimizer = amp.initialize( model, optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale) # For distributed training, wrap the model with apex.parallel.DistributedDataParallel. # This must be done AFTER the call to amp.initialize. If model = DDP(model) is called # before model, ... = amp.initialize(model, ...), the call to amp.initialize may alter # the types of model's parameters in a way that disrupts or destroys DDP's allreduce hooks. if args.distributed: # By default, apex.parallel.DistributedDataParallel overlaps communication with # computation in the backward pass. # model = DDP(model) # delay_allreduce delays all communication to the end of the backward pass. if args.opt_level is not None: model = DDP(model, delay_allreduce=True) else: model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank) # Optionally resume from a checkpoint if args.resume: # Use a local scope to avoid dangling references def resume(): if os.path.isfile(args.resume): dist_print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) dist_print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: dist_print("=> no checkpoint found at '{}'".format( args.resume)) resume() if args.evaluate: assert args.evaluate_model is not None dist_print("=> loading checkpoint '{}' for eval".format( args.evaluate_model)) checkpoint = torch.load( args.evaluate_model, map_location=lambda storage, loc: storage.cuda(args.gpu)) if 'state_dict' in checkpoint.keys(): model.load_state_dict(checkpoint['state_dict']) else: state_dict_with_module = {} for k, v in checkpoint.items(): state_dict_with_module['module.' + k] = v model.load_state_dict(state_dict_with_module) # Data loading code if len(args.data) == 1: traindir = os.path.join(args.data[0], 'train') valdir = os.path.join(args.data[0], 'val') else: traindir = args.data[0] valdir = args.data[1] if (args.arch == "inception_v3"): raise RuntimeError( "Currently, inception_v3 is not supported by this example.") # crop_size = 299 # val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 pipe = HybridTrainPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=traindir, crop=crop_size, dali_cpu=args.dali_cpu, shard_id=args.local_rank, num_shards=args.world_size) pipe.build() train_loader = DALIClassificationIterator(pipe, reader_name="Reader", fill_last_batch=False) pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=valdir, crop=crop_size, size=val_size, shard_id=args.local_rank, num_shards=args.world_size) pipe.build() val_loader = DALIClassificationIterator(pipe, reader_name="Reader", fill_last_batch=False) # criterion = nn.CrossEntropyLoss().cuda() criterion = CrossEntropyLabelSmooth().cuda() if args.evaluate: validate(val_loader, model, criterion) return len_epoch = int(math.ceil(train_loader._size / args.batch_size)) T_max = 95 * len_epoch warmup_iters = 5 * len_epoch scheduler = CosineAnnealingLR(optimizer, T_max, warmup='linear', warmup_iters=warmup_iters) total_time = AverageMeter() for epoch in range(args.start_epoch, args.epochs): # train for one epoch avg_train_time = train(train_loader, model, criterion, optimizer, epoch, logger, scheduler) total_time.update(avg_train_time) torch.cuda.empty_cache() # evaluate on validation set [prec1, prec5] = validate(val_loader, model, criterion) logger.add_scalar('Val/prec1', prec1, global_step=epoch) logger.add_scalar('Val/prec5', prec5, global_step=epoch) # remember best prec@1 and save checkpoint if args.local_rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, work_dir=args.work_dir) if epoch == args.epochs - 1: dist_print('##Best Top-1 {0}\n' '##Perf {2}'.format( best_prec1, args.total_batch_size / total_time.avg)) with open(os.path.join(args.work_dir, 'res.txt'), 'w') as f: f.write('arhc: {0} \n best_prec1 {1}'.format( args.arch + args.note, best_prec1)) train_loader.reset() val_loader.reset()
if __name__ == "__main__": torch.backends.cudnn.benchmark = True args, cfg = merge_config() work_dir = get_work_dir(cfg) distributed = False if 'WORLD_SIZE' in os.environ: distributed = int(os.environ['WORLD_SIZE']) > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') dist_print(datetime.datetime.now().strftime('[%Y/%m/%d %H:%M:%S]') + ' start training...') dist_print(cfg) assert cfg.backbone in [ '18', '34', '50', '101', '152', '50next', '101next', '50wide', '101wide' ] train_loader, cls_num_per_lane = get_train_loader( cfg.batch_size, cfg.data_root, cfg.griding_num, cfg.dataset, cfg.use_aux, distributed, cfg.num_lanes) net = parsingNet(pretrained=True, backbone=cfg.backbone, cls_dim=(cfg.griding_num + 1, cls_num_per_lane, cfg.num_lanes), use_aux=cfg.use_aux).cuda()
def train(net, data_loader, loss_dict, optimizer, scheduler, logger, epoch, metric_dict, cfg): net.train() progress_bar = dist_tqdm(data_loader) t_data_0 = time.time() # Pyten-20201019-FixBug reset_metrics(metric_dict) total_loss = 0 for b_idx, data_label in enumerate(progress_bar): t_data_1 = time.time() global_step = epoch * len(data_loader) + b_idx t_net_0 = time.time() results = inference(net, data_label, cfg.use_aux) loss = calc_loss(loss_dict, results, logger, global_step, "train") optimizer.zero_grad() loss.backward() # Pyten-20210201-ClipGrad clip_grad_norm_(net.parameters(), max_norm=10.0) optimizer.step() total_loss = total_loss + loss.detach() scheduler.step(global_step) t_net_1 = time.time() results = resolve_val_data(results, cfg.use_aux) update_metrics(metric_dict, results) if global_step % 20 == 0: # Pyten-20210201-TransformImg img = img_detrans(data_label[0][0]) logger.add_image("train_image/org", img, global_step=global_step) logger.add_image("train_image/std", data_label[0][0], global_step=global_step) if cfg.use_aux: seg_color_out = decode_seg_color_map(results["seg_out"][0]) seg_color_label = decode_seg_color_map(data_label[2][0]) logger.add_image("train_seg/predict", seg_color_out, global_step=global_step, dataformats='HWC') logger.add_image("train_seg/label", seg_color_label, global_step=global_step, dataformats='HWC') cls_color_out = decode_cls_color_map(data_label[0][0], results["cls_out"][0], cfg) cls_color_label = decode_cls_color_map(data_label[0][0], data_label[1][0], cfg) logger.add_image("train_cls/predict", cls_color_out, global_step=global_step, dataformats='HWC') logger.add_image("train_cls/label", cls_color_label, global_step=global_step, dataformats='HWC') for me_name, me_op in zip(metric_dict['name'], metric_dict['op']): logger.add_scalar('train_metric/' + me_name, me_op.get(), global_step=global_step) logger.add_scalar('train/meta/lr', optimizer.param_groups[0]['lr'], global_step=global_step) if hasattr(progress_bar, 'set_postfix'): kwargs = { me_name: '%.4f' % me_op.get() for me_name, me_op in zip(metric_dict['name'], metric_dict['op']) } progress_bar.set_postfix( loss='%.3f' % float(loss), avg_loss='%.3f' % float(total_loss / (b_idx + 1)), #data_time = '%.3f' % float(t_data_1 - t_data_0), net_time='%.3f' % float(t_net_1 - t_net_0), **kwargs) t_data_0 = time.time() dist_print("avg_loss_over_epoch", total_loss / len(data_loader))
def val(net, data_loader, loss_dict, scheduler, logger, epoch, metric_dict, cfg): net.eval() progress_bar = dist_tqdm(data_loader) t_data_0 = time.time() reset_metrics(metric_dict) total_loss = 0 with torch.no_grad(): for b_idx, data_label in enumerate(progress_bar): t_data_1 = time.time() # reset_metrics(metric_dict) global_step = epoch * len(data_loader) + b_idx t_net_0 = time.time() # pdb.set_trace() results = inference(net, data_label, cfg.use_aux) loss = calc_loss(loss_dict, results, logger, global_step, "val") total_loss = total_loss + loss.detach() t_net_1 = time.time() results = resolve_val_data(results, cfg.use_aux) update_metrics(metric_dict, results) if global_step % 20 == 0: # Pyten-20210201-TransformImg img = img_detrans(data_label[0][0]) logger.add_image("val_image/org", img, global_step=global_step) logger.add_image("val_image/std", data_label[0][0], global_step=global_step) if cfg.use_aux: # import pdb; pdb.set_trace() seg_color_out = decode_seg_color_map(results["seg_out"][0]) seg_color_label = decode_seg_color_map(data_label[2][0]) logger.add_image("val_seg/predict", seg_color_out, global_step=global_step, dataformats='HWC') logger.add_image("val_seg/label", seg_color_label, global_step=global_step, dataformats='HWC') cls_color_out = decode_cls_color_map(data_label[0][0], results["cls_out"][0], cfg) cls_color_label = decode_cls_color_map(data_label[0][0], data_label[1][0], cfg) logger.add_image("val_cls/predict", cls_color_out, global_step=global_step, dataformats='HWC') logger.add_image("val_cls/label", cls_color_label, global_step=global_step, dataformats='HWC') if hasattr(progress_bar, 'set_postfix'): kwargs = { me_name: '%.4f' % me_op.get() for me_name, me_op in zip(metric_dict['name'], metric_dict['op']) } progress_bar.set_postfix( loss='%.3f' % float(loss), avg_loss='%.3f' % float(total_loss / (b_idx + 1)), # data_time = '%.3f' % float(t_data_1 - t_data_0), net_time='%.3f' % float(t_net_1 - t_net_0), **kwargs) t_data_0 = time.time() dist_print("avg_loss_over_epoch", total_loss / len(data_loader)) for me_name, me_op in zip(metric_dict['name'], metric_dict['op']): logger.add_scalar('val_metric/' + me_name, me_op.get(), global_step=epoch) # Pyten-20201019-SaveBestMetric update_best_metric = True for me_name, me_op in zip(metric_dict['name'], metric_dict['op']): if me_name == "iou": continue cur_metric = me_op.get() if cur_metric < metric_dict["best_metric"][me_name]: update_best_metric = False if update_best_metric: for me_name, me_op in zip(metric_dict['name'], metric_dict['op']): metric_dict["best_metric"][me_name] = me_op.get() cfg.best_epoch = epoch dist_print("best metric updated!(epoch%d)" % epoch)
if __name__ == "__main__": torch.backends.cudnn.benchmark = True args, cfg = merge_config() work_dir = get_work_dir(cfg) distributed = cfg.distributed if "distributed" in cfg else False if 'WORLD_SIZE' in os.environ: distributed = int(os.environ['WORLD_SIZE']) > 1 if distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') dist_print(datetime.datetime.now().strftime('[%Y/%m/%d %H:%M:%S]') + ' start training...') dist_print(cfg) assert cfg.backbone in [ '18', '34', '50', '101', '152', '50next', '101next', '50wide', '101wide' ] train_loader, cls_num_per_lane = get_train_loader( cfg.batch_size, cfg.data_root, cfg.griding_num, cfg.dataset, cfg.use_aux, distributed, cfg.num_lanes, cfg) if cfg.val: val_loader = get_val_loader(cfg.val_batch_size, cfg.val_data_root, cfg.griding_num, cfg.val_dataset, cfg.use_aux, distributed, cfg.num_lanes, cfg)