def eval_ssd300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() dboxes = dboxes300_coco() encoder = Encoder(dboxes) val_trans = SSDTransformer(dboxes, (300, 300), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) inv_map = {v:k for k,v in val_coco.label_map.items()} ssd300 = SSD300(val_coco.labelnum) print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) ssd300.load_state_dict(od["model"]) if use_cuda: ssd300.cuda(args.device) loss_func = Loss(dboxes) if use_cuda: loss_func.cuda(args.device) coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold,args.device)
def eval_ssd_r34_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() dboxes = dboxes_R34_coco(args.image_size, args.strides) encoder = Encoder(dboxes) val_trans = SSDTransformer(dboxes, (args.image_size[0], args.image_size[1]), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) inv_map = {v: k for k, v in val_coco.label_map.items()} print('ssd r34') ssd_r34 = SSD_R34(val_coco.labelnum, strides=args.strides) print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) # import pdb; pdb.set_trace() ssd_r34.load_state_dict(od["model"]) if use_cuda: ssd_r34.cuda(args.device) loss_func = Loss(dboxes) if use_cuda: loss_func.cuda(args.device) if args.onnx: if args.onnx == 'export': return coco_eval_export(ssd_r34, val_coco, cocoGt, encoder, inv_map, args.threshold, args.device, use_cuda) elif args.onnx == 'eval': return coco_eval_onnx(ssd_r34, val_coco, cocoGt, encoder, inv_map, args.threshold, args.device, use_cuda) return coco_eval(ssd_r34, val_coco, cocoGt, encoder, inv_map, args.threshold, args.device, use_cuda)
def train300_mlperf_coco(exp, args): from coco import COCO device = exp.get_device() chrono = exp.chrono() dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) # mlperf_log.ssd_print(key=# mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4) # set shuffle=True in DataLoader # mlperf_log.ssd_print(key=# mlperf_log.INPUT_SHARD, value=None) # mlperf_log.ssd_print(key=# mlperf_log.INPUT_ORDER) # mlperf_log.ssd_print(key=# mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() ssd300 = ssd300.to(device) loss_func = Loss(dboxes).to(device) current_lr = 1e-3 current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) # mlperf_log.ssd_print(key=# mlperf_log.OPT_NAME, value="SGD") # mlperf_log.ssd_print(key=# mlperf_log.OPT_LR, value=current_lr) # mlperf_log.ssd_print(key=# mlperf_log.OPT_MOMENTUM, value=current_momentum) # mlperf_log.ssd_print(key=# mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} # mlperf_log.ssd_print(key=# mlperf_log.TRAIN_LOOP) for epoch in range(args.repeat): # mlperf_log.ssd_print(key=# mlperf_log.TRAIN_EPOCH, value=epoch) with chrono.time('train') as t: for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if nbatch > args.number: break img = Variable(img.to(device), requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() trans_bbox = trans_bbox.to(device) label = label.to(device) gloc = Variable(trans_bbox, requires_grad=False) glabel = Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() exp.log_batch_loss(loss) optim.zero_grad() loss.backward() optim.step() iter_num += 1 exp.show_eta(epoch, t) exp.report() return False
def train300_mlperf_coco(args): args.distributed = args.world_size > 1 from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() dboxes = dboxes300_coco() encoder = Encoder(dboxes) train_trans = SSDTransformer(dboxes, (300, 300), val=False) val_trans = SSDTransformer(dboxes, (300, 300), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4, sampler=train_sampler) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) ssd300 = DistributedDataParallel(ssd300) else: ssd300 = torch.nn.DataParallel(ssd300) optim = torch.optim.SGD(ssd300.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} for epoch in range(args.epochs): for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): start = time.time() if iter_num == 160000: print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = 1e-4 if iter_num == 200000: print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = 1e-5 if use_cuda: img = img.cuda() img = Variable(img, requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() if use_cuda: trans_bbox = trans_bbox.cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() optim.zero_grad() loss.backward() optim.step() end = time.time() if nbatch % 10 == 0: print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, Average time: {:.3f} secs"\ .format(iter_num, loss.item(), avg_loss, end - start)) if iter_num in args.evaluation: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold): return iter_num += 1
def train300_coco(): dboxes = dboxes300_coco() trans = SSDTransformer(dboxes, (300, 300), val=False) #annotate = "../../coco_ssd/instances_valminusminival2014.json" #coco_root = "../../coco_data/val2014" #annotate = "../../coco_ssd/train.json" #coco_root = "../../coco_data/train2014" annotate = "../../coco_ssd/instances_train2017.json" coco_root = "../../coco_data/train2017" coco = COCODetection(coco_root, annotate, trans) print("Number of labels: {}".format(coco.labelnum)) print("Number of images: {}".format(len(coco))) #train_sampler = torch.utils.data.distributed.DistributedSampler(coco) dataloader = DataLoader(coco, batch_size=32, shuffle=True, num_workers=4) #dataloader = DataLoader(coco, batch_size=8, shuffle=True, num_workers=4, sampler=train_sampler, shuffle=(train_sampler is None)) nepochs = 800 ssd300 = SSD300(coco.labelnum) #ssd300 = DDP(ssd300) ssd300.train() ssd300.cuda() loss_func = Loss(dboxes) loss_func.cuda() optim = torch.optim.SGD(ssd300.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4) print("epoch", "nbatch", "loss") iter_num = 0 avg_loss = 0.0 #od = torch.load("./models/larger_iter_210000.pt") #ssd300.load_state_dict(od["model"]) #iter_num = 210000 #optim = torch.optim.SGD(ssd300.parameters(), lr=1e-5, momentum=0.9, weight_decay=5e-4) for epoch in range(nepochs): #train_sampler.set_epoch(epoch) if iter_num >= 240000: break for nbatch, (img, img_size, bbox, label) in enumerate(dataloader): iter_num += 1 if iter_num == 160000: print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = 1e-4 if iter_num == 200000: print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = 1e-5 img = Variable(img.cuda(), requires_grad=True) ploc, plabel = ssd300(img) gloc, glabel = Variable(bbox.transpose(1,2).contiguous().cuda(), requires_grad=False), \ Variable(label.cuda(), requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss), end="\r") optim.zero_grad() loss.backward() optim.step() if iter_num % 5000 == 0: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": coco.label_info }, "./models/crowd_iter_{}.pt".format(iter_num))
def train512(): label_map = {} dboxes = dboxes512() trans = SSDTransformer(dboxes, (512, 512), val=False) img_folder = "../../VOCdevkit/VOC2007/JPEGImages" ann_folder = "../../VOCdevkit/VOC2007/Annotations" tgt_folder = "../../VOCdevkit/VOC2007/ImageSets/Main/trainval.txt" vd = VOCDetection(img_folder, ann_folder, tgt_folder, label_map=label_map, \ transform = trans) dataloader = DataLoader(vd, batch_size=32, shuffle=True, num_workers=4) nepochs = 800 ssd512 = SSD512(21) ssd512.train() ssd512.cuda() loss_func = Loss(dboxes) loss_func.cuda() optim = torch.optim.SGD(ssd512.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4) print("epoch", "nbatch", "loss") iter_num = 0 avg_loss = 0.0 for epoch in range(nepochs): if iter_num >= 60000: break for nbatch, (img, img_size, bbox, label) in enumerate(dataloader): #gc.collect() iter_num += 1 if iter_num == 40000: print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = 1e-4 if iter_num == 50000: print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = 1e-5 img = Variable(img.cuda(), requires_grad=True) ploc, plabel = ssd512(img) #torch.cuda.synchronize() #show_memusage() gloc, glabel = Variable(bbox.transpose(1,2).contiguous().cuda(), requires_grad=False), \ Variable(label.cuda(), requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) #torch.cuda.synchronize() #show_memusage() avg_loss = 0.999 * avg_loss + 0.001 * loss.item() print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss), end="\r") optim.zero_grad() loss.backward() #torch.cuda.synchronize() #show_memusage() optim.step() del img, ploc, plabel, gloc, glabel, loss if iter_num % 5000 == 0: print("") print("saving model...") torch.save( { "model": ssd512.state_dict(), "label_map": vd.label_map }, "./models/iter_{}.pt".format(iter_num))
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError( "Please install APEX from https://github.com/nvidia/apex") local_seed = args.seed os.environ['USE_CUDA'] = str(use_cuda) if args.world_size > 1: args.distributed = True if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist print('Distributed training with DDP') if args.no_cuda: device = torch.device('cpu') os.environ['RANK'] = str(os.environ.get('PMI_RANK', args.rank)) os.environ['WORLD_SIZE'] = str( os.environ.get('PMI_SIZE', args.world_size)) os.environ['MASTER_ADDR'] = args.master_addr os.environ['MASTER_PORT'] = args.port # Initialize the process group with ccl backend if args.backend == 'ccl': import torch_ccl dist.init_process_group(backend=args.backend) else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 mllogger.event(key=mllog_const.SEED, value=local_seed) # Refer to https://pytorch.org/docs/stable/notes/randomness.html#dataloader torch.manual_seed(local_seed) # Set PyTorch seed np.random.seed(seed=local_seed) # Set Numpy seed random.seed(local_seed) # Set the Python seed args.rank = dist.get_rank() if args.distributed else args.local_rank print("args.rank = {}".format(args.rank)) print("local rank = {}".format(args.local_rank)) print("distributed={}".format(args.distributed)) dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer( dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) mllogger.event(key=mllog_const.TRAIN_SAMPLES, value=len(train_coco)) mllogger.event(key=mllog_const.EVAL_SAMPLES, value=len(val_coco)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=0) # set shuffle=True in DataLoader # Leslie: here is the workaround: dist.broadcast will fail on other rank. we will run evalution on all the ranks val_dataloader = DataLoader(val_coco, batch_size=args.val_batch_size or args.batch_size, shuffle=False, sampler=None, num_workers=0) ssd300 = SSD300(train_coco.labelnum, model_path=args.pretrained_backbone) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 global_batch_size = N_gpu * args.batch_size mllogger.event(key=mllog_const.GLOBAL_BATCH_SIZE, value=global_batch_size) # Reference doesn't support group batch norm, so bn_span==local_batch_size mllogger.event(key=mllog_const.MODEL_BN_SPAN, value=args.batch_size) current_lr = args.lr * (global_batch_size / 32) assert args.batch_size % args.batch_splits == 0, "--batch-size must be divisible by --batch-splits" fragment_size = args.batch_size // args.batch_splits if args.batch_splits != 1: print("using gradient accumulation with fragments of size {}".format( fragment_size)) # Model to NHWC ssd300 = ssd300.to(memory_format=torch.channels_last) current_momentum = 0.9 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=args.weight_decay) ssd_print(key=mllog_const.OPT_BASE_LR, value=current_lr) ssd_print(key=mllog_const.OPT_WEIGHT_DECAY, value=args.weight_decay) iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu * args.batch_size)) ssd_print(key=mllog_const.OPT_LR_WARMUP_STEPS, value=wb) warmup_step = lambda iter_num, current_lr: lr_warmup( optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None ssd_print(key=mllog_const.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) ssd_print(key=mllog_const.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_schedule) mllogger.start(key=mllog_const.BLOCK_START, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) if args.performance_only: train_time = AverageMeter('TrainTime', ':6.3f') progress = ProgressMeter(args.train_iteration, [train_time], prefix='Train: ') # Restore the model and optim from checkpoint if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) optim.load_state_dict(od['optim']) # Model Prepack if use_ipex: if args.autocast: ssd300, optim = ipex.optimize(ssd300, dtype=torch.bfloat16, optimizer=optim) else: ssd300, optim = ipex.optimize(ssd300, dtype=torch.float32, optimizer=optim) # parallelize if args.distributed: device_ids = None ssd300 = torch.nn.parallel.DistributedDataParallel( ssd300, device_ids=device_ids) optim.zero_grad(set_to_none=True) for epoch in range(args.epochs): mllogger.start(key=mllog_const.EPOCH_START, metadata={mllog_const.EPOCH_NUM: epoch}) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format( num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr for nbatch, (img, img_id, img_size, bbox, label) in enumerate(train_dataloader): naive_train_case = True # img.shape[0] == fragment_size if naive_train_case: # Naive train case fimg, gloc, glabel, mask, pos_num, neg_num, num_mask = data_preprocess( img, bbox, label, loss_func, args.autocast) if args.performance_only and iter_num >= args.warmup_iterations: start_time = time.time() if args.profile and args.performance_only and iter_num == 30: # Profile Mode with torch.profiler.profile( on_trace_ready=trace_handler) as prof: with torch.cpu.amp.autocast(enabled=args.autocast): ploc, plabel = ssd300(fimg) loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask, args.autocast) loss.backward() warmup_step(iter_num, current_lr) optim.step() optim.zero_grad(set_to_none=True) else: # Non Profile Mode with torch.cpu.amp.autocast(enabled=args.autocast): ploc, plabel = ssd300(fimg) loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask, args.autocast) loss.backward() warmup_step(iter_num, current_lr) optim.step() optim.zero_grad(set_to_none=True) else: # Train case: when split input to several fragment size print("Not support input with several fragment size yet.") exit(-1) # current_batch_size = img.shape[0] # # Split batch for gradient accumulation # img = torch.split(img, fragment_size) # bbox = torch.split(bbox, fragment_size) # label = torch.split(label, fragment_size) # if args.performance_only and iter_num >= args.warmup_iterations: # start_time=time.time() # for (fimg, fbbox, flabel) in zip(img, bbox, label): # current_fragment_size = fimg.shape[0] # trans_bbox = fbbox.transpose(1,2).contiguous() # if use_cuda: # fimg = fimg.cuda() # trans_bbox = trans_bbox.cuda() # flabel = flabel.cuda() # fimg = Variable(fimg, requires_grad=True) # gloc, glabel = Variable(trans_bbox, requires_grad=False), \ # Variable(flabel, requires_grad=False) # gloc = loss_func._loc_vec(gloc) # mask = glabel > 0 # pos_num = mask.sum(dim=1) # neg_num = torch.clamp(3*pos_num, max=mask.size(1)).unsqueeze(-1) # num_mask = (pos_num > 0).float() # # image to NHWC # fimg = fimg.contiguous(memory_format=torch.channels_last) # if use_ipex: # with ipex.amp.autocast(enabled=args.autocast, configure=ipex.conf.AmpConf(torch.bfloat16)): # ploc, plabel = ssd300(fimg) # loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask) # else: # ploc, plabel = ssd300(fimg) # loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask) # loss = loss * (current_fragment_size / current_batch_size) # weighted mean # loss.backward() # warmup_step(iter_num, current_lr) # optim.step() # optim.zero_grad(set_to_none=True) if args.performance_only and iter_num >= args.warmup_iterations: train_time.update(time.time() - start_time) if args.performance_only and iter_num % args.print_freq == 0: progress.display(iter_num) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() if args.log_interval and not iter_num % args.log_interval: print("Iteration: {:6d}, Loss function: {:5.8f}, Average Loss: {:.8f}"\ .format(iter_num, loss.item(), avg_loss)) iter_num += 1 if args.performance_only and iter_num >= args.train_iteration: break if args.performance_only and iter_num >= args.train_iteration: break if (args.val_epochs and (epoch+1) in args.val_epochs) or \ (args.val_interval and not (epoch+1) % args.val_interval): if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size ssd_print(key=mllog_const.MODEL_BN_SPAN, value=bn_buf.cpu().detach().numpy()) if args.rank == 0 or True: # Leslie: here is the workaround: dist.broadcast will fail on other rank. we will run evalution on all the ranks if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info, "optim": optim.state_dict() }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num, log_interval=args.log_interval, nms_valid_thresh=args.nms_valid_thresh, use_autocast=args.autocast): success = torch.ones(1) if use_cuda: success = success.cuda() # Leslie: same Workaround: since we run evalution on all ranks, we don't need to broadcast the evalutation result # if args.distributed: # dist.broadcast(success, 0) if success[0]: return True mllogger.end(key=mllog_const.EPOCH_STOP, metadata={mllog_const.EPOCH_NUM: epoch}) mllogger.end(key=mllog_const.BLOCK_STOP, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) if args.performance_only: batch_size = args.batch_size latency = train_time.avg / batch_size * 1000 perf = batch_size / train_time.avg print('train latency %.2f ms' % latency) print('train performance %.2f fps' % perf) print("Throughput: {:.3f} fps".format(perf)) return False
def train_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() ssd_r34 = SSD_R34(81, strides=args.strides) #img_size=[args.image_size,args.image_size] dboxes = dboxes_coco(args.image_size, args.strides) encoder = Encoder(dboxes) train_trans = SSDTransformer(dboxes, tuple(args.image_size), val=False) val_trans = SSDTransformer(dboxes, tuple(args.image_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4) ssd_r34 = SSD_R34(train_coco.labelnum, strides=args.strides) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd_r34.load_state_dict(od["model"]) ssd_r34.train() ssd_r34.to('cuda') if use_cuda: if args.device_ids and len(args.device_ids) > 1: ssd_r34 = nn.DataParallel(ssd_r34, args.device_ids) loss_func = Loss(dboxes) if use_cuda: loss_func.to('cuda') loss_func = nn.DataParallel(loss_func, args.device_ids) optim = torch.optim.SGD(ssd_r34.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 last_loss = [0.0] * 10 inv_map = {v: k for k, v in val_coco.label_map.items()} for epoch in range(args.epochs): for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if iter_num == 160000: print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = 1e-4 if iter_num == 200000: print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = 1e-5 img = Variable(img, requires_grad=True) ploc, plabel, _ = ssd_r34(img.to('cuda')) trans_bbox = bbox.transpose(1, 2).contiguous() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel).mean() if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() last_loss.pop() last_loss = [loss.item()] + last_loss avg_last_loss = sum(last_loss) / len(last_loss) print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, Average Last 10 Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss,avg_last_loss), end="\r") optim.zero_grad() loss.backward() optim.step() loss = None if iter_num in args.evaluation: if not args.no_save: print("") print("saving model...") module = ssd_r34.module if len( args.device_ids) > 1 else ssd_r34 torch.save( { "model": module.state_dict(), "label_map": train_coco.label_info }, args.save_path + "/iter_{}.pt".format(iter_num)) if coco_eval(ssd_r34, val_coco, cocoGt, encoder, inv_map, args.threshold, args.device_ids): return iter_num += 1
def train300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4) # set shuffle=True in DataLoader mlperf_log.ssd_print(key=mlperf_log.INPUT_SHARD, value=None) mlperf_log.ssd_print(key=mlperf_log.INPUT_ORDER) mlperf_log.ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() current_lr = 1e-3 current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) mlperf_log.ssd_print(key=mlperf_log.OPT_NAME, value="SGD") mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) mlperf_log.ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) mlperf_log.ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if iter_num == 160000: current_lr = 1e-4 print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if iter_num == 200000: current_lr = 1e-5 print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if use_cuda: img = img.cuda() img = Variable(img, requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() if use_cuda: trans_bbox = trans_bbox.cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss), end="\r") optim.zero_grad() loss.backward() optim.step() if iter_num in args.evaluation: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num): return True iter_num += 1 return False
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError( "Please install APEX from https://github.com/nvidia/apex") if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist # ssd_print(key=mlperf_log.RUN_SET_RANDOM_SEED) if args.no_cuda: device = torch.device('cpu') else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 print(dist.get_rank(), "Using seed = {}".format(local_seed)) torch.manual_seed(local_seed) np.random.seed(seed=local_seed) dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=4) # set shuffle=True in DataLoader ssd_print(key=mlperf_log.INPUT_SHARD, value=None) ssd_print(key=mlperf_log.INPUT_ORDER) ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # parallelize if args.distributed: ssd300 = DDP(ssd300) global_batch_size = N_gpu * args.batch_size current_lr = args.lr * (global_batch_size / 32) current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) ssd_print(key=mlperf_log.OPT_NAME, value="SGD") ssd_print(key=mlperf_log.OPT_LR, value=current_lr) ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) eval_points = args.evaluation print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu * args.batch_size)) warmup_step = lambda iter_num, current_lr: lr_warmup( optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None for epoch in range(args.epochs): ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format( num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr ssd_print(key=mlperf_log.OPT_LR, value=current_lr) for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if use_cuda: img = img.cuda() img = Variable(img, requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() if use_cuda: trans_bbox = trans_bbox.cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss), end="\r") optim.zero_grad() loss.backward() warmup_step(iter_num, current_lr) optim.step() iter_num += 1 if epoch + 1 in eval_points: rank = dist.get_rank() if args.distributed else args.local_rank if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size if rank == 0: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num): success = torch.ones(1) if use_cuda: success = success.cuda() if args.distributed: dist.broadcast(success, 0) if success[0]: return True return False
def train300_mlperf_coco(args): from pycocotools.coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') local_seed = set_seeds(args) # start timing here if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 validate_group_bn(args.bn_group) # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") # Build the model model_options = { 'backbone': args.backbone, 'use_nhwc': args.nhwc, 'pad_input': args.pad_input, 'bn_group': args.bn_group, } ssd300 = SSD300(args.num_classes, **model_options) if args.checkpoint is not None: load_checkpoint(ssd300, args.checkpoint) ssd300.train() ssd300.cuda() if args.opt_loss: loss_func = OptLoss(dboxes) else: loss_func = Loss(dboxes) loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 if args.use_fp16: ssd300 = network_to_half(ssd300) # Parallelize. Need to do this after network_to_half. if args.distributed: if args.delay_allreduce: print_message(args.local_rank, "Delaying allreduces to the end of backward()") ssd300 = DDP(ssd300, gradient_predivide_factor=N_gpu / 8.0, delay_allreduce=args.delay_allreduce, retain_allreduce_buffers=args.use_fp16) # Create optimizer. This must also be done after network_to_half. global_batch_size = (N_gpu * args.batch_size) mlperf_print(key=mlperf_compliance.constants.MODEL_BN_SPAN, value=args.bn_group * args.batch_size) mlperf_print(key=mlperf_compliance.constants.GLOBAL_BATCH_SIZE, value=global_batch_size) # mlperf only allows base_lr scaled by an integer base_lr = 2.5e-3 requested_lr_multiplier = args.lr / base_lr adjusted_multiplier = max( 1, round(requested_lr_multiplier * global_batch_size / 32)) current_lr = base_lr * adjusted_multiplier current_momentum = 0.9 current_weight_decay = args.wd static_loss_scale = 128. if args.use_fp16: if args.distributed and not args.delay_allreduce: # We can't create the flat master params yet, because we need to # imitate the flattened bucket structure that DDP produces. optimizer_created = False else: model_buckets = [ [ p for p in ssd300.parameters() if p.requires_grad and p.type() == "torch.cuda.HalfTensor" ], [ p for p in ssd300.parameters() if p.requires_grad and p.type() == "torch.cuda.FloatTensor" ] ] flat_master_buckets = create_flat_master(model_buckets) optim = torch.optim.SGD(flat_master_buckets, lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True else: optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True mlperf_print(key=mlperf_compliance.constants.OPT_BASE_LR, value=current_lr) mlperf_print(key=mlperf_compliance.constants.OPT_WEIGHT_DECAY, value=current_weight_decay) if args.warmup is not None: mlperf_print(key=mlperf_compliance.constants.OPT_LR_WARMUP_STEPS, value=args.warmup) mlperf_print(key=mlperf_compliance.constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) # Model is completely finished -- need to create separate copies, preserve parameters across # them, and jit ssd300_eval = SSD300(args.num_classes, backbone=args.backbone, use_nhwc=args.nhwc, pad_input=args.pad_input).cuda() if args.use_fp16: ssd300_eval = network_to_half(ssd300_eval) # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 ssd300_eval.load_state_dict(train_model.state_dict()) ssd300_eval.eval() print_message(args.local_rank, "epoch", "nbatch", "loss") eval_points = np.array(args.evaluation) * 32 / global_batch_size eval_points = list(map(int, list(eval_points))) iter_num = args.iteration avg_loss = 0.0 start_elapsed_time = time.time() last_printed_iter = args.iteration num_elapsed_samples = 0 # Generate normalization tensors mean, std = generate_mean_std(args) dummy_overflow_buf = torch.cuda.IntTensor([0]) def step_maybe_fp16_maybe_distributed(optim): if args.use_fp16: if args.distributed: for flat_master, allreduce_buffer in zip( flat_master_buckets, ssd300.allreduce_buffers): if allreduce_buffer is None: raise RuntimeError("allreduce_buffer is None") flat_master.grad = allreduce_buffer.float() flat_master.grad.data.mul_(1. / static_loss_scale) else: for flat_master, model_bucket in zip(flat_master_buckets, model_buckets): flat_grad = apex_C.flatten( [m.grad.data for m in model_bucket]) flat_master.grad = flat_grad.float() flat_master.grad.data.mul_(1. / static_loss_scale) optim.step() if args.use_fp16: # Use multi-tensor scale instead of loop & individual parameter copies for model_bucket, flat_master in zip(model_buckets, flat_master_buckets): multi_tensor_applier( amp_C.multi_tensor_scale, dummy_overflow_buf, [ apex_C.unflatten(flat_master.data, model_bucket), model_bucket ], 1.0) input_c = 4 if args.pad_input else 3 example_shape = [args.batch_size, 300, 300, input_c ] if args.nhwc else [args.batch_size, input_c, 300, 300] example_input = torch.randn(*example_shape).cuda() if args.use_fp16: example_input = example_input.half() if args.jit: # DDP has some Python-side control flow. If we JIT the entire DDP-wrapped module, # the resulting ScriptModule will elide this control flow, resulting in allreduce # hooks not being called. If we're running distributed, we need to extract and JIT # the wrapped .module. # Replacing a DDP-ed ssd300 with a script_module might also cause the AccumulateGrad hooks # to go out of scope, and therefore silently disappear. module_to_jit = ssd300.module if args.distributed else ssd300 if args.distributed: ssd300.module = torch.jit.trace(module_to_jit, example_input) else: ssd300 = torch.jit.trace(module_to_jit, example_input) # JIT the eval model too ssd300_eval = torch.jit.trace(ssd300_eval, example_input) # do a dummy fprop & bprop to make sure cudnnFind etc. are timed here ploc, plabel = ssd300(example_input) # produce a single dummy "loss" to make things easier loss = ploc[0, 0, 0] + plabel[0, 0, 0] dloss = torch.randn_like(loss) # Cause cudnnFind for dgrad, wgrad to run loss.backward(dloss) mlperf_print(key=mlperf_compliance.constants.INIT_STOP, sync=True) ##### END INIT # This is the first place we touch anything related to data ##### START DATA TOUCHING mlperf_print(key=mlperf_compliance.constants.RUN_START, sync=True) barrier() cocoGt = COCO(annotation_file=val_annotate, use_ext=True) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) if args.distributed: val_sampler = GeneralDistributedSampler(val_coco, pad=False) else: val_sampler = None if args.no_dali: train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) if args.distributed: train_sampler = GeneralDistributedSampler(train_coco, pad=False) else: train_sampler = None train_loader = DataLoader(train_coco, batch_size=args.batch_size * args.input_batch_multiplier, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=args.num_workers, collate_fn=partial(my_collate, is_training=True)) else: train_pipe = COCOPipeline(args.batch_size * args.input_batch_multiplier, args.local_rank, train_coco_root, train_annotate, N_gpu, num_threads=args.num_workers, output_fp16=args.use_fp16, output_nhwc=args.nhwc, pad_output=args.pad_input, seed=local_seed - 2**31, use_nvjpeg=args.use_nvjpeg, use_roi=args.use_roi_decode, dali_cache=args.dali_cache, dali_async=(not args.dali_sync)) print_message(args.local_rank, "time_check a: {secs:.9f}".format(secs=time.time())) train_pipe.build() print_message(args.local_rank, "time_check b: {secs:.9f}".format(secs=time.time())) test_run = train_pipe.run() train_loader = SingleDaliIterator( train_pipe, [ 'images', DALIOutput('bboxes', False, True), DALIOutput('labels', True, True) ], train_pipe.epoch_size()['train_reader'], ngpu=N_gpu) train_loader = EncodingInputIterator(train_loader, dboxes=encoder.dboxes.cuda(), nhwc=args.nhwc, fake_input=args.fake_input, no_dali=args.no_dali) if args.input_batch_multiplier > 1: train_loader = RateMatcher(input_it=train_loader, output_size=args.batch_size) val_dataloader = DataLoader( val_coco, batch_size=args.eval_batch_size, shuffle=False, # Note: distributed sampler is shuffled :( sampler=val_sampler, num_workers=args.num_workers) inv_map = {v: k for k, v in val_coco.label_map.items()} ##### END DATA TOUCHING i_eval = 0 first_epoch = 1 mlperf_print(key=mlperf_compliance.constants.BLOCK_START, metadata={ 'first_epoch_num': first_epoch, 'epoch_count': args.evaluation[i_eval] * 32 / train_pipe.epoch_size()['train_reader'] }, sync=True) for epoch in range(args.epochs): mlperf_print(key=mlperf_compliance.constants.EPOCH_START, metadata={'epoch_num': epoch + 1}, sync=True) for p in ssd300.parameters(): p.grad = None for i, (img, bbox, label) in enumerate(train_loader): if args.profile_start is not None and iter_num == args.profile_start: torch.cuda.profiler.start() torch.cuda.synchronize() if args.profile_nvtx: torch.autograd._enable_profiler( torch.autograd.ProfilerState.NVTX) if args.profile is not None and iter_num == args.profile: if args.profile_start is not None and iter_num >= args.profile_start: # we turned cuda and nvtx profiling on, better turn it off too if args.profile_nvtx: torch.autograd._disable_profiler() torch.cuda.profiler.stop() return if args.warmup is not None and optimizer_created: lr_warmup(optim, args.warmup, iter_num, epoch, current_lr, args) if iter_num == ((args.decay1 * 1000 * 32) // global_batch_size): print_message(args.local_rank, "lr decay step #1") current_lr *= 0.1 for param_group in optim.param_groups: param_group['lr'] = current_lr if iter_num == ((args.decay2 * 1000 * 32) // global_batch_size): print_message(args.local_rank, "lr decay step #2") current_lr *= 0.1 for param_group in optim.param_groups: param_group['lr'] = current_lr if (img is None) or (bbox is None) or (label is None): print("No labels in batch") continue ploc, plabel = ssd300(img) ploc, plabel = ploc.float(), plabel.float() N = img.shape[0] gloc, glabel = Variable(bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if np.isfinite(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() else: print("model exploded (corrupted by Inf or Nan)") sys.exit() num_elapsed_samples += N if args.local_rank == 0 and iter_num % args.print_interval == 0: end_elapsed_time = time.time() elapsed_time = end_elapsed_time - start_elapsed_time avg_samples_per_sec = num_elapsed_samples * N_gpu / elapsed_time print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, avg. samples / sec: {:.2f}"\ .format(iter_num, loss.item(), avg_loss, avg_samples_per_sec), end="\n") last_printed_iter = iter_num start_elapsed_time = time.time() num_elapsed_samples = 0 # loss scaling if args.use_fp16: loss = loss * static_loss_scale loss.backward() if not optimizer_created: # Imitate the model bucket structure created by DDP. # These will already be split by type (float or half). model_buckets = [] for bucket in ssd300.active_i_buckets: model_buckets.append([]) for active_i in bucket: model_buckets[-1].append( ssd300.active_params[active_i]) flat_master_buckets = create_flat_master(model_buckets) optim = torch.optim.SGD(flat_master_buckets, lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True # Skip this first iteration because flattened allreduce buffers are not yet created. # step_maybe_fp16_maybe_distributed(optim) else: step_maybe_fp16_maybe_distributed(optim) # Likely a decent skew here, let's take this opportunity to set the gradients to None. # After DALI integration, playing with the placement of this is worth trying. for p in ssd300.parameters(): p.grad = None if iter_num in eval_points: # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 if args.distributed and args.allreduce_running_stats: if get_rank() == 0: print("averaging bn running means and vars") # make sure every node has the same running bn stats before # using them to evaluate, or saving the model for inference world_size = float(torch.distributed.get_world_size()) for bn_name, bn_buf in train_model.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size if get_rank() == 0: if not args.no_save: print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": val_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) ssd300_eval.load_state_dict(train_model.state_dict()) succ = coco_eval( ssd300_eval, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num, args.eval_batch_size, use_fp16=args.use_fp16, local_rank=args.local_rank if args.distributed else -1, N_gpu=N_gpu, use_nhwc=args.nhwc, pad_input=args.pad_input) mlperf_print(key=mlperf_compliance.constants.BLOCK_STOP, metadata={'first_epoch_num': first_epoch}, sync=True) if succ: return True if iter_num != max(eval_points): i_eval += 1 first_epoch = epoch + 1 mlperf_print(key=mlperf_compliance.constants.BLOCK_START, metadata={ 'first_epoch_num': first_epoch, 'epoch_count': (args.evaluation[i_eval] - args.evaluation[i_eval - 1]) * 32 / train_pipe.epoch_size()['train_reader'] }, sync=True) iter_num += 1 if args.max_iter > 0: if iter_num > args.max_iter: break train_loader.reset() mlperf_print(key=mlperf_compliance.constants.EPOCH_STOP, metadata={'epoch_num': epoch + 1}, sync=True) return False
def train300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available if not torch.cuda.is_available(): print("Error. No GPU available.") return False dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) train_pipe = COCOPipeline(args.batch_size, train_coco_root, train_annotate, dboxes, args.seed) train_pipe.build() train_loader = DALIGenericIterator(train_pipe, ["images", "boxes", "labels"], train_pipe.epoch_size("Reader")) mlperf_log.ssd_print(key=mlperf_log.INPUT_SHARD, value=None) mlperf_log.ssd_print(key=mlperf_log.INPUT_ORDER) mlperf_log.ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() ssd300.cuda() loss_func = Loss(dboxes) loss_func.cuda() current_lr = 1e-3 current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) mlperf_log.ssd_print(key=mlperf_log.OPT_NAME, value="SGD") mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) mlperf_log.ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) mlperf_log.ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} mean, std = generate_mean_std() data_perf = AverageMeter() batch_perf = AverageMeter() end = time.time() train_start = end mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) for nbatch, data in enumerate(train_loader): img = data[0]["images"] bbox = data[0]["boxes"] label = data[0]["labels"] boxes_in_batch = len(label.nonzero()) if boxes_in_batch == 0: print("No labels in batch") continue label = label.type(torch.cuda.LongTensor) img = Variable(img, requires_grad=True) trans_bbox = bbox.transpose(1, 2).contiguous() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) data_perf.update(time.time() - end) if iter_num == 160000: current_lr = 1e-4 print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if iter_num == 200000: current_lr = 1e-5 print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) ploc, plabel = ssd300(img) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() optim.zero_grad() loss.backward() optim.step() batch_perf.update(time.time() - end) if iter_num in args.evaluation: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) try: if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num): return True except: print("Eval error on iteration {0}".format(iter_num)) print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, Data perf: {:3f} img/sec, Batch perf: {:3f} img/sec, Avg Data perf: {:3f} img/sec, Avg Batch perf: {:3f} img/sec"\ .format(iter_num, loss.item(), avg_loss, args.batch_size / data_perf.val, args.batch_size / batch_perf.val, args.batch_size / data_perf.avg, args.batch_size / batch_perf.avg), end="\r") end = time.time() iter_num += 1 if iter_num == 10 and epoch == 0: data_perf.reset() batch_perf.reset() train_loader.reset() print("\n\n") print("Training end: Data perf: {:3f} img/sec, Batch perf: {:3f} img/sec, Total time: {:3f} sec"\ .format(args.batch_size / data_perf.avg, args.batch_size / batch_perf.avg, time.time() - train_start)) return False
def test_coco(args): # For testing purposes we have to use CUDA use_cuda = True # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) if args.use_train_dataset: annotate = os.path.join(args.data, "annotations/instances_train2017.json") coco_root = os.path.join(args.data, "train2017") img_number = 118287 else: annotate = os.path.join(args.data, "annotations/instances_val2017.json") coco_root = os.path.join(args.data, "val2017") img_number = 5000 pipe = COCOPipeline(args.batch_size, args.local_rank, coco_root, annotate, N_gpu, num_threads=args.num_workers) pipe.build() test_run = pipe.run() dataloader = DALICOCOIterator(pipe, img_number / N_gpu) # Build the model ssd300 = SSD300(81, backbone=args.backbone, model_path='', dilation=False) """ # Note: args.checkpoint is required, so this can never be false if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) # remove proceeding 'module' from checkpoint model = od["model"] for k in list(model.keys()): if k.startswith('module.'): model[k[7:]] = model.pop(k) ssd300.load_state_dict(model) """ ssd300.cuda() ssd300.eval() loss_func = Loss(dboxes) loss_func.cuda() # parallelize if args.distributed: ssd300 = DDP(ssd300) if args.use_fp16: ssd300 = network_to_half(ssd300) if args.use_train_dataset and args.local_rank == 0: print( 'Image 000000320612.jpg is in fact PNG and it will cause fail if ' + 'used with nvJPEGDecoder in coco_pipeline') for epoch in range(2): if epoch == 1 and args.local_rank == 0: print("Performance computation starts") s = time.time() for i, data in enumerate(dataloader): with torch.no_grad(): # Get data from pipeline img = data[0][0][0] bbox = data[0][1][0] label = data[0][2][0] label = label.type(torch.cuda.LongTensor) bbox_offsets = data[0][3][0] bbox_offsets = bbox_offsets.cuda() # Encode labels N = img.shape[0] if bbox_offsets[-1].item() == 0: print("No labels in batch") continue bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5) # Prepare tensors for computing loss M = bbox.shape[0] // N bbox = bbox.view(N, M, 4) label = label.view(N, M) trans_bbox = bbox.transpose(1, 2).contiguous() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) if args.use_fp16: img = img.half() for _ in range(args.fbu): ploc, plabel = ssd300(img) ploc, plabel = ploc.float(), plabel.float() loss = loss_func(ploc, plabel, gloc, glabel) if epoch == 1 and args.local_rank == 0: e = time.time() print("Performance achieved: {:.2f} img/sec".format(img_number / (e - s))) dataloader.reset()
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError("Please install APEX from https://github.com/nvidia/apex") local_seed = args.seed if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist if args.no_cuda: device = torch.device('cpu') else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 mllogger.event(key=mllog_const.SEED, value=local_seed) torch.manual_seed(local_seed) np.random.seed(seed=local_seed) args.rank = dist.get_rank() if args.distributed else args.local_rank print("args.rank = {}".format(args.rank)) print("local rank = {}".format(args.local_rank)) print("distributed={}".format(args.distributed)) dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) mllogger.event(key=mllog_const.TRAIN_SAMPLES, value=len(train_coco)) mllogger.event(key=mllog_const.EVAL_SAMPLES, value=len(val_coco)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=4) # set shuffle=True in DataLoader if args.rank==0: val_dataloader = DataLoader(val_coco, batch_size=args.val_batch_size or args.batch_size, shuffle=False, sampler=None, num_workers=4) else: val_dataloader = None ssd300 = SSD300(train_coco.labelnum, model_path=args.pretrained_backbone) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # parallelize if args.distributed: ssd300 = DDP(ssd300) global_batch_size = N_gpu * args.batch_size mllogger.event(key=mllog_const.GLOBAL_BATCH_SIZE, value=global_batch_size) # Reference doesn't support group batch norm, so bn_span==local_batch_size mllogger.event(key=mllog_const.MODEL_BN_SPAN, value=args.batch_size) current_lr = args.lr * (global_batch_size / 32) assert args.batch_size % args.batch_splits == 0, "--batch-size must be divisible by --batch-splits" fragment_size = args.batch_size // args.batch_splits if args.batch_splits != 1: print("using gradient accumulation with fragments of size {}".format(fragment_size)) current_momentum = 0.9 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=args.weight_decay) ssd_print(key=mllog_const.OPT_BASE_LR, value=current_lr) ssd_print(key=mllog_const.OPT_WEIGHT_DECAY, value=args.weight_decay) iter_num = args.iteration avg_loss = 0.0 inv_map = {v:k for k,v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu*args.batch_size)) ssd_print(key=mllog_const.OPT_LR_WARMUP_STEPS, value=wb) warmup_step = lambda iter_num, current_lr: lr_warmup(optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None ssd_print(key=mllog_const.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) ssd_print(key=mllog_const.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_schedule) mllogger.start( key=mllog_const.BLOCK_START, metadata={mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs}) optim.zero_grad() for epoch in range(args.epochs): mllogger.start( key=mllog_const.EPOCH_START, metadata={mllog_const.EPOCH_NUM: epoch}) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format(num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr for nbatch, (img, img_id, img_size, bbox, label) in enumerate(train_dataloader): current_batch_size = img.shape[0] # Split batch for gradient accumulation img = torch.split(img, fragment_size) bbox = torch.split(bbox, fragment_size) label = torch.split(label, fragment_size) for (fimg, fbbox, flabel) in zip(img, bbox, label): current_fragment_size = fimg.shape[0] trans_bbox = fbbox.transpose(1,2).contiguous() if use_cuda: fimg = fimg.cuda() trans_bbox = trans_bbox.cuda() flabel = flabel.cuda() fimg = Variable(fimg, requires_grad=True) ploc, plabel = ssd300(fimg) gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(flabel, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) loss = loss * (current_fragment_size / current_batch_size) # weighted mean loss.backward() warmup_step(iter_num, current_lr) optim.step() optim.zero_grad() if not np.isinf(loss.item()): avg_loss = 0.999*avg_loss + 0.001*loss.item() if args.rank == 0 and args.log_interval and not iter_num % args.log_interval: print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss)) iter_num += 1 if (args.val_epochs and (epoch+1) in args.val_epochs) or \ (args.val_interval and not (epoch+1) % args.val_interval): if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers(recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size ssd_print(key=mllog_const.MODEL_BN_SPAN, value=bn_buf) if args.rank == 0: if not args.no_save: print("") print("saving model...") torch.save({"model" : ssd300.state_dict(), "label_map": train_coco.label_info}, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num, log_interval=args.log_interval, nms_valid_thresh=args.nms_valid_thresh): success = torch.ones(1) if use_cuda: success = success.cuda() if args.distributed: dist.broadcast(success, 0) if success[0]: return True mllogger.end( key=mllog_const.EPOCH_STOP, metadata={mllog_const.EPOCH_NUM: epoch}) mllogger.end( key=mllog_const.BLOCK_STOP, metadata={mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs}) return False
def train300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') local_seed = set_seeds(args) # start timing here ssd_print(key=mlperf_log.RUN_START) if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) if args.distributed: val_sampler = GeneralDistributedSampler(val_coco, pad=False) else: val_sampler = None train_pipe = COCOPipeline(args.batch_size, args.local_rank, train_coco_root, train_annotate, N_gpu, num_threads=args.num_workers, output_fp16=args.use_fp16, output_nhwc=args.nhwc, pad_output=args.pad_input, seed=local_seed - 2**31) print_message(args.local_rank, "time_check a: {secs:.9f}".format(secs=time.time())) train_pipe.build() print_message(args.local_rank, "time_check b: {secs:.9f}".format(secs=time.time())) test_run = train_pipe.run() train_loader = DALICOCOIterator(train_pipe, 118287 / N_gpu) val_dataloader = DataLoader( val_coco, batch_size=args.eval_batch_size, shuffle=False, # Note: distributed sampler is shuffled :( sampler=val_sampler, num_workers=args.num_workers) ssd_print(key=mlperf_log.INPUT_ORDER) ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) # Build the model ssd300 = SSD300(val_coco.labelnum, backbone=args.backbone, use_nhwc=args.nhwc, pad_input=args.pad_input) if args.checkpoint is not None: load_checkpoint(ssd300, args.checkpoint) ssd300.train() ssd300.cuda() loss_func = Loss(dboxes) loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 if args.use_fp16: ssd300 = network_to_half(ssd300) # Parallelize. Need to do this after network_to_half. if args.distributed: if args.delay_allreduce: print_message(args.local_rank, "Delaying allreduces to the end of backward()") ssd300 = DDP(ssd300, delay_allreduce=args.delay_allreduce, retain_allreduce_buffers=args.use_fp16) # Create optimizer. This must also be done after network_to_half. global_batch_size = (N_gpu * args.batch_size) # mlperf only allows base_lr scaled by an integer base_lr = 1e-3 requested_lr_multiplier = args.lr / base_lr adjusted_multiplier = max( 1, round(requested_lr_multiplier * global_batch_size / 32)) current_lr = base_lr * adjusted_multiplier current_momentum = 0.9 current_weight_decay = 5e-4 static_loss_scale = 128. if args.use_fp16: if args.distributed and not args.delay_allreduce: # We can't create the flat master params yet, because we need to # imitate the flattened bucket structure that DDP produces. optimizer_created = False else: model_buckets = [ [ p for p in ssd300.parameters() if p.requires_grad and p.type() == "torch.cuda.HalfTensor" ], [ p for p in ssd300.parameters() if p.requires_grad and p.type() == "torch.cuda.FloatTensor" ] ] flat_master_buckets = create_flat_master(model_buckets) optim = torch.optim.SGD(flat_master_buckets, lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True else: optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True # Add LARC if desired if args.use_larc: optim = LARC(optim) ssd_print(key=mlperf_log.OPT_NAME, value="SGD") ssd_print(key=mlperf_log.OPT_LR, value=current_lr) ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) if args.warmup is not None: ssd_print(key=mlperf_log.OPT_LR_WARMUP_STEPS, value=args.warmup) # Model is completely finished -- need to create separate copies, preserve parameters across # them, and jit ssd300_eval = SSD300(val_coco.labelnum, backbone=args.backbone, use_nhwc=args.nhwc, pad_input=args.pad_input).cuda() if args.use_fp16: ssd300_eval = network_to_half(ssd300_eval) # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 ssd300_eval.load_state_dict(train_model.state_dict()) ssd300_eval.eval() if args.jit: input_c = 4 if args.pad_input else 3 example_shape = [ args.batch_size, 300, 300, input_c ] if args.nhwc else [args.batch_size, input_c, 300, 300] example_input = torch.randn(*example_shape).cuda() if args.use_fp16: example_input = example_input.half() # DDP has some Python-side control flow. If we JIT the entire DDP-wrapped module, # the resulting ScriptModule will elide this control flow, resulting in allreduce # hooks not being called. If we're running distributed, we need to extract and JIT # the wrapped .module. # Replacing a DDP-ed ssd300 with a script_module might also cause the AccumulateGrad hooks # to go out of scope, and therefore silently disappear. module_to_jit = ssd300.module if args.distributed else ssd300 if args.distributed: ssd300.module = torch.jit.trace(module_to_jit, example_input) else: ssd300 = torch.jit.trace(module_to_jit, example_input) print_message(args.local_rank, "epoch", "nbatch", "loss") eval_points = np.array(args.evaluation) * 32 / global_batch_size eval_points = list(map(int, list(eval_points))) iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} start_elapsed_time = time.time() last_printed_iter = args.iteration num_elapsed_samples = 0 # Generate normalization tensors mean, std = generate_mean_std(args) def step_maybe_fp16_maybe_distributed(optim): if args.use_fp16: if args.distributed: for flat_master, allreduce_buffer in zip( flat_master_buckets, ssd300.allreduce_buffers): if allreduce_buffer is None: raise RuntimeError("allreduce_buffer is None") flat_master.grad = allreduce_buffer.float() flat_master.grad.data.mul_(1. / static_loss_scale) else: for flat_master, model_bucket in zip(flat_master_buckets, model_buckets): flat_grad = apex_C.flatten( [m.grad.data for m in model_bucket]) flat_master.grad = flat_grad.float() flat_master.grad.data.mul_(1. / static_loss_scale) optim.step() if args.use_fp16: for model_bucket, flat_master in zip(model_buckets, flat_master_buckets): for model, master in zip( model_bucket, apex_C.unflatten(flat_master.data, model_bucket)): model.data.copy_(master.data) ssd_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) for p in ssd300.parameters(): p.grad = None for i, data in enumerate(train_loader): img = data[0][0][0] bbox = data[0][1][0] label = data[0][2][0] label = label.type(torch.cuda.LongTensor) bbox_offsets = data[0][3][0] # handle random flipping outside of DALI for now bbox_offsets = bbox_offsets.cuda() img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, args.nhwc) img.sub_(mean).div_(std) if args.profile is not None and iter_num == args.profile: return if args.warmup is not None and optimizer_created: lr_warmup(optim, args.warmup, iter_num, epoch, current_lr, args) if iter_num == ((args.decay1 * 1000 * 32) // global_batch_size): print_message(args.local_rank, "lr decay step #1") current_lr *= 0.1 for param_group in optim.param_groups: param_group['lr'] = current_lr ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if iter_num == ((args.decay2 * 1000 * 32) // global_batch_size): print_message(args.local_rank, "lr decay step #2") current_lr *= 0.1 for param_group in optim.param_groups: param_group['lr'] = current_lr ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if use_cuda: img = img.cuda() # NHWC direct from DALI now if necessary bbox = bbox.cuda() label = label.cuda() bbox_offsets = bbox_offsets.cuda() # Now run the batched box encoder N = img.shape[0] if bbox_offsets[-1].item() == 0: print("No labels in batch") continue bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5) # output is ([N*8732, 4], [N*8732], need [N, 8732, 4], [N, 8732] respectively M = bbox.shape[0] // N bbox = bbox.view(N, M, 4) label = label.view(N, M) # print(img.shape, bbox.shape, label.shape) ploc, plabel = ssd300(img) ploc, plabel = ploc.float(), plabel.float() trans_bbox = bbox.transpose(1, 2).contiguous().cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() num_elapsed_samples += N if args.local_rank == 0 and iter_num % args.print_interval == 0: end_elapsed_time = time.time() elapsed_time = end_elapsed_time - start_elapsed_time avg_samples_per_sec = num_elapsed_samples * N_gpu / elapsed_time print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, avg. samples / sec: {:.2f}"\ .format(iter_num, loss.item(), avg_loss, avg_samples_per_sec), end="\n") last_printed_iter = iter_num start_elapsed_time = time.time() num_elapsed_samples = 0 # loss scaling if args.use_fp16: loss = loss * static_loss_scale loss.backward() if not optimizer_created: # Imitate the model bucket structure created by DDP. # These will already be split by type (float or half). model_buckets = [] for bucket in ssd300.active_i_buckets: model_buckets.append([]) for active_i in bucket: model_buckets[-1].append( ssd300.active_params[active_i]) flat_master_buckets = create_flat_master(model_buckets) optim = torch.optim.SGD(flat_master_buckets, lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True # Skip this first iteration because flattened allreduce buffers are not yet created. # step_maybe_fp16_maybe_distributed(optim) else: step_maybe_fp16_maybe_distributed(optim) # Likely a decent skew here, let's take this opportunity to set the gradients to None. # After DALI integration, playing with the placement of this is worth trying. for p in ssd300.parameters(): p.grad = None if iter_num in eval_points: if args.local_rank == 0: if not args.no_save: print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": val_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 ssd300_eval.load_state_dict(train_model.state_dict()) if coco_eval( ssd300_eval, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num, args.eval_batch_size, use_fp16=args.use_fp16, local_rank=args.local_rank if args.distributed else -1, N_gpu=N_gpu, use_nhwc=args.nhwc, pad_input=args.pad_input): return True iter_num += 1 train_loader.reset() return False
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if args.use_hpu: if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.world_size = int(os.environ['WORLD_SIZE']) print("world_size = {}".format(args.world_size)) print("distributed={}".format(args.distributed)) if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError( "Please install APEX from https://github.com/nvidia/apex") use_hpu = args.use_hpu hpu_channels_last = args.hpu_channels_last hpu_lazy_mode = args.hpu_lazy_mode is_hmp = args.is_hmp device = torch.device('cpu') data_loader_type = DataLoader if use_hpu: device = torch.device('hpu') if args.distributed: os.environ["MAX_WAIT_ATTEMPTS"] = "90" if hpu_lazy_mode: os.environ["PT_HPU_LAZY_MODE"] = "1" else: os.environ["PT_HPU_LAZY_MODE"] = "2" if is_hmp: if not args.hmp_bf16: raise IOError("Please provide list of BF16 ops") if not args.hmp_fp32: raise IOError("Please provide list of FP32 ops") from habana_frameworks.torch.hpex import hmp hmp.convert(opt_level=args.hmp_opt_level, bf16_file_path=args.hmp_bf16, fp32_file_path=args.hmp_fp32, isVerbose=args.hmp_verbose) from habana_frameworks.torch.utils.library_loader import load_habana_module load_habana_module() # TODO - add dataloader local_seed = args.seed if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist if use_hpu: args.dist_backend = 'hccl' import habana_frameworks.torch.core.hccl os.environ["ID"] = os.environ["RANK"] dist.init_process_group(args.dist_backend, init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device, use_hpu=True) local_seed = (args.seed + dist.get_rank()) % 2**32 elif args.no_cuda: device = torch.device('cpu') else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 mllogger.event(key=mllog_const.SEED, value=local_seed) torch.manual_seed(local_seed) np.random.seed(seed=local_seed) random.seed(local_seed) # amorgenstern torch.cuda.manual_seed(local_seed) # amorgenstern args.rank = dist.get_rank() if args.distributed else args.local_rank print("args.rank = {}".format(args.rank)) print("local rank = {}".format(args.local_rank)) print("distributed={}".format(args.distributed)) if use_hpu and is_hmp: with hmp.disable_casts(): dboxes = dboxes300_coco() encoder = Encoder(dboxes) else: dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 if use_hpu and is_hmp: with hmp.disable_casts(): train_trans = SSDTransformer( dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) else: train_trans = SSDTransformer( dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") if use_hpu and is_hmp: with hmp.disable_casts(): cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) else: cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) mllogger.event(key=mllog_const.TRAIN_SAMPLES, value=len(train_coco)) mllogger.event(key=mllog_const.EVAL_SAMPLES, value=len(val_coco)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None if use_hpu: # patch torch cuda functions that are being unconditionally invoked # in the multiprocessing data loader torch.cuda.current_device = lambda: None torch.cuda.set_device = lambda x: None train_dataloader = data_loader_type(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=args.num_workers) # set shuffle=True in DataLoader if args.rank == 0: val_dataloader = data_loader_type(val_coco, batch_size=args.val_batch_size or args.batch_size, shuffle=False, sampler=None, num_workers=args.num_workers) else: val_dataloader = None ssd300 = SSD300(train_coco.labelnum, model_path=args.pretrained_backbone) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint, map_location=torch.device('cpu')) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() if use_hpu and is_hmp: with hmp.disable_casts(): loss_func = Loss(dboxes, use_hpu=use_hpu, hpu_device=device) else: loss_func = Loss(dboxes, use_hpu=use_hpu, hpu_device=device) if use_cuda: loss_func.cuda() if use_hpu: ssd300.to(device) loss_func.to(device) if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 global_batch_size = N_gpu * args.batch_size mllogger.event(key=mllog_const.GLOBAL_BATCH_SIZE, value=global_batch_size) # Reference doesn't support group batch norm, so bn_span==local_batch_size mllogger.event(key=mllog_const.MODEL_BN_SPAN, value=args.batch_size) current_lr = args.lr * (global_batch_size / 32) assert args.batch_size % args.batch_splits == 0, "--batch-size must be divisible by --batch-splits" fragment_size = args.batch_size // args.batch_splits if args.batch_splits != 1: print("using gradient accumulation with fragments of size {}".format( fragment_size)) current_momentum = 0.9 sgd_optimizer = torch.optim.SGD if use_hpu and hpu_lazy_mode: from habana_frameworks.torch.hpex.optimizers import FusedSGD sgd_optimizer = FusedSGD optim = sgd_optimizer(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=args.weight_decay) if use_hpu: permute_params(model=ssd300, to_filters_last=True, lazy_mode=hpu_lazy_mode) permute_momentum(optimizer=optim, to_filters_last=True, lazy_mode=hpu_lazy_mode) ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_BASE_LR, value=current_lr) ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_WEIGHT_DECAY, value=args.weight_decay) # parallelize if args.distributed: if use_hpu: ssd300 = torch.nn.parallel.DistributedDataParallel( ssd300, bucket_cap_mb=100, broadcast_buffers=False, gradient_as_bucket_view=True) else: ssd300 = DDP(ssd300) iter_num = args.iteration end_iter_num = args.end_iteration if end_iter_num: print("--end-iteration set to: {}".format(end_iter_num)) assert end_iter_num > iter_num, "--end-iteration must have a value > --iteration" avg_loss = 0.0 if use_hpu: loss_iter = list() inv_map = {v: k for k, v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if use_hpu: success = success.to(device) if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu * args.batch_size)) ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_LR_WARMUP_STEPS, value=wb) warmup_step = lambda iter_num, current_lr: lr_warmup( optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_schedule) mllogger.start(key=mllog_const.BLOCK_START, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) optim.zero_grad(set_to_none=True) if use_hpu: start = time.time() for epoch in range(args.epochs): mllogger.start(key=mllog_const.EPOCH_START, metadata={mllog_const.EPOCH_NUM: epoch}) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format( num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr for nbatch, (img, img_id, img_size, bbox, label) in enumerate(train_dataloader): current_batch_size = img.shape[0] # Split batch for gradient accumulation img = torch.split(img, fragment_size) bbox = torch.split(bbox, fragment_size) label = torch.split(label, fragment_size) for (fimg, fbbox, flabel) in zip(img, bbox, label): current_fragment_size = fimg.shape[0] if not use_hpu: trans_bbox = fbbox.transpose(1, 2).contiguous() if use_cuda: fimg = fimg.cuda() trans_bbox = trans_bbox.cuda() flabel = flabel.cuda() if use_hpu: fimg = fimg.to(device) if hpu_channels_last: fimg = fimg.contiguous( memory_format=torch.channels_last) if hpu_lazy_mode: mark_step() if is_hmp: with hmp.disable_casts(): #TODO revert after SW-58188 is fixed trans_bbox = fbbox.to(device).transpose( 1, 2).contiguous() flabel = flabel.to(device) else: #TODO revert after SW-58188 is fixed trans_bbox = fbbox.to(device).transpose( 1, 2).contiguous() flabel = flabel.to(device) fimg = Variable(fimg, requires_grad=True) if args.lowp: # amorgenstern import lowp with lowp.Lowp(mode='BF16', warn_patched=True, warn_not_patched=True): ploc, plabel = ssd300(fimg) gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(flabel, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) else: ploc, plabel = ssd300(fimg) if use_hpu and is_hmp: with hmp.disable_casts(): gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(flabel, requires_grad=False) loss = loss_func(ploc.float(), plabel.float(), gloc, glabel) else: gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(flabel, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) loss = loss * (current_fragment_size / current_batch_size ) # weighted mean if use_hpu and hpu_lazy_mode and args.distributed: mark_step() loss.backward() if use_hpu and hpu_lazy_mode: mark_step() warmup_step(iter_num, current_lr) if use_hpu and is_hmp: with hmp.disable_casts(): optim.step() else: optim.step() optim.zero_grad(set_to_none=True) if use_hpu: loss_iter.append(loss.clone().detach()) else: if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() if use_hpu and hpu_lazy_mode: mark_step() if use_hpu: if args.log_interval and not iter_num % args.log_interval: cur_loss = 0.0 for i, x in enumerate(loss_iter): cur_loss = x.cpu().item() if not np.isinf(cur_loss): avg_loss = 0.999 * avg_loss + 0.001 * cur_loss if args.rank == 0: print("Rank: {:6d}, Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(args.rank, iter_num, cur_loss, avg_loss)) loss_iter = list() else: if args.rank == 0 and args.log_interval and not iter_num % args.log_interval: print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss)) iter_num += 1 if use_hpu and iter_num == 50: start = time.time() if end_iter_num and iter_num >= end_iter_num: if use_hpu: print("Training Ended, total time: {:.2f} s".format( time.time() - start)) break if (args.val_epochs and (epoch+1) in args.val_epochs) or \ (args.val_interval and not (epoch+1) % args.val_interval): if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.MODEL_BN_SPAN, value=bn_buf) if args.rank == 0: if use_hpu: print("Training Ended, total time: {:.2f} s".format( time.time() - start)) if not args.no_save: print("") print("saving model...") if use_hpu: permute_params(model=ssd300, to_filters_last=False, lazy_mode=hpu_lazy_mode) ssd300_copy = SSD300( train_coco.labelnum, model_path=args.pretrained_backbone) if args.distributed: ssd300_copy.load_state_dict( ssd300.module.state_dict()) else: ssd300_copy.load_state_dict(ssd300.state_dict()) torch.save( { "model": ssd300_copy.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) permute_params(model=ssd300, to_filters_last=True, lazy_mode=hpu_lazy_mode) else: torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num, log_interval=args.log_interval, use_cuda=use_cuda, use_hpu=use_hpu, hpu_device=device, is_hmp=is_hmp, hpu_channels_last=hpu_channels_last, hpu_lazy_mode=hpu_lazy_mode, nms_valid_thresh=args.nms_valid_thresh): success = torch.ones(1) if use_cuda: success = success.cuda() if use_hpu: success = success.to(device) if args.distributed: dist.broadcast(success, 0) if success[0]: return True mllogger.end(key=mllog_const.EPOCH_STOP, metadata={mllog_const.EPOCH_NUM: epoch}) mllogger.end(key=mllog_const.BLOCK_STOP, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) return False