def eval_ssd300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() dboxes = dboxes300_coco() encoder = Encoder(dboxes) val_trans = SSDTransformer(dboxes, (300, 300), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) inv_map = {v:k for k,v in val_coco.label_map.items()} ssd300 = SSD300(val_coco.labelnum) print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint, map_location=lambda storage, loc: storage) ssd300.load_state_dict(od["model"]) if use_cuda: ssd300.cuda(args.device) loss_func = Loss(dboxes) if use_cuda: loss_func.cuda(args.device) coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold,args.device)
def test300(): figsize = 300 feat_size = [38, 19, 10, 5, 3, 1] steps = [8, 16, 32, 64, 100, 300] scales = [30, 60, 111, 162, 213, 264, 315] aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios) print(dboxes().shape) img = torch.randn(1, 3, 300, 300) model = SSD300(21) loc, conf = model(img) print(loc.shape, conf.shape)
def main(): # Parse arguments args = parse_args() # Get categories names with open(args.annotations, 'r') as anno: js = json.loads(anno.read()) coco_names = js['categories'] # Prepare map of COCO labels to COCO names name_map = {} for name in coco_names: name_map[name['id']] = name['name'] # Prepare map of SSD to COCO labels deleted = [12, 26, 29, 30, 45, 66, 68, 69, 71, 83] inv_map = {} cnt = 0 for i in range(1, 81): while i + cnt in deleted: cnt += 1 inv_map[i] = i + cnt # Prepare colors for categories category_id_to_color = dict([ (cat_id, [random.uniform(0, 1), random.uniform(0, 1), random.uniform(0, 1)]) for cat_id in range(1, 91) ]) # Set math plot lib size plt.rcParams["figure.figsize"] = (12, 8) # Build and load SSD model ssd300 = SSD300(81, backbone="resnet34", model_path=None, dilation=None) load_checkpoint(ssd300, args.model) ssd300.eval() # Prepare encoder dboxes = dboxes300_coco() encoder = Encoder(dboxes) # Print images for image in args.images: print_image(image, ssd300, encoder, inv_map, name_map, category_id_to_color, args.threshold)
def run_eval(args): args = setup_distributed(args) from pycocotools.coco import COCO local_seed = args.local_seed encoder = build_ssd300_coder() val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") cocoGt = COCO(annotation_file=val_annotate) val_loader, inv_map = build_pipeline(args, training=False) model_options = { 'use_nhwc': args.nhwc, 'pad_input': args.pad_input, 'bn_group': args.bn_group, 'pretrained': False, } ssd300_eval = SSD300(args, args.num_classes, **model_options).cuda() if args.use_fp16: convert_network(ssd300_eval, torch.half) ssd300_eval.eval() if args.checkpoint is not None: load_checkpoint(ssd300_eval, args.checkpoint) evaluator = AsyncEvaluator(num_threads=1) coco_eval( args, ssd300_eval, val_loader, cocoGt, encoder, inv_map, 0, # epoch 0, # iter_num evaluator=evaluator) res = evaluator.task_result(0)
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError( "Please install APEX from https://github.com/nvidia/apex") local_seed = args.seed os.environ['USE_CUDA'] = str(use_cuda) if args.world_size > 1: args.distributed = True if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist print('Distributed training with DDP') if args.no_cuda: device = torch.device('cpu') os.environ['RANK'] = str(os.environ.get('PMI_RANK', args.rank)) os.environ['WORLD_SIZE'] = str( os.environ.get('PMI_SIZE', args.world_size)) os.environ['MASTER_ADDR'] = args.master_addr os.environ['MASTER_PORT'] = args.port # Initialize the process group with ccl backend if args.backend == 'ccl': import torch_ccl dist.init_process_group(backend=args.backend) else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 mllogger.event(key=mllog_const.SEED, value=local_seed) # Refer to https://pytorch.org/docs/stable/notes/randomness.html#dataloader torch.manual_seed(local_seed) # Set PyTorch seed np.random.seed(seed=local_seed) # Set Numpy seed random.seed(local_seed) # Set the Python seed args.rank = dist.get_rank() if args.distributed else args.local_rank print("args.rank = {}".format(args.rank)) print("local rank = {}".format(args.local_rank)) print("distributed={}".format(args.distributed)) dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer( dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) mllogger.event(key=mllog_const.TRAIN_SAMPLES, value=len(train_coco)) mllogger.event(key=mllog_const.EVAL_SAMPLES, value=len(val_coco)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=0) # set shuffle=True in DataLoader # Leslie: here is the workaround: dist.broadcast will fail on other rank. we will run evalution on all the ranks val_dataloader = DataLoader(val_coco, batch_size=args.val_batch_size or args.batch_size, shuffle=False, sampler=None, num_workers=0) ssd300 = SSD300(train_coco.labelnum, model_path=args.pretrained_backbone) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 global_batch_size = N_gpu * args.batch_size mllogger.event(key=mllog_const.GLOBAL_BATCH_SIZE, value=global_batch_size) # Reference doesn't support group batch norm, so bn_span==local_batch_size mllogger.event(key=mllog_const.MODEL_BN_SPAN, value=args.batch_size) current_lr = args.lr * (global_batch_size / 32) assert args.batch_size % args.batch_splits == 0, "--batch-size must be divisible by --batch-splits" fragment_size = args.batch_size // args.batch_splits if args.batch_splits != 1: print("using gradient accumulation with fragments of size {}".format( fragment_size)) # Model to NHWC ssd300 = ssd300.to(memory_format=torch.channels_last) current_momentum = 0.9 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=args.weight_decay) ssd_print(key=mllog_const.OPT_BASE_LR, value=current_lr) ssd_print(key=mllog_const.OPT_WEIGHT_DECAY, value=args.weight_decay) iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu * args.batch_size)) ssd_print(key=mllog_const.OPT_LR_WARMUP_STEPS, value=wb) warmup_step = lambda iter_num, current_lr: lr_warmup( optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None ssd_print(key=mllog_const.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) ssd_print(key=mllog_const.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_schedule) mllogger.start(key=mllog_const.BLOCK_START, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) if args.performance_only: train_time = AverageMeter('TrainTime', ':6.3f') progress = ProgressMeter(args.train_iteration, [train_time], prefix='Train: ') # Restore the model and optim from checkpoint if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) optim.load_state_dict(od['optim']) # Model Prepack if use_ipex: if args.autocast: ssd300, optim = ipex.optimize(ssd300, dtype=torch.bfloat16, optimizer=optim) else: ssd300, optim = ipex.optimize(ssd300, dtype=torch.float32, optimizer=optim) # parallelize if args.distributed: device_ids = None ssd300 = torch.nn.parallel.DistributedDataParallel( ssd300, device_ids=device_ids) optim.zero_grad(set_to_none=True) for epoch in range(args.epochs): mllogger.start(key=mllog_const.EPOCH_START, metadata={mllog_const.EPOCH_NUM: epoch}) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format( num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr for nbatch, (img, img_id, img_size, bbox, label) in enumerate(train_dataloader): naive_train_case = True # img.shape[0] == fragment_size if naive_train_case: # Naive train case fimg, gloc, glabel, mask, pos_num, neg_num, num_mask = data_preprocess( img, bbox, label, loss_func, args.autocast) if args.performance_only and iter_num >= args.warmup_iterations: start_time = time.time() if args.profile and args.performance_only and iter_num == 30: # Profile Mode with torch.profiler.profile( on_trace_ready=trace_handler) as prof: with torch.cpu.amp.autocast(enabled=args.autocast): ploc, plabel = ssd300(fimg) loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask, args.autocast) loss.backward() warmup_step(iter_num, current_lr) optim.step() optim.zero_grad(set_to_none=True) else: # Non Profile Mode with torch.cpu.amp.autocast(enabled=args.autocast): ploc, plabel = ssd300(fimg) loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask, args.autocast) loss.backward() warmup_step(iter_num, current_lr) optim.step() optim.zero_grad(set_to_none=True) else: # Train case: when split input to several fragment size print("Not support input with several fragment size yet.") exit(-1) # current_batch_size = img.shape[0] # # Split batch for gradient accumulation # img = torch.split(img, fragment_size) # bbox = torch.split(bbox, fragment_size) # label = torch.split(label, fragment_size) # if args.performance_only and iter_num >= args.warmup_iterations: # start_time=time.time() # for (fimg, fbbox, flabel) in zip(img, bbox, label): # current_fragment_size = fimg.shape[0] # trans_bbox = fbbox.transpose(1,2).contiguous() # if use_cuda: # fimg = fimg.cuda() # trans_bbox = trans_bbox.cuda() # flabel = flabel.cuda() # fimg = Variable(fimg, requires_grad=True) # gloc, glabel = Variable(trans_bbox, requires_grad=False), \ # Variable(flabel, requires_grad=False) # gloc = loss_func._loc_vec(gloc) # mask = glabel > 0 # pos_num = mask.sum(dim=1) # neg_num = torch.clamp(3*pos_num, max=mask.size(1)).unsqueeze(-1) # num_mask = (pos_num > 0).float() # # image to NHWC # fimg = fimg.contiguous(memory_format=torch.channels_last) # if use_ipex: # with ipex.amp.autocast(enabled=args.autocast, configure=ipex.conf.AmpConf(torch.bfloat16)): # ploc, plabel = ssd300(fimg) # loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask) # else: # ploc, plabel = ssd300(fimg) # loss = loss_func(ploc, plabel, gloc, glabel, mask, pos_num, neg_num, num_mask) # loss = loss * (current_fragment_size / current_batch_size) # weighted mean # loss.backward() # warmup_step(iter_num, current_lr) # optim.step() # optim.zero_grad(set_to_none=True) if args.performance_only and iter_num >= args.warmup_iterations: train_time.update(time.time() - start_time) if args.performance_only and iter_num % args.print_freq == 0: progress.display(iter_num) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() if args.log_interval and not iter_num % args.log_interval: print("Iteration: {:6d}, Loss function: {:5.8f}, Average Loss: {:.8f}"\ .format(iter_num, loss.item(), avg_loss)) iter_num += 1 if args.performance_only and iter_num >= args.train_iteration: break if args.performance_only and iter_num >= args.train_iteration: break if (args.val_epochs and (epoch+1) in args.val_epochs) or \ (args.val_interval and not (epoch+1) % args.val_interval): if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size ssd_print(key=mllog_const.MODEL_BN_SPAN, value=bn_buf.cpu().detach().numpy()) if args.rank == 0 or True: # Leslie: here is the workaround: dist.broadcast will fail on other rank. we will run evalution on all the ranks if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info, "optim": optim.state_dict() }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num, log_interval=args.log_interval, nms_valid_thresh=args.nms_valid_thresh, use_autocast=args.autocast): success = torch.ones(1) if use_cuda: success = success.cuda() # Leslie: same Workaround: since we run evalution on all ranks, we don't need to broadcast the evalutation result # if args.distributed: # dist.broadcast(success, 0) if success[0]: return True mllogger.end(key=mllog_const.EPOCH_STOP, metadata={mllog_const.EPOCH_NUM: epoch}) mllogger.end(key=mllog_const.BLOCK_STOP, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) if args.performance_only: batch_size = args.batch_size latency = train_time.avg / batch_size * 1000 perf = batch_size / train_time.avg print('train latency %.2f ms' % latency) print('train performance %.2f fps' % perf) print("Throughput: {:.3f} fps".format(perf)) return False
def train300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4) # set shuffle=True in DataLoader mlperf_log.ssd_print(key=mlperf_log.INPUT_SHARD, value=None) mlperf_log.ssd_print(key=mlperf_log.INPUT_ORDER) mlperf_log.ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() current_lr = 1e-3 current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) mlperf_log.ssd_print(key=mlperf_log.OPT_NAME, value="SGD") mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) mlperf_log.ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) mlperf_log.ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if iter_num == 160000: current_lr = 1e-4 print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if iter_num == 200000: current_lr = 1e-5 print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if use_cuda: img = img.cuda() img = Variable(img, requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() if use_cuda: trans_bbox = trans_bbox.cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss), end="\r") optim.zero_grad() loss.backward() optim.step() if iter_num in args.evaluation: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num): return True iter_num += 1 return False
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError( "Please install APEX from https://github.com/nvidia/apex") if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist # ssd_print(key=mlperf_log.RUN_SET_RANDOM_SEED) if args.no_cuda: device = torch.device('cpu') else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 print(dist.get_rank(), "Using seed = {}".format(local_seed)) torch.manual_seed(local_seed) np.random.seed(seed=local_seed) dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=4) # set shuffle=True in DataLoader ssd_print(key=mlperf_log.INPUT_SHARD, value=None) ssd_print(key=mlperf_log.INPUT_ORDER) ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # parallelize if args.distributed: ssd300 = DDP(ssd300) global_batch_size = N_gpu * args.batch_size current_lr = args.lr * (global_batch_size / 32) current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) ssd_print(key=mlperf_log.OPT_NAME, value="SGD") ssd_print(key=mlperf_log.OPT_LR, value=current_lr) ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) eval_points = args.evaluation print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu * args.batch_size)) warmup_step = lambda iter_num, current_lr: lr_warmup( optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None for epoch in range(args.epochs): ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format( num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr ssd_print(key=mlperf_log.OPT_LR, value=current_lr) for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if use_cuda: img = img.cuda() img = Variable(img, requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() if use_cuda: trans_bbox = trans_bbox.cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss), end="\r") optim.zero_grad() loss.backward() warmup_step(iter_num, current_lr) optim.step() iter_num += 1 if epoch + 1 in eval_points: rank = dist.get_rank() if args.distributed else args.local_rank if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size if rank == 0: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num): success = torch.ones(1) if use_cuda: success = success.cuda() if args.distributed: dist.broadcast(success, 0) if success[0]: return True return False
def val300(path): ssd300 = SSD300(21) dboxes = dboxes300() encoder = Encoder(dboxes) trans = SSDTransformer(dboxes, (300, 300), val=True) valmodel(ssd300, path, dboxes, trans, encoder)
def train300_coco(): dboxes = dboxes300_coco() trans = SSDTransformer(dboxes, (300, 300), val=False) #annotate = "../../coco_ssd/instances_valminusminival2014.json" #coco_root = "../../coco_data/val2014" #annotate = "../../coco_ssd/train.json" #coco_root = "../../coco_data/train2014" annotate = "../../coco_ssd/instances_train2017.json" coco_root = "../../coco_data/train2017" coco = COCODetection(coco_root, annotate, trans) print("Number of labels: {}".format(coco.labelnum)) print("Number of images: {}".format(len(coco))) #train_sampler = torch.utils.data.distributed.DistributedSampler(coco) dataloader = DataLoader(coco, batch_size=32, shuffle=True, num_workers=4) #dataloader = DataLoader(coco, batch_size=8, shuffle=True, num_workers=4, sampler=train_sampler, shuffle=(train_sampler is None)) nepochs = 800 ssd300 = SSD300(coco.labelnum) #ssd300 = DDP(ssd300) ssd300.train() ssd300.cuda() loss_func = Loss(dboxes) loss_func.cuda() optim = torch.optim.SGD(ssd300.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4) print("epoch", "nbatch", "loss") iter_num = 0 avg_loss = 0.0 #od = torch.load("./models/larger_iter_210000.pt") #ssd300.load_state_dict(od["model"]) #iter_num = 210000 #optim = torch.optim.SGD(ssd300.parameters(), lr=1e-5, momentum=0.9, weight_decay=5e-4) for epoch in range(nepochs): #train_sampler.set_epoch(epoch) if iter_num >= 240000: break for nbatch, (img, img_size, bbox, label) in enumerate(dataloader): iter_num += 1 if iter_num == 160000: print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = 1e-4 if iter_num == 200000: print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = 1e-5 img = Variable(img.cuda(), requires_grad=True) ploc, plabel = ssd300(img) gloc, glabel = Variable(bbox.transpose(1,2).contiguous().cuda(), requires_grad=False), \ Variable(label.cuda(), requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss), end="\r") optim.zero_grad() loss.backward() optim.step() if iter_num % 5000 == 0: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": coco.label_info }, "./models/crowd_iter_{}.pt".format(iter_num))
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError("Please install APEX from https://github.com/nvidia/apex") local_seed = args.seed if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist if args.no_cuda: device = torch.device('cpu') else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 mllogger.event(key=mllog_const.SEED, value=local_seed) torch.manual_seed(local_seed) np.random.seed(seed=local_seed) args.rank = dist.get_rank() if args.distributed else args.local_rank print("args.rank = {}".format(args.rank)) print("local rank = {}".format(args.local_rank)) print("distributed={}".format(args.distributed)) dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) mllogger.event(key=mllog_const.TRAIN_SAMPLES, value=len(train_coco)) mllogger.event(key=mllog_const.EVAL_SAMPLES, value=len(val_coco)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=4) # set shuffle=True in DataLoader if args.rank==0: val_dataloader = DataLoader(val_coco, batch_size=args.val_batch_size or args.batch_size, shuffle=False, sampler=None, num_workers=4) else: val_dataloader = None ssd300 = SSD300(train_coco.labelnum, model_path=args.pretrained_backbone) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # parallelize if args.distributed: ssd300 = DDP(ssd300) global_batch_size = N_gpu * args.batch_size mllogger.event(key=mllog_const.GLOBAL_BATCH_SIZE, value=global_batch_size) # Reference doesn't support group batch norm, so bn_span==local_batch_size mllogger.event(key=mllog_const.MODEL_BN_SPAN, value=args.batch_size) current_lr = args.lr * (global_batch_size / 32) assert args.batch_size % args.batch_splits == 0, "--batch-size must be divisible by --batch-splits" fragment_size = args.batch_size // args.batch_splits if args.batch_splits != 1: print("using gradient accumulation with fragments of size {}".format(fragment_size)) current_momentum = 0.9 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=args.weight_decay) ssd_print(key=mllog_const.OPT_BASE_LR, value=current_lr) ssd_print(key=mllog_const.OPT_WEIGHT_DECAY, value=args.weight_decay) iter_num = args.iteration avg_loss = 0.0 inv_map = {v:k for k,v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu*args.batch_size)) ssd_print(key=mllog_const.OPT_LR_WARMUP_STEPS, value=wb) warmup_step = lambda iter_num, current_lr: lr_warmup(optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None ssd_print(key=mllog_const.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) ssd_print(key=mllog_const.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_schedule) mllogger.start( key=mllog_const.BLOCK_START, metadata={mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs}) optim.zero_grad() for epoch in range(args.epochs): mllogger.start( key=mllog_const.EPOCH_START, metadata={mllog_const.EPOCH_NUM: epoch}) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format(num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr for nbatch, (img, img_id, img_size, bbox, label) in enumerate(train_dataloader): current_batch_size = img.shape[0] # Split batch for gradient accumulation img = torch.split(img, fragment_size) bbox = torch.split(bbox, fragment_size) label = torch.split(label, fragment_size) for (fimg, fbbox, flabel) in zip(img, bbox, label): current_fragment_size = fimg.shape[0] trans_bbox = fbbox.transpose(1,2).contiguous() if use_cuda: fimg = fimg.cuda() trans_bbox = trans_bbox.cuda() flabel = flabel.cuda() fimg = Variable(fimg, requires_grad=True) ploc, plabel = ssd300(fimg) gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(flabel, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) loss = loss * (current_fragment_size / current_batch_size) # weighted mean loss.backward() warmup_step(iter_num, current_lr) optim.step() optim.zero_grad() if not np.isinf(loss.item()): avg_loss = 0.999*avg_loss + 0.001*loss.item() if args.rank == 0 and args.log_interval and not iter_num % args.log_interval: print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss)) iter_num += 1 if (args.val_epochs and (epoch+1) in args.val_epochs) or \ (args.val_interval and not (epoch+1) % args.val_interval): if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers(recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size ssd_print(key=mllog_const.MODEL_BN_SPAN, value=bn_buf) if args.rank == 0: if not args.no_save: print("") print("saving model...") torch.save({"model" : ssd300.state_dict(), "label_map": train_coco.label_info}, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num, log_interval=args.log_interval, nms_valid_thresh=args.nms_valid_thresh): success = torch.ones(1) if use_cuda: success = success.cuda() if args.distributed: dist.broadcast(success, 0) if success[0]: return True mllogger.end( key=mllog_const.EPOCH_STOP, metadata={mllog_const.EPOCH_NUM: epoch}) mllogger.end( key=mllog_const.BLOCK_STOP, metadata={mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs}) return False
def run_eval(args): from coco import COCO # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') local_seed = set_seeds(args) # start timing here if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) inv_map = {v:k for k,v in val_coco.label_map.items()} if args.distributed: val_sampler = GeneralDistributedSampler(val_coco, pad=False) else: val_sampler = None val_dataloader = DataLoader(val_coco, batch_size=args.eval_batch_size, shuffle=False, # Note: distributed sampler is shuffled :( sampler=val_sampler, num_workers=args.num_workers) ssd300_eval = SSD300(val_coco.labelnum, backbone=args.backbone, use_nhwc=args.nhwc, pad_input=args.pad_input).cuda() if args.use_fp16: ssd300_eval = network_to_half(ssd300_eval) ssd300_eval.eval() if args.checkpoint is not None: load_checkpoint(ssd300_eval, args.checkpoint) coco_eval(ssd300_eval, val_dataloader, cocoGt, encoder, inv_map, args.threshold, 0, # epoch 0, # iter_num args.eval_batch_size, use_fp16=args.use_fp16, local_rank=args.local_rank if args.distributed else -1, N_gpu=N_gpu, use_nhwc=args.nhwc, pad_input=args.pad_input)
from keras.optimizers import Adam, SGD from keras import backend as K from ssd300 import SSD300 from keras_loss_function.keras_ssd_loss import SSDLoss img_height = 300 img_width = 300 img_channels = 3 n_classes = 20 K.clear_session() # 1: 创建模型 model = SSD300(image_size=(img_height, img_width, img_channels), n_classes=n_classes, mode='training') # 2: 加载VGG16模型权重 weights_path = 'vgg/VGG_ILSVRC_16_layers_fc_reduced.h5' model.load_weights(weights_path, by_name=True) # 3: 优化器及loss #adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) sgd = SGD(lr=0.001, momentum=0.9, decay=0.0, nesterov=False) ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0) model.compile(optimizer=sgd, loss=ssd_loss.compute_loss)
def train300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') local_seed = set_seeds(args) # start timing here ssd_print(key=mlperf_log.RUN_START) if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) if args.distributed: val_sampler = GeneralDistributedSampler(val_coco, pad=False) else: val_sampler = None train_pipe = COCOPipeline(args.batch_size, args.local_rank, train_coco_root, train_annotate, N_gpu, num_threads=args.num_workers, output_fp16=args.use_fp16, output_nhwc=args.nhwc, pad_output=args.pad_input, seed=local_seed - 2**31) print_message(args.local_rank, "time_check a: {secs:.9f}".format(secs=time.time())) train_pipe.build() print_message(args.local_rank, "time_check b: {secs:.9f}".format(secs=time.time())) test_run = train_pipe.run() train_loader = DALICOCOIterator(train_pipe, 118287 / N_gpu) val_dataloader = DataLoader( val_coco, batch_size=args.eval_batch_size, shuffle=False, # Note: distributed sampler is shuffled :( sampler=val_sampler, num_workers=args.num_workers) ssd_print(key=mlperf_log.INPUT_ORDER) ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) # Build the model ssd300 = SSD300(val_coco.labelnum, backbone=args.backbone, use_nhwc=args.nhwc, pad_input=args.pad_input) if args.checkpoint is not None: load_checkpoint(ssd300, args.checkpoint) ssd300.train() ssd300.cuda() loss_func = Loss(dboxes) loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 if args.use_fp16: ssd300 = network_to_half(ssd300) # Parallelize. Need to do this after network_to_half. if args.distributed: if args.delay_allreduce: print_message(args.local_rank, "Delaying allreduces to the end of backward()") ssd300 = DDP(ssd300, delay_allreduce=args.delay_allreduce, retain_allreduce_buffers=args.use_fp16) # Create optimizer. This must also be done after network_to_half. global_batch_size = (N_gpu * args.batch_size) # mlperf only allows base_lr scaled by an integer base_lr = 1e-3 requested_lr_multiplier = args.lr / base_lr adjusted_multiplier = max( 1, round(requested_lr_multiplier * global_batch_size / 32)) current_lr = base_lr * adjusted_multiplier current_momentum = 0.9 current_weight_decay = 5e-4 static_loss_scale = 128. if args.use_fp16: if args.distributed and not args.delay_allreduce: # We can't create the flat master params yet, because we need to # imitate the flattened bucket structure that DDP produces. optimizer_created = False else: model_buckets = [ [ p for p in ssd300.parameters() if p.requires_grad and p.type() == "torch.cuda.HalfTensor" ], [ p for p in ssd300.parameters() if p.requires_grad and p.type() == "torch.cuda.FloatTensor" ] ] flat_master_buckets = create_flat_master(model_buckets) optim = torch.optim.SGD(flat_master_buckets, lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True else: optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True # Add LARC if desired if args.use_larc: optim = LARC(optim) ssd_print(key=mlperf_log.OPT_NAME, value="SGD") ssd_print(key=mlperf_log.OPT_LR, value=current_lr) ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) if args.warmup is not None: ssd_print(key=mlperf_log.OPT_LR_WARMUP_STEPS, value=args.warmup) # Model is completely finished -- need to create separate copies, preserve parameters across # them, and jit ssd300_eval = SSD300(val_coco.labelnum, backbone=args.backbone, use_nhwc=args.nhwc, pad_input=args.pad_input).cuda() if args.use_fp16: ssd300_eval = network_to_half(ssd300_eval) # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 ssd300_eval.load_state_dict(train_model.state_dict()) ssd300_eval.eval() if args.jit: input_c = 4 if args.pad_input else 3 example_shape = [ args.batch_size, 300, 300, input_c ] if args.nhwc else [args.batch_size, input_c, 300, 300] example_input = torch.randn(*example_shape).cuda() if args.use_fp16: example_input = example_input.half() # DDP has some Python-side control flow. If we JIT the entire DDP-wrapped module, # the resulting ScriptModule will elide this control flow, resulting in allreduce # hooks not being called. If we're running distributed, we need to extract and JIT # the wrapped .module. # Replacing a DDP-ed ssd300 with a script_module might also cause the AccumulateGrad hooks # to go out of scope, and therefore silently disappear. module_to_jit = ssd300.module if args.distributed else ssd300 if args.distributed: ssd300.module = torch.jit.trace(module_to_jit, example_input) else: ssd300 = torch.jit.trace(module_to_jit, example_input) print_message(args.local_rank, "epoch", "nbatch", "loss") eval_points = np.array(args.evaluation) * 32 / global_batch_size eval_points = list(map(int, list(eval_points))) iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} start_elapsed_time = time.time() last_printed_iter = args.iteration num_elapsed_samples = 0 # Generate normalization tensors mean, std = generate_mean_std(args) def step_maybe_fp16_maybe_distributed(optim): if args.use_fp16: if args.distributed: for flat_master, allreduce_buffer in zip( flat_master_buckets, ssd300.allreduce_buffers): if allreduce_buffer is None: raise RuntimeError("allreduce_buffer is None") flat_master.grad = allreduce_buffer.float() flat_master.grad.data.mul_(1. / static_loss_scale) else: for flat_master, model_bucket in zip(flat_master_buckets, model_buckets): flat_grad = apex_C.flatten( [m.grad.data for m in model_bucket]) flat_master.grad = flat_grad.float() flat_master.grad.data.mul_(1. / static_loss_scale) optim.step() if args.use_fp16: for model_bucket, flat_master in zip(model_buckets, flat_master_buckets): for model, master in zip( model_bucket, apex_C.unflatten(flat_master.data, model_bucket)): model.data.copy_(master.data) ssd_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) for p in ssd300.parameters(): p.grad = None for i, data in enumerate(train_loader): img = data[0][0][0] bbox = data[0][1][0] label = data[0][2][0] label = label.type(torch.cuda.LongTensor) bbox_offsets = data[0][3][0] # handle random flipping outside of DALI for now bbox_offsets = bbox_offsets.cuda() img, bbox = C.random_horiz_flip(img, bbox, bbox_offsets, 0.5, args.nhwc) img.sub_(mean).div_(std) if args.profile is not None and iter_num == args.profile: return if args.warmup is not None and optimizer_created: lr_warmup(optim, args.warmup, iter_num, epoch, current_lr, args) if iter_num == ((args.decay1 * 1000 * 32) // global_batch_size): print_message(args.local_rank, "lr decay step #1") current_lr *= 0.1 for param_group in optim.param_groups: param_group['lr'] = current_lr ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if iter_num == ((args.decay2 * 1000 * 32) // global_batch_size): print_message(args.local_rank, "lr decay step #2") current_lr *= 0.1 for param_group in optim.param_groups: param_group['lr'] = current_lr ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if use_cuda: img = img.cuda() # NHWC direct from DALI now if necessary bbox = bbox.cuda() label = label.cuda() bbox_offsets = bbox_offsets.cuda() # Now run the batched box encoder N = img.shape[0] if bbox_offsets[-1].item() == 0: print("No labels in batch") continue bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5) # output is ([N*8732, 4], [N*8732], need [N, 8732, 4], [N, 8732] respectively M = bbox.shape[0] // N bbox = bbox.view(N, M, 4) label = label.view(N, M) # print(img.shape, bbox.shape, label.shape) ploc, plabel = ssd300(img) ploc, plabel = ploc.float(), plabel.float() trans_bbox = bbox.transpose(1, 2).contiguous().cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() num_elapsed_samples += N if args.local_rank == 0 and iter_num % args.print_interval == 0: end_elapsed_time = time.time() elapsed_time = end_elapsed_time - start_elapsed_time avg_samples_per_sec = num_elapsed_samples * N_gpu / elapsed_time print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, avg. samples / sec: {:.2f}"\ .format(iter_num, loss.item(), avg_loss, avg_samples_per_sec), end="\n") last_printed_iter = iter_num start_elapsed_time = time.time() num_elapsed_samples = 0 # loss scaling if args.use_fp16: loss = loss * static_loss_scale loss.backward() if not optimizer_created: # Imitate the model bucket structure created by DDP. # These will already be split by type (float or half). model_buckets = [] for bucket in ssd300.active_i_buckets: model_buckets.append([]) for active_i in bucket: model_buckets[-1].append( ssd300.active_params[active_i]) flat_master_buckets = create_flat_master(model_buckets) optim = torch.optim.SGD(flat_master_buckets, lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True # Skip this first iteration because flattened allreduce buffers are not yet created. # step_maybe_fp16_maybe_distributed(optim) else: step_maybe_fp16_maybe_distributed(optim) # Likely a decent skew here, let's take this opportunity to set the gradients to None. # After DALI integration, playing with the placement of this is worth trying. for p in ssd300.parameters(): p.grad = None if iter_num in eval_points: if args.local_rank == 0: if not args.no_save: print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": val_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 ssd300_eval.load_state_dict(train_model.state_dict()) if coco_eval( ssd300_eval, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num, args.eval_batch_size, use_fp16=args.use_fp16, local_rank=args.local_rank if args.distributed else -1, N_gpu=N_gpu, use_nhwc=args.nhwc, pad_input=args.pad_input): return True iter_num += 1 train_loader.reset() return False
import tools # 设置默认的tensor类型 torch.set_default_tensor_type('torch.cuda.FloatTensor') test_img_index = 113 conf_threshold = 0.2 nms_threshold = 0.45 test_img, norm_img, label = test_ds.__getitem__(test_img_index) width, height = test_img.size # 实例化一个网络模型 net = torch.nn.DataParallel(SSD300()) # 加载参数 net.load_state_dict(torch.load('./net179-0.645.pth')) # pred_conf: 预测得到的分类概率向量---1, 8732, 21 # pred_loc: 预测得到的偏移向量---1, 8732, 4 norm_img = norm_img.cuda() pred_conf, pred_loc = net(norm_img) pred_loc = torch.squeeze(pred_loc) pred_conf = torch.squeeze(pred_conf) # 将pred_conf转换为概率向量 pred_conf = F.softmax(pred_conf, dim=1)
def train300(): label_map = {} dboxes = dboxes300() trans = SSDTransformer(dboxes, (300, 300), val=False) img_folder = "../../VOCdevkit/VOC2007/JPEGImages" ann_folder = "../../VOCdevkit/VOC2007/Annotations" tgt_folder = "../../VOCdevkit/VOC2007/ImageSets/Main/trainval.txt" vd = VOCDetection(img_folder, ann_folder, tgt_folder, label_map=label_map, \ transform = trans) dataloader = DataLoader(vd, batch_size=32, shuffle=True, num_workers=8) nepochs = 800 ssd300 = SSD300(21) ssd300.train() ssd300.cuda() loss_func = Loss(dboxes) loss_func.cuda() optim = torch.optim.SGD(ssd300.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4) print("epoch", "nbatch", "loss") iter_num = 0 avg_loss = 0.0 for epoch in range(nepochs): if iter_num >= 60000: break for nbatch, (img, img_size, bbox, label) in enumerate(dataloader): #gc.collect() iter_num += 1 if iter_num == 40000: print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = 1e-4 if iter_num == 50000: print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = 1e-5 img = Variable(img.cuda(), requires_grad=True) ploc, plabel = ssd300(img) #torch.cuda.synchronize() #show_memusage() gloc, glabel = Variable(bbox.transpose(1,2).contiguous().cuda(), requires_grad=False), \ Variable(label.cuda(), requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) #torch.cuda.synchronize() #show_memusage() avg_loss = 0.999 * avg_loss + 0.001 * loss.item() print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss), end="\r") optim.zero_grad() loss.backward() #torch.cuda.synchronize() #show_memusage() optim.step() if iter_num % 5000 == 0: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": vd.label_map }, "./models/iter_{}.pt".format(iter_num))
def test_coco(args): # For testing purposes we have to use CUDA use_cuda = True # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) if args.use_train_dataset: annotate = os.path.join(args.data, "annotations/instances_train2017.json") coco_root = os.path.join(args.data, "train2017") img_number = 118287 else: annotate = os.path.join(args.data, "annotations/instances_val2017.json") coco_root = os.path.join(args.data, "val2017") img_number = 5000 pipe = COCOPipeline(args.batch_size, args.local_rank, coco_root, annotate, N_gpu, num_threads=args.num_workers) pipe.build() test_run = pipe.run() dataloader = DALICOCOIterator(pipe, img_number / N_gpu) # Build the model ssd300 = SSD300(81, backbone=args.backbone, model_path='', dilation=False) """ # Note: args.checkpoint is required, so this can never be false if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) # remove proceeding 'module' from checkpoint model = od["model"] for k in list(model.keys()): if k.startswith('module.'): model[k[7:]] = model.pop(k) ssd300.load_state_dict(model) """ ssd300.cuda() ssd300.eval() loss_func = Loss(dboxes) loss_func.cuda() # parallelize if args.distributed: ssd300 = DDP(ssd300) if args.use_fp16: ssd300 = network_to_half(ssd300) if args.use_train_dataset and args.local_rank == 0: print( 'Image 000000320612.jpg is in fact PNG and it will cause fail if ' + 'used with nvJPEGDecoder in coco_pipeline') for epoch in range(2): if epoch == 1 and args.local_rank == 0: print("Performance computation starts") s = time.time() for i, data in enumerate(dataloader): with torch.no_grad(): # Get data from pipeline img = data[0][0][0] bbox = data[0][1][0] label = data[0][2][0] label = label.type(torch.cuda.LongTensor) bbox_offsets = data[0][3][0] bbox_offsets = bbox_offsets.cuda() # Encode labels N = img.shape[0] if bbox_offsets[-1].item() == 0: print("No labels in batch") continue bbox, label = C.box_encoder(N, bbox, bbox_offsets, label, encoder.dboxes.cuda(), 0.5) # Prepare tensors for computing loss M = bbox.shape[0] // N bbox = bbox.view(N, M, 4) label = label.view(N, M) trans_bbox = bbox.transpose(1, 2).contiguous() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) if args.use_fp16: img = img.half() for _ in range(args.fbu): ploc, plabel = ssd300(img) ploc, plabel = ploc.float(), plabel.float() loss = loss_func(ploc, plabel, gloc, glabel) if epoch == 1 and args.local_rank == 0: e = time.time() print("Performance achieved: {:.2f} img/sec".format(img_number / (e - s))) dataloader.reset()
def train300_mlperf_coco(args): from coco import COCO # Check that GPUs are actually available if not torch.cuda.is_available(): print("Error. No GPU available.") return False dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) train_pipe = COCOPipeline(args.batch_size, train_coco_root, train_annotate, dboxes, args.seed) train_pipe.build() train_loader = DALIGenericIterator(train_pipe, ["images", "boxes", "labels"], train_pipe.epoch_size("Reader")) mlperf_log.ssd_print(key=mlperf_log.INPUT_SHARD, value=None) mlperf_log.ssd_print(key=mlperf_log.INPUT_ORDER) mlperf_log.ssd_print(key=mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() ssd300.cuda() loss_func = Loss(dboxes) loss_func.cuda() current_lr = 1e-3 current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) mlperf_log.ssd_print(key=mlperf_log.OPT_NAME, value="SGD") mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) mlperf_log.ssd_print(key=mlperf_log.OPT_MOMENTUM, value=current_momentum) mlperf_log.ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} mean, std = generate_mean_std() data_perf = AverageMeter() batch_perf = AverageMeter() end = time.time() train_start = end mlperf_log.ssd_print(key=mlperf_log.TRAIN_LOOP) for epoch in range(args.epochs): mlperf_log.ssd_print(key=mlperf_log.TRAIN_EPOCH, value=epoch) for nbatch, data in enumerate(train_loader): img = data[0]["images"] bbox = data[0]["boxes"] label = data[0]["labels"] boxes_in_batch = len(label.nonzero()) if boxes_in_batch == 0: print("No labels in batch") continue label = label.type(torch.cuda.LongTensor) img = Variable(img, requires_grad=True) trans_bbox = bbox.transpose(1, 2).contiguous() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) data_perf.update(time.time() - end) if iter_num == 160000: current_lr = 1e-4 print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) if iter_num == 200000: current_lr = 1e-5 print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = current_lr mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=current_lr) ploc, plabel = ssd300(img) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() optim.zero_grad() loss.backward() optim.step() batch_perf.update(time.time() - end) if iter_num in args.evaluation: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) try: if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num): return True except: print("Eval error on iteration {0}".format(iter_num)) print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, Data perf: {:3f} img/sec, Batch perf: {:3f} img/sec, Avg Data perf: {:3f} img/sec, Avg Batch perf: {:3f} img/sec"\ .format(iter_num, loss.item(), avg_loss, args.batch_size / data_perf.val, args.batch_size / batch_perf.val, args.batch_size / data_perf.avg, args.batch_size / batch_perf.avg), end="\r") end = time.time() iter_num += 1 if iter_num == 10 and epoch == 0: data_perf.reset() batch_perf.reset() train_loader.reset() print("\n\n") print("Training end: Data perf: {:3f} img/sec, Batch perf: {:3f} img/sec, Total time: {:3f} sec"\ .format(args.batch_size / data_perf.avg, args.batch_size / batch_perf.avg, time.time() - train_start)) return False
def val300_coco(model_path): print("loading model at {}".format(model_path)) from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval dboxes = dboxes300_coco() encoder = Encoder(dboxes) trans = SSDTransformer(dboxes, (300, 300), val=True) #annotate = "../../coco_ssd/instances_minival2014.json" #coco_root = "../../coco_data/val2014" #annotate = "../../coco_ssd/image_info_test-dev2015.json" #coco_root = "../../coco_data/test2015" annotate = "../../coco_ssd/instances_val2017.json" coco_root = "../../coco_data/val2017" cocoGt = COCO(annotation_file=annotate) coco = COCODetection(coco_root, annotate, trans) model = SSD300(coco.labelnum) od = torch.load(model_path) model.load_state_dict(od["model"]) model.eval() model.cuda() ret = [] inv_map = {v: k for k, v in coco.label_map.items()} start = time.time() for idx, image_id in enumerate(coco.img_keys): img, (htot, wtot), _, _ = coco[idx] with torch.no_grad(): print("Parsing image: {}/{}".format(idx + 1, len(coco)), end="\r") ploc, plabel = model(img.unsqueeze(0).cuda()) try: result = encoder.decode_batch(ploc, plabel, 0.50, 200)[0] except: #raise print("") print("No object detected in idx: {}".format(idx), end="\r") continue loc, label, prob = [r.cpu().numpy() for r in result] for loc_, label_, prob_ in zip(loc, label, prob): ret.append([image_id, loc_[0]*wtot, \ loc_[1]*htot, (loc_[2] - loc_[0])*wtot, (loc_[3] - loc_[1])*htot, prob_, inv_map[label_]]) print("") print("Predicting Ended, totoal time: {:.2f} s".format(time.time() - start)) cocoDt = cocoGt.loadRes(np.array(ret)) E = COCOeval(cocoGt, cocoDt, iouType='bbox') #E.params.useSegm = 0 #E.params.recThrs = [0.5] #E.params.maxDets = [10, 100, 200] E.evaluate() E.accumulate() E.summarize()
def train300_mlperf_coco(args): args = setup_distributed(args) # Build the model model_options = { 'use_nhwc': args.nhwc, 'pad_input': args.pad_input, 'bn_group': args.bn_group, } ssd300 = SSD300(args, args.num_classes, **model_options) if args.checkpoint is not None: load_checkpoint(ssd300, args.checkpoint) ssd300.train() ssd300.cuda() dboxes = dboxes300_coco() # Note: No reason not to use optimised loss loss_func = OptLoss() loss_func.cuda() # Create optimizer. This must also be done after network_to_half. global_batch_size = (args.N_gpu * args.batch_size) log_event(key=constants.MODEL_BN_SPAN, value=args.bn_group * args.batch_size) log_event(key=constants.GLOBAL_BATCH_SIZE, value=global_batch_size) # mlperf only allows base_lr scaled by an integer base_lr = 2.5e-3 requested_lr_multiplier = args.lr / base_lr adjusted_multiplier = max( 1, round(requested_lr_multiplier * global_batch_size / 32)) current_lr = base_lr * adjusted_multiplier current_momentum = 0.9 current_weight_decay = args.wd static_loss_scale = 128. optim = apex.optimizers.FusedSGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) ssd300, optim = apex.amp.initialize(ssd300, optim, opt_level='O2', loss_scale=static_loss_scale) # Parallelize. Need to do this after network_to_half. if args.distributed: if args.delay_allreduce: print_message(args.local_rank, "Delaying allreduces to the end of backward()") ssd300 = DDP(ssd300, gradient_predivide_factor=args.N_gpu / 8.0, delay_allreduce=args.delay_allreduce, retain_allreduce_buffers=args.use_fp16) log_event(key=constants.OPT_BASE_LR, value=current_lr) log_event(key=constants.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_epochs) log_event(key=constants.OPT_LR_DECAY_STEPS, value=args.lr_decay_epochs) log_event(key=constants.OPT_WEIGHT_DECAY, value=current_weight_decay) if args.warmup is not None: log_event(key=constants.OPT_LR_WARMUP_STEPS, value=args.warmup) log_event(key=constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) # Model is completely finished -- need to create separate copies, preserve parameters across # them, and jit ssd300_eval = SSD300(args, args.num_classes, **model_options).cuda() if args.use_fp16: convert_network(ssd300_eval, torch.half) # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 ssd300_eval.load_state_dict(train_model.state_dict()) ssd300_eval.eval() print_message(args.local_rank, "epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 start_elapsed_time = time.time() last_printed_iter = args.iteration num_elapsed_samples = 0 input_c = 4 if args.pad_input else 3 example_shape = [args.batch_size, 300, 300, input_c ] if args.nhwc else [args.batch_size, input_c, 300, 300] example_input = torch.randn(*example_shape).cuda() if args.use_fp16: example_input = example_input.half() if args.jit: # DDP has some Python-side control flow. If we JIT the entire DDP-wrapped module, # the resulting ScriptModule will elide this control flow, resulting in allreduce # hooks not being called. If we're running distributed, we need to extract and JIT # the wrapped .module. # Replacing a DDP-ed ssd300 with a script_module might also cause the AccumulateGrad hooks # to go out of scope, and therefore silently disappear. module_to_jit = ssd300.module if args.distributed else ssd300 if args.distributed: ssd300.module = torch.jit.trace(module_to_jit, example_input, check_trace=False) else: ssd300 = torch.jit.trace(module_to_jit, example_input, check_trace=False) # JIT the eval model too ssd300_eval = torch.jit.trace(ssd300_eval, example_input, check_trace=False) # do a dummy fprop & bprop to make sure cudnnFind etc. are timed here ploc, plabel = ssd300(example_input) # produce a single dummy "loss" to make things easier loss = ploc[0, 0, 0] + plabel[0, 0, 0] dloss = torch.randn_like(loss) # Cause cudnnFind for dgrad, wgrad to run loss.backward(dloss) # Necessary import in init from pycocotools.coco import COCO encoder = build_ssd300_coder() evaluator = AsyncEvaluator(num_threads=1) log_end(key=constants.INIT_STOP) ##### END INIT # This is the first place we touch anything related to data ##### START DATA TOUCHING barrier() log_start(key=constants.RUN_START) barrier() train_pipe = prebuild_pipeline(args) train_loader, epoch_size = build_pipeline(args, training=True, pipe=train_pipe) if args.rank == 0: print("epoch size is: ", epoch_size, " images") val_loader, inv_map, cocoGt = build_pipeline(args, training=False) if args.profile_gc_off: gc.disable() gc.collect() ##### END DATA TOUCHING i_eval = 0 block_start_epoch = 1 log_start(key=constants.BLOCK_START, metadata={ 'first_epoch_num': block_start_epoch, 'epoch_count': args.evaluation[i_eval] }) for epoch in range(args.epochs): for p in ssd300.parameters(): p.grad = None if epoch in args.evaluation: # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 if args.distributed and args.allreduce_running_stats: if args.rank == 0: print("averaging bn running means and vars") # make sure every node has the same running bn stats before # using them to evaluate, or saving the model for inference world_size = float(torch.distributed.get_world_size()) for bn_name, bn_buf in train_model.named_buffers(recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size if args.rank == 0: if args.save: print("saving model...") if not os.path.isdir('./models'): os.mkdir('./models') torch.save({"model": ssd300.state_dict()}, "./models/iter_{}.pt".format(iter_num)) ssd300_eval.load_state_dict(train_model.state_dict()) # Note: No longer returns, evaluation is abstracted away inside evaluator coco_eval(args, ssd300_eval, val_loader, cocoGt, encoder, inv_map, epoch, iter_num, evaluator=evaluator) log_end(key=constants.BLOCK_STOP, metadata={'first_epoch_num': block_start_epoch}) if epoch != max(args.evaluation): i_eval += 1 block_start_epoch = epoch + 1 log_start(key=constants.BLOCK_START, metadata={ 'first_epoch_num': block_start_epoch, 'epoch_count': (args.evaluation[i_eval] - args.evaluation[i_eval - 1]) }) if epoch in args.lr_decay_epochs: current_lr *= args.lr_decay_factor print_message( args.rank, "lr decay step #" + str(bisect(args.lr_decay_epochs, epoch))) for param_group in optim.param_groups: param_group['lr'] = current_lr log_start(key=constants.EPOCH_START, metadata={ 'epoch_num': epoch + 1, 'current_iter_num': iter_num }) for i, (img, bbox, label) in enumerate(train_loader): if args.profile_start is not None and iter_num == args.profile_start: torch.cuda.profiler.start() torch.cuda.synchronize() if args.profile_nvtx: torch.autograd._enable_profiler( torch.autograd.ProfilerState.NVTX) if args.profile is not None and iter_num == args.profile: if args.profile_start is not None and iter_num >= args.profile_start: # we turned cuda and nvtx profiling on, better turn it off too if args.profile_nvtx: torch.autograd._disable_profiler() torch.cuda.profiler.stop() return if args.warmup is not None: lr_warmup(optim, args.warmup, iter_num, epoch, current_lr, args) if (img is None) or (bbox is None) or (label is None): print("No labels in batch") continue ploc, plabel = ssd300(img) ploc, plabel = ploc.float(), plabel.float() N = img.shape[0] bbox.requires_grad = False label.requires_grad = False # reshape (N*8732X4 -> Nx8732x4) and transpose (Nx8732x4 -> Nx4x8732) bbox = bbox.view(N, -1, 4).transpose(1, 2).contiguous() # reshape (N*8732 -> Nx8732) and cast to Long label = label.view(N, -1).long() loss = loss_func(ploc, plabel, bbox, label) if np.isfinite(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() else: print("model exploded (corrupted by Inf or Nan)") sys.exit() num_elapsed_samples += N if args.rank == 0 and iter_num % args.print_interval == 0: end_elapsed_time = time.time() elapsed_time = end_elapsed_time - start_elapsed_time avg_samples_per_sec = num_elapsed_samples * args.N_gpu / elapsed_time print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, avg. samples / sec: {:.2f}"\ .format(iter_num, loss.item(), avg_loss, avg_samples_per_sec), end="\n") last_printed_iter = iter_num start_elapsed_time = time.time() num_elapsed_samples = 0 with apex.amp.scale_loss(loss, optim) as scaled_loss: scaled_loss.backward() if not args.profile_fake_optim: optim.step() # Likely a decent skew here, let's take this opportunity to set the # gradients to None. After DALI integration, playing with the # placement of this is worth trying. for p in ssd300.parameters(): p.grad = None # Don't check every iteration due to cost of broadcast if iter_num % 20 == 0: finished = check_async_evals(args, evaluator, args.threshold) if finished: return True iter_num += 1 train_loader.reset() log_end(key=constants.EPOCH_STOP, metadata={'epoch_num': epoch + 1}) return False
def train300_mlperf_coco(args): args.distributed = args.world_size > 1 from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() dboxes = dboxes300_coco() encoder = Encoder(dboxes) train_trans = SSDTransformer(dboxes, (300, 300), val=False) val_trans = SSDTransformer(dboxes, (300, 300), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4, sampler=train_sampler) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) ssd300 = DistributedDataParallel(ssd300) else: ssd300 = torch.nn.DataParallel(ssd300) optim = torch.optim.SGD(ssd300.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} for epoch in range(args.epochs): for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): start = time.time() if iter_num == 160000: print("") print("lr decay step #1") for param_group in optim.param_groups: param_group['lr'] = 1e-4 if iter_num == 200000: print("") print("lr decay step #2") for param_group in optim.param_groups: param_group['lr'] = 1e-5 if use_cuda: img = img.cuda() img = Variable(img, requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() if use_cuda: trans_bbox = trans_bbox.cuda() label = label.cuda() gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() optim.zero_grad() loss.backward() optim.step() end = time.time() if nbatch % 10 == 0: print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, Average time: {:.3f} secs"\ .format(iter_num, loss.item(), avg_loss, end - start)) if iter_num in args.evaluation: if not args.no_save: print("") print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_coco, cocoGt, encoder, inv_map, args.threshold): return iter_num += 1
def train300_mlperf_coco(args): from pycocotools.coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda # Setup multi-GPU if necessary args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') local_seed = set_seeds(args) # start timing here if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 validate_group_bn(args.bn_group) # Setup data, defaults dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") # Build the model model_options = { 'backbone': args.backbone, 'use_nhwc': args.nhwc, 'pad_input': args.pad_input, 'bn_group': args.bn_group, } ssd300 = SSD300(args.num_classes, **model_options) if args.checkpoint is not None: load_checkpoint(ssd300, args.checkpoint) ssd300.train() ssd300.cuda() if args.opt_loss: loss_func = OptLoss(dboxes) else: loss_func = Loss(dboxes) loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 if args.use_fp16: ssd300 = network_to_half(ssd300) # Parallelize. Need to do this after network_to_half. if args.distributed: if args.delay_allreduce: print_message(args.local_rank, "Delaying allreduces to the end of backward()") ssd300 = DDP(ssd300, gradient_predivide_factor=N_gpu / 8.0, delay_allreduce=args.delay_allreduce, retain_allreduce_buffers=args.use_fp16) # Create optimizer. This must also be done after network_to_half. global_batch_size = (N_gpu * args.batch_size) mlperf_print(key=mlperf_compliance.constants.MODEL_BN_SPAN, value=args.bn_group * args.batch_size) mlperf_print(key=mlperf_compliance.constants.GLOBAL_BATCH_SIZE, value=global_batch_size) # mlperf only allows base_lr scaled by an integer base_lr = 2.5e-3 requested_lr_multiplier = args.lr / base_lr adjusted_multiplier = max( 1, round(requested_lr_multiplier * global_batch_size / 32)) current_lr = base_lr * adjusted_multiplier current_momentum = 0.9 current_weight_decay = args.wd static_loss_scale = 128. if args.use_fp16: if args.distributed and not args.delay_allreduce: # We can't create the flat master params yet, because we need to # imitate the flattened bucket structure that DDP produces. optimizer_created = False else: model_buckets = [ [ p for p in ssd300.parameters() if p.requires_grad and p.type() == "torch.cuda.HalfTensor" ], [ p for p in ssd300.parameters() if p.requires_grad and p.type() == "torch.cuda.FloatTensor" ] ] flat_master_buckets = create_flat_master(model_buckets) optim = torch.optim.SGD(flat_master_buckets, lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True else: optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True mlperf_print(key=mlperf_compliance.constants.OPT_BASE_LR, value=current_lr) mlperf_print(key=mlperf_compliance.constants.OPT_WEIGHT_DECAY, value=current_weight_decay) if args.warmup is not None: mlperf_print(key=mlperf_compliance.constants.OPT_LR_WARMUP_STEPS, value=args.warmup) mlperf_print(key=mlperf_compliance.constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) # Model is completely finished -- need to create separate copies, preserve parameters across # them, and jit ssd300_eval = SSD300(args.num_classes, backbone=args.backbone, use_nhwc=args.nhwc, pad_input=args.pad_input).cuda() if args.use_fp16: ssd300_eval = network_to_half(ssd300_eval) # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 ssd300_eval.load_state_dict(train_model.state_dict()) ssd300_eval.eval() print_message(args.local_rank, "epoch", "nbatch", "loss") eval_points = np.array(args.evaluation) * 32 / global_batch_size eval_points = list(map(int, list(eval_points))) iter_num = args.iteration avg_loss = 0.0 start_elapsed_time = time.time() last_printed_iter = args.iteration num_elapsed_samples = 0 # Generate normalization tensors mean, std = generate_mean_std(args) dummy_overflow_buf = torch.cuda.IntTensor([0]) def step_maybe_fp16_maybe_distributed(optim): if args.use_fp16: if args.distributed: for flat_master, allreduce_buffer in zip( flat_master_buckets, ssd300.allreduce_buffers): if allreduce_buffer is None: raise RuntimeError("allreduce_buffer is None") flat_master.grad = allreduce_buffer.float() flat_master.grad.data.mul_(1. / static_loss_scale) else: for flat_master, model_bucket in zip(flat_master_buckets, model_buckets): flat_grad = apex_C.flatten( [m.grad.data for m in model_bucket]) flat_master.grad = flat_grad.float() flat_master.grad.data.mul_(1. / static_loss_scale) optim.step() if args.use_fp16: # Use multi-tensor scale instead of loop & individual parameter copies for model_bucket, flat_master in zip(model_buckets, flat_master_buckets): multi_tensor_applier( amp_C.multi_tensor_scale, dummy_overflow_buf, [ apex_C.unflatten(flat_master.data, model_bucket), model_bucket ], 1.0) input_c = 4 if args.pad_input else 3 example_shape = [args.batch_size, 300, 300, input_c ] if args.nhwc else [args.batch_size, input_c, 300, 300] example_input = torch.randn(*example_shape).cuda() if args.use_fp16: example_input = example_input.half() if args.jit: # DDP has some Python-side control flow. If we JIT the entire DDP-wrapped module, # the resulting ScriptModule will elide this control flow, resulting in allreduce # hooks not being called. If we're running distributed, we need to extract and JIT # the wrapped .module. # Replacing a DDP-ed ssd300 with a script_module might also cause the AccumulateGrad hooks # to go out of scope, and therefore silently disappear. module_to_jit = ssd300.module if args.distributed else ssd300 if args.distributed: ssd300.module = torch.jit.trace(module_to_jit, example_input) else: ssd300 = torch.jit.trace(module_to_jit, example_input) # JIT the eval model too ssd300_eval = torch.jit.trace(ssd300_eval, example_input) # do a dummy fprop & bprop to make sure cudnnFind etc. are timed here ploc, plabel = ssd300(example_input) # produce a single dummy "loss" to make things easier loss = ploc[0, 0, 0] + plabel[0, 0, 0] dloss = torch.randn_like(loss) # Cause cudnnFind for dgrad, wgrad to run loss.backward(dloss) mlperf_print(key=mlperf_compliance.constants.INIT_STOP, sync=True) ##### END INIT # This is the first place we touch anything related to data ##### START DATA TOUCHING mlperf_print(key=mlperf_compliance.constants.RUN_START, sync=True) barrier() cocoGt = COCO(annotation_file=val_annotate, use_ext=True) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) if args.distributed: val_sampler = GeneralDistributedSampler(val_coco, pad=False) else: val_sampler = None if args.no_dali: train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) if args.distributed: train_sampler = GeneralDistributedSampler(train_coco, pad=False) else: train_sampler = None train_loader = DataLoader(train_coco, batch_size=args.batch_size * args.input_batch_multiplier, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=args.num_workers, collate_fn=partial(my_collate, is_training=True)) else: train_pipe = COCOPipeline(args.batch_size * args.input_batch_multiplier, args.local_rank, train_coco_root, train_annotate, N_gpu, num_threads=args.num_workers, output_fp16=args.use_fp16, output_nhwc=args.nhwc, pad_output=args.pad_input, seed=local_seed - 2**31, use_nvjpeg=args.use_nvjpeg, use_roi=args.use_roi_decode, dali_cache=args.dali_cache, dali_async=(not args.dali_sync)) print_message(args.local_rank, "time_check a: {secs:.9f}".format(secs=time.time())) train_pipe.build() print_message(args.local_rank, "time_check b: {secs:.9f}".format(secs=time.time())) test_run = train_pipe.run() train_loader = SingleDaliIterator( train_pipe, [ 'images', DALIOutput('bboxes', False, True), DALIOutput('labels', True, True) ], train_pipe.epoch_size()['train_reader'], ngpu=N_gpu) train_loader = EncodingInputIterator(train_loader, dboxes=encoder.dboxes.cuda(), nhwc=args.nhwc, fake_input=args.fake_input, no_dali=args.no_dali) if args.input_batch_multiplier > 1: train_loader = RateMatcher(input_it=train_loader, output_size=args.batch_size) val_dataloader = DataLoader( val_coco, batch_size=args.eval_batch_size, shuffle=False, # Note: distributed sampler is shuffled :( sampler=val_sampler, num_workers=args.num_workers) inv_map = {v: k for k, v in val_coco.label_map.items()} ##### END DATA TOUCHING i_eval = 0 first_epoch = 1 mlperf_print(key=mlperf_compliance.constants.BLOCK_START, metadata={ 'first_epoch_num': first_epoch, 'epoch_count': args.evaluation[i_eval] * 32 / train_pipe.epoch_size()['train_reader'] }, sync=True) for epoch in range(args.epochs): mlperf_print(key=mlperf_compliance.constants.EPOCH_START, metadata={'epoch_num': epoch + 1}, sync=True) for p in ssd300.parameters(): p.grad = None for i, (img, bbox, label) in enumerate(train_loader): if args.profile_start is not None and iter_num == args.profile_start: torch.cuda.profiler.start() torch.cuda.synchronize() if args.profile_nvtx: torch.autograd._enable_profiler( torch.autograd.ProfilerState.NVTX) if args.profile is not None and iter_num == args.profile: if args.profile_start is not None and iter_num >= args.profile_start: # we turned cuda and nvtx profiling on, better turn it off too if args.profile_nvtx: torch.autograd._disable_profiler() torch.cuda.profiler.stop() return if args.warmup is not None and optimizer_created: lr_warmup(optim, args.warmup, iter_num, epoch, current_lr, args) if iter_num == ((args.decay1 * 1000 * 32) // global_batch_size): print_message(args.local_rank, "lr decay step #1") current_lr *= 0.1 for param_group in optim.param_groups: param_group['lr'] = current_lr if iter_num == ((args.decay2 * 1000 * 32) // global_batch_size): print_message(args.local_rank, "lr decay step #2") current_lr *= 0.1 for param_group in optim.param_groups: param_group['lr'] = current_lr if (img is None) or (bbox is None) or (label is None): print("No labels in batch") continue ploc, plabel = ssd300(img) ploc, plabel = ploc.float(), plabel.float() N = img.shape[0] gloc, glabel = Variable(bbox, requires_grad=False), \ Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if np.isfinite(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() else: print("model exploded (corrupted by Inf or Nan)") sys.exit() num_elapsed_samples += N if args.local_rank == 0 and iter_num % args.print_interval == 0: end_elapsed_time = time.time() elapsed_time = end_elapsed_time - start_elapsed_time avg_samples_per_sec = num_elapsed_samples * N_gpu / elapsed_time print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, avg. samples / sec: {:.2f}"\ .format(iter_num, loss.item(), avg_loss, avg_samples_per_sec), end="\n") last_printed_iter = iter_num start_elapsed_time = time.time() num_elapsed_samples = 0 # loss scaling if args.use_fp16: loss = loss * static_loss_scale loss.backward() if not optimizer_created: # Imitate the model bucket structure created by DDP. # These will already be split by type (float or half). model_buckets = [] for bucket in ssd300.active_i_buckets: model_buckets.append([]) for active_i in bucket: model_buckets[-1].append( ssd300.active_params[active_i]) flat_master_buckets = create_flat_master(model_buckets) optim = torch.optim.SGD(flat_master_buckets, lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) optimizer_created = True # Skip this first iteration because flattened allreduce buffers are not yet created. # step_maybe_fp16_maybe_distributed(optim) else: step_maybe_fp16_maybe_distributed(optim) # Likely a decent skew here, let's take this opportunity to set the gradients to None. # After DALI integration, playing with the placement of this is worth trying. for p in ssd300.parameters(): p.grad = None if iter_num in eval_points: # Get the existant state from the train model # * if we use distributed, then we want .module train_model = ssd300.module if args.distributed else ssd300 if args.distributed and args.allreduce_running_stats: if get_rank() == 0: print("averaging bn running means and vars") # make sure every node has the same running bn stats before # using them to evaluate, or saving the model for inference world_size = float(torch.distributed.get_world_size()) for bn_name, bn_buf in train_model.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size if get_rank() == 0: if not args.no_save: print("saving model...") torch.save( { "model": ssd300.state_dict(), "label_map": val_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) ssd300_eval.load_state_dict(train_model.state_dict()) succ = coco_eval( ssd300_eval, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch, iter_num, args.eval_batch_size, use_fp16=args.use_fp16, local_rank=args.local_rank if args.distributed else -1, N_gpu=N_gpu, use_nhwc=args.nhwc, pad_input=args.pad_input) mlperf_print(key=mlperf_compliance.constants.BLOCK_STOP, metadata={'first_epoch_num': first_epoch}, sync=True) if succ: return True if iter_num != max(eval_points): i_eval += 1 first_epoch = epoch + 1 mlperf_print(key=mlperf_compliance.constants.BLOCK_START, metadata={ 'first_epoch_num': first_epoch, 'epoch_count': (args.evaluation[i_eval] - args.evaluation[i_eval - 1]) * 32 / train_pipe.epoch_size()['train_reader'] }, sync=True) iter_num += 1 if args.max_iter > 0: if iter_num > args.max_iter: break train_loader.reset() mlperf_print(key=mlperf_compliance.constants.EPOCH_STOP, metadata={'epoch_num': epoch + 1}, sync=True) return False
def train300_mlperf_coco(exp, args): from coco import COCO device = exp.get_device() chrono = exp.chrono() dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) # mlperf_log.ssd_print(key=# mlperf_log.INPUT_SIZE, value=input_size) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) #print("Number of labels: {}".format(train_coco.labelnum)) train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=True, num_workers=4) # set shuffle=True in DataLoader # mlperf_log.ssd_print(key=# mlperf_log.INPUT_SHARD, value=None) # mlperf_log.ssd_print(key=# mlperf_log.INPUT_ORDER) # mlperf_log.ssd_print(key=# mlperf_log.INPUT_BATCH_SIZE, value=args.batch_size) ssd300 = SSD300(train_coco.labelnum) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() ssd300 = ssd300.to(device) loss_func = Loss(dboxes).to(device) current_lr = 1e-3 current_momentum = 0.9 current_weight_decay = 5e-4 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=current_weight_decay) # mlperf_log.ssd_print(key=# mlperf_log.OPT_NAME, value="SGD") # mlperf_log.ssd_print(key=# mlperf_log.OPT_LR, value=current_lr) # mlperf_log.ssd_print(key=# mlperf_log.OPT_MOMENTUM, value=current_momentum) # mlperf_log.ssd_print(key=# mlperf_log.OPT_WEIGHT_DECAY, value=current_weight_decay) print("epoch", "nbatch", "loss") iter_num = args.iteration avg_loss = 0.0 inv_map = {v: k for k, v in val_coco.label_map.items()} # mlperf_log.ssd_print(key=# mlperf_log.TRAIN_LOOP) for epoch in range(args.repeat): # mlperf_log.ssd_print(key=# mlperf_log.TRAIN_EPOCH, value=epoch) with chrono.time('train') as t: for nbatch, (img, img_size, bbox, label) in enumerate(train_dataloader): if nbatch > args.number: break img = Variable(img.to(device), requires_grad=True) ploc, plabel = ssd300(img) trans_bbox = bbox.transpose(1, 2).contiguous() trans_bbox = trans_bbox.to(device) label = label.to(device) gloc = Variable(trans_bbox, requires_grad=False) glabel = Variable(label, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() exp.log_batch_loss(loss) optim.zero_grad() loss.backward() optim.step() iter_num += 1 exp.show_eta(epoch, t) exp.report() return False
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if args.use_hpu: if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.world_size = int(os.environ['WORLD_SIZE']) print("world_size = {}".format(args.world_size)) print("distributed={}".format(args.distributed)) if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError( "Please install APEX from https://github.com/nvidia/apex") use_hpu = args.use_hpu hpu_channels_last = args.hpu_channels_last hpu_lazy_mode = args.hpu_lazy_mode is_hmp = args.is_hmp device = torch.device('cpu') data_loader_type = DataLoader if use_hpu: device = torch.device('hpu') if args.distributed: os.environ["MAX_WAIT_ATTEMPTS"] = "90" if hpu_lazy_mode: os.environ["PT_HPU_LAZY_MODE"] = "1" else: os.environ["PT_HPU_LAZY_MODE"] = "2" if is_hmp: if not args.hmp_bf16: raise IOError("Please provide list of BF16 ops") if not args.hmp_fp32: raise IOError("Please provide list of FP32 ops") from habana_frameworks.torch.hpex import hmp hmp.convert(opt_level=args.hmp_opt_level, bf16_file_path=args.hmp_bf16, fp32_file_path=args.hmp_fp32, isVerbose=args.hmp_verbose) from habana_frameworks.torch.utils.library_loader import load_habana_module load_habana_module() # TODO - add dataloader local_seed = args.seed if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist if use_hpu: args.dist_backend = 'hccl' import habana_frameworks.torch.core.hccl os.environ["ID"] = os.environ["RANK"] dist.init_process_group(args.dist_backend, init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device, use_hpu=True) local_seed = (args.seed + dist.get_rank()) % 2**32 elif args.no_cuda: device = torch.device('cpu') else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 mllogger.event(key=mllog_const.SEED, value=local_seed) torch.manual_seed(local_seed) np.random.seed(seed=local_seed) random.seed(local_seed) # amorgenstern torch.cuda.manual_seed(local_seed) # amorgenstern args.rank = dist.get_rank() if args.distributed else args.local_rank print("args.rank = {}".format(args.rank)) print("local rank = {}".format(args.local_rank)) print("distributed={}".format(args.distributed)) if use_hpu and is_hmp: with hmp.disable_casts(): dboxes = dboxes300_coco() encoder = Encoder(dboxes) else: dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 if use_hpu and is_hmp: with hmp.disable_casts(): train_trans = SSDTransformer( dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) else: train_trans = SSDTransformer( dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") if use_hpu and is_hmp: with hmp.disable_casts(): cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) else: cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) mllogger.event(key=mllog_const.TRAIN_SAMPLES, value=len(train_coco)) mllogger.event(key=mllog_const.EVAL_SAMPLES, value=len(val_coco)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_coco) else: train_sampler = None if use_hpu: # patch torch cuda functions that are being unconditionally invoked # in the multiprocessing data loader torch.cuda.current_device = lambda: None torch.cuda.set_device = lambda x: None train_dataloader = data_loader_type(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=args.num_workers) # set shuffle=True in DataLoader if args.rank == 0: val_dataloader = data_loader_type(val_coco, batch_size=args.val_batch_size or args.batch_size, shuffle=False, sampler=None, num_workers=args.num_workers) else: val_dataloader = None ssd300 = SSD300(train_coco.labelnum, model_path=args.pretrained_backbone) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint, map_location=torch.device('cpu')) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() if use_hpu and is_hmp: with hmp.disable_casts(): loss_func = Loss(dboxes, use_hpu=use_hpu, hpu_device=device) else: loss_func = Loss(dboxes, use_hpu=use_hpu, hpu_device=device) if use_cuda: loss_func.cuda() if use_hpu: ssd300.to(device) loss_func.to(device) if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 global_batch_size = N_gpu * args.batch_size mllogger.event(key=mllog_const.GLOBAL_BATCH_SIZE, value=global_batch_size) # Reference doesn't support group batch norm, so bn_span==local_batch_size mllogger.event(key=mllog_const.MODEL_BN_SPAN, value=args.batch_size) current_lr = args.lr * (global_batch_size / 32) assert args.batch_size % args.batch_splits == 0, "--batch-size must be divisible by --batch-splits" fragment_size = args.batch_size // args.batch_splits if args.batch_splits != 1: print("using gradient accumulation with fragments of size {}".format( fragment_size)) current_momentum = 0.9 sgd_optimizer = torch.optim.SGD if use_hpu and hpu_lazy_mode: from habana_frameworks.torch.hpex.optimizers import FusedSGD sgd_optimizer = FusedSGD optim = sgd_optimizer(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=args.weight_decay) if use_hpu: permute_params(model=ssd300, to_filters_last=True, lazy_mode=hpu_lazy_mode) permute_momentum(optimizer=optim, to_filters_last=True, lazy_mode=hpu_lazy_mode) ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_BASE_LR, value=current_lr) ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_WEIGHT_DECAY, value=args.weight_decay) # parallelize if args.distributed: if use_hpu: ssd300 = torch.nn.parallel.DistributedDataParallel( ssd300, bucket_cap_mb=100, broadcast_buffers=False, gradient_as_bucket_view=True) else: ssd300 = DDP(ssd300) iter_num = args.iteration end_iter_num = args.end_iteration if end_iter_num: print("--end-iteration set to: {}".format(end_iter_num)) assert end_iter_num > iter_num, "--end-iteration must have a value > --iteration" avg_loss = 0.0 if use_hpu: loss_iter = list() inv_map = {v: k for k, v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if use_hpu: success = success.to(device) if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu * args.batch_size)) ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_LR_WARMUP_STEPS, value=wb) warmup_step = lambda iter_num, current_lr: lr_warmup( optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_schedule) mllogger.start(key=mllog_const.BLOCK_START, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) optim.zero_grad(set_to_none=True) if use_hpu: start = time.time() for epoch in range(args.epochs): mllogger.start(key=mllog_const.EPOCH_START, metadata={mllog_const.EPOCH_NUM: epoch}) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format( num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr for nbatch, (img, img_id, img_size, bbox, label) in enumerate(train_dataloader): current_batch_size = img.shape[0] # Split batch for gradient accumulation img = torch.split(img, fragment_size) bbox = torch.split(bbox, fragment_size) label = torch.split(label, fragment_size) for (fimg, fbbox, flabel) in zip(img, bbox, label): current_fragment_size = fimg.shape[0] if not use_hpu: trans_bbox = fbbox.transpose(1, 2).contiguous() if use_cuda: fimg = fimg.cuda() trans_bbox = trans_bbox.cuda() flabel = flabel.cuda() if use_hpu: fimg = fimg.to(device) if hpu_channels_last: fimg = fimg.contiguous( memory_format=torch.channels_last) if hpu_lazy_mode: mark_step() if is_hmp: with hmp.disable_casts(): #TODO revert after SW-58188 is fixed trans_bbox = fbbox.to(device).transpose( 1, 2).contiguous() flabel = flabel.to(device) else: #TODO revert after SW-58188 is fixed trans_bbox = fbbox.to(device).transpose( 1, 2).contiguous() flabel = flabel.to(device) fimg = Variable(fimg, requires_grad=True) if args.lowp: # amorgenstern import lowp with lowp.Lowp(mode='BF16', warn_patched=True, warn_not_patched=True): ploc, plabel = ssd300(fimg) gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(flabel, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) else: ploc, plabel = ssd300(fimg) if use_hpu and is_hmp: with hmp.disable_casts(): gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(flabel, requires_grad=False) loss = loss_func(ploc.float(), plabel.float(), gloc, glabel) else: gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(flabel, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) loss = loss * (current_fragment_size / current_batch_size ) # weighted mean if use_hpu and hpu_lazy_mode and args.distributed: mark_step() loss.backward() if use_hpu and hpu_lazy_mode: mark_step() warmup_step(iter_num, current_lr) if use_hpu and is_hmp: with hmp.disable_casts(): optim.step() else: optim.step() optim.zero_grad(set_to_none=True) if use_hpu: loss_iter.append(loss.clone().detach()) else: if not np.isinf(loss.item()): avg_loss = 0.999 * avg_loss + 0.001 * loss.item() if use_hpu and hpu_lazy_mode: mark_step() if use_hpu: if args.log_interval and not iter_num % args.log_interval: cur_loss = 0.0 for i, x in enumerate(loss_iter): cur_loss = x.cpu().item() if not np.isinf(cur_loss): avg_loss = 0.999 * avg_loss + 0.001 * cur_loss if args.rank == 0: print("Rank: {:6d}, Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(args.rank, iter_num, cur_loss, avg_loss)) loss_iter = list() else: if args.rank == 0 and args.log_interval and not iter_num % args.log_interval: print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss)) iter_num += 1 if use_hpu and iter_num == 50: start = time.time() if end_iter_num and iter_num >= end_iter_num: if use_hpu: print("Training Ended, total time: {:.2f} s".format( time.time() - start)) break if (args.val_epochs and (epoch+1) in args.val_epochs) or \ (args.val_interval and not (epoch+1) % args.val_interval): if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers( recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size ssd_print(device=device, use_hpu=use_hpu, key=mllog_const.MODEL_BN_SPAN, value=bn_buf) if args.rank == 0: if use_hpu: print("Training Ended, total time: {:.2f} s".format( time.time() - start)) if not args.no_save: print("") print("saving model...") if use_hpu: permute_params(model=ssd300, to_filters_last=False, lazy_mode=hpu_lazy_mode) ssd300_copy = SSD300( train_coco.labelnum, model_path=args.pretrained_backbone) if args.distributed: ssd300_copy.load_state_dict( ssd300.module.state_dict()) else: ssd300_copy.load_state_dict(ssd300.state_dict()) torch.save( { "model": ssd300_copy.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) permute_params(model=ssd300, to_filters_last=True, lazy_mode=hpu_lazy_mode) else: torch.save( { "model": ssd300.state_dict(), "label_map": train_coco.label_info }, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num, log_interval=args.log_interval, use_cuda=use_cuda, use_hpu=use_hpu, hpu_device=device, is_hmp=is_hmp, hpu_channels_last=hpu_channels_last, hpu_lazy_mode=hpu_lazy_mode, nms_valid_thresh=args.nms_valid_thresh): success = torch.ones(1) if use_cuda: success = success.cuda() if use_hpu: success = success.to(device) if args.distributed: dist.broadcast(success, 0) if success[0]: return True mllogger.end(key=mllog_const.EPOCH_STOP, metadata={mllog_const.EPOCH_NUM: epoch}) mllogger.end(key=mllog_const.BLOCK_STOP, metadata={ mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs }) return False