def set_seed_distributed(local_seed): # single-element tensor with the local seed in it rank_0_seed = nd.full((1), local_seed, dtype=np.int32) if hvd.size() > 1: rank_0_seed = hvd.broadcast_(tensor=rank_0_seed, root_rank=0, name="broadcast_the_seed") nd.ndarray.waitall() local_seed = (rank_0_seed[0].asscalar() + hvd.rank()) % 2**31 log_event(key=mlperf_constants.SEED, value=local_seed) random.seed(local_seed) np.random.seed(local_seed) mx.random.seed(local_seed) return local_seed
def sgd_trainer(net, learning_rate, weight_decay, momentum, precision, fp16_loss_scale, gradient_predivide_factor, num_groups, profile_no_horovod): # Trainer if not profile_no_horovod: trainer = hvd.DistributedTrainer( net.collect_params(), 'sgd', { 'learning_rate': learning_rate, 'wd': weight_decay, 'momentum': momentum, 'multi_precision': precision == 'fp16', 'rescale_grad': 1.0 / fp16_loss_scale if precision == 'fp16' else 1.0 }, gradient_predivide_factor=gradient_predivide_factor, num_groups=num_groups) else: trainer = mx.gluon.Trainer( net.collect_params(), 'sgd', { 'learning_rate': learning_rate, 'wd': weight_decay, 'momentum': momentum, 'multi_precision': precision == 'fp16', 'rescale_grad': 1.0 / fp16_loss_scale if precision == 'fp16' else 1.0 }, kvstore=None) log_event(key=mlperf_constants.OPT_WEIGHT_DECAY, value=weight_decay) if precision == 'amp': amp.init_trainer(trainer) return trainer
def __init__(self, learning_rate=_MLPERF_BASE_LR, decay_factor=None, decay_epochs=None, warmup_factor=None, warmup_epochs=None, epoch_size=None, global_batch_size=None): if decay_epochs: assert (decay_factor is not None ), 'decay_factor can\'t be None when decay_epochs is given' if warmup_epochs: assert ( warmup_factor is not None ), 'warmup_factor can\'t be None when warmup_epochs is given' self.lr = learning_rate or self._MLPERF_BASE_LR self.decay_factor = decay_factor self.decay_epochs = decay_epochs self.warmup_factor = warmup_factor self.warmup_epochs = warmup_epochs self.epoch_size = epoch_size self.global_batch_size = global_batch_size # convert warmup epochs to iterations self.warmup_iters = None if self.warmup_epochs: self.warmup_iters = int(self.warmup_epochs * self.epoch_size / self.global_batch_size) log_event(key=mlperf_constants.OPT_LR_WARMUP_STEPS, value=self.warmup_iters) log_event(key=mlperf_constants.OPT_LR_WARMUP_FACTOR, value=self.warmup_factor) self.lr = self.mlperf_adjusted_lr(requested_lr=self.lr, global_batch_size=global_batch_size) log_event(key=mlperf_constants.OPT_BASE_LR, value=self.lr) log_event(key=mlperf_constants.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=self.decay_epochs)
def main(args): if not torch.cuda.is_available(): raise NotImplementedError('Training on CPU is not supported') torch.cuda.set_device(args.device_id) mllog.config(filename=os.path.join( os.path.dirname(os.path.abspath(__file__)), 'transformer.log')) mllogger = mllog.get_mllogger() mllogger.logger.propagate = False log_start(key=constants.INIT_START, log_all_ranks=True) # preinit and warmup streams/groups for allreduce communicators allreduce_communicators = None if args.distributed_world_size > 1 and args.enable_parallel_backward_allred_opt: allreduce_groups = [ torch.distributed.new_group() for _ in range(args.parallel_backward_allred_cuda_nstreams) ] allreduce_streams = [ torch.cuda.Stream() for _ in range(args.parallel_backward_allred_cuda_nstreams) ] for group, stream in zip(allreduce_groups, allreduce_streams): with torch.cuda.stream(stream): torch.distributed.all_reduce(torch.cuda.FloatTensor(1), group=group) allreduce_communicators = (allreduce_groups, allreduce_streams) if args.max_tokens is None: args.max_tokens = 6000 print(args) log_event(key=constants.GLOBAL_BATCH_SIZE, value=args.max_tokens * args.distributed_world_size) log_event(key=constants.OPT_NAME, value=args.optimizer) assert (len(args.lr) == 1) log_event(key=constants.OPT_BASE_LR, value=args.lr[0] if len(args.lr) == 1 else args.lr) log_event(key=constants.OPT_LR_WARMUP_STEPS, value=args.warmup_updates) assert (args.max_source_positions == args.max_target_positions) log_event(key=constants.MAX_SEQUENCE_LENGTH, value=args.max_target_positions, metadata={'method': 'discard'}) log_event(key=constants.OPT_ADAM_BETA_1, value=eval(args.adam_betas)[0]) log_event(key=constants.OPT_ADAM_BETA_2, value=eval(args.adam_betas)[1]) log_event(key=constants.OPT_ADAM_EPSILON, value=args.adam_eps) log_event(key=constants.SEED, value=args.seed) # L2 Sector Promotion pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int)) result = ctypes.CDLL('libcudart.so').cudaDeviceSetLimit( ctypes.c_int(0x05), ctypes.c_int(128)) result = ctypes.CDLL('libcudart.so').cudaDeviceGetLimit( pValue, ctypes.c_int(0x05)) worker_seeds, shuffling_seeds = setup_seeds( args.seed, args.max_epoch + 1, torch.device('cuda'), args.distributed_rank, args.distributed_world_size, ) worker_seed = worker_seeds[args.distributed_rank] print( f'Worker {args.distributed_rank} is using worker seed: {worker_seed}') torch.manual_seed(worker_seed) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print('| model {}, criterion {}'.format(args.arch, criterion.__class__.__name__)) print('| num. model params: {}'.format( sum(p.numel() for p in model.parameters()))) # Build trainer if args.fp16: if args.distributed_weight_update != 0: from fairseq.fp16_trainer import DistributedFP16Trainer trainer = DistributedFP16Trainer( args, task, model, criterion, allreduce_communicators=allreduce_communicators) else: from fairseq.fp16_trainer import FP16Trainer trainer = FP16Trainer( args, task, model, criterion, allreduce_communicators=allreduce_communicators) else: if torch.cuda.get_device_capability(0)[0] >= 7: print( '| NOTICE: your device may support faster training with --fp16' ) trainer = Trainer(args, task, model, criterion, allreduce_communicators=None) #if (args.online_eval or args.target_bleu) and not args.remove_bpe: # args.remove_bpe='@@ ' print('| training on {} GPUs'.format(args.distributed_world_size)) print('| max tokens per GPU = {} and max sentences per GPU = {}'.format( args.max_tokens, args.max_sentences, )) # Initialize dataloader max_positions = trainer.get_model().max_positions() # Send a dummy batch to warm the caching allocator dummy_batch = language_pair_dataset.get_dummy_batch_isolated( args.max_tokens, max_positions, 8) trainer.dummy_train_step(dummy_batch) # Train until the learning rate gets too small or model reaches target score max_epoch = args.max_epoch if args.max_epoch >= 0 else math.inf max_update = args.max_update or math.inf tgt_bleu = args.target_bleu or math.inf current_bleu = 0.0 lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_losses = [None] # mlperf compliance synchronization if args.distributed_world_size > 1: assert (torch.distributed.is_initialized()) torch.distributed.all_reduce(torch.cuda.FloatTensor(1)) torch.cuda.synchronize() log_end(key=constants.INIT_STOP, sync=False) log_start(key=constants.RUN_START, sync=True) # second sync after RUN_START tag is printed. # this ensures no rank touches data until after RUN_START tag is printed. barrier() # Load dataset splits load_dataset_splits(task, ['train', 'test']) log_event(key=constants.TRAIN_SAMPLES, value=len(task.dataset(args.train_subset)), sync=False) log_event(key=constants.EVAL_SAMPLES, value=len(task.dataset(args.gen_subset)), sync=False) ctr = 0 start = time.time() epoch_itr = data.EpochBatchIterator( dataset=task.dataset(args.train_subset), dataloader_num_workers=args.dataloader_num_workers, dataloader_pin_memory=args.enable_dataloader_pin_memory, max_tokens=args.max_tokens, max_sentences=args.max_sentences_valid, max_positions=max_positions, ignore_invalid_inputs=True, required_batch_size_multiple=8, seeds=shuffling_seeds, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, epoch=epoch_itr.epoch if ctr is not 0 else 0, bucket_growth_factor=args.bucket_growth_factor, seq_len_multiple=args.seq_len_multiple, batching_scheme=args.batching_scheme, batch_multiple_strategy=args.batch_multiple_strategy, ) print("got epoch iterator", time.time() - start) # Main training loop while lr >= args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates( ) < max_update and current_bleu < tgt_bleu: first_epoch = epoch_itr.epoch + 1 log_start(key=constants.BLOCK_START, metadata={ 'first_epoch_num': first_epoch, 'epoch_count': 1 }, sync=False) log_start(key=constants.EPOCH_START, metadata={'epoch_num': first_epoch}, sync=False) gc.disable() # Load the latest checkpoint if one is available if ctr is 0: load_checkpoint(args, trainer, epoch_itr) # train for one epoch start = time.time() #exit(1) train(args, trainer, task, epoch_itr, shuffling_seeds) print("epoch time ", time.time() - start) start = time.time() log_end(key=constants.EPOCH_STOP, metadata={'epoch_num': first_epoch}, sync=False) # Eval BLEU score if args.online_eval or (not tgt_bleu is math.inf): current_bleu = score(args, trainer, task, epoch_itr, args.gen_subset) log_event(key=constants.EVAL_ACCURACY, value=float(current_bleu) / 100.0, metadata={'epoch_num': first_epoch}) gc.enable() # Only use first validation loss to update the learning rate #lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # Save checkpoint #if epoch_itr.epoch % args.save_interval == 0: # save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) ctr = ctr + 1 print("validation and scoring ", time.time() - start) log_end(key=constants.BLOCK_STOP, metadata={'first_epoch_num': first_epoch}, sync=False) train_meter.stop() status = 'success' if current_bleu >= tgt_bleu else 'aborted' log_end(key=constants.RUN_STOP, metadata={'status': status}) print('| done training in {:.1f} seconds'.format(train_meter.sum))
def log_callback(future): log_end(key=mlperf_constants.EVAL_STOP, metadata={'epoch_num': epoch}) log_event(key=mlperf_constants.EVAL_ACCURACY, value=future.result() / 100, metadata={'epoch_num': epoch})
def validate(self, val_iterator, epoch=1, annotation_file=None, cocoapi_threads=1, log_interval=None): """Test on validation dataset.""" log_start(key=mlperf_constants.EVAL_START, metadata={'epoch_num': epoch}) time_ticks = [time.time()] time_messages = [] # save a copy of weights to temp dir if self.async_executor and self.save_prefix and hvd.rank() == 0: save_fname = os.path.join(tempfile.gettempdir(), f'temp_ssd_mxnet_epoch{epoch}.params') self.net.save_parameters(save_fname) time_ticks.append(time.time()) time_messages.append('save_parameters') results = self.infer(data_iterator=val_iterator, log_interval=log_interval) time_ticks.append(time.time()) time_messages.append('inference') # all gather results from all ranks if hvd.size() > 1: results = self.allgather(results) time_ticks.append(time.time()) time_messages.append('allgather') # convert to numpy (cocoapi doesn't take mxnet ndarray) results = results.asnumpy() time_ticks.append(time.time()) time_messages.append('asnumpy') time_ticks = np.array(time_ticks) elpased_time = time_ticks[1:] - time_ticks[:-1] validation_log_msg = '[Validation] ' for msg, t in zip(time_messages, elpased_time): validation_log_msg += f'{msg}: {t*1000.0:.3f} [ms], ' # TODO(ahmadki): val size is hard coded :( validation_log_msg += f'speed: {5000.0/(time_ticks[-1]-time_ticks[0]):.3f} [imgs/sec]' # TODO(ahmadki): remove time measurements logging.info(validation_log_msg) # Evaluate(score) results map_score = -1 if self.async_executor: if hvd.rank() == 0: self.async_executor.submit(tag=str(epoch), fn=coco_map_score, results=results, annotation_file=annotation_file, num_threads=cocoapi_threads) def log_callback(future): log_end(key=mlperf_constants.EVAL_STOP, metadata={'epoch_num': epoch}) log_event(key=mlperf_constants.EVAL_ACCURACY, value=future.result() / 100, metadata={'epoch_num': epoch}) self.async_executor.add_done_callback(tag=str(epoch), fn=log_callback) else: if hvd.rank() == 0: map_score = coco_map_score(results=results, annotation_file=annotation_file, num_threads=cocoapi_threads) map_score = comm.bcast(map_score, root=0) log_end(key=mlperf_constants.EVAL_STOP, metadata={'epoch_num': epoch}) log_event(key=mlperf_constants.EVAL_ACCURACY, value=map_score / 100, metadata={'epoch_num': epoch}) return map_score
def train_val(self, train_iterator, start_epoch=1, end_epoch=80, val_iterator=None, val_interval=None, val_epochs=None, annotation_file=None, target_map=0.23, train_log_interval=100, val_log_interval=100, save_interval=None, cocoapi_threads=1, profile_start=None, profile_stop=None): local_train_batch_size = train_iterator.batch_size global_train_batch_size = local_train_batch_size * hvd.size() log_event(key=mlperf_constants.MODEL_BN_SPAN, value=self.bn_group * local_train_batch_size) log_event(key=mlperf_constants.GLOBAL_BATCH_SIZE, value=global_train_batch_size) epoch_size = train_iterator.epoch_size() iterations_per_epoch = math.ceil(epoch_size / global_train_batch_size) logging.info(f'Training from epoch: {start_epoch}') for epoch in range(start_epoch, end_epoch + 1): # Train for 1 epoch ret = self.train_epoch( data_iterator=train_iterator, global_train_batch_size=global_train_batch_size, iterations_per_epoch=iterations_per_epoch, epoch=epoch, log_interval=train_log_interval, profile_start=profile_start, profile_stop=profile_stop) if ret > 0: return None, epoch val_map = None val_epoch = epoch # Run (or schedule) a validation run if (val_interval and not epoch % val_interval) or (val_epochs and epoch in val_epochs): self.allreduce_running() # all reduce the running parameters val_map = self.validate(val_iterator=val_iterator, epoch=epoch, annotation_file=annotation_file, cocoapi_threads=cocoapi_threads, log_interval=val_log_interval) # Check if there are completed async validation runs if self.async_executor: val_epoch, val_map = self.get_async_results( waitall=epoch == end_epoch) # Check if target accuracy reached if val_map and val_map >= target_map: if self.save_prefix and hvd.rank() == 0: save_fname = f'{self.save_prefix}_epoch{epoch}_map{val_map:.2f}.params' logging.info(f'Saving model weights: {save_fname}') if self.async_executor: src_fname = os.path.join( tempfile.gettempdir(), f'temp_ssd_mxnet_epoch{val_epoch}.params') os.rename(src_fname, save_fname) else: self.net.save_parameters(save_fname) return val_map, val_epoch # Save model weights if (save_interval and not epoch % save_interval ) and hvd.rank() == 0 and self.save_prefix: save_fname = f'{self.save_prefix}_epoch{epoch}.params' logging.info(f'Saving model weights: {save_fname}') self.net.save_parameters(save_fname) return None, epoch
def parse_args(): parser = argparse.ArgumentParser(description='Train SSD networks.') # Model arguments parser.add_argument( '--mode', type=str.lower, choices=VALID_MODES, default='train_val', help='Mode to run; one of %s. currently has no effect' % VALID_MODES) # TODO(ahmadki) parser.add_argument( '--backbone', type=str, choices=VALID_BACKBONES, default='resnet34_mlperf', help= "Base network name which serves as feature extraction base; one of %s" % VALID_BACKBONES) parser.add_argument( '--bn-group', type=int, default=1, choices=[1, 2, 4, 8, 16], help='Group of processes to collaborate on BatchNorm ops') parser.add_argument('--bn-fp16', action='store_true', help='Use FP16 for batchnorm gamma and beta.') parser.add_argument('--no-fuse-bn-add-relu', action='store_true', help="Do not fuse batch norm, add and relu layers") parser.add_argument('--no-fuse-bn-relu', action='store_true', help="Do not fuse batch norm and relu layers") parser.add_argument('--precision', type=str.lower, choices=VALID_PRECISIONS, default='fp16', help="Data format to use; one of %s" % VALID_PRECISIONS) parser.add_argument('--fp16-loss-scale', type=float, default=128.0, help='Static FP16 loss scale') # Dataset arguments parser.add_argument('--dataset', type=str, choices=VALID_DATASETS, default='coco2017', help="Specify the dataset to be used; one of %s" % VALID_DATASETS) parser.add_argument('--synthetic', action='store_true', help="Use synthetic input data") # TODO(ahmadki): # 1) Remove --use-tfrecord # 2) make --coco-root and --tfrecord-* mutually execlusive # 3) Use tfrecord by default when --tfrecord-* are used, otherwise use raw images with --coco-root # 4) Find a way to pass validation annotation json when using --tfrecord-* (needed for cocoapi) parser.add_argument('--use-tfrecord', action='store_true', help="Use TFRecord instead of raw images") parser.add_argument( '--coco-root', type=str, default='/datasets/coco2017', help='Directory where coco dataset (raw images) are located.') parser.add_argument( '--tfrecord-root', type=str, default='/datasets/coco2017/tfrecord/', help='Directory where TFRecord and dali index files are located.') # Note: for MLPerf, --dataset-size needs to be given as an argument in order to comply with the # "don't touch data before run_start" rule parser.add_argument( '--dataset-size', type=int, default=None, help= 'Training dataset size, if none the size will be automatically inferred.' ) parser.add_argument( '--eval-dataset-size', type=int, default=None, help= 'Validation dataset size, if none the size will be automatically inferred.' ) parser.add_argument('--input-jpg-decode', type=str, default='gpu', help='Way to decode jpg.') parser.add_argument( '--hw-decoder-load', type=float, default=0.0, help= 'Percentage of workload that will be offloaded to the hardware decoder if available. ' ) # Model data arguments parser.add_argument('--batch-size', type=int, default=32, help='The local batch size') parser.add_argument( '--eval-batch-size', type=int, default=None, help= 'The evaluation local batch size. If not specified, --batch-size will be used' ) parser.add_argument('--data-shape', type=int, default=300, help="Input data shape, use 300, 512." ) # TODO(ahmadki): support other data shapes parser.add_argument('--data-layout', type=str.upper, choices=VALID_DATA_LAYOUTS, default="NHWC", help="Specify the input data layout; one of %s" % VALID_DATA_LAYOUTS) parser.add_argument('--input-batch-multiplier', type=int, default=1, help="use larger batches for input pipeline") parser.add_argument( '--dali-workers', '-j', type=int, default=6, help='Number of DALI data workers, you can use larger ' 'number to accelerate data loading, if you CPU and GPUs are powerful.') # General arguments parser.add_argument('--seed', type=int, default=None, help='Random seed to be fixed.') # Training arguments parser.add_argument( '--pretrained-backbone', type=str, default=None, help="path to a pickle file with pretrained backbone weights. " "Mutually exclusive with --resume-from") parser.add_argument('--epochs', type=int, default=80, help='Training epochs.') parser.add_argument('--gradient-predivide-factor', type=float, default=1, help='Gradient predivide factor before allreduce') parser.add_argument( '--horovod-num-groups', type=int, default=1, help='num_groups argument to pass to Horovod DistributedTrainer') parser.add_argument( '--resume-from', type=str, default='', help='Resume from previously saved parameters if not None. ' 'For example, you can resume from ./ssd_xxx_0123.params. ' 'Mutually exclusive with --pretrained-backbone.') parser.add_argument( '--start-epoch', type=int, default=1, help='Starting epoch for resuming, default is 1 for new training.' 'You can specify it to 100 for example to start from 100 epoch.') parser.add_argument( '--lr', type=float, default=None, help= 'Learning rate, if None will default to 0.0025 * (batch_size*num_gpus / 32)' ) parser.add_argument('--lr-decay-factor', type=float, default=0.1, help='decay rate of learning rate. default is 0.1.') parser.add_argument( '--lr-decay-epochs', type=int, nargs='+', default=[44, 55], help='epochs at which learning rate decays. default is [44, 55].') parser.add_argument( '--lr-warmup-epochs', type=float, default=None, help= 'how long the learning rate will be warmed up in fraction of epochs') parser.add_argument( '--lr-warmup-factor', type=float, default=0, help='mlperf rule parameter for controlling warmup curve') parser.add_argument('--momentum', type=float, default=0.9, help='SGD momentum, default is 0.9') parser.add_argument('--weight-decay', type=float, default=0.0005, help='weight decay, default is 5e-4') parser.add_argument('--save-interval', type=int, default=None, help='Save model parameters every that many epochs.') parser.add_argument( '--async-val', action='store_true', help='Execute validation asynchronously (scoring only)') parser.add_argument( '--val-interval', type=int, default=None, help= 'epoch interval for validation, increase the number will reduce the ' 'training time if validation is slow.') parser.add_argument( '--val-epochs', nargs='*', type=int, default=[40, 50, 55, 60, 65, 70, 75, 80], help='epochs at which to evaluate in addition to --val-interval') parser.add_argument('--target-map', '-t', type=float, default=23, help='stop training early at this threshold') parser.add_argument('--cocoapi-threads', type=int, default=1, help='Number of OpenMP threads to use with cocoAPI') parser.add_argument( '--nms-valid-thresh', type=float, default=0.0, help='filter out results whose scores less than nms-valid-thresh.') parser.add_argument( '--nms-overlap-thresh', type=float, default=0.5, help='overlapping(IoU) threshold to suppress object with smaller score.' ) parser.add_argument( '--nms-topk', type=int, default=200, help='Non-Maximal Suppression (NMS) maximum number of detections') parser.add_argument( '--post-nms', type=int, default=200, help= 'Only return top post_nms detection results. Set to -1 to return all detections.' ) parser.add_argument('--bulk-last-wgrad', action='store_true', help='Include the last wgrad in backward bulk.') # logging arguments parser.add_argument( '--results', type=str, default=None, help= 'Folder to save results. If not set, logs or weights will not be written to disk.' ) parser.add_argument('--log-interval', type=int, default=100, help='Logging mini-batch interval.') parser.add_argument('--log-level', type=str, choices=VALID_LOGGING_LEVELS, default='INFO', help="logging level; one of %s" % VALID_LOGGING_LEVELS) parser.add_argument('--log-local-ranks', type=int, nargs='+', default=[0], help='use --log-level on this list of MPI ranks, ' 'the reset will have a log level of CRITICAL') # profiling arguments parser.add_argument( '--profile-no-horovod', action='store_true', help='in the single gpu case, use (presumably faster) gluon.Trainer ' 'instead of horovod.DistributedTrainer') parser.add_argument('--profile-start', type=int, default=None, help='Iteration at which to turn on cuda profiling') parser.add_argument( '--profile-stop', type=int, default=None, help= 'Iteration at which to early terminate (and turn off cuda profiling)') # testing arguments parser.add_argument( '--test-initialization', action='store_true', help='Print network parameter statistics after initalization') parser.add_argument('--test-anchors', action='store_true', help='Overview of normalized xywh anchors.') args = parser.parse_args() args.eval_batch_size = args.eval_batch_size or args.batch_size args.seed = args.seed or random.SystemRandom().randint(0, 2**31 - 1) log_event(key=mlperf_constants.SGD, value=args.seed) return args
def main(async_executor=None): # Setup MLPerf logger mllog.config() mllogger = mllog.get_mllogger() mllogger.logger.propagate = False # Start MLPerf benchmark log_start(key=mlperf_constants.INIT_START, uniq=False) # Parse args args = parse_args() ############################################################################ # Initialize various libraries (horovod, logger, amp ...) ############################################################################ # Initialize async executor if args.async_val: assert async_executor is not None, 'Please use ssd_main_async.py to launch with async support' else: # (Force) disable async validation async_executor = None # Initialize horovod hvd.init() # Initialize AMP if args.precision == 'amp': amp.init(layout_optimization=True) # Set MXNET_SAFE_ACCUMULATION=1 if necessary if args.precision == 'fp16': os.environ["MXNET_SAFE_ACCUMULATION"] = "1" # Results folder network_name = f'ssd_{args.backbone}_{args.data_layout}_{args.dataset}_{args.data_shape}' save_prefix = None if args.results: save_prefix = os.path.join(args.results, network_name) else: logging.info( "No results folder was provided. The script will not write logs or save weight to disk" ) # Initialize logger log_file = None if args.results: log_file = f'{save_prefix}_{args.mode}_{hvd.rank()}.log' setup_logger(level=args.log_level if hvd.local_rank() in args.log_local_ranks else 'CRITICAL', log_file=log_file) # Set seed args.seed = set_seed_distributed(args.seed) ############################################################################ ############################################################################ # Validate arguments and print some useful information ############################################################################ logging.info(args) assert not (args.resume_from and args.pretrained_backbone), ( "--resume-from and --pretrained_backbone are " "mutually exclusive.") assert args.data_shape == 300, "only data_shape=300 is supported at the moment." assert args.input_batch_multiplier >= 1, "input_batch_multiplier must be >= 1" assert not (hvd.size() == 1 and args.gradient_predivide_factor > 1), ( "Gradient predivide factor is not supported " "with a single GPU") if args.data_layout == 'NCHW' or args.precision == 'fp32': assert args.bn_group == 1, "Group batch norm doesn't support FP32 data format or NCHW data layout." if not args.no_fuse_bn_relu: logging.warning(( "WARNING: fused batch norm relu is only supported with NHWC layout. " "A non fused version will be forced.")) args.no_fuse_bn_relu = True if not args.no_fuse_bn_add_relu: logging.warning(( "WARNING: fused batch norm add relu is only supported with NHWC layout. " "A non fused version will be forced.")) args.no_fuse_bn_add_relu = True if args.profile_no_horovod and hvd.size() > 1: logging.warning( "WARNING: hvd.size() > 1, so must IGNORE requested --profile-no-horovod" ) args.profile_no_horovod = False logging.info(f'Seed: {args.seed}') logging.info(f'precision: {args.precision}') if args.precision == 'fp16': logging.info(f'loss scaling: {args.fp16_loss_scale}') logging.info(f'network name: {network_name}') logging.info(f'fuse bn relu: {not args.no_fuse_bn_relu}') logging.info(f'fuse bn add relu: {not args.no_fuse_bn_add_relu}') logging.info(f'bn group: {args.bn_group}') logging.info(f'bn all reduce fp16: {args.bn_fp16}') logging.info(f'MPI size: {hvd.size()}') logging.info(f'MPI global rank: {hvd.rank()}') logging.info(f'MPI local rank: {hvd.local_rank()}') logging.info(f'async validation: {args.async_val}') ############################################################################ # TODO(ahmadki): load network and anchors based on args.backbone (JoC) # Load network net = ssd_300_resnet34_v1_mlperf_coco( pretrained_base=False, nms_overlap_thresh=args.nms_overlap_thresh, nms_topk=args.nms_topk, nms_valid_thresh=args.nms_valid_thresh, post_nms=args.post_nms, layout=args.data_layout, fuse_bn_add_relu=not args.no_fuse_bn_add_relu, fuse_bn_relu=not args.no_fuse_bn_relu, bn_fp16=args.bn_fp16, norm_kwargs={'bn_group': args.bn_group}) # precomputed anchors anchors_np = mlperf_xywh_anchors(image_size=args.data_shape, clip=True, normalize=True) if args.test_anchors and hvd.rank() == 0: logging.info(f'Normalized anchors: {anchors_np}') # Training mode train_net = None train_pipeline = None trainer_fn = None lr_scheduler = None if args.mode in ['train', 'train_val']: # Training iterator num_cropping_iterations = 1 if args.use_tfrecord: tfrecord_files = glob.glob( os.path.join(args.tfrecord_root, 'train.*.tfrecord')) index_files = glob.glob( os.path.join(args.tfrecord_root, 'train.*.idx')) tfrecords = [(tfrecod, index) for tfrecod, index in zip(tfrecord_files, index_files) ] train_pipeline = get_training_pipeline( coco_root=args.coco_root if not args.use_tfrecord else None, tfrecords=tfrecords if args.use_tfrecord else None, anchors=anchors_np, num_shards=hvd.size(), shard_id=hvd.rank(), device_id=hvd.local_rank(), batch_size=args.batch_size * args.input_batch_multiplier, dataset_size=args.dataset_size, data_layout=args.data_layout, data_shape=args.data_shape, num_cropping_iterations=num_cropping_iterations, num_workers=args.dali_workers, fp16=args.precision == 'fp16', input_jpg_decode=args.input_jpg_decode, hw_decoder_load=args.hw_decoder_load, decoder_cache_size=min( (100 * 1024 + hvd.size() - 1) // hvd.size(), 12 * 1024) if args.input_jpg_decode == 'cache' else 0, seed=args.seed) log_event(key=mlperf_constants.TRAIN_SAMPLES, value=train_pipeline.epoch_size) log_event(key=mlperf_constants.MAX_SAMPLES, value=num_cropping_iterations) # Training network train_net = SSDMultiBoxLoss(net=net, local_batch_size=args.batch_size, bulk_last_wgrad=args.bulk_last_wgrad) # Trainer function. SSDModel expects a function that takes 1 parameter - HybridBlock trainer_fn = functools.partial( sgd_trainer, learning_rate=args.lr, weight_decay=args.weight_decay, momentum=args.momentum, precision=args.precision, fp16_loss_scale=args.fp16_loss_scale, gradient_predivide_factor=args.gradient_predivide_factor, num_groups=args.horovod_num_groups, profile_no_horovod=args.profile_no_horovod) # Learning rate scheduler lr_scheduler = MLPerfLearningRateScheduler( learning_rate=args.lr, decay_factor=args.lr_decay_factor, decay_epochs=args.lr_decay_epochs, warmup_factor=args.lr_warmup_factor, warmup_epochs=args.lr_warmup_epochs, epoch_size=train_pipeline.epoch_size, global_batch_size=args.batch_size * hvd.size()) # Validation mode infer_net = None val_iterator = None if args.mode in ['infer', 'val', 'train_val']: # Validation iterator tfrecord_files = glob.glob( os.path.join(args.tfrecord_root, 'val.*.tfrecord')) index_files = glob.glob(os.path.join(args.tfrecord_root, 'val.*.idx')) tfrecords = [(tfrecod, index) for tfrecod, index in zip(tfrecord_files, index_files)] val_pipeline = get_inference_pipeline( coco_root=args.coco_root if not args.use_tfrecord else None, tfrecords=tfrecords if args.use_tfrecord else None, num_shards=hvd.size(), shard_id=hvd.rank(), device_id=hvd.local_rank(), batch_size=args.eval_batch_size, dataset_size=args.eval_dataset_size, data_layout=args.data_layout, data_shape=args.data_shape, num_workers=args.dali_workers, fp16=args.precision == 'fp16') log_event(key=mlperf_constants.EVAL_SAMPLES, value=val_pipeline.epoch_size) # Inference network infer_net = COCOInference(net=net, ltrb=False, scale_bboxes=True, score_threshold=0.0) # annotations file cocoapi_annotation_file = os.path.join( args.coco_root, 'annotations', 'bbox_only_instances_val2017.json') # Prepare model model = SSDModel(net=net, anchors_np=anchors_np, precision=args.precision, fp16_loss_scale=args.fp16_loss_scale, train_net=train_net, trainer_fn=trainer_fn, lr_scheduler=lr_scheduler, metric=mx.metric.Loss(), infer_net=infer_net, async_executor=async_executor, save_prefix=save_prefix, ctx=mx.gpu(hvd.local_rank())) # Do a training and validation runs on fake data. # this will set layers shape (needed before loading pre-trained backbone), # allocate tensors and and cache optimized graph. # Training dry run: logging.info('Running training dry runs') dummy_train_pipeline = get_training_pipeline( coco_root=None, tfrecords=[('dummy.tfrecord', 'dummy.idx')], anchors=anchors_np, num_shards=1, shard_id=0, device_id=hvd.local_rank(), batch_size=args.batch_size * args.input_batch_multiplier, dataset_size=None, data_layout=args.data_layout, data_shape=args.data_shape, num_workers=args.dali_workers, fp16=args.precision == 'fp16', seed=args.seed) dummy_train_iterator = get_training_iterator(pipeline=dummy_train_pipeline, batch_size=args.batch_size) for images, box_targets, cls_targets in dummy_train_iterator: model.train_step(images=images, box_targets=box_targets, cls_targets=cls_targets) # Freeing memory is disabled due a bug in CUDA graphs # del dummy_train_pipeline # del dummy_train_iterator mx.ndarray.waitall() logging.info('Done') # Validation dry run: logging.info('Running inference dry runs') dummy_val_pipeline = get_inference_pipeline( coco_root=None, tfrecords=[('dummy.tfrecord', 'dummy.idx')], num_shards=1, shard_id=0, device_id=hvd.local_rank(), batch_size=args.eval_batch_size, dataset_size=None, data_layout=args.data_layout, data_shape=args.data_shape, num_workers=args.dali_workers, fp16=args.precision == 'fp16') dummy_val_iterator = get_inference_iterator(pipeline=dummy_val_pipeline) model.infer(data_iterator=dummy_val_iterator, log_interval=None) # Freeing memory is disabled due a bug in CUDA graphs # del dummy_val_pipeline # del dummy_val_iterator mx.ndarray.waitall() logging.info('Done') # re-initialize the model as a precaution in case the dry runs changed the parameters model.init_model(force_reinit=True) model.zero_grads() mx.ndarray.waitall() # load saved model or pretrained backbone if args.resume_from: model.load_parameters(filename=args.resume_from) elif args.pretrained_backbone: model.load_pretrain_backbone(picklefile_name=args.pretrained_backbone) # broadcast parameters model.broadcast_params() mx.ndarray.waitall() if args.test_initialization and hvd.rank() == 0: model.print_params_stats(net) log_end(key=mlperf_constants.INIT_STOP) # Main MLPerf loop (training+validation) mpiwrapper.barrier() log_start(key=mlperf_constants.RUN_START) mpiwrapper.barrier() # Real data iterators train_iterator = None val_iterator = None if train_pipeline: train_iterator = get_training_iterator(pipeline=train_pipeline, batch_size=args.batch_size, synthetic=args.synthetic) if val_pipeline: val_iterator = get_inference_iterator(pipeline=val_pipeline) model_map, epoch = model.train_val(train_iterator=train_iterator, start_epoch=args.start_epoch, end_epoch=args.epochs, val_iterator=val_iterator, val_interval=args.val_interval, val_epochs=args.val_epochs, annotation_file=cocoapi_annotation_file, target_map=args.target_map, train_log_interval=args.log_interval, val_log_interval=args.log_interval, save_interval=args.save_interval, cocoapi_threads=args.cocoapi_threads, profile_start=args.profile_start, profile_stop=args.profile_stop) status = 'success' if (model_map and model_map >= args.target_map) else 'aborted' mx.ndarray.waitall() log_end(key=mlperf_constants.RUN_STOP, metadata={"status": status}) logging.info(f'Rank {hvd.rank()} done. map={model_map} @ epoch={epoch}') mx.nd.waitall() hvd.shutdown()