image="%s/t10k-images-idx3-ubyte" % data_dir, label="%s/t10k-labels-idx1-ubyte" % data_dir, input_shape=input_shape, batch_size=batch_size, flat=False, num_parts=hvd.size(), part_index=hvd.rank() ) return train_iter, val_iter # Step 1: initialize Horovod hvd.init() # Horovod: pin context to process context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank()) # Step 2: load data train_iter, val_iter = get_mnist_iterator(hvd.rank()) # Step 3: define network def conv_net(): # placeholder for data data = mx.sym.var('data') # first conv layer conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=10) relu1 = mx.sym.Activation(data=conv1, act_type='relu') pool1 = mx.sym.Pooling(data=relu1, pool_type='max', kernel=(2, 2), stride=(2, 2)) # second conv layer
fh.setLevel(logging.INFO) fh.setFormatter(formatter) console = logging.StreamHandler() console.setLevel(logging.INFO) console.setFormatter(formatter) log.addHandler(console) log.addHandler(fh) log.info(args) if args.comm_backend == 'horovod': import horovod.mxnet as hvd hvd.init() rank = hvd.rank() size = hvd.size() local_rank = hvd.local_rank() else: rank = 0 size = 1 local_rank = 0 if args.dtype == 'float16': from mxnet.contrib import amp amp.init() model_name = args.bert_model dataset_name = args.bert_dataset only_predict = args.only_predict model_parameters = args.model_parameters pretrained_bert_parameters = args.pretrained_bert_parameters if pretrained_bert_parameters and model_parameters:
def _current_context(self): if has_gpu: return mx.gpu(hvd.local_rank()) else: return mx.current_context()
def fit(self, train_data, eval_data=None, eval_metric='acc', epoch_end_callback=None, batch_end_callback=None, kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01), ), eval_end_callback=None, eval_batch_end_callback=None, initializer=Uniform(0.01), arg_params=None, aux_params=None, allow_missing=False, force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, validation_metric=None, monitor=None, sparse_row_id_fn=None, accuracy_target=1.0, eval_frequency=1, eval_offset=0, logger=None): assert num_epoch is not None, 'please specify number of epochs' if 'horovod' in kvstore: rank = hvd.rank() local_rank = hvd.local_rank() else: rank = 0 local_rank = 0 profiler_on = os.getenv('RESNET50_PROFILING', False) and (rank == 0) if profiler_on: self.logger.info("Profiling is enabled") stop_iter = int(os.getenv('RESNET50_STOP_ITERATION', '0')) if stop_iter > 0: self.logger.info( "Training will stop at iteration {} of the first epoch".format( stop_iter)) self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, for_training=True, force_rebind=force_rebind) if monitor is not None: self.install_monitor(monitor) self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, allow_missing=allow_missing, force_init=force_init) self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) if validation_metric is None: validation_metric = eval_metric if not isinstance(eval_metric, mx.metric.EvalMetric): eval_metric = mx.metric.create(eval_metric) block_epoch_start = begin_epoch block_epoch_count = eval_offset + 1 - (begin_epoch % eval_frequency) if block_epoch_count < 0: block_epoch_count += eval_frequency mll.block_start(block_epoch_start + 1, count=block_epoch_count) if profiler_on: mx.profiler.set_config(profile_symbolic=True, profile_imperative=True, profile_memory=False, profile_api=True, filename='resnet50_profile.json', aggregate_stats=True) mx.profiler.set_state('run') ################################################################################ # training loop ################################################################################ for epoch in range(begin_epoch, num_epoch): mll.epoch_start(epoch + 1) tic = time.time() eval_metric.reset() nbatch = 0 early_stop = False data_iter = iter(train_data) end_of_batch = False next_data_batch = next(data_iter) while not end_of_batch: data_batch = next_data_batch if monitor is not None: monitor.tic() self.forward_backward(data_batch) self.update() if isinstance(data_batch, list): self.update_metric(eval_metric, [db.label for db in data_batch], pre_sliced=True) else: self.update_metric(eval_metric, data_batch.label) try: # pre fetch next batch next_data_batch = next(data_iter) self.prepare(next_data_batch, sparse_row_id_fn=sparse_row_id_fn) except StopIteration: end_of_batch = True if monitor is not None: monitor.toc_print() if end_of_batch: eval_name_vals = eval_metric.get_global_name_value() if batch_end_callback is not None: batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric, locals=locals()) for callback in _as_list(batch_end_callback): callback(batch_end_params) nbatch += 1 if stop_iter > 0 and nbatch >= stop_iter: early_stop = True self.logger.info( "Training stopped at {} iteration. Clear RESNET50_STOP_ITERATION if it's not itended." .format(stop_iter)) break if early_stop: break mll.epoch_stop(epoch + 1) # one epoch of training is finished if rank == 0: for name, val in eval_name_vals: self.logger.info('Rank[%d] Epoch[%d] Train-%s=%f', rank, epoch, name, val) toc = time.time() self.logger.info('Rank[%d] Epoch[%d] Time cost=%.3f', rank, epoch, (toc - tic)) # sync aux params across devices arg_params, aux_params = self.get_params() self.set_params(arg_params, aux_params) #---------------------------------------- # evaluation on validation set if eval_data is not None and ((epoch % eval_frequency == eval_offset) or (epoch + 1 == num_epoch)): mll.eval_start(epoch + 1, sync=True) res = self.score(eval_data, [validation_metric, CorrectCount(), TotalCount()], score_end_callback=eval_end_callback, batch_end_callback=eval_batch_end_callback, epoch=epoch) #TODO: pull this into default if rank == 0: for name, val in res: self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) # temporarily add these two metrics for debugging, can be removed before submission res = dict(res) correct_count = res['correct-count'] total_count = res['total-count'] if 'horovod' in kvstore: correct_count = allreduce(correct_count) total_count = allreduce(total_count) acc = correct_count / total_count mll.eval_stop(epoch + 1) mll.eval_accuracy(epoch + 1, acc) mll.block_stop(block_epoch_start + 1) if acc > accuracy_target: mll.run_stop(status='success') return if epoch < num_epoch - 1: block_epoch_start = epoch + 1 block_epoch_count = num_epoch - epoch - 1 if block_epoch_count > eval_frequency: block_epoch_count = eval_frequency mll.block_start(block_epoch_start + 1, count=block_epoch_count) # end of 1 epoch, reset the data-iter for another epoch train_data.reset() if profiler_on: mx.profiler.set_state('stop') print(mx.profiler.dumps()) mll.run_stop(status='aborted')
def __init__(self, config, logger=None, reporter=None): super(MaskRCNNEstimator, self).__init__(config, logger, reporter) # fix seed for mxnet, numpy and python builtin random generator. gutils.random.seed(self._cfg.train.seed) if self._cfg.mask_rcnn.amp: amp.init() # training contexts if self._cfg.horovod: self.ctx = [mx.gpu(hvd.local_rank())] else: ctx = [mx.gpu(int(i)) for i in self._cfg.gpus] self.ctx = ctx if ctx else [mx.cpu()] # network kwargs = {} module_list = [] if self._cfg.mask_rcnn.use_fpn: module_list.append('fpn') if self._cfg.mask_rcnn.norm_layer is not None: module_list.append(self._cfg.mask_rcnn.norm_layer) if self._cfg.mask_rcnn.norm_layer == 'bn': kwargs['num_devices'] = len(self.ctx) self.num_gpus = hvd.size() if self._cfg.horovod else len(self.ctx) net_name = '_'.join(('mask_rcnn', *module_list, self._cfg.mask_rcnn.backbone, self._cfg.dataset)) if self._cfg.mask_rcnn.custom_model: self._cfg.mask_rcnn.use_fpn = True net_name = '_'.join(('mask_rcnn_fpn', self._cfg.mask_rcnn.backbone, self._cfg.dataset)) if self._cfg.mask_rcnn.norm_layer == 'bn': norm_layer = gluon.contrib.nn.SyncBatchNorm norm_kwargs = {'num_devices': len(self.ctx)} # sym_norm_layer = mx.sym.contrib.SyncBatchNorm sym_norm_kwargs = {'ndev': len(self.ctx)} elif self._cfg.mask_rcnn.norm_layer == 'gn': norm_layer = gluon.nn.GroupNorm norm_kwargs = {'groups': 8} # sym_norm_layer = mx.sym.GroupNorm sym_norm_kwargs = {'groups': 8} else: norm_layer = gluon.nn.BatchNorm norm_kwargs = None # sym_norm_layer = None sym_norm_kwargs = None if self._cfg.dataset == 'coco': classes = COCODetection.CLASSES else: # default to VOC classes = VOCDetection.CLASSES self.net = get_model( 'custom_mask_rcnn_fpn', classes=classes, transfer=None, dataset=self._cfg.dataset, pretrained_base=self._cfg.train.pretrained_base, base_network_name=self._cfg.mask_rcnn.backbone, norm_layer=norm_layer, norm_kwargs=norm_kwargs, sym_norm_kwargs=sym_norm_kwargs, num_fpn_filters=self._cfg.mask_rcnn.num_fpn_filters, num_box_head_conv=self._cfg.mask_rcnn.num_box_head_conv, num_box_head_conv_filters=self._cfg.mask_rcnn. num_box_head_conv_filters, num_box_head_dense_filters=self._cfg.mask_rcnn. num_box_head_dense_filters, short=self._cfg.mask_rcnn.image_short, max_size=self._cfg.mask_rcnn.image_max_size, min_stage=2, max_stage=6, nms_thresh=self._cfg.mask_rcnn.nms_thresh, nms_topk=self._cfg.mask_rcnn.nms_topk, post_nms=self._cfg.mask_rcnn.post_nms, roi_mode=self._cfg.mask_rcnn.roi_mode, roi_size=self._cfg.mask_rcnn.roi_size, strides=self._cfg.mask_rcnn.strides, clip=self._cfg.mask_rcnn.clip, rpn_channel=self._cfg.mask_rcnn.rpn_channel, base_size=self._cfg.mask_rcnn.anchor_base_size, scales=self._cfg.mask_rcnn.anchor_scales, ratios=self._cfg.mask_rcnn.anchor_aspect_ratio, alloc_size=self._cfg.mask_rcnn.anchor_alloc_size, rpn_nms_thresh=self._cfg.mask_rcnn.rpn_nms_thresh, rpn_train_pre_nms=self._cfg.train.rpn_train_pre_nms, rpn_train_post_nms=self._cfg.train.rpn_train_post_nms, rpn_test_pre_nms=self._cfg.valid.rpn_test_pre_nms, rpn_test_post_nms=self._cfg.valid.rpn_test_post_nms, rpn_min_size=self._cfg.train.rpn_min_size, per_device_batch_size=self._cfg.train.batch_size // self.num_gpus, num_sample=self._cfg.train.rcnn_num_samples, pos_iou_thresh=self._cfg.train.rcnn_pos_iou_thresh, pos_ratio=self._cfg.train.rcnn_pos_ratio, max_num_gt=self._cfg.mask_rcnn.max_num_gt, target_roi_scale=self._cfg.mask_rcnn.target_roi_scale, num_fcn_convs=self._cfg.mask_rcnn.num_mask_head_convs) else: self.net = get_model( net_name, pretrained_base=True, per_device_batch_size=self._cfg.train.batch_size // self.num_gpus, **kwargs) self._cfg.save_prefix += net_name if self._cfg.resume.strip(): self.net.load_parameters(self._cfg.resume.strip()) else: for param in self.net.collect_params().values(): if param._data is not None: continue param.initialize() self.net.collect_params().reset_ctx(self.ctx) if self._cfg.mask_rcnn.amp: # Cast both weights and gradients to 'float16' self.net.cast('float16') # This layers doesn't support type 'float16' self.net.collect_params('.*batchnorm.*').setattr( 'dtype', 'float32') self.net.collect_params( '.*normalizedperclassboxcenterencoder.*').setattr( 'dtype', 'float32') # set up logger logging.basicConfig() self._logger = logging.getLogger() self._logger.setLevel(logging.INFO) log_file_path = self._cfg.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) self._logger.addHandler(fh) if MPI is None and self._cfg.horovod: self._logger.warning( 'mpi4py is not installed, validation result may be incorrect.') self._logger.info(self._cfg) self.rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss( from_sigmoid=False) self.rpn_box_loss = mx.gluon.loss.HuberLoss( rho=self._cfg.train.rpn_smoothl1_rho) # == smoothl1 self.rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() self.rcnn_box_loss = mx.gluon.loss.HuberLoss( rho=self._cfg.train.rcnn_smoothl1_rho) # == smoothl1 self.rcnn_mask_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss( from_sigmoid=False) self.metrics = [ mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'), mx.metric.Loss('RCNN_Mask') ] self.rpn_acc_metric = RPNAccMetric() self.rpn_bbox_metric = RPNL1LossMetric() self.rcnn_acc_metric = RCNNAccMetric() self.rcnn_bbox_metric = RCNNL1LossMetric() self.rcnn_mask_metric = MaskAccMetric() self.rcnn_fgmask_metric = MaskFGAccMetric() self.metrics2 = [ self.rpn_acc_metric, self.rpn_bbox_metric, self.rcnn_acc_metric, self.rcnn_bbox_metric, self.rcnn_mask_metric, self.rcnn_fgmask_metric ] self.async_eval_processes = [] self.best_map = [0] self.epoch = 0 # training data self.train_dataset, self.val_dataset, self.eval_metric = _get_dataset( self._cfg.dataset, self._cfg) self.batch_size = self._cfg.train.batch_size // self.num_gpus \ if self._cfg.horovod else self._cfg.train.batch_size self._train_data, self._val_data = _get_dataloader( self.net, self.train_dataset, self.val_dataset, MaskRCNNDefaultTrainTransform, MaskRCNNDefaultValTransform, self.batch_size, len(self.ctx), self._cfg)
data_iter.reset() metric = mx.metric.Accuracy() for _, batch in enumerate(data_iter): data = batch.data[0].as_in_context(context) label = batch.label[0].as_in_context(context) output = model(data.astype(args.dtype, copy=False)) metric.update([label], [output]) return metric.get() # Initialize Horovod hvd.init() # Horovod: pin context to local rank context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu( hvd.local_rank()) num_workers = hvd.size() # Load training and validation data train_data, val_data = get_mnist_iterator(hvd.rank()) # Build model model = conv_nets() model.cast(args.dtype) model.hybridize() # Create optimizer optimizer_params = { 'momentum': args.momentum, 'learning_rate': args.lr * hvd.size()