def get_mnist_iterator(rank): data_dir = "data-%d" % rank if not os.path.isdir(data_dir): os.makedirs(data_dir) zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip', dirname=data_dir) with zipfile.ZipFile(zip_file_path) as zf: zf.extractall(data_dir) input_shape = (1, 28, 28) batch_size = args.batch_size train_iter = mx.io.MNISTIter( image="%s/train-images-idx3-ubyte" % data_dir, label="%s/train-labels-idx1-ubyte" % data_dir, input_shape=input_shape, batch_size=batch_size, shuffle=True, flat=False, num_parts=hvd.size(), part_index=hvd.rank() ) val_iter = mx.io.MNISTIter( image="%s/t10k-images-idx3-ubyte" % data_dir, label="%s/t10k-labels-idx1-ubyte" % data_dir, input_shape=input_shape, batch_size=batch_size, flat=False, num_parts=hvd.size(), part_index=hvd.rank() ) return train_iter, val_iter
def _get_dataloader(net, train_dataset, val_dataset, train_transform, val_transform, batch_size, num_shards_per_process, args): """Get dataloader.""" train_bfn = batchify.MaskRCNNTrainBatchify(net, num_shards_per_process) train_sampler = \ SplitSortedBucketSampler(train_dataset.get_im_aspect_ratio(), batch_size, num_parts=hvd.size() if args.horovod else 1, part_index=hvd.rank() if args.horovod else 0, shuffle=True) train_loader = gluon.data.DataLoader( train_dataset.transform(train_transform(net.short, net.max_size, net, ashape=net.ashape, multi_stage=args.mask_rcnn.use_fpn)), batch_sampler=train_sampler, batchify_fn=train_bfn, num_workers=args.num_workers) val_bfn = batchify.Tuple(*[batchify.Append() for _ in range(2)]) short = net.short[-1] if isinstance(net.short, (tuple, list)) else net.short # validation use 1 sample per device val_loader = gluon.data.DataLoader( val_dataset.transform(val_transform(short, net.max_size)), num_shards_per_process, False, batchify_fn=val_bfn, last_batch='keep', num_workers=args.num_workers) return train_loader, val_loader
def _resume_fit(self, train_data, val_data): if max(self._cfg.train.start_epoch, self.epoch) >= self._cfg.train.epochs: return {'time', self._time_elapsed} if not self.classes or not self.num_class: raise ValueError('Unable to determine classes of dataset') # training dataset train_dataset = train_data.to_mxnet() val_dataset = val_data.to_mxnet() # training dataloader self.batch_size = self._cfg.train.batch_size // hvd.size() if self._cfg.horovod else self._cfg.train.batch_size train_loader, val_loader, train_eval_loader = _get_dataloader( self.async_net, train_dataset, val_dataset, self._cfg.yolo3.data_shape, self.batch_size, self._cfg.num_workers, self._cfg) if self._cfg.train.no_wd: for _, v in self.net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if self._cfg.train.label_smooth: self.net._target_generator._label_smooth = True return self._train_loop(train_loader, val_loader, train_eval_loader)
def test_horovod_broadcast_error(self): """Test that the broadcast returns an error if any dimension besides the first is different among the tensors being broadcasted.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") ctx = self._current_context() shape = (17, rank + 1) tensor = mx.nd.ones(shape=shape, ctx=ctx) try: output = hvd.broadcast(tensor, 0) output.wait_to_read() assert False, 'hvd.broadcast did not throw error' except (MXNetError, RuntimeError): pass
def _get_dali_dataset(dataset_name, devices, args): if dataset_name.lower() == "coco": # training expanded_file_root = os.path.expanduser(args.dataset_root) coco_root = os.path.join(expanded_file_root, 'coco', 'train2017') coco_annotations = os.path.join(expanded_file_root, 'coco', 'annotations', 'instances_train2017.json') if args.horovod: train_dataset = [ gdata.COCODetectionDALI(num_shards=hvd.size(), shard_id=hvd.rank(), file_root=coco_root, annotations_file=coco_annotations, device_id=hvd.local_rank()) ] else: train_dataset = [gdata.COCODetectionDALI(num_shards=len(devices), shard_id=i, file_root=coco_root, annotations_file=coco_annotations, device_id=i) \ for i, _ in enumerate(devices)] # validation if not args.horovod or hvd.rank() == 0: val_dataset = gdata.COCODetection( root=os.path.join(args.dataset_root + '/coco'), splits='instances_val2017', skip_empty=False) val_metric = COCODetectionMetric( val_dataset, os.path.join(args.logdir, args.save_prefix + '_eval'), cleanup=True, data_shape=(args.ssd.data_shape, args.ssd.data_shape)) else: val_dataset = None val_metric = None else: raise NotImplementedError( 'Dataset: {} not implemented with DALI.'.format(dataset_name)) return train_dataset, val_dataset, val_metric
def test_horovod_alltoall(self): """Test that the alltoall correctly distributes 1D, 2D, and 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if using gloo controller if hvd.gloo_enabled(): self.skipTest( "Alltoall currently does not support Gloo controller.") # This test does not apply if NCCL version < 2.7.0 if hvd.nccl_built() and hvd.nccl_built() < 2700: self.skipTest( "NCCL-based Alltoall requires NCCL version >= 2.7.0.") dtypes = ['int32', 'int64', 'float32', 'float64'] dims = [1, 2, 3] ctx = self._current_context() for dtype, dim in itertools.product(dtypes, dims): vals = [] for i in range(size): vals += [i] * (rank + 1) tensor = mx.ndarray.array(vals, dtype=dtype, ctx=ctx) for _ in range(dim - 1): tensor = mx.ndarray.expand_dims(tensor, axis=1) tensor = mx.ndarray.concat(tensor, tensor, dim=1) splits = mx.ndarray.array([rank + 1] * size, dtype='int32', ctx=ctx) collected = hvd.alltoall(tensor, splits) assert collected.min( ) == rank, 'hvd.alltoall produces incorrect collected tensor' assert collected.max( ) == rank, 'hvd.alltoall produces incorrect collected tensor' assert collected.size == size * (size + 1) // 2 * 2**( dim - 1), 'hvd.alltoall collected wrong number of values'
def test_horovod_grouped_allreduce_inplace(self): """Test that the in-place grouped allreduce correctly sums 1D, 2D, 3D tensors.""" hvd.init() size = hvd.size() dtypes = self.filter_supported_types( ['int32', 'int64', 'float32', 'float64']) dims = [1, 2, 3] ctx = self._current_context() count = 1 shapes = [(), (17), (17, 17), (17, 17, 17)] for dtype, dim in itertools.product(dtypes, dims): mx.random.seed(1234, ctx=ctx) tensors = [ mx.nd.random.uniform(-100, 100, shape=shapes[dim], ctx=ctx) for _ in range(5) ] tensors = [tensor.astype(dtype) for tensor in tensors] multiplied = [tensor * size for tensor in tensors] hvd.grouped_allreduce_(tensors, average=False, name=str(count)) count += 1 # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in ['int32', 'int64']: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert all([almost_equal(t1.asnumpy(), t2.asnumpy(), atol=threshold) for t1, t2 in zip(tensors, multiplied)]), \ f'hvd.grouped_allreduce_ produces incorrect results: {hvd.rank()} {count} {dtype} {dim}'
def test_horovod_allreduce(self): """Test that the allreduce correctly sums 1D, 2D, 3D tensors.""" hvd.init() size = hvd.size() dtypes = self.filter_supported_types(['int32', 'int64', 'float32', 'float64']) dims = [1, 2, 3] ctx = self._current_context() count = 0 shapes = [(), (17), (17, 17), (17, 17, 17)] for dtype, dim in itertools.product(dtypes, dims): # MXNet uses gpu_id as part of the seed, so to get identical seeds # we must set a context. mx.random.seed(1234, ctx=ctx) tensor = mx.nd.random.uniform(-100, 100, shape=shapes[dim], ctx=ctx) tensor = tensor.astype(dtype) summed = hvd.allreduce(tensor, average=False, name=str(count)) multiplied = tensor * size max_difference = mx.nd.max(mx.nd.subtract(summed, multiplied)) count += 1 # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in ['int32', 'int64']: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break if max_difference > threshold: print("allreduce", count, dtype, dim, max_difference, threshold) print("tensor", hvd.rank(), tensor) print("summed", hvd.rank(), summed) print("multiplied", hvd.rank(), multiplied) assert max_difference <= threshold, 'hvd.allreduce produces \
def init_comm(backend): """Init communication backend""" # backend specific implementation if backend == 'horovod': try: import horovod.mxnet as hvd except ImportError: logging.info('horovod must be installed.') exit() hvd.init() store = None num_workers = hvd.size() rank = hvd.rank() local_rank = hvd.local_rank() is_master_node = rank == local_rank ctxs = [mx.gpu(local_rank)] elif backend == 'byteps': try: import byteps.mxnet as bps except ImportError: logging.info('BytePS must be installed.') exit() bps.init() store = None num_workers = bps.size() rank = bps.rank() local_rank = bps.local_rank() is_master_node = rank == local_rank ctxs = [mx.gpu(local_rank)] else: # kvstore store = mx.kv.create(backend) num_workers = store.num_workers rank = store.rank local_rank = 0 is_master_node = rank == local_rank ctxs = [mx.cpu()] if args.gpus is None or args.gpus == '' else \ [mx.gpu(int(x)) for x in args.gpus.split(',')] return store, num_workers, rank, local_rank, is_master_node, ctxs
def test_horovod_allgather_error(self): """Test that the allgather returns an error if any dimension besides the first is different among the tensors being gathered.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") ctx = self._current_context() tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = mx.ndarray.ones(shape=tensor_size, ctx=ctx) try: hvd.allgather(tensor) assert False, 'hvd.allgather did not throw error' except (MXNetError, RuntimeError): pass
def test_horovod_alltoall_equal_split_length_error(self): """Test that the alltoall with default splitting returns an error if the first dimension of tensor is not a multiple of the number of workers.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") # This test does not apply if NCCL version < 2.7.0 if hvd.nccl_built() and hvd.nccl_built() < 2700: self.skipTest("NCCL-based Alltoall requires NCCL version >= 2.7.0.") ctx = self._current_context() tensor = mx.ndarray.empty([size + 1], ctx=ctx) try: hvd.alltoall(tensor) assert False, 'hvd.alltoall did not throw error' except (MXNetError, RuntimeError): pass
def test_horovod_allgather(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = ['int32', 'int64', 'float32', 'float64'] dims = [1, 2, 3] ctx = self._current_context() for dtype, dim in itertools.product(dtypes, dims): tensor = mx.ndarray.ones(shape=[17] * dim, dtype=dtype, ctx=ctx) * rank gathered = hvd.allgather(tensor) assert list(gathered.shape) == [17 * size] + [17] * (dim - 1) for i in range(size): rank_tensor = gathered[i * 17:(i + 1) * 17] assert list(rank_tensor.shape) == [17] * dim, \ 'hvd.allgather produces incorrect gathered shape' assert rank_tensor.min() == i, 'hvd.allgather produces incorrect gathered tensor' assert rank_tensor.max() == i, 'hvd.allgather produces incorrect gathered tensor'
def test_horovod_allreduce_average(self): """Test that the allreduce correctly sums 1D, 2D, 3D tensors.""" hvd.init() size = hvd.size() dtypes = ['int32', 'int64', 'float32', 'float64'] dims = [1, 2, 3] ctx = self._current_context() count = 0 shapes = [(), (17), (17, 17), (17, 17, 17)] for dtype, dim in itertools.product(dtypes, dims): mx.random.seed(1234, ctx=ctx) tensor = mx.nd.random.uniform(-100, 100, shape=shapes[dim], ctx=ctx) tensor = tensor.astype(dtype) averaged = hvd.allreduce(tensor, average=True, name=str(count)) tensor *= size tensor /= size max_difference = mx.nd.max(mx.nd.subtract(averaged, tensor)) count += 1 # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in ['int32', 'int64']: threshold = 1 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break if max_difference > threshold: print("average", count, dtype, dim, max_difference, threshold) print("tensor", hvd.rank(), tensor) print("averaged", hvd.rank(), averaged) assert max_difference <= threshold, 'hvd.allreduce produces \
def test_horovod_broadcast_grad(self): """Test the correctness of the broadcast gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return dtypes = ['int32', 'int64', 'float32', 'float64'] dims = [1, 2, 3] ctx = self._current_context() count = 0 shapes = [(), (17), (17, 17), (17, 17, 17)] root_rank = 1 tensor_dict = {} broadcast_dict = {} root_dict = {} for dtype, dim, in itertools.product(dtypes, dims): tensor_dict[count] = mx.nd.ones(shapes[dim], ctx=ctx) * rank root_dict[count] = mx.nd.ones(shapes[dim], ctx=ctx) * root_rank tensor_dict[count] = tensor_dict[count].astype(dtype) root_dict[count] = root_dict[count].astype(dtype) # Only do broadcasting using and on broadcast_tensor count += 1 hvd.broadcast_parameters(tensor_dict, root_rank=root_rank) for i in range(count): if not same(tensor_dict[i].asnumpy(), root_dict[i].asnumpy()): print("broadcast", count, dtype, dim) print("broadcast_tensor", hvd.rank(), tensor_dict[i]) print("root_tensor", hvd.rank(), root_dict[i]) print("comparison", hvd.rank(), tensor_dict[i] == root_dict[i]) assert same(tensor_dict[i].asnumpy(), root_dict[i].asnumpy()), \ 'hvd.broadcast produces incorrect broadcasted tensor'
def test_horovod_alltoall_splits_error(self): """Test that the alltoall returns an error if the sum of the splits entries exceeds the first dimension of the input tensor.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") # This test does not apply if NCCL version < 2.7.0 if hvd.nccl_built() and hvd.nccl_built() < 2700: self.skipTest("NCCL-based Alltoall requires NCCL version >= 2.7.0.") ctx = self._current_context() tensor = mx.ndarray.empty([size-1], ctx=ctx) splits = mx.ndarray.ones([size], dtype='int32', ctx=ctx) try: hvd.alltoall(tensor, splits) assert False, 'hvd.alltoall did not throw error' except (MXNetError, RuntimeError): pass
def get_dataloader(net, train_dataset, val_dataset, train_transform, val_transform, batch_size, num_shards, args): """Get dataloader.""" train_bfn = FasterRCNNTrainBatchify(net, num_shards) if hasattr(train_dataset, 'get_im_aspect_ratio'): im_aspect_ratio = train_dataset.get_im_aspect_ratio() else: im_aspect_ratio = [1.] * len(train_dataset) train_sampler = \ gcv.nn.sampler.SplitSortedBucketSampler(im_aspect_ratio, batch_size, num_parts=hvd.size() if args.horovod else 1, part_index=hvd.rank() if args.horovod else 0, shuffle=True) train_loader = mx.gluon.data.DataLoader(train_dataset.transform( train_transform(net.short, net.max_size, net, ashape=net.ashape, multi_stage=args.use_fpn)), batch_sampler=train_sampler, batchify_fn=train_bfn, num_workers=args.num_workers) val_bfn = Tuple(*[Append() for _ in range(3)]) short = net.short[-1] if isinstance(net.short, (tuple, list)) else net.short # validation use 1 sample per device val_loader = mx.gluon.data.DataLoader( val_dataset.transform(val_transform(short, net.max_size)), num_shards, False, batchify_fn=val_bfn, last_batch='keep', num_workers=args.num_workers) return train_loader, val_loader
def test_allreduce(use_horovod, dtype): if use_horovod is False: kvstore_type = "dist_sync_device" if os.environ.get( "DMLC_ROLE") == "worker" else kvstore_type kv = mx.kvstore.create(kvstore_type) rank = kv.rank num_workers = kv.num_workers else: kvstore_type = "device" kv = mx.kvstore.create(kvstore_type) hvd.init() rank = hvd.rank() num_workers = hvd.size() print('use horovod: {}, rank {}/{}, kv type: {}, usetree: {}'.format( use_horovod, rank, num_workers, kvstore_type, os.environ.get("MXNET_KVSTORE_USETREE"))) rescale_grad = 1.0 / (8 * num_workers) if use_horovod: rescale_grad = rescale_grad * num_workers optimizer_params = dict( momentum=0, # pOpt.optimizer.momentum, wd=0, # pOpt.optimizer.wd, learning_rate=0.1, rescale_grad=rescale_grad, ) optimizer = mx.optimizer.create("sgd", **optimizer_params) if use_horovod: # Horovod: wrap optimizer with DistributedOptimizer optimizer = hvd.DistributedOptimizer(optimizer) print("opt rescale:{}".format(optimizer.rescale_grad)) kv.set_optimizer(optimizer) test_hvd_kv(rank, num_workers, kv, dtype)
def test_horovod_allgather_type_error(self): """Test that the allgather returns an error if the types being gathered differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") ctx = self._current_context() tensor_size = [17] * 3 if rank % 2 == 0: tensor = mx.ndarray.ones(shape=tensor_size, dtype="int32", ctx=ctx) else: tensor = mx.ndarray.ones(shape=tensor_size, dtype="float32", ctx=ctx) try: hvd.allgather(tensor) assert False, 'hvd.allgather did not throw error' except (MXNetError, RuntimeError): pass
def test_horovod_allreduce(self): """Test that the allreduce correctly sums 1D, 2D, 3D tensors.""" hvd.init() size = hvd.size() dtypes = self.filter_supported_types( ['int32', 'int64', 'float32', 'float64']) dims = [1, 2, 3] ctx = self._current_context() count = 0 shapes = [(), (17), (17, 17), (17, 17, 17)] for dtype, dim in itertools.product(dtypes, dims): # MXNet uses gpu_id as part of the seed, so to get identical seeds # we must set a context. mx.random.seed(1234, ctx=ctx) tensor = mx.nd.random.uniform(-100, 100, shape=shapes[dim], ctx=ctx) tensor = tensor.astype(dtype) summed = hvd.allreduce(tensor, average=False, name=str(count)) multiplied = tensor * size count += 1 # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in ['int32', 'int64']: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert almost_equal(summed.asnumpy(), multiplied.asnumpy(), atol=threshold), \ f'hvd.allreduce produces incorrect results: {hvd.rank()} {count} {dtype} {dim}'
def _resume_fit(self, train_data, val_data, time_limit=math.inf): tic = time.time() if max(self._cfg.train.start_epoch, self.epoch) >= self._cfg.train.epochs: return {'time', self._time_elapsed} if not self.classes or not self.num_class: raise ValueError('Unable to determine classes of dataset') # dataset devices = [int(i) for i in self._cfg.gpus] train_dataset = train_data.to_mxnet() val_dataset = val_data.to_mxnet() # dataloader if self._cfg.train.dali: if not dali_found: raise SystemExit( "DALI not found, please check if you installed it correctly." ) train_loader, val_loader = _get_dali_dataloader( self.async_net, train_dataset, val_dataset, self._cfg.ssd.data_shape, self._cfg.train.batch_size, self._cfg.num_workers, devices, self.ctx[0], self._cfg.horovod) else: self.batch_size = self._cfg.train.batch_size // hvd.size() \ if self._cfg.horovod else self._cfg.train.batch_size train_loader, val_loader, train_eval_loader = _get_dataloader( self.async_net, train_dataset, val_dataset, self._cfg.ssd.data_shape, self.batch_size, self._cfg.num_workers) self._time_elapsed += time.time() - tic return self._train_loop(train_loader, val_loader, train_eval_loader, time_limit=time_limit)
def __init__(self, symbol, fc7_model, memory_bank, memory_optimizer, logger=logging, ): self.size = hvd.size() self.rank = hvd.rank() self.local_rank = hvd.local_rank() self.gpu = mx.gpu(self.local_rank) self.cpu = mx.cpu() # `device_id` is not needed for CPU. self.nd_cache = {} self.embedding_size = config.embedding_size self.batch_size = config.batch_size self.num_update = 0 self.batch_end_param = namedtuple('batch_end_param', ['loss', 'num_epoch', 'num_update']) self.fc7_model = fc7_model self.symbol = symbol self.logger = logger self.backbone_module = mx.module.Module( self.symbol, ['data'], ['softmax_label'], logger=self.logger, context=self.gpu) self.memory_bank = memory_bank self.memory_optimizer = memory_optimizer self.memory_lr = None self.loss_cache = None self.grad_cache = None
def test_horovod_allreduce_cpu_gpu_error(self): """Test that the allreduce raises an error if different ranks try to perform reduction on CPU and GPU.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return shape = (17, 17, 17) if rank % 2 == 0: ctx = mx.gpu(hvd.rank()) else: ctx = mx.cpu(hvd.rank()) tensor = mx.nd.ones(shape=shape, ctx=ctx) try: output = hvd.allreduce(tensor) output.wait_to_read() assert False, 'hvd.allreduce did not throw cpu-gpu error' except (MXNetError, RuntimeError): pass
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.resume_params == '': net.initialize(mx.init.MSRAPrelu(), ctx=ctx) if opt.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 hvd.broadcast_parameters(net.collect_params(), root_rank=0) # trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) # trainer = hvd.DistributedTrainer( # net.collect_params(), # optimizer, # optimizer_params) if opt.trainer == 'sgd': trainer = SGDTrainer( net.collect_params(), optimizer, optimizer_params) elif opt.trainer == 'efsgd': trainer = EFSGDTrainerV1( net.collect_params(), 'EFSGDV1', optimizer_params, input_sparse_ratio=1./opt.input_sparse_1, output_sparse_ratio=1./opt.output_sparse_1, layer_sparse_ratio=1./opt.layer_sparse_1) elif opt.trainer == 'qsparselocalsgd': trainer = QSparseLocalSGDTrainerV1( net.collect_params(), optimizer, optimizer_params, input_sparse_ratio=1./opt.input_sparse_1, output_sparse_ratio=1./opt.output_sparse_1, layer_sparse_ratio=1./opt.layer_sparse_1, local_sgd_interval=opt.local_sgd_interval) elif opt.trainer == 'ersgd': trainer = ERSGDTrainerV2( net.collect_params(), optimizer, optimizer_params, input_sparse_ratio=1./opt.input_sparse_1, output_sparse_ratio=1./opt.output_sparse_1, layer_sparse_ratio=1./opt.layer_sparse_1) elif opt.trainer == 'partiallocalsgd': trainer = PartialLocalSGDTrainerV1( net.collect_params(), optimizer, optimizer_params, input_sparse_ratio=1./opt.input_sparse_1, output_sparse_ratio=1./opt.output_sparse_1, layer_sparse_ratio=1./opt.layer_sparse_1, local_sgd_interval=opt.local_sgd_interval) elif opt.trainer == 'ersgd2': trainer = ERSGD2TrainerV2( net.collect_params(), optimizer, optimizer_params, input_sparse_ratio_1=1./opt.input_sparse_1, output_sparse_ratio_1=1./opt.output_sparse_1, layer_sparse_ratio_1=1./opt.layer_sparse_1, input_sparse_ratio_2=1./opt.input_sparse_2, output_sparse_ratio_2=1./opt.output_sparse_2, layer_sparse_ratio_2=1./opt.layer_sparse_2, local_sgd_interval=opt.local_sgd_interval) else: trainer = SGDTrainer( net.collect_params(), optimizer, optimizer_params) if opt.resume_states != '': trainer.load_states(opt.resume_states) if opt.label_smoothing or opt.mixup: sparse_label_loss = False else: sparse_label_loss = True if distillation: L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(temperature=opt.temperature, hard_weight=opt.hard_weight, sparse_label=sparse_label_loss) else: L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss) best_val_score = 1 for epoch in range(opt.resume_epoch, opt.num_epochs): tic = time.time() if opt.use_rec: train_data.reset() # train_metric.reset() train_loss = 0 btic = time.time() # test speed if opt.test_speed > 0: n_repeats = opt.test_speed elif opt.test_speed == 0: n_repeats = 1 else: n_repeats = 0 for i, batch in enumerate(train_data): # test speed if n_repeats == 0 and not (i+1)%opt.log_interval: print('[Epoch %d] # batch: %d'%(epoch, i)) continue data, label = batch_fn(batch, ctx) for j in range(n_repeats): if opt.mixup: lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha) if epoch >= opt.num_epochs - opt.mixup_off_epoch: lam = 1 data = [lam*X + (1-lam)*X[::-1] for X in data] if opt.label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, classes, lam, eta) elif opt.label_smoothing: hard_label = label label = smooth(label, classes) if distillation: teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \ for X in data] with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] if distillation: loss = [L(yhat.astype('float32', copy=False), y.astype('float32', copy=False), p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)] else: loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)] for l in loss: l.backward() trainer.step(batch_size) # if opt.mixup: # output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \ # for out in outputs] # train_metric.update(label, output_softmax) # else: # if opt.label_smoothing: # train_metric.update(hard_label, outputs) # else: # train_metric.update(label, outputs) step_loss = sum([l.sum().asscalar() for l in loss]) train_loss += step_loss if opt.log_interval and not (i+j+1)%opt.log_interval: # train_metric_name, train_metric_score = train_metric.get() if hvd.rank() == 0: # logger.info('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%( # epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic), # train_metric_name, train_metric_score, trainer.learning_rate, trainer._comm_counter/1e6)) # print('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%( # epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic), # train_metric_name, train_metric_score, trainer.learning_rate, trainer._comm_counter/1e6)) print('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%( epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic), 'loss', step_loss/batch_size, trainer.learning_rate, trainer._comm_counter/1e6)) btic = time.time() mx.nd.waitall() toc = time.time() if n_repeats == 0: allreduce_array_nd = mx.nd.array([i]) hvd.allreduce_(allreduce_array_nd, name='allreduce_array', average=True) mx.nd.waitall() print('[Epoch %d] # total batch: %d'%(epoch, i)) continue train_metric_name, train_metric_score = train_metric.get() throughput = int(batch_size * i /(toc - tic) * hvd.size()) train_loss /= (batch_size * i) if opt.trainer == 'ersgd' or opt.trainer == 'qsparselocalsgd' or opt.trainer == 'ersgd2' or opt.trainer == 'partiallocalsgd': allreduce_for_val = True else: allreduce_for_val = False if allreduce_for_val: trainer.pre_test() # err_train_tic = time.time() # err_top1_train, err_top5_train = test(ctx, train_data, val=False) err_val_tic = time.time() err_top1_val, err_top5_val = test(ctx, val_data, val=True) err_val_toc = time.time() if allreduce_for_val: trainer.post_test() mx.nd.waitall() # allreduce the results allreduce_array_nd = mx.nd.array([train_loss, err_top1_val, err_top5_val]) hvd.allreduce_(allreduce_array_nd, name='allreduce_array', average=True) allreduce_array_np = allreduce_array_nd.asnumpy() train_loss = np.asscalar(allreduce_array_np[0]) err_top1_val = np.asscalar(allreduce_array_np[1]) err_top5_val = np.asscalar(allreduce_array_np[2]) if hvd.rank() == 0: # logger.info('[Epoch %d] training: %s=%f'%(epoch, train_metric_name, train_metric_score)) logger.info('[Epoch %d] training: loss=%f'%(epoch, train_loss)) logger.info('[Epoch %d] speed: %d samples/sec training-time: %f comm: %f'%(epoch, throughput, toc-tic, trainer._comm_counter/1e6)) logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f err-time=%f'%(epoch, err_top1_val, err_top5_val, err_val_toc - err_val_tic)) trainer._comm_counter = 0 if err_top1_val < best_val_score: best_val_score = err_top1_val # if hvd.local_rank() == 0: # net.save_parameters('%s/%.4f-imagenet-%s-%d-best.params'%(save_dir, best_val_score, model_name, epoch)) # trainer.save_states('%s/%.4f-imagenet-%s-%d-best.states'%(save_dir, best_val_score, model_name, epoch)) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: if hvd.local_rank() == 0: net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, epoch)) trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, epoch))
def main(): opt = parse_args() hvd.init() logging_file = 'train_imagenet_%s.log' % (opt.trainer) filehandler = logging.FileHandler(logging_file) streamhandler = logging.StreamHandler() logger = logging.getLogger('') logger.setLevel(logging.INFO) logger.addHandler(filehandler) logger.addHandler(streamhandler) if hvd.rank() == 0: logger.info(opt) batch_size = opt.batch_size classes = 1000 num_training_samples = 1281167 context = [mx.gpu(hvd.local_rank())] num_workers = opt.num_workers optimizer = opt.optimizer warmup_epochs = opt.warmup_epochs lr_decay = opt.lr_decay lr_decay_period = opt.lr_decay_period if opt.lr_decay_period > 0: lr_decay_epoch = list(range(lr_decay_period, opt.num_epochs, lr_decay_period)) else: lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')] lr_decay_epoch = [e - warmup_epochs for e in lr_decay_epoch] num_batches = num_training_samples // (batch_size * hvd.size()) lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=opt.warmup_lr, target_lr=opt.lr, nepochs=warmup_epochs, iters_per_epoch=num_batches), LRScheduler(opt.lr_mode, base_lr=opt.lr, target_lr=0, nepochs=opt.num_epochs - warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=lr_decay, power=2) ]) model_name = opt.model kwargs = {'ctx': context, 'pretrained': opt.use_pretrained, 'classes': classes} if opt.use_gn: kwargs['norm_layer'] = gcv.nn.GroupNorm if model_name.startswith('vgg'): kwargs['batch_norm'] = opt.batch_norm elif model_name.startswith('resnext'): kwargs['use_se'] = opt.use_se if opt.last_gamma: kwargs['last_gamma'] = True optimizer_params = {'wd': opt.wd, 'momentum': opt.momentum, 'lr_scheduler': lr_scheduler} if opt.dtype != 'float32': optimizer_params['multi_precision'] = True net = get_model(model_name, **kwargs) net.cast(opt.dtype) if opt.resume_params != '': net.load_parameters(opt.resume_params, ctx = context) # teacher model for distillation training if opt.teacher is not None and opt.hard_weight < 1.0: teacher_name = opt.teacher teacher = get_model(teacher_name, pretrained=True, classes=classes, ctx=context) teacher.cast(opt.dtype) distillation = True else: distillation = False # Two functions for reading data from record file or raw images def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers): rec_train = os.path.expanduser(rec_train) rec_train_idx = os.path.expanduser(rec_train_idx) rec_val = os.path.expanduser(rec_val) rec_val_idx = os.path.expanduser(rec_val_idx) jitter_param = 0.4 lighting_param = 0.1 input_size = opt.input_size crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875 resize = int(math.ceil(input_size / crop_ratio)) mean_rgb = [123.68, 116.779, 103.939] std_rgb = [58.393, 57.12, 57.375] def batch_fn(batch, ctx): data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) return data, label train_data = mx.io.ImageRecordIter( path_imgrec = rec_train, path_imgidx = rec_train_idx, preprocess_threads = num_workers, shuffle = True, batch_size = batch_size, round_batch = False, data_shape = (3, input_size, input_size), mean_r = mean_rgb[0], mean_g = mean_rgb[1], mean_b = mean_rgb[2], std_r = std_rgb[0], std_g = std_rgb[1], std_b = std_rgb[2], rand_mirror = True, random_resized_crop = True, max_aspect_ratio = 4. / 3., min_aspect_ratio = 3. / 4., max_random_area = 1, min_random_area = 0.08, brightness = jitter_param, saturation = jitter_param, contrast = jitter_param, pca_noise = lighting_param, num_parts = hvd.size(), part_index = hvd.rank(), ) val_data = mx.io.ImageRecordIter( path_imgrec = rec_val, path_imgidx = rec_val_idx, preprocess_threads = num_workers, shuffle = False, batch_size = batch_size, resize = resize, data_shape = (3, input_size, input_size), mean_r = mean_rgb[0], mean_g = mean_rgb[1], mean_b = mean_rgb[2], std_r = std_rgb[0], std_g = std_rgb[1], std_b = std_rgb[2], ) return train_data, val_data, batch_fn def get_data_loader(data_dir, batch_size, num_workers): normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) jitter_param = 0.4 lighting_param = 0.1 input_size = opt.input_size crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875 resize = int(math.ceil(input_size / crop_ratio)) def batch_fn(batch, ctx): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) return data, label transform_train = transforms.Compose([ transforms.RandomResizedCrop(input_size), transforms.RandomFlipLeftRight(), transforms.RandomColorJitter(brightness=jitter_param, contrast=jitter_param, saturation=jitter_param), transforms.RandomLighting(lighting_param), transforms.ToTensor(), normalize ]) transform_test = transforms.Compose([ transforms.Resize(resize, keep_ratio=True), transforms.CenterCrop(input_size), transforms.ToTensor(), normalize ]) train_data = gluon.data.DataLoader( imagenet.classification.ImageNet(data_dir, train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader( imagenet.classification.ImageNet(data_dir, train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) return train_data, val_data, batch_fn if opt.use_rec: train_data, val_data, batch_fn = get_data_rec(opt.rec_train, opt.rec_train_idx, opt.rec_val, opt.rec_val_idx, batch_size, num_workers) else: train_data, val_data, batch_fn = get_data_loader(opt.data_dir, batch_size, num_workers) if opt.mixup: train_metric = mx.metric.RMSE() else: train_metric = mx.metric.Accuracy() acc_top1 = mx.metric.Accuracy() acc_top5 = mx.metric.TopKAccuracy(5) save_frequency = opt.save_frequency if opt.save_dir and save_frequency: save_dir = opt.save_dir makedirs(save_dir) else: save_dir = '' save_frequency = 0 def mixup_transform(label, classes, lam=1, eta=0.0): if isinstance(label, nd.NDArray): label = [label] res = [] for l in label: y1 = l.one_hot(classes, on_value = 1 - eta + eta/classes, off_value = eta/classes) y2 = l[::-1].one_hot(classes, on_value = 1 - eta + eta/classes, off_value = eta/classes) res.append(lam*y1 + (1-lam)*y2) return res def smooth(label, classes, eta=0.1): if isinstance(label, nd.NDArray): label = [label] smoothed = [] for l in label: res = l.one_hot(classes, on_value = 1 - eta + eta/classes, off_value = eta/classes) smoothed.append(res) return smoothed def test(ctx, val_data, val=True): if opt.use_rec: if val: val_data.reset() else: train_data.reset() acc_top1.reset() acc_top5.reset() for i, batch in enumerate(val_data): data, label = batch_fn(batch, ctx) outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] acc_top1.update(label, outputs) acc_top5.update(label, outputs) _, top1 = acc_top1.get() _, top5 = acc_top5.get() return (1-top1, 1-top5) def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.resume_params == '': net.initialize(mx.init.MSRAPrelu(), ctx=ctx) if opt.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 hvd.broadcast_parameters(net.collect_params(), root_rank=0) # trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) # trainer = hvd.DistributedTrainer( # net.collect_params(), # optimizer, # optimizer_params) if opt.trainer == 'sgd': trainer = SGDTrainer( net.collect_params(), optimizer, optimizer_params) elif opt.trainer == 'efsgd': trainer = EFSGDTrainerV1( net.collect_params(), 'EFSGDV1', optimizer_params, input_sparse_ratio=1./opt.input_sparse_1, output_sparse_ratio=1./opt.output_sparse_1, layer_sparse_ratio=1./opt.layer_sparse_1) elif opt.trainer == 'qsparselocalsgd': trainer = QSparseLocalSGDTrainerV1( net.collect_params(), optimizer, optimizer_params, input_sparse_ratio=1./opt.input_sparse_1, output_sparse_ratio=1./opt.output_sparse_1, layer_sparse_ratio=1./opt.layer_sparse_1, local_sgd_interval=opt.local_sgd_interval) elif opt.trainer == 'ersgd': trainer = ERSGDTrainerV2( net.collect_params(), optimizer, optimizer_params, input_sparse_ratio=1./opt.input_sparse_1, output_sparse_ratio=1./opt.output_sparse_1, layer_sparse_ratio=1./opt.layer_sparse_1) elif opt.trainer == 'partiallocalsgd': trainer = PartialLocalSGDTrainerV1( net.collect_params(), optimizer, optimizer_params, input_sparse_ratio=1./opt.input_sparse_1, output_sparse_ratio=1./opt.output_sparse_1, layer_sparse_ratio=1./opt.layer_sparse_1, local_sgd_interval=opt.local_sgd_interval) elif opt.trainer == 'ersgd2': trainer = ERSGD2TrainerV2( net.collect_params(), optimizer, optimizer_params, input_sparse_ratio_1=1./opt.input_sparse_1, output_sparse_ratio_1=1./opt.output_sparse_1, layer_sparse_ratio_1=1./opt.layer_sparse_1, input_sparse_ratio_2=1./opt.input_sparse_2, output_sparse_ratio_2=1./opt.output_sparse_2, layer_sparse_ratio_2=1./opt.layer_sparse_2, local_sgd_interval=opt.local_sgd_interval) else: trainer = SGDTrainer( net.collect_params(), optimizer, optimizer_params) if opt.resume_states != '': trainer.load_states(opt.resume_states) if opt.label_smoothing or opt.mixup: sparse_label_loss = False else: sparse_label_loss = True if distillation: L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(temperature=opt.temperature, hard_weight=opt.hard_weight, sparse_label=sparse_label_loss) else: L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss) best_val_score = 1 for epoch in range(opt.resume_epoch, opt.num_epochs): tic = time.time() if opt.use_rec: train_data.reset() # train_metric.reset() train_loss = 0 btic = time.time() # test speed if opt.test_speed > 0: n_repeats = opt.test_speed elif opt.test_speed == 0: n_repeats = 1 else: n_repeats = 0 for i, batch in enumerate(train_data): # test speed if n_repeats == 0 and not (i+1)%opt.log_interval: print('[Epoch %d] # batch: %d'%(epoch, i)) continue data, label = batch_fn(batch, ctx) for j in range(n_repeats): if opt.mixup: lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha) if epoch >= opt.num_epochs - opt.mixup_off_epoch: lam = 1 data = [lam*X + (1-lam)*X[::-1] for X in data] if opt.label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, classes, lam, eta) elif opt.label_smoothing: hard_label = label label = smooth(label, classes) if distillation: teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \ for X in data] with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] if distillation: loss = [L(yhat.astype('float32', copy=False), y.astype('float32', copy=False), p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)] else: loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)] for l in loss: l.backward() trainer.step(batch_size) # if opt.mixup: # output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \ # for out in outputs] # train_metric.update(label, output_softmax) # else: # if opt.label_smoothing: # train_metric.update(hard_label, outputs) # else: # train_metric.update(label, outputs) step_loss = sum([l.sum().asscalar() for l in loss]) train_loss += step_loss if opt.log_interval and not (i+j+1)%opt.log_interval: # train_metric_name, train_metric_score = train_metric.get() if hvd.rank() == 0: # logger.info('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%( # epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic), # train_metric_name, train_metric_score, trainer.learning_rate, trainer._comm_counter/1e6)) # print('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%( # epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic), # train_metric_name, train_metric_score, trainer.learning_rate, trainer._comm_counter/1e6)) print('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%( epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic), 'loss', step_loss/batch_size, trainer.learning_rate, trainer._comm_counter/1e6)) btic = time.time() mx.nd.waitall() toc = time.time() if n_repeats == 0: allreduce_array_nd = mx.nd.array([i]) hvd.allreduce_(allreduce_array_nd, name='allreduce_array', average=True) mx.nd.waitall() print('[Epoch %d] # total batch: %d'%(epoch, i)) continue train_metric_name, train_metric_score = train_metric.get() throughput = int(batch_size * i /(toc - tic) * hvd.size()) train_loss /= (batch_size * i) if opt.trainer == 'ersgd' or opt.trainer == 'qsparselocalsgd' or opt.trainer == 'ersgd2' or opt.trainer == 'partiallocalsgd': allreduce_for_val = True else: allreduce_for_val = False if allreduce_for_val: trainer.pre_test() # err_train_tic = time.time() # err_top1_train, err_top5_train = test(ctx, train_data, val=False) err_val_tic = time.time() err_top1_val, err_top5_val = test(ctx, val_data, val=True) err_val_toc = time.time() if allreduce_for_val: trainer.post_test() mx.nd.waitall() # allreduce the results allreduce_array_nd = mx.nd.array([train_loss, err_top1_val, err_top5_val]) hvd.allreduce_(allreduce_array_nd, name='allreduce_array', average=True) allreduce_array_np = allreduce_array_nd.asnumpy() train_loss = np.asscalar(allreduce_array_np[0]) err_top1_val = np.asscalar(allreduce_array_np[1]) err_top5_val = np.asscalar(allreduce_array_np[2]) if hvd.rank() == 0: # logger.info('[Epoch %d] training: %s=%f'%(epoch, train_metric_name, train_metric_score)) logger.info('[Epoch %d] training: loss=%f'%(epoch, train_loss)) logger.info('[Epoch %d] speed: %d samples/sec training-time: %f comm: %f'%(epoch, throughput, toc-tic, trainer._comm_counter/1e6)) logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f err-time=%f'%(epoch, err_top1_val, err_top5_val, err_val_toc - err_val_tic)) trainer._comm_counter = 0 if err_top1_val < best_val_score: best_val_score = err_top1_val # if hvd.local_rank() == 0: # net.save_parameters('%s/%.4f-imagenet-%s-%d-best.params'%(save_dir, best_val_score, model_name, epoch)) # trainer.save_states('%s/%.4f-imagenet-%s-%d-best.states'%(save_dir, best_val_score, model_name, epoch)) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: if hvd.local_rank() == 0: net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, epoch)) trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, epoch)) # if save_frequency and save_dir: # if hvd.local_rank() == 0: # net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, opt.num_epochs-1)) # trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, opt.num_epochs-1)) if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) if distillation: teacher.hybridize(static_alloc=True, static_shape=True) train(context)
def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers): rec_train = os.path.expanduser(rec_train) rec_train_idx = os.path.expanduser(rec_train_idx) rec_val = os.path.expanduser(rec_val) rec_val_idx = os.path.expanduser(rec_val_idx) jitter_param = 0.4 lighting_param = 0.1 input_size = opt.input_size crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875 resize = int(math.ceil(input_size / crop_ratio)) mean_rgb = [123.68, 116.779, 103.939] std_rgb = [58.393, 57.12, 57.375] def batch_fn(batch, ctx): data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) return data, label train_data = mx.io.ImageRecordIter( path_imgrec = rec_train, path_imgidx = rec_train_idx, preprocess_threads = num_workers, shuffle = True, batch_size = batch_size, round_batch = False, data_shape = (3, input_size, input_size), mean_r = mean_rgb[0], mean_g = mean_rgb[1], mean_b = mean_rgb[2], std_r = std_rgb[0], std_g = std_rgb[1], std_b = std_rgb[2], rand_mirror = True, random_resized_crop = True, max_aspect_ratio = 4. / 3., min_aspect_ratio = 3. / 4., max_random_area = 1, min_random_area = 0.08, brightness = jitter_param, saturation = jitter_param, contrast = jitter_param, pca_noise = lighting_param, num_parts = hvd.size(), part_index = hvd.rank(), ) val_data = mx.io.ImageRecordIter( path_imgrec = rec_val, path_imgidx = rec_val_idx, preprocess_threads = num_workers, shuffle = False, batch_size = batch_size, resize = resize, data_shape = (3, input_size, input_size), mean_r = mean_rgb[0], mean_g = mean_rgb[1], mean_b = mean_rgb[2], std_r = std_rgb[0], std_g = std_rgb[1], std_b = std_rgb[2], ) return train_data, val_data, batch_fn
# use sync bn if specified if args.syncbn and len(ctx) > 1: net = get_model(net_name, pretrained_base=True, norm_layer=gluon.contrib.nn.SyncBatchNorm, norm_kwargs={'num_devices': len(ctx)}) async_net = get_model(net_name, pretrained_base=False) # used by cpu worker else: net = get_model(net_name, pretrained_base=True) async_net = net if args.resume.strip(): net.load_parameters(args.resume.strip()) async_net.load_parameters(args.resume.strip()) else: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") net.initialize() async_net.initialize() # training data batch_size = (args.batch_size // hvd.size()) if args.horovod else args.batch_size train_dataset, val_dataset, eval_metric = get_dataset(args.dataset, args) train_data, val_data = get_dataloader(async_net, train_dataset, val_dataset, args.data_shape, batch_size, args.num_workers, args) # training train(net, train_data, val_data, eval_metric, ctx, args)
args = parser.parse_args() # logging level = logging.DEBUG if args.verbose else logging.INFO logging.getLogger().setLevel(level) logging.info(args) os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' try: import horovod.mxnet as hvd except ImportError: logging.info('horovod must be installed.') exit() hvd.init() store = None num_workers = hvd.size() rank = hvd.rank() local_rank = hvd.local_rank() is_master_node = rank == local_rank if not args.use_avg_len and hvd.size() > 1: logging.info('Specifying --use-avg-len and setting --batch_size with the ' 'target number of tokens would help improve training throughput.') def train(data_train, data_eval, model, nsp_loss, mlm_loss, vocab_size, ctx): """Training function.""" hvd.broadcast_parameters(model.collect_params(), root_rank=0) mlm_metric = nlp.metric.MaskedAccuracy() nsp_metric = nlp.metric.MaskedAccuracy() mlm_metric.reset() nsp_metric.reset()
default=0, help='frequency of model saving (default: 0)') parser.add_argument( '--gradient-predivide-factor', type=float, default=1.0, help='apply gradient predivide factor in optimizer (default: 1.0)') args = parser.parse_args() logging.basicConfig(level=logging.INFO) logging.info(args) # Horovod: initialize Horovod hvd.init() num_workers = hvd.size() rank = hvd.rank() local_rank = hvd.local_rank() num_classes = 1000 num_training_samples = 1281167 batch_size = args.batch_size epoch_size = \ int(math.ceil(int(num_training_samples // num_workers) / batch_size)) if args.lr_mode == 'step': lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] steps = [epoch_size * x for x in lr_decay_epoch] lr_sched = lr_scheduler.MultiFactorScheduler( step=steps, factor=args.lr_decay,
if args.horovod: ctx = [mx.gpu(hvd.local_rank())] else: ctx = [mx.gpu(int(i)) for i in args.gpus.split(',') if i.strip()] ctx = ctx if ctx else [mx.cpu()] # network kwargs = {} module_list = [] if args.use_fpn: module_list.append('fpn') if args.norm_layer is not None: module_list.append(args.norm_layer) if args.norm_layer == 'bn': kwargs['num_devices'] = len(ctx) num_gpus = hvd.size() if args.horovod else len(ctx) net_name = '_'.join(('mask_rcnn', *module_list, args.network, args.dataset)) if args.custom_model: args.use_fpn = True net_name = '_'.join(('mask_rcnn_fpn', args.network, args.dataset)) if args.norm_layer == 'bn': norm_layer = gluon.contrib.nn.SyncBatchNorm norm_kwargs = {'num_devices': len(ctx)} sym_norm_layer = mx.sym.contrib.SyncBatchNorm sym_norm_kwargs = {'ndev': len(ctx)} elif args.norm_layer == 'gn': norm_layer = gluon.nn.GroupNorm norm_kwargs = {'groups': 8} sym_norm_layer = mx.sym.GroupNorm sym_norm_kwargs = {'groups': 8} else:
# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of # the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the "license" file accompanying this file. This file is # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. import json import os import horovod.mxnet as hvd hvd.init() with open( os.path.join('/opt/ml/model/local-rank-%s-rank-%s' % (hvd.local_rank(), hvd.rank())), 'w+') as f: basic_info = { 'local-rank': hvd.local_rank(), 'rank': hvd.rank(), 'size': hvd.size() } print(basic_info) json.dump(basic_info, f)
stride=(2, 2)) # first fully connected layer flatten = mx.sym.flatten(data=pool2) fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=50) relu3 = mx.sym.Activation(data=fc1, act_type='relu') # second fully connected layer fc2 = mx.sym.FullyConnected(data=relu3, num_hidden=10) # softmax loss loss = mx.sym.SoftmaxOutput(data=fc2, name='softmax') return loss # Step 4: fit the model net = conv_net() model = mx.mod.Module(symbol=net, context=context) optimizer_params = {'learning_rate': args.lr * hvd.size(), 'rescale_grad': 1.0 / args.batch_size} opt = mx.optimizer.create('sgd', **optimizer_params) # Horovod: wrap optimizer with DistributedOptimizer opt = hvd.DistributedOptimizer(opt) initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) model.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) model.init_params(initializer) # Horovod: fetch and broadcast parameters (arg_params, aux_params) = model.get_params() if arg_params is not None:
def _init_network(self): if not self.num_class: raise ValueError( 'Unable to create network when `num_class` is unknown. \ It should be inferred from dataset or resumed from saved states.' ) assert len(self.classes) == self.num_class # training contexts if self._cfg.horovod: self.ctx = [mx.gpu(hvd.local_rank())] else: ctx = [mx.gpu(int(i)) for i in self._cfg.gpus] self.ctx = ctx if ctx else [mx.cpu()] # network kwargs = {} module_list = [] if self._cfg.faster_rcnn.use_fpn: module_list.append('fpn') if self._cfg.faster_rcnn.norm_layer is not None: module_list.append(self._cfg.faster_rcnn.norm_layer) if self._cfg.faster_rcnn.norm_layer == 'syncbn': kwargs['num_devices'] = len(self.ctx) self.num_gpus = hvd.size() if self._cfg.horovod else len(self.ctx) if self._cfg.faster_rcnn.transfer is not None: assert isinstance(self._cfg.faster_rcnn.transfer, str) self._logger.info( f'Using transfer learning from {self._cfg.faster_rcnn.transfer}, ' + 'the other network parameters are ignored.') self._cfg.faster_rcnn.use_fpn = 'fpn' in self._cfg.faster_rcnn.transfer self.net = get_model( self._cfg.faster_rcnn.transfer, pretrained=True, per_device_batch_size=self._cfg.train.batch_size // self.num_gpus, **kwargs) self.net.reset_class(self.classes, reuse_weights=[ cname for cname in self.classes if cname in self.net.classes ]) else: self._cfg.faster_rcnn.use_fpn = True if self._cfg.faster_rcnn.norm_layer == 'syncbn': norm_layer = gluon.contrib.nn.SyncBatchNorm norm_kwargs = {'num_devices': len(self.ctx)} sym_norm_layer = mx.sym.contrib.SyncBatchNorm sym_norm_kwargs = {'ndev': len(self.ctx)} elif self._cfg.faster_rcnn.norm_layer == 'gn': norm_layer = gluon.nn.GroupNorm norm_kwargs = {'groups': 8} sym_norm_layer = mx.sym.GroupNorm sym_norm_kwargs = {'groups': 8} else: norm_layer = gluon.nn.BatchNorm norm_kwargs = None sym_norm_layer = None sym_norm_kwargs = None self.net = get_model( 'custom_faster_rcnn_fpn', classes=self.classes, transfer=None, dataset=self._cfg.dataset, pretrained_base=self._cfg.train.pretrained_base, base_network_name=self._cfg.faster_rcnn.backbone, norm_layer=norm_layer, norm_kwargs=norm_kwargs, sym_norm_layer=sym_norm_layer, sym_norm_kwargs=sym_norm_kwargs, num_fpn_filters=self._cfg.faster_rcnn.num_fpn_filters, num_box_head_conv=self._cfg.faster_rcnn.num_box_head_conv, num_box_head_conv_filters=self._cfg.faster_rcnn. num_box_head_conv_filters, num_box_head_dense_filters=self._cfg.faster_rcnn. num_box_head_dense_filters, short=self._cfg.faster_rcnn.image_short, max_size=self._cfg.faster_rcnn.image_max_size, min_stage=2, max_stage=6, nms_thresh=self._cfg.faster_rcnn.nms_thresh, nms_topk=self._cfg.faster_rcnn.nms_topk, roi_mode=self._cfg.faster_rcnn.roi_mode, roi_size=self._cfg.faster_rcnn.roi_size, strides=self._cfg.faster_rcnn.strides, clip=self._cfg.faster_rcnn.clip, rpn_channel=self._cfg.faster_rcnn.rpn_channel, base_size=self._cfg.faster_rcnn.anchor_base_size, scales=self._cfg.faster_rcnn.anchor_scales, ratios=self._cfg.faster_rcnn.anchor_aspect_ratio, alloc_size=self._cfg.faster_rcnn.anchor_alloc_size, rpn_nms_thresh=self._cfg.faster_rcnn.rpn_nms_thresh, rpn_train_pre_nms=self._cfg.train.rpn_train_pre_nms, rpn_train_post_nms=self._cfg.train.rpn_train_post_nms, rpn_test_pre_nms=self._cfg.valid.rpn_test_pre_nms, rpn_test_post_nms=self._cfg.valid.rpn_test_post_nms, rpn_min_size=self._cfg.train.rpn_min_size, per_device_batch_size=self._cfg.train.batch_size // self.num_gpus, num_sample=self._cfg.train.rcnn_num_samples, pos_iou_thresh=self._cfg.train.rcnn_pos_iou_thresh, pos_ratio=self._cfg.train.rcnn_pos_ratio, max_num_gt=self._cfg.faster_rcnn.max_num_gt) if self._cfg.resume.strip(): self.net.load_parameters(self._cfg.resume.strip()) else: for param in self.net.collect_params().values(): if param._data is not None: continue param.initialize() self.net.collect_params().reset_ctx(self.ctx) if self._cfg.faster_rcnn.amp: # Cast both weights and gradients to 'float16' self.net.cast('float16') # These layers don't support type 'float16' self.net.collect_params('.*batchnorm.*').setattr( 'dtype', 'float32') self.net.collect_params( '.*normalizedperclassboxcenterencoder.*').setattr( 'dtype', 'float32') if self._cfg.resume.strip(): self.net.load_parameters(self._cfg.resume.strip()) else: for param in self.net.collect_params().values(): if param._data is not None: continue param.initialize() self.net.collect_params().reset_ctx(self.ctx)