def get_mnist_iterator(rank): data_dir = "data-%d" % rank if not os.path.isdir(data_dir): os.makedirs(data_dir) zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip', dirname=data_dir) with zipfile.ZipFile(zip_file_path) as zf: zf.extractall(data_dir) input_shape = (1, 28, 28) batch_size = args.batch_size train_iter = mx.io.MNISTIter( image="%s/train-images-idx3-ubyte" % data_dir, label="%s/train-labels-idx1-ubyte" % data_dir, input_shape=input_shape, batch_size=batch_size, shuffle=True, flat=False, num_parts=hvd.size(), part_index=hvd.rank() ) val_iter = mx.io.MNISTIter( image="%s/t10k-images-idx3-ubyte" % data_dir, label="%s/t10k-labels-idx1-ubyte" % data_dir, input_shape=input_shape, batch_size=batch_size, flat=False, num_parts=hvd.size(), part_index=hvd.rank() ) return train_iter, val_iter
def test_horovod_broadcast_grad(self): """Test the correctness of the broadcast gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") dtypes = ['int32', 'int64', 'float32', 'float64'] dims = [1, 2, 3] ctx = self._current_context() count = 0 shapes = [(), (17), (17, 17), (17, 17, 17)] root_rank = 1 tensor_dict = {} root_dict = {} for dtype, dim, in itertools.product(dtypes, dims): tensor_dict[count] = mx.nd.ones(shapes[dim], ctx=ctx) * rank root_dict[count] = mx.nd.ones(shapes[dim], ctx=ctx) * root_rank tensor_dict[count] = tensor_dict[count].astype(dtype) root_dict[count] = root_dict[count].astype(dtype) # Only do broadcasting using and on broadcast_tensor count += 1 hvd.broadcast_parameters(tensor_dict, root_rank=root_rank) for i in range(count): if not same(tensor_dict[i].asnumpy(), root_dict[i].asnumpy()): print("broadcast", count, dtype, dim) print("broadcast_tensor", hvd.rank(), tensor_dict[i]) print("root_tensor", hvd.rank(), root_dict[i]) print("comparison", hvd.rank(), tensor_dict[i] == root_dict[i]) assert same(tensor_dict[i].asnumpy(), root_dict[i].asnumpy()), \ 'hvd.broadcast produces incorrect broadcasted tensor'
def test_horovod_allreduce_cpu_gpu_error(self): """Test that the allreduce raises an error if different ranks try to perform reduction on CPU and GPU.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return shape = (17, 17, 17) if rank % 2 == 0: ctx = mx.gpu(hvd.rank()) else: ctx = mx.cpu(hvd.rank()) tensor = mx.nd.ones(shape=shape, ctx=ctx) try: output = hvd.allreduce(tensor) output.wait_to_read() assert False, 'hvd.allreduce did not throw cpu-gpu error' except (MXNetError, RuntimeError): pass
def get_async_results(self, waitall=False): val_map = -1 val_epoch = -1 if hvd.rank() == 0: if waitall: results = self.async_executor.result() else: results = self.async_executor.pop_done() if results and len(results) > 0: # get highest mAP (in case multiple results are returned) val_epoch = max(results, key=results.get) val_map = results[val_epoch] val_map = comm.bcast(val_map, root=0) return val_epoch, val_map
def test_horovod_allreduce_inplace(self): """Test that the allreduce correctly sums 1D, 2D, 3D tensors.""" hvd.init() size = hvd.size() dtypes = self.filter_supported_types(['int32', 'int64', 'float32', 'float64']) dims = [1, 2, 3] ctx = self._current_context() count = 0 shapes = [(), (17), (17, 17), (17, 17, 17)] for dtype, dim in itertools.product(dtypes, dims): mx.random.seed(1234, ctx=ctx) tensor = mx.nd.random.uniform(-100, 100, shape=shapes[dim], ctx=ctx) tensor = tensor.astype(dtype) multiplied = tensor * size hvd.allreduce_(tensor, average=False, name=str(count)) max_difference = mx.nd.max(mx.nd.subtract(tensor, multiplied)) count += 1 # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in ['int32', 'int64']: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break if max_difference > threshold: print("self", count, dtype, dim, max_difference, threshold) print("tensor", hvd.rank(), tensor) print("multiplied", hvd.rank(), multiplied) assert max_difference <= threshold, 'hvd.allreduce produces \
def set_seed_distributed(local_seed): # single-element tensor with the local seed in it rank_0_seed = nd.full((1), local_seed, dtype=np.int32) if hvd.size() > 1: rank_0_seed = hvd.broadcast_(tensor=rank_0_seed, root_rank=0, name="broadcast_the_seed") nd.ndarray.waitall() local_seed = (rank_0_seed[0].asscalar() + hvd.rank()) % 2**31 log_event(key=mlperf_constants.SEED, value=local_seed) random.seed(local_seed) np.random.seed(local_seed) mx.random.seed(local_seed) return local_seed
def get_dataloader(net, train_dataset, val_dataset, train_transform, val_transform, batch_size, args): """Get dataloader.""" train_bfn = batchify.Tuple(*[batchify.Append() for _ in range(6)]) train_sampler = gcv.nn.sampler.SplitSampler(len(train_dataset), hvd.size(), hvd.rank()) if args.horovod else None train_loader = mx.gluon.data.DataLoader( train_dataset.transform(train_transform(net.short, net.max_size, net, ashape=net.ashape, multi_stage=args.use_fpn)), batch_size, train_sampler is None, sampler=train_sampler, batchify_fn=train_bfn, last_batch='rollover', num_workers=args.num_workers) val_bfn = batchify.Tuple(*[batchify.Append() for _ in range(2)]) short = net.short[-1] if isinstance(net.short, (tuple, list)) else net.short val_loader = mx.gluon.data.DataLoader( val_dataset.transform(val_transform(short, net.max_size)), batch_size, False, batchify_fn=val_bfn, last_batch='keep', num_workers=args.num_workers) return train_loader, val_loader
def init_comm(backend, gpus): """Init communication backend Parameters ---------- backend gpus Returns ------- store num_workers rank local_rank is_master_node ctx_l """ # backend specific implementation import mxnet as mx if backend == 'horovod': try: import horovod.mxnet as hvd # pylint: disable=import-outside-toplevel except ImportError: logging.info('horovod must be installed.') sys.exit(1) hvd.init() store = None num_workers = hvd.size() rank = hvd.rank() local_rank = hvd.local_rank() is_master_node = rank == local_rank ctx_l = [mx.gpu(local_rank)] logging.info('GPU communication supported by horovod') else: store = mx.kv.create(backend) num_workers = store.num_workers rank = store.rank local_rank = 0 is_master_node = rank == local_rank if gpus == '-1' or gpus == '': ctx_l = [mx.cpu()] logging.info('Runing on CPU') else: ctx_l = [mx.gpu(int(x)) for x in gpus.split(',')] logging.info('GPU communication supported by KVStore') return store, num_workers, rank, local_rank, is_master_node, ctx_l
def save_checkpoint(net, epoch, top1, best_acc, model_prefix, save_frequency, kvstore): if model_prefix is None or save_frequency == 0 or ('horovod' in kvstore and hvd.rank() != 0): return if save_frequency > 0 and (epoch + 1) % save_frequency == 0: fname = '{}_{:04}.params'.format(model_prefix, epoch) net.save_parameters(fname) logging.info( '[Epoch {}] Saving checkpoint to {} with Accuracy: {:.4f}'.format( epoch, fname, top1)) if top1 > best_acc: fname = '{}_best.params'.format(model_prefix) net.save_parameters(fname) logging.info( '[Epoch {}] Saving checkpoint to {} with Accuracy: {:.4f}'.format( epoch, fname, top1))
def test_horovod_allreduce_ndarray_lifetime(self): """Test that the input NDArray remains valid during async allreduce""" hvd.init() rank = hvd.rank() size = hvd.size() dims = [1, 2, 3] ctx = self._current_context() count = 0 shapes = [(), (17), (17, 17), (17, 17, 17)] for i, dim in enumerate(dims): tensor = mx.nd.ones(shape=shapes[dim], ctx=ctx) # tensor*(i+1) result will be destroyed immediately after this call # See https://github.com/horovod/horovod/issues/1533 sum = hvd.allreduce(tensor * (i + 1), average=False) expected = tensor * (i + 1) * size assert same(sum.asnumpy(), expected.asnumpy())
def test_horovod_broadcast_inplace(self): """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return dtypes = ['int32', 'int64', 'float32', 'float64'] dims = [1, 2, 3] ctx = self._current_context() count = 0 shapes = [(), (17), (17, 17), (17, 17, 17)] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): tensor = mx.nd.ones(shapes[dim], ctx=ctx) * rank root_tensor = mx.nd.ones(shapes[dim], ctx=ctx) * root_rank tensor = tensor.astype(dtype) root_tensor = root_tensor.astype(dtype) # Only do broadcasting using and on broadcast_tensor broadcast_tensor = tensor.copy() hvd.broadcast_(broadcast_tensor, root_rank=root_rank, name=str(count)) if rank != root_rank: if same(tensor.asnumpy(), root_tensor.asnumpy()): print("broadcast", count, dtype, dim, mx.nd.max(tensor == root_tensor)) print("tensor", hvd.rank(), tensor) print("root_tensor", hvd.rank(), root_tensor) print("comparison", hvd.rank(), tensor == root_tensor) assert not same(tensor.asnumpy(), root_tensor.asnumpy()), \ 'hvd.broadcast modifies source tensor' if not same(broadcast_tensor.asnumpy(), root_tensor.asnumpy()): print("broadcast", count, dtype, dim) print("broadcast_tensor", hvd.rank(), broadcast_tensor) print("root_tensor", hvd.rank(), root_tensor) print("comparison", hvd.rank(), broadcast_tensor == root_tensor) broadcast_tensor.wait_to_read() tensor.wait_to_read() assert same(broadcast_tensor.asnumpy(), root_tensor.asnumpy()), \ 'hvd.broadcast produces incorrect broadcasted tensor'
def get_dataloader(net, train_dataset, val_dataset, train_transform, val_transform, batch_size, num_shards, args): """Get dataloader.""" train_bfn = FasterRCNNTrainBatchify(net, num_shards) if hasattr(train_dataset, 'get_im_aspect_ratio'): im_aspect_ratio = train_dataset.get_im_aspect_ratio() else: im_aspect_ratio = [1.] * len(train_dataset) train_sampler = \ gcv.nn.sampler.SplitSortedBucketSampler(im_aspect_ratio, batch_size, num_parts=hvd.size() if args.horovod else 1, part_index=hvd.rank() if args.horovod else 0, shuffle=True) # dataset: train_dataset.transform(train_transform(net.short, net.max_size, net, ashape=net.ashape, multi_stage=args.use_fpn)) # ashape: anchor 预先定义的大小 # multi_stage + ashape : 计算anchor train_loader = mx.gluon.data.DataLoader(train_dataset.transform( train_transform(net.short, net.max_size, net, ashape=net.ashape, multi_stage=args.use_fpn)), batch_sampler=train_sampler, batchify_fn=train_bfn, num_workers=args.num_workers) val_bfn = Tuple(*[Append() for _ in range(3)]) short = net.short[-1] if isinstance(net.short, (tuple, list)) else net.short # validation use 1 sample per device # dataset: val_dataset.transform(val_transform(short, net.max_size)) # 每个item返回 img, bbox.astype('float32'), mx.nd.array([im_scale]) # bbox: x1, y1, x2, y2, class_id # img最短边<= short,最长边<=net.max_size # Tuple 不是python中的元组tuple # Append(): 每个样本自成ndarray,所有样本数据的大小不必相同,返回的batch是列表 # val_bfn 有3个Append(),每个Append()处理dataset item的一个属性 val_loader = mx.gluon.data.DataLoader(val_dataset.transform( val_transform(short, net.max_size)), num_shards, False, batchify_fn=val_bfn, last_batch='keep', num_workers=args.num_workers) return train_loader, val_loader
def get_dataloader( net, train_dataset, val_dataset, train_transform, val_transform, batch_size, num_shards_per_process, args, ): """Get dataloader.""" train_bfn = batchify.MaskRCNNTrainBatchify(net, num_shards_per_process) train_sampler = gcv.nn.sampler.SplitSortedBucketSampler( train_dataset.get_im_aspect_ratio(), batch_size, num_parts=hvd.size() if args.horovod else 1, part_index=hvd.rank() if args.horovod else 0, shuffle=True, ) train_loader = mx.gluon.data.DataLoader( train_dataset.transform( train_transform(net.short, net.max_size, net, ashape=net.ashape, multi_stage=True)), batch_sampler=train_sampler, batchify_fn=train_bfn, num_workers=args.num_workers, ) val_bfn = batchify.Tuple(*[batchify.Append() for _ in range(2)]) short = net.short[-1] if isinstance(net.short, (tuple, list)) else net.short # validation use 1 sample per device val_loader = mx.gluon.data.DataLoader( val_dataset.transform(val_transform(short, net.max_size)), num_shards_per_process, False, batchify_fn=val_bfn, last_batch="keep", num_workers=args.num_workers, ) return train_loader, val_loader
def __init__(self, optimizer): """Construct a new ScheduledOptimizer, which uses horovod optimizer under the hood for averaging gradients across all the Horovod ranks. Args: optimizer: Optimizer to use for computing and averaging gradients and applying updates. """ self._optimizer = optimizer self._immediate = False # Let rank 0 decide the communication order self._rank = hvd.rank() if self._rank != 0: self._immediate = True self._first_key = None self._step = 0 core.start(rank=self._rank, arch="allreduce")
def get_dataloader(net, train_dataset, val_dataset, train_transform, val_transform, batch_size, num_shards, args): """Get dataloader.""" train_bfn = FasterRCNNTrainBatchify(net, num_shards) if hasattr(train_dataset, 'get_im_aspect_ratio'): im_aspect_ratio = train_dataset.get_im_aspect_ratio() else: im_aspect_ratio = [1.] * len(train_dataset) if args.horovod: num_parts = hvd.size() part_index = hvd.rank() elif "perseus" in args.kv_store: num_parts = kv.num_workers part_index = kv.rank else: num_parts = 1 part_index = 0 train_sampler = \ gcv.nn.sampler.SplitSortedBucketSampler(im_aspect_ratio, batch_size, num_parts=num_parts, part_index=part_index, shuffle=True) train_loader = mx.gluon.data.DataLoader(train_dataset.transform( train_transform(net.short, net.max_size, net, ashape=net.ashape, multi_stage=args.use_fpn)), batch_sampler=train_sampler, batchify_fn=train_bfn, num_workers=args.num_workers) val_bfn = Tuple(*[Append() for _ in range(3)]) short = net.short[-1] if isinstance(net.short, (tuple, list)) else net.short # validation use 1 sample per device val_loader = mx.gluon.data.DataLoader(val_dataset.transform( val_transform(short, net.max_size)), num_shards, False, batchify_fn=val_bfn, last_batch='keep', num_workers=args.num_workers) return train_loader, val_loader
def test_horovod_alltoall_splits_type_error(self): """Test that the alltoall returns an error if the splits tensor does not contain 32-bit integers.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if NCCL version < 2.7.0 if hvd.nccl_built() and hvd.nccl_built() < 2700: self.skipTest("NCCL-based Alltoall requires NCCL version >= 2.7.0.") ctx = self._current_context() tensor = mx.ndarray.empty([size], ctx=ctx) splits = mx.ndarray.ones([size], dtype='float32', ctx=ctx) try: hvd.alltoall(tensor, splits) assert False, 'hvd.alltoall did not throw error' except (MXNetError, ValueError): pass
def test_horovod_alltoall_splits_error(self): """Test that the alltoall returns an error if the sum of the splits entries exceeds the first dimension of the input tensor.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if NCCL version < 2.7.0 if hvd.nccl_built() and hvd.nccl_built() < 2700: self.skipTest( "NCCL-based Alltoall requires NCCL version >= 2.7.0.") ctx = self._current_context() tensor = mx.ndarray.empty([size - 1], ctx=ctx) splits = mx.ndarray.ones([size], dtype='int32', ctx=ctx) try: hvd.alltoall(tensor, splits) assert False, 'hvd.alltoall did not throw error' except (MXNetError, RuntimeError): pass
def test_horovod_broadcast_rank_error(self): """Test that the broadcast returns an error if different ranks specify different root rank.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") ctx = self._current_context() shape = (17, 17, 17) tensor = mx.nd.ones(shape=shape, ctx=ctx) try: output = hvd.broadcast(tensor, root_rank=rank) output.wait_to_read() assert False, 'hvd.broadcast did not throw rank error' except (MXNetError, RuntimeError): pass
def test_horovod_broadcast_error(self): """Test that the broadcast returns an error if any dimension besides the first is different among the tensors being broadcasted.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") ctx = self._current_context() shape = (17, rank + 1) tensor = mx.nd.ones(shape=shape, ctx=ctx) try: output = hvd.broadcast(tensor, 0) output.wait_to_read() assert False, 'hvd.broadcast did not throw error' except (MXNetError, RuntimeError): pass
def test_horovod_alltoall(self): """Test that the alltoall correctly distributes 1D, 2D, and 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if using gloo controller if hvd.gloo_enabled(): self.skipTest( "Alltoall currently does not support Gloo controller.") # This test does not apply if NCCL version < 2.7.0 if hvd.nccl_built() and hvd.nccl_built() < 2700: self.skipTest( "NCCL-based Alltoall requires NCCL version >= 2.7.0.") dtypes = ['int32', 'int64', 'float32', 'float64'] dims = [1, 2, 3] ctx = self._current_context() for dtype, dim in itertools.product(dtypes, dims): vals = [] for i in range(size): vals += [i] * (rank + 1) tensor = mx.ndarray.array(vals, dtype=dtype, ctx=ctx) for _ in range(dim - 1): tensor = mx.ndarray.expand_dims(tensor, axis=1) tensor = mx.ndarray.concat(tensor, tensor, dim=1) splits = mx.ndarray.array([rank + 1] * size, dtype='int32', ctx=ctx) collected = hvd.alltoall(tensor, splits) assert collected.min( ) == rank, 'hvd.alltoall produces incorrect collected tensor' assert collected.max( ) == rank, 'hvd.alltoall produces incorrect collected tensor' assert collected.size == size * (size + 1) // 2 * 2**( dim - 1), 'hvd.alltoall collected wrong number of values'
def test_horovod_allgather_error(self): """Test that the allgather returns an error if any dimension besides the first is different among the tensors being gathered.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") ctx = self._current_context() tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = mx.ndarray.ones(shape=tensor_size, ctx=ctx) try: hvd.allgather(tensor) assert False, 'hvd.allgather did not throw error' except (MXNetError, RuntimeError): pass
def init_comm(backend): """Init communication backend""" # backend specific implementation if backend == 'horovod': try: import horovod.mxnet as hvd except ImportError: logging.info('horovod must be installed.') exit() hvd.init() store = None num_workers = hvd.size() rank = hvd.rank() local_rank = hvd.local_rank() is_master_node = rank == local_rank ctxs = [mx.gpu(local_rank)] elif backend == 'byteps': try: import byteps.mxnet as bps except ImportError: logging.info('BytePS must be installed.') exit() bps.init() store = None num_workers = bps.size() rank = bps.rank() local_rank = bps.local_rank() is_master_node = rank == local_rank ctxs = [mx.gpu(local_rank)] else: # kvstore store = mx.kv.create(backend) num_workers = store.num_workers rank = store.rank local_rank = 0 is_master_node = rank == local_rank ctxs = [mx.cpu()] if args.gpus is None or args.gpus == '' else \ [mx.gpu(int(x)) for x in args.gpus.split(',')] return store, num_workers, rank, local_rank, is_master_node, ctxs
def test_horovod_alltoall_equal_split_length_error(self): """Test that the alltoall with default splitting returns an error if the first dimension of tensor is not a multiple of the number of workers.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") # This test does not apply if NCCL version < 2.7.0 if hvd.nccl_built() and hvd.nccl_built() < 2700: self.skipTest("NCCL-based Alltoall requires NCCL version >= 2.7.0.") ctx = self._current_context() tensor = mx.ndarray.empty([size + 1], ctx=ctx) try: hvd.alltoall(tensor) assert False, 'hvd.alltoall did not throw error' except (MXNetError, RuntimeError): pass
def test_horovod_allgather(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = ['int32', 'int64', 'float32', 'float64'] dims = [1, 2, 3] ctx = self._current_context() for dtype, dim in itertools.product(dtypes, dims): tensor = mx.ndarray.ones(shape=[17] * dim, dtype=dtype, ctx=ctx) * rank gathered = hvd.allgather(tensor) assert list(gathered.shape) == [17 * size] + [17] * (dim - 1) for i in range(size): rank_tensor = gathered[i * 17:(i + 1) * 17] assert list(rank_tensor.shape) == [17] * dim, \ 'hvd.allgather produces incorrect gathered shape' assert rank_tensor.min() == i, 'hvd.allgather produces incorrect gathered tensor' assert rank_tensor.max() == i, 'hvd.allgather produces incorrect gathered tensor'
def get_dali_dataloader(net, train_dataset, val_dataset, data_shape, global_batch_size, num_workers, devices, ctx, horovod, seed): width, height = data_shape, data_shape with autograd.train_mode(): _, _, anchors = net(mx.nd.zeros((1, 3, height, width), ctx=ctx)) anchors = anchors.as_in_context(mx.cpu()) if horovod: batch_size = global_batch_size // hvd.size() pipelines = [SSDDALIPipeline(device_id=hvd.local_rank(), batch_size=batch_size, data_shape=data_shape, anchors=anchors, num_workers=num_workers, dataset_reader = train_dataset[0], seed=seed)] else: num_devices = len(devices) batch_size = global_batch_size // num_devices pipelines = [SSDDALIPipeline(device_id=device_id, batch_size=batch_size, data_shape=data_shape, anchors=anchors, num_workers=num_workers, dataset_reader = train_dataset[i], seed=seed) for i, device_id in enumerate(devices)] epoch_size = train_dataset[0].size() if horovod: epoch_size //= hvd.size() train_loader = DALIGenericIterator(pipelines, [('data', DALIGenericIterator.DATA_TAG), ('bboxes', DALIGenericIterator.LABEL_TAG), ('label', DALIGenericIterator.LABEL_TAG)], epoch_size, auto_reset=True) # validation if (not horovod or hvd.rank() == 0): val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1)) val_loader = gluon.data.DataLoader( val_dataset.transform(SSDDefaultValTransform(width, height)), global_batch_size, False, batchify_fn=val_batchify_fn, last_batch='keep', num_workers=num_workers) else: val_loader = None return train_loader, val_loader
def test_horovod_allgather_type_error(self): """Test that the allgather returns an error if the types being gathered differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") ctx = self._current_context() tensor_size = [17] * 3 if rank % 2 == 0: tensor = mx.ndarray.ones(shape=tensor_size, dtype="int32", ctx=ctx) else: tensor = mx.ndarray.ones(shape=tensor_size, dtype="float32", ctx=ctx) try: hvd.allgather(tensor) assert False, 'hvd.allgather did not throw error' except (MXNetError, RuntimeError): pass
def __init__(self, symbol, fc7_model, memory_bank, memory_optimizer, logger=logging, ): self.size = hvd.size() self.rank = hvd.rank() self.local_rank = hvd.local_rank() self.gpu = mx.gpu(self.local_rank) self.cpu = mx.cpu() # `device_id` is not needed for CPU. self.nd_cache = {} self.embedding_size = config.embedding_size self.batch_size = config.batch_size self.num_update = 0 self.batch_end_param = namedtuple('batch_end_param', ['loss', 'num_epoch', 'num_update']) self.fc7_model = fc7_model self.symbol = symbol self.logger = logger self.backbone_module = mx.module.Module( self.symbol, ['data'], ['softmax_label'], logger=self.logger, context=self.gpu) self.memory_bank = memory_bank self.memory_optimizer = memory_optimizer self.memory_lr = None self.loss_cache = None self.grad_cache = None
def test_allreduce(use_horovod, dtype): if use_horovod is False: kvstore_type = "dist_sync_device" if os.environ.get( "DMLC_ROLE") == "worker" else kvstore_type kv = mx.kvstore.create(kvstore_type) rank = kv.rank num_workers = kv.num_workers else: kvstore_type = "device" kv = mx.kvstore.create(kvstore_type) hvd.init() rank = hvd.rank() num_workers = hvd.size() print('use horovod: {}, rank {}/{}, kv type: {}, usetree: {}'.format( use_horovod, rank, num_workers, kvstore_type, os.environ.get("MXNET_KVSTORE_USETREE"))) rescale_grad = 1.0 / (8 * num_workers) if use_horovod: rescale_grad = rescale_grad * num_workers optimizer_params = dict( momentum=0, # pOpt.optimizer.momentum, wd=0, # pOpt.optimizer.wd, learning_rate=0.1, rescale_grad=rescale_grad, ) optimizer = mx.optimizer.create("sgd", **optimizer_params) if use_horovod: # Horovod: wrap optimizer with DistributedOptimizer optimizer = hvd.DistributedOptimizer(optimizer) print("opt rescale:{}".format(optimizer.rescale_grad)) kv.set_optimizer(optimizer) test_hvd_kv(rank, num_workers, kv, dtype)
def train(net, train_data, val_data, eval_metric, batch_size, ctx, args): """Training pipeline""" kv = mx.kvstore.create(args.kv_store) net.collect_params().setattr('grad_req', 'null') net.collect_train_params().setattr('grad_req', 'write') optimizer_params = { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum } if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer( net.collect_train_params( ), # fix batchnorm, fix first stage, etc... 'sgd', optimizer_params) else: trainer = gluon.Trainer( net.collect_train_params( ), # fix batchnorm, fix first stage, etc... 'sgd', optimizer_params, update_on_kvstore=(False if args.amp else None), kvstore=kv) if args.amp: amp.init_trainer(trainer) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted( [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division # TODO(zhreshold) losses? rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss( from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss() # == smoothl1 metrics = [ mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'), ] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() metrics2 = [ rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric ] # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) if args.verbose: logger.info('Trainable parameters:') logger.info(net.collect_train_params().keys()) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): mix_ratio = 1.0 if not args.disable_hybridization: net.hybridize(static_alloc=args.static_alloc) rcnn_task = ForwardBackwardTask(net, trainer, rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss, mix_ratio=1.0) executor = Parallel(args.executor_threads, rcnn_task) if not args.horovod else None if args.mixup: # TODO(zhreshold) only support evenly mixup now, target generator needs to be modified otherwise train_data._dataset._data.set_mixup(np.random.uniform, 0.5, 0.5) mix_ratio = 0.5 if epoch >= args.epochs - args.no_mixup_epochs: train_data._dataset._data.set_mixup(None) mix_ratio = 1.0 while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() btic = time.time() base_lr = trainer.learning_rate rcnn_task.mix_ratio = mix_ratio print(len(train_data)) for i, batch in enumerate(train_data): if epoch == 0 and i <= lr_warmup: # adjust based on real percentage new_lr = base_lr * get_lr_at_iter(i / lr_warmup, args.lr_warmup_factor) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info( '[Epoch 0 Iteration {}] Set learning rate to {}'. format(i, new_lr)) trainer.set_learning_rate(new_lr) batch = split_and_load(batch, ctx_list=ctx) metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] if executor is not None: for data in zip(*batch): executor.put(data) for j in range(len(ctx)): if executor is not None: result = executor.get() else: result = rcnn_task.forward_backward(list(zip(*batch))[0]) if (not args.horovod) or hvd.rank() == 0: for k in range(len(metric_losses)): metric_losses[k].append(result[k]) for k in range(len(add_losses)): add_losses[k].append(result[len(metric_losses) + k]) for metric, record in zip(metrics, metric_losses): metric.update(0, record) for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(batch_size) # update metrics if (not args.horovod or hvd.rank() == 0) and args.log_interval \ and not (i + 1) % args.log_interval: msg = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in metrics + metrics2 ]) logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'. format( epoch, i, args.log_interval * args.batch_size / (time.time() - btic), msg)) btic = time.time() if (not args.horovod) or hvd.rank() == 0: msg = ','.join( ['{}={:.3f}'.format(*metric.get()) for metric in metrics]) logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format( epoch, (time.time() - tic), msg)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric, args) map_name_train, mean_ap_train = validate( net, train_data, ctx, eval_metric, args) if isinstance(map_name, list): val_msg = '\n'.join([ '{}={}'.format(k, v) for k, v in zip(map_name, mean_ap) ]) train_msg = '\n'.join([ '{}={}'.format(k, v) for k, v in zip(map_name_train, mean_ap_train) ]) current_map = float(mean_ap[-1]) else: val_msg = '{}={}'.format(map_name, mean_ap) train_msg = '{}={}'.format(map_name_train, mean_ap_train) current_map = mean_ap logger.info('[Epoch {}] Validation: {}'.format(epoch, val_msg)) logger.info('[Epoch {}] Train: {}'.format(epoch, train_msg)) else: current_map = 0. save_params(net, logger, best_map, current_map, epoch, args.save_interval, os.path.join(args.model_dir, 'fastrcnn')) executor.__del__()
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.resume_params == '': net.initialize(mx.init.MSRAPrelu(), ctx=ctx) if opt.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 hvd.broadcast_parameters(net.collect_params(), root_rank=0) # trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) # trainer = hvd.DistributedTrainer( # net.collect_params(), # optimizer, # optimizer_params) if opt.trainer == 'sgd': trainer = SGDTrainer( net.collect_params(), optimizer, optimizer_params) elif opt.trainer == 'efsgd': trainer = EFSGDTrainerV1( net.collect_params(), 'EFSGDV1', optimizer_params, input_sparse_ratio=1./opt.input_sparse_1, output_sparse_ratio=1./opt.output_sparse_1, layer_sparse_ratio=1./opt.layer_sparse_1) elif opt.trainer == 'qsparselocalsgd': trainer = QSparseLocalSGDTrainerV1( net.collect_params(), optimizer, optimizer_params, input_sparse_ratio=1./opt.input_sparse_1, output_sparse_ratio=1./opt.output_sparse_1, layer_sparse_ratio=1./opt.layer_sparse_1, local_sgd_interval=opt.local_sgd_interval) elif opt.trainer == 'ersgd': trainer = ERSGDTrainerV2( net.collect_params(), optimizer, optimizer_params, input_sparse_ratio=1./opt.input_sparse_1, output_sparse_ratio=1./opt.output_sparse_1, layer_sparse_ratio=1./opt.layer_sparse_1) elif opt.trainer == 'partiallocalsgd': trainer = PartialLocalSGDTrainerV1( net.collect_params(), optimizer, optimizer_params, input_sparse_ratio=1./opt.input_sparse_1, output_sparse_ratio=1./opt.output_sparse_1, layer_sparse_ratio=1./opt.layer_sparse_1, local_sgd_interval=opt.local_sgd_interval) elif opt.trainer == 'ersgd2': trainer = ERSGD2TrainerV2( net.collect_params(), optimizer, optimizer_params, input_sparse_ratio_1=1./opt.input_sparse_1, output_sparse_ratio_1=1./opt.output_sparse_1, layer_sparse_ratio_1=1./opt.layer_sparse_1, input_sparse_ratio_2=1./opt.input_sparse_2, output_sparse_ratio_2=1./opt.output_sparse_2, layer_sparse_ratio_2=1./opt.layer_sparse_2, local_sgd_interval=opt.local_sgd_interval) else: trainer = SGDTrainer( net.collect_params(), optimizer, optimizer_params) if opt.resume_states != '': trainer.load_states(opt.resume_states) if opt.label_smoothing or opt.mixup: sparse_label_loss = False else: sparse_label_loss = True if distillation: L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(temperature=opt.temperature, hard_weight=opt.hard_weight, sparse_label=sparse_label_loss) else: L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss) best_val_score = 1 for epoch in range(opt.resume_epoch, opt.num_epochs): tic = time.time() if opt.use_rec: train_data.reset() # train_metric.reset() train_loss = 0 btic = time.time() # test speed if opt.test_speed > 0: n_repeats = opt.test_speed elif opt.test_speed == 0: n_repeats = 1 else: n_repeats = 0 for i, batch in enumerate(train_data): # test speed if n_repeats == 0 and not (i+1)%opt.log_interval: print('[Epoch %d] # batch: %d'%(epoch, i)) continue data, label = batch_fn(batch, ctx) for j in range(n_repeats): if opt.mixup: lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha) if epoch >= opt.num_epochs - opt.mixup_off_epoch: lam = 1 data = [lam*X + (1-lam)*X[::-1] for X in data] if opt.label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, classes, lam, eta) elif opt.label_smoothing: hard_label = label label = smooth(label, classes) if distillation: teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \ for X in data] with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] if distillation: loss = [L(yhat.astype('float32', copy=False), y.astype('float32', copy=False), p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)] else: loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)] for l in loss: l.backward() trainer.step(batch_size) # if opt.mixup: # output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \ # for out in outputs] # train_metric.update(label, output_softmax) # else: # if opt.label_smoothing: # train_metric.update(hard_label, outputs) # else: # train_metric.update(label, outputs) step_loss = sum([l.sum().asscalar() for l in loss]) train_loss += step_loss if opt.log_interval and not (i+j+1)%opt.log_interval: # train_metric_name, train_metric_score = train_metric.get() if hvd.rank() == 0: # logger.info('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%( # epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic), # train_metric_name, train_metric_score, trainer.learning_rate, trainer._comm_counter/1e6)) # print('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%( # epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic), # train_metric_name, train_metric_score, trainer.learning_rate, trainer._comm_counter/1e6)) print('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%( epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic), 'loss', step_loss/batch_size, trainer.learning_rate, trainer._comm_counter/1e6)) btic = time.time() mx.nd.waitall() toc = time.time() if n_repeats == 0: allreduce_array_nd = mx.nd.array([i]) hvd.allreduce_(allreduce_array_nd, name='allreduce_array', average=True) mx.nd.waitall() print('[Epoch %d] # total batch: %d'%(epoch, i)) continue train_metric_name, train_metric_score = train_metric.get() throughput = int(batch_size * i /(toc - tic) * hvd.size()) train_loss /= (batch_size * i) if opt.trainer == 'ersgd' or opt.trainer == 'qsparselocalsgd' or opt.trainer == 'ersgd2' or opt.trainer == 'partiallocalsgd': allreduce_for_val = True else: allreduce_for_val = False if allreduce_for_val: trainer.pre_test() # err_train_tic = time.time() # err_top1_train, err_top5_train = test(ctx, train_data, val=False) err_val_tic = time.time() err_top1_val, err_top5_val = test(ctx, val_data, val=True) err_val_toc = time.time() if allreduce_for_val: trainer.post_test() mx.nd.waitall() # allreduce the results allreduce_array_nd = mx.nd.array([train_loss, err_top1_val, err_top5_val]) hvd.allreduce_(allreduce_array_nd, name='allreduce_array', average=True) allreduce_array_np = allreduce_array_nd.asnumpy() train_loss = np.asscalar(allreduce_array_np[0]) err_top1_val = np.asscalar(allreduce_array_np[1]) err_top5_val = np.asscalar(allreduce_array_np[2]) if hvd.rank() == 0: # logger.info('[Epoch %d] training: %s=%f'%(epoch, train_metric_name, train_metric_score)) logger.info('[Epoch %d] training: loss=%f'%(epoch, train_loss)) logger.info('[Epoch %d] speed: %d samples/sec training-time: %f comm: %f'%(epoch, throughput, toc-tic, trainer._comm_counter/1e6)) logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f err-time=%f'%(epoch, err_top1_val, err_top5_val, err_val_toc - err_val_tic)) trainer._comm_counter = 0 if err_top1_val < best_val_score: best_val_score = err_top1_val # if hvd.local_rank() == 0: # net.save_parameters('%s/%.4f-imagenet-%s-%d-best.params'%(save_dir, best_val_score, model_name, epoch)) # trainer.save_states('%s/%.4f-imagenet-%s-%d-best.states'%(save_dir, best_val_score, model_name, epoch)) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: if hvd.local_rank() == 0: net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, epoch)) trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, epoch))
def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers): rec_train = os.path.expanduser(rec_train) rec_train_idx = os.path.expanduser(rec_train_idx) rec_val = os.path.expanduser(rec_val) rec_val_idx = os.path.expanduser(rec_val_idx) jitter_param = 0.4 lighting_param = 0.1 input_size = opt.input_size crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875 resize = int(math.ceil(input_size / crop_ratio)) mean_rgb = [123.68, 116.779, 103.939] std_rgb = [58.393, 57.12, 57.375] def batch_fn(batch, ctx): data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) return data, label train_data = mx.io.ImageRecordIter( path_imgrec = rec_train, path_imgidx = rec_train_idx, preprocess_threads = num_workers, shuffle = True, batch_size = batch_size, round_batch = False, data_shape = (3, input_size, input_size), mean_r = mean_rgb[0], mean_g = mean_rgb[1], mean_b = mean_rgb[2], std_r = std_rgb[0], std_g = std_rgb[1], std_b = std_rgb[2], rand_mirror = True, random_resized_crop = True, max_aspect_ratio = 4. / 3., min_aspect_ratio = 3. / 4., max_random_area = 1, min_random_area = 0.08, brightness = jitter_param, saturation = jitter_param, contrast = jitter_param, pca_noise = lighting_param, num_parts = hvd.size(), part_index = hvd.rank(), ) val_data = mx.io.ImageRecordIter( path_imgrec = rec_val, path_imgidx = rec_val_idx, preprocess_threads = num_workers, shuffle = False, batch_size = batch_size, resize = resize, data_shape = (3, input_size, input_size), mean_r = mean_rgb[0], mean_g = mean_rgb[1], mean_b = mean_rgb[2], std_r = std_rgb[0], std_g = std_rgb[1], std_b = std_rgb[2], ) return train_data, val_data, batch_fn
batch_size=batch_size, flat=False, num_parts=hvd.size(), part_index=hvd.rank() ) return train_iter, val_iter # Step 1: initialize Horovod hvd.init() # Horovod: pin context to process context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank()) # Step 2: load data train_iter, val_iter = get_mnist_iterator(hvd.rank()) # Step 3: define network def conv_net(): # placeholder for data data = mx.sym.var('data') # first conv layer conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=10) relu1 = mx.sym.Activation(data=conv1, act_type='relu') pool1 = mx.sym.Pooling(data=relu1, pool_type='max', kernel=(2, 2), stride=(2, 2)) # second conv layer conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=20) relu2 = mx.sym.Activation(data=conv2, act_type='relu') pool2 = mx.sym.Pooling(data=relu2, pool_type='max', kernel=(2, 2),