Example #1
0
def get_mnist_iterator(rank):
    data_dir = "data-%d" % rank
    if not os.path.isdir(data_dir):
        os.makedirs(data_dir)
    zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip',
                             dirname=data_dir)
    with zipfile.ZipFile(zip_file_path) as zf:
        zf.extractall(data_dir)

    input_shape = (1, 28, 28)
    batch_size = args.batch_size

    train_iter = mx.io.MNISTIter(
        image="%s/train-images-idx3-ubyte" % data_dir,
        label="%s/train-labels-idx1-ubyte" % data_dir,
        input_shape=input_shape,
        batch_size=batch_size,
        shuffle=True,
        flat=False,
        num_parts=hvd.size(),
        part_index=hvd.rank()
    )

    val_iter = mx.io.MNISTIter(
        image="%s/t10k-images-idx3-ubyte" % data_dir,
        label="%s/t10k-labels-idx1-ubyte" % data_dir,
        input_shape=input_shape,
        batch_size=batch_size,
        flat=False,
        num_parts=hvd.size(),
        part_index=hvd.rank()
    )

    return train_iter, val_iter
Example #2
0
    def test_horovod_broadcast_grad(self):
        """Test the correctness of the broadcast gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        dtypes = ['int32', 'int64', 'float32', 'float64']
        dims = [1, 2, 3]
        ctx = self._current_context()
        count = 0
        shapes = [(), (17), (17, 17), (17, 17, 17)]
        root_rank = 1
        tensor_dict = {}
        root_dict = {}
        for dtype, dim, in itertools.product(dtypes, dims):
            tensor_dict[count] = mx.nd.ones(shapes[dim], ctx=ctx) * rank
            root_dict[count] = mx.nd.ones(shapes[dim], ctx=ctx) * root_rank
            tensor_dict[count] = tensor_dict[count].astype(dtype)
            root_dict[count] = root_dict[count].astype(dtype)

            # Only do broadcasting using and on broadcast_tensor
            count += 1

        hvd.broadcast_parameters(tensor_dict, root_rank=root_rank)
        for i in range(count):
            if not same(tensor_dict[i].asnumpy(), root_dict[i].asnumpy()):
                print("broadcast", count, dtype, dim)
                print("broadcast_tensor", hvd.rank(), tensor_dict[i])
                print("root_tensor", hvd.rank(), root_dict[i])
                print("comparison", hvd.rank(), tensor_dict[i] == root_dict[i])
            assert same(tensor_dict[i].asnumpy(), root_dict[i].asnumpy()), \
                'hvd.broadcast produces incorrect broadcasted tensor'
Example #3
0
    def test_horovod_allreduce_cpu_gpu_error(self):
        """Test that the allreduce raises an error if different ranks try to
           perform reduction on CPU and GPU."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        shape = (17, 17, 17)
        if rank % 2 == 0:
            ctx = mx.gpu(hvd.rank())
        else:
            ctx = mx.cpu(hvd.rank())
        tensor = mx.nd.ones(shape=shape, ctx=ctx)

        try:
            output = hvd.allreduce(tensor)
            output.wait_to_read()
            assert False, 'hvd.allreduce did not throw cpu-gpu error'
        except (MXNetError, RuntimeError):
            pass
Example #4
0
    def get_async_results(self, waitall=False):
        val_map = -1
        val_epoch = -1
        if hvd.rank() == 0:
            if waitall:
                results = self.async_executor.result()
            else:
                results = self.async_executor.pop_done()
            if results and len(results) > 0:
                # get highest mAP (in case multiple results are returned)
                val_epoch = max(results, key=results.get)
                val_map = results[val_epoch]

        val_map = comm.bcast(val_map, root=0)
        return val_epoch, val_map
Example #5
0
    def test_horovod_allreduce_inplace(self):
        """Test that the allreduce correctly sums 1D, 2D, 3D tensors."""
        hvd.init()
        size = hvd.size()
        dtypes = self.filter_supported_types(['int32',   'int64',
                                              'float32', 'float64'])
        dims = [1, 2, 3]
        ctx = self._current_context()
        count = 0
        shapes = [(), (17), (17, 17), (17, 17, 17)]
        for dtype, dim in itertools.product(dtypes, dims):
            mx.random.seed(1234, ctx=ctx)
            tensor = mx.nd.random.uniform(-100, 100, shape=shapes[dim],
                                          ctx=ctx)
            tensor = tensor.astype(dtype)
            multiplied = tensor * size
            hvd.allreduce_(tensor, average=False, name=str(count))
            max_difference = mx.nd.max(mx.nd.subtract(tensor, multiplied))
            count += 1

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in ['int32', 'int64']:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            if max_difference > threshold:
                print("self", count, dtype, dim, max_difference, threshold)
                print("tensor", hvd.rank(), tensor)
                print("multiplied", hvd.rank(), multiplied)
            assert max_difference <= threshold, 'hvd.allreduce produces \
def set_seed_distributed(local_seed):
    # single-element tensor with the local seed in it
    rank_0_seed = nd.full((1), local_seed, dtype=np.int32)
    if hvd.size() > 1:
        rank_0_seed = hvd.broadcast_(tensor=rank_0_seed,
                                     root_rank=0,
                                     name="broadcast_the_seed")

    nd.ndarray.waitall()
    local_seed = (rank_0_seed[0].asscalar() + hvd.rank()) % 2**31

    log_event(key=mlperf_constants.SEED, value=local_seed)
    random.seed(local_seed)
    np.random.seed(local_seed)
    mx.random.seed(local_seed)
    return local_seed
Example #7
0
def get_dataloader(net, train_dataset, val_dataset, train_transform, val_transform, batch_size,
                   args):
    """Get dataloader."""
    train_bfn = batchify.Tuple(*[batchify.Append() for _ in range(6)])
    train_sampler = gcv.nn.sampler.SplitSampler(len(train_dataset), hvd.size(), hvd.rank()) if args.horovod else None
    train_loader = mx.gluon.data.DataLoader(
        train_dataset.transform(train_transform(net.short, net.max_size, net, ashape=net.ashape,
                                                multi_stage=args.use_fpn)),
        batch_size, train_sampler is None, sampler=train_sampler, batchify_fn=train_bfn,
        last_batch='rollover', num_workers=args.num_workers)
    val_bfn = batchify.Tuple(*[batchify.Append() for _ in range(2)])
    short = net.short[-1] if isinstance(net.short, (tuple, list)) else net.short
    val_loader = mx.gluon.data.DataLoader(
        val_dataset.transform(val_transform(short, net.max_size)),
        batch_size, False, batchify_fn=val_bfn, last_batch='keep', num_workers=args.num_workers)
    return train_loader, val_loader
Example #8
0
def init_comm(backend, gpus):
    """Init communication backend

    Parameters
    ----------
    backend
    gpus

    Returns
    -------
    store
    num_workers
    rank
    local_rank
    is_master_node
    ctx_l
    """
    # backend specific implementation
    import mxnet as mx
    if backend == 'horovod':
        try:
            import horovod.mxnet as hvd  # pylint: disable=import-outside-toplevel
        except ImportError:
            logging.info('horovod must be installed.')
            sys.exit(1)
        hvd.init()
        store = None
        num_workers = hvd.size()
        rank = hvd.rank()
        local_rank = hvd.local_rank()
        is_master_node = rank == local_rank
        ctx_l = [mx.gpu(local_rank)]
        logging.info('GPU communication supported by horovod')
    else:
        store = mx.kv.create(backend)
        num_workers = store.num_workers
        rank = store.rank
        local_rank = 0
        is_master_node = rank == local_rank
        if gpus == '-1' or gpus == '':
            ctx_l = [mx.cpu()]
            logging.info('Runing on CPU')
        else:
            ctx_l = [mx.gpu(int(x)) for x in gpus.split(',')]
            logging.info('GPU communication supported by KVStore')

    return store, num_workers, rank, local_rank, is_master_node, ctx_l
Example #9
0
def save_checkpoint(net, epoch, top1, best_acc, model_prefix, save_frequency,
                    kvstore):
    if model_prefix is None or save_frequency == 0 or ('horovod' in kvstore
                                                       and hvd.rank() != 0):
        return
    if save_frequency > 0 and (epoch + 1) % save_frequency == 0:
        fname = '{}_{:04}.params'.format(model_prefix, epoch)
        net.save_parameters(fname)
        logging.info(
            '[Epoch {}] Saving checkpoint to {} with Accuracy: {:.4f}'.format(
                epoch, fname, top1))
    if top1 > best_acc:
        fname = '{}_best.params'.format(model_prefix)
        net.save_parameters(fname)
        logging.info(
            '[Epoch {}] Saving checkpoint to {} with Accuracy: {:.4f}'.format(
                epoch, fname, top1))
Example #10
0
    def test_horovod_allreduce_ndarray_lifetime(self):
        """Test that the input NDArray remains valid during async allreduce"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dims = [1, 2, 3]
        ctx = self._current_context()
        count = 0
        shapes = [(), (17), (17, 17), (17, 17, 17)]
        for i, dim in enumerate(dims):
            tensor = mx.nd.ones(shape=shapes[dim], ctx=ctx)
            # tensor*(i+1) result will be destroyed immediately after this call
            # See https://github.com/horovod/horovod/issues/1533
            sum = hvd.allreduce(tensor * (i + 1), average=False)
            expected = tensor * (i + 1) * size
            assert same(sum.asnumpy(), expected.asnumpy())
Example #11
0
    def test_horovod_broadcast_inplace(self):
        """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        dtypes = ['int32', 'int64', 'float32', 'float64']
        dims = [1, 2, 3]
        ctx = self._current_context()
        count = 0
        shapes = [(), (17), (17, 17), (17, 17, 17)]
        root_ranks = list(range(size))
        for dtype, dim, root_rank in itertools.product(dtypes, dims,
                                                       root_ranks):
            tensor = mx.nd.ones(shapes[dim], ctx=ctx) * rank
            root_tensor = mx.nd.ones(shapes[dim], ctx=ctx) * root_rank
            tensor = tensor.astype(dtype)
            root_tensor = root_tensor.astype(dtype)

            # Only do broadcasting using and on broadcast_tensor
            broadcast_tensor = tensor.copy()
            hvd.broadcast_(broadcast_tensor,
                           root_rank=root_rank,
                           name=str(count))
            if rank != root_rank:
                if same(tensor.asnumpy(), root_tensor.asnumpy()):
                    print("broadcast", count, dtype, dim,
                          mx.nd.max(tensor == root_tensor))
                    print("tensor", hvd.rank(), tensor)
                    print("root_tensor", hvd.rank(), root_tensor)
                    print("comparison", hvd.rank(), tensor == root_tensor)
                assert not same(tensor.asnumpy(), root_tensor.asnumpy()), \
                    'hvd.broadcast modifies source tensor'
            if not same(broadcast_tensor.asnumpy(), root_tensor.asnumpy()):
                print("broadcast", count, dtype, dim)
                print("broadcast_tensor", hvd.rank(), broadcast_tensor)
                print("root_tensor", hvd.rank(), root_tensor)
                print("comparison", hvd.rank(),
                      broadcast_tensor == root_tensor)
            broadcast_tensor.wait_to_read()
            tensor.wait_to_read()
            assert same(broadcast_tensor.asnumpy(), root_tensor.asnumpy()), \
                'hvd.broadcast produces incorrect broadcasted tensor'
Example #12
0
def get_dataloader(net, train_dataset, val_dataset, train_transform,
                   val_transform, batch_size, num_shards, args):
    """Get dataloader."""
    train_bfn = FasterRCNNTrainBatchify(net, num_shards)
    if hasattr(train_dataset, 'get_im_aspect_ratio'):
        im_aspect_ratio = train_dataset.get_im_aspect_ratio()
    else:
        im_aspect_ratio = [1.] * len(train_dataset)
    train_sampler = \
        gcv.nn.sampler.SplitSortedBucketSampler(im_aspect_ratio, batch_size,
                                                num_parts=hvd.size() if args.horovod else 1,
                                                part_index=hvd.rank() if args.horovod else 0,
                                                shuffle=True)
    # dataset: train_dataset.transform(train_transform(net.short, net.max_size, net, ashape=net.ashape, multi_stage=args.use_fpn))
    # ashape: anchor 预先定义的大小
    # multi_stage + ashape : 计算anchor
    train_loader = mx.gluon.data.DataLoader(train_dataset.transform(
        train_transform(net.short,
                        net.max_size,
                        net,
                        ashape=net.ashape,
                        multi_stage=args.use_fpn)),
                                            batch_sampler=train_sampler,
                                            batchify_fn=train_bfn,
                                            num_workers=args.num_workers)
    val_bfn = Tuple(*[Append() for _ in range(3)])
    short = net.short[-1] if isinstance(net.short,
                                        (tuple, list)) else net.short
    # validation use 1 sample per device
    # dataset: val_dataset.transform(val_transform(short, net.max_size))
    # 每个item返回 img, bbox.astype('float32'), mx.nd.array([im_scale])
    # bbox: x1, y1, x2, y2, class_id
    # img最短边<= short,最长边<=net.max_size
    # Tuple 不是python中的元组tuple
    # Append(): 每个样本自成ndarray,所有样本数据的大小不必相同,返回的batch是列表
    # val_bfn 有3个Append(),每个Append()处理dataset item的一个属性
    val_loader = mx.gluon.data.DataLoader(val_dataset.transform(
        val_transform(short, net.max_size)),
                                          num_shards,
                                          False,
                                          batchify_fn=val_bfn,
                                          last_batch='keep',
                                          num_workers=args.num_workers)
    return train_loader, val_loader
Example #13
0
def get_dataloader(
    net,
    train_dataset,
    val_dataset,
    train_transform,
    val_transform,
    batch_size,
    num_shards_per_process,
    args,
):
    """Get dataloader."""
    train_bfn = batchify.MaskRCNNTrainBatchify(net, num_shards_per_process)
    train_sampler = gcv.nn.sampler.SplitSortedBucketSampler(
        train_dataset.get_im_aspect_ratio(),
        batch_size,
        num_parts=hvd.size() if args.horovod else 1,
        part_index=hvd.rank() if args.horovod else 0,
        shuffle=True,
    )
    train_loader = mx.gluon.data.DataLoader(
        train_dataset.transform(
            train_transform(net.short,
                            net.max_size,
                            net,
                            ashape=net.ashape,
                            multi_stage=True)),
        batch_sampler=train_sampler,
        batchify_fn=train_bfn,
        num_workers=args.num_workers,
    )
    val_bfn = batchify.Tuple(*[batchify.Append() for _ in range(2)])
    short = net.short[-1] if isinstance(net.short,
                                        (tuple, list)) else net.short
    # validation use 1 sample per device
    val_loader = mx.gluon.data.DataLoader(
        val_dataset.transform(val_transform(short, net.max_size)),
        num_shards_per_process,
        False,
        batchify_fn=val_bfn,
        last_batch="keep",
        num_workers=args.num_workers,
    )
    return train_loader, val_loader
Example #14
0
    def __init__(self, optimizer):
        """Construct a new ScheduledOptimizer, which uses horovod optimizer under the hood for averaging gradients
        across all the Horovod ranks.

        Args:
            optimizer: Optimizer to use for computing and averaging gradients and applying updates.
        """
        self._optimizer = optimizer
        self._immediate = False

        # Let rank 0 decide the communication order
        self._rank = hvd.rank()
        if self._rank != 0:
            self._immediate = True

        self._first_key = None
        self._step = 0

        core.start(rank=self._rank, arch="allreduce")
Example #15
0
def get_dataloader(net, train_dataset, val_dataset, train_transform,
                   val_transform, batch_size, num_shards, args):
    """Get dataloader."""
    train_bfn = FasterRCNNTrainBatchify(net, num_shards)
    if hasattr(train_dataset, 'get_im_aspect_ratio'):
        im_aspect_ratio = train_dataset.get_im_aspect_ratio()
    else:
        im_aspect_ratio = [1.] * len(train_dataset)
    if args.horovod:
        num_parts = hvd.size()
        part_index = hvd.rank()
    elif "perseus" in args.kv_store:
        num_parts = kv.num_workers
        part_index = kv.rank
    else:
        num_parts = 1
        part_index = 0
    train_sampler = \
        gcv.nn.sampler.SplitSortedBucketSampler(im_aspect_ratio, batch_size,
                                                num_parts=num_parts,
                                                part_index=part_index,
                                                shuffle=True)
    train_loader = mx.gluon.data.DataLoader(train_dataset.transform(
        train_transform(net.short,
                        net.max_size,
                        net,
                        ashape=net.ashape,
                        multi_stage=args.use_fpn)),
                                            batch_sampler=train_sampler,
                                            batchify_fn=train_bfn,
                                            num_workers=args.num_workers)
    val_bfn = Tuple(*[Append() for _ in range(3)])
    short = net.short[-1] if isinstance(net.short,
                                        (tuple, list)) else net.short
    # validation use 1 sample per device
    val_loader = mx.gluon.data.DataLoader(val_dataset.transform(
        val_transform(short, net.max_size)),
                                          num_shards,
                                          False,
                                          batchify_fn=val_bfn,
                                          last_batch='keep',
                                          num_workers=args.num_workers)
    return train_loader, val_loader
Example #16
0
    def test_horovod_alltoall_splits_type_error(self):
        """Test that the alltoall returns an error if the splits tensor does not
           contain 32-bit integers."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if NCCL version < 2.7.0
        if hvd.nccl_built() and hvd.nccl_built() < 2700:
            self.skipTest("NCCL-based Alltoall requires NCCL version >= 2.7.0.")

        ctx = self._current_context()
        tensor = mx.ndarray.empty([size], ctx=ctx)
        splits = mx.ndarray.ones([size], dtype='float32', ctx=ctx)
        try:
            hvd.alltoall(tensor, splits)
            assert False, 'hvd.alltoall did not throw error'
        except (MXNetError, ValueError):
            pass
Example #17
0
    def test_horovod_alltoall_splits_error(self):
        """Test that the alltoall returns an error if the sum of the splits entries exceeds
        the first dimension of the input tensor."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if NCCL version < 2.7.0
        if hvd.nccl_built() and hvd.nccl_built() < 2700:
            self.skipTest(
                "NCCL-based Alltoall requires NCCL version >= 2.7.0.")

        ctx = self._current_context()
        tensor = mx.ndarray.empty([size - 1], ctx=ctx)
        splits = mx.ndarray.ones([size], dtype='int32', ctx=ctx)
        try:
            hvd.alltoall(tensor, splits)
            assert False, 'hvd.alltoall did not throw error'
        except (MXNetError, RuntimeError):
            pass
Example #18
0
    def test_horovod_broadcast_rank_error(self):
        """Test that the broadcast returns an error if different ranks
           specify different root rank."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        ctx = self._current_context()
        shape = (17, 17, 17)
        tensor = mx.nd.ones(shape=shape, ctx=ctx)
        try:
            output = hvd.broadcast(tensor, root_rank=rank)
            output.wait_to_read()
            assert False, 'hvd.broadcast did not throw rank error'
        except (MXNetError, RuntimeError):
            pass
Example #19
0
    def test_horovod_broadcast_error(self):
        """Test that the broadcast returns an error if any dimension besides
           the first is different among the tensors being broadcasted."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        ctx = self._current_context()
        shape = (17, rank + 1)
        tensor = mx.nd.ones(shape=shape, ctx=ctx)

        try:
            output = hvd.broadcast(tensor, 0)
            output.wait_to_read()
            assert False, 'hvd.broadcast did not throw error'
        except (MXNetError, RuntimeError):
            pass
Example #20
0
    def test_horovod_alltoall(self):
        """Test that the alltoall correctly distributes 1D, 2D, and 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if using gloo controller
        if hvd.gloo_enabled():
            self.skipTest(
                "Alltoall currently does not support Gloo controller.")

        # This test does not apply if NCCL version < 2.7.0
        if hvd.nccl_built() and hvd.nccl_built() < 2700:
            self.skipTest(
                "NCCL-based Alltoall requires NCCL version >= 2.7.0.")

        dtypes = ['int32', 'int64', 'float32', 'float64']
        dims = [1, 2, 3]
        ctx = self._current_context()
        for dtype, dim in itertools.product(dtypes, dims):
            vals = []
            for i in range(size):
                vals += [i] * (rank + 1)

            tensor = mx.ndarray.array(vals, dtype=dtype, ctx=ctx)
            for _ in range(dim - 1):
                tensor = mx.ndarray.expand_dims(tensor, axis=1)
                tensor = mx.ndarray.concat(tensor, tensor, dim=1)

            splits = mx.ndarray.array([rank + 1] * size,
                                      dtype='int32',
                                      ctx=ctx)
            collected = hvd.alltoall(tensor, splits)

            assert collected.min(
            ) == rank, 'hvd.alltoall produces incorrect collected tensor'
            assert collected.max(
            ) == rank, 'hvd.alltoall produces incorrect collected tensor'
            assert collected.size == size * (size + 1) // 2 * 2**(
                dim - 1), 'hvd.alltoall collected wrong number of values'
Example #21
0
    def test_horovod_allgather_error(self):
        """Test that the allgather returns an error if any dimension besides
        the first is different among the tensors being gathered."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        ctx = self._current_context()

        tensor_size = [17] * 3
        tensor_size[1] = 10 * (rank + 1)
        tensor = mx.ndarray.ones(shape=tensor_size, ctx=ctx)

        try:
            hvd.allgather(tensor)
            assert False, 'hvd.allgather did not throw error'
        except (MXNetError, RuntimeError):
            pass
Example #22
0
def init_comm(backend):
    """Init communication backend"""
    # backend specific implementation
    if backend == 'horovod':
        try:
            import horovod.mxnet as hvd
        except ImportError:
            logging.info('horovod must be installed.')
            exit()
        hvd.init()
        store = None
        num_workers = hvd.size()
        rank = hvd.rank()
        local_rank = hvd.local_rank()
        is_master_node = rank == local_rank
        ctxs = [mx.gpu(local_rank)]
    elif backend == 'byteps':
        try:
            import byteps.mxnet as bps
        except ImportError:
            logging.info('BytePS must be installed.')
            exit()
        bps.init()
        store = None
        num_workers = bps.size()
        rank = bps.rank()
        local_rank = bps.local_rank()
        is_master_node = rank == local_rank
        ctxs = [mx.gpu(local_rank)]
    else:
        # kvstore
        store = mx.kv.create(backend)
        num_workers = store.num_workers
        rank = store.rank
        local_rank = 0
        is_master_node = rank == local_rank
        ctxs = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
               [mx.gpu(int(x)) for x in args.gpus.split(',')]
    return store, num_workers, rank, local_rank, is_master_node, ctxs
Example #23
0
    def test_horovod_alltoall_equal_split_length_error(self):
        """Test that the alltoall with default splitting returns an error if the first dimension
        of tensor is not a multiple of the number of workers."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        # This test does not apply if NCCL version < 2.7.0
        if hvd.nccl_built() and hvd.nccl_built() < 2700:
            self.skipTest("NCCL-based Alltoall requires NCCL version >= 2.7.0.")

        ctx = self._current_context()
        tensor = mx.ndarray.empty([size + 1], ctx=ctx)
        try:
            hvd.alltoall(tensor)
            assert False, 'hvd.alltoall did not throw error'
        except (MXNetError, RuntimeError):
            pass
Example #24
0
    def test_horovod_allgather(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = ['int32',   'int64',
                  'float32', 'float64']
        dims = [1, 2, 3]
        ctx = self._current_context()
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = mx.ndarray.ones(shape=[17] * dim, dtype=dtype, ctx=ctx) * rank
            gathered = hvd.allgather(tensor)

            assert list(gathered.shape) == [17 * size] + [17] * (dim - 1)

            for i in range(size):
                rank_tensor = gathered[i * 17:(i + 1) * 17]
                assert list(rank_tensor.shape) == [17] * dim, \
                    'hvd.allgather produces incorrect gathered shape'
                assert rank_tensor.min() == i, 'hvd.allgather produces incorrect gathered tensor'
                assert rank_tensor.max() == i, 'hvd.allgather produces incorrect gathered tensor'
Example #25
0
def get_dali_dataloader(net, train_dataset, val_dataset, data_shape, global_batch_size, num_workers, devices, ctx, horovod, seed):
    width, height = data_shape, data_shape
    with autograd.train_mode():
        _, _, anchors = net(mx.nd.zeros((1, 3, height, width), ctx=ctx))
    anchors = anchors.as_in_context(mx.cpu())

    if horovod:
        batch_size = global_batch_size // hvd.size()
        pipelines = [SSDDALIPipeline(device_id=hvd.local_rank(), batch_size=batch_size,
                                     data_shape=data_shape, anchors=anchors,
                                     num_workers=num_workers, dataset_reader = train_dataset[0],
                                     seed=seed)]
    else:
        num_devices = len(devices)
        batch_size = global_batch_size // num_devices
        pipelines = [SSDDALIPipeline(device_id=device_id, batch_size=batch_size,
                                     data_shape=data_shape, anchors=anchors,
                                     num_workers=num_workers,
                                     dataset_reader = train_dataset[i],
                                     seed=seed) for i, device_id in enumerate(devices)]

    epoch_size = train_dataset[0].size()
    if horovod:
        epoch_size //= hvd.size()
    train_loader = DALIGenericIterator(pipelines, [('data', DALIGenericIterator.DATA_TAG),
                                                    ('bboxes', DALIGenericIterator.LABEL_TAG),
                                                    ('label', DALIGenericIterator.LABEL_TAG)],
                                                    epoch_size, auto_reset=True)

    # validation
    if (not horovod or hvd.rank() == 0):
        val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1))
        val_loader = gluon.data.DataLoader(
            val_dataset.transform(SSDDefaultValTransform(width, height)),
            global_batch_size, False, batchify_fn=val_batchify_fn, last_batch='keep', num_workers=num_workers)
    else:
        val_loader = None

    return train_loader, val_loader
Example #26
0
    def test_horovod_allgather_type_error(self):
        """Test that the allgather returns an error if the types being gathered
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        ctx = self._current_context()

        tensor_size = [17] * 3
        if rank % 2 == 0:
            tensor = mx.ndarray.ones(shape=tensor_size, dtype="int32", ctx=ctx)
        else:
            tensor = mx.ndarray.ones(shape=tensor_size, dtype="float32", ctx=ctx)

        try:
            hvd.allgather(tensor)
            assert False, 'hvd.allgather did not throw error'
        except (MXNetError, RuntimeError):
            pass
Example #27
0
    def __init__(self, symbol, fc7_model, memory_bank, memory_optimizer,
                 logger=logging, ):
        self.size = hvd.size()
        self.rank = hvd.rank()
        self.local_rank = hvd.local_rank()
        self.gpu = mx.gpu(self.local_rank)
        self.cpu = mx.cpu()                                     # `device_id` is not needed for CPU.
        self.nd_cache = {}
        self.embedding_size = config.embedding_size
        self.batch_size = config.batch_size
        self.num_update = 0
        self.batch_end_param = namedtuple('batch_end_param', ['loss', 'num_epoch', 'num_update'])

        self.fc7_model = fc7_model
        self.symbol = symbol
        self.logger = logger
        self.backbone_module = mx.module.Module(
            self.symbol, ['data'], ['softmax_label'], logger=self.logger, context=self.gpu)

        self.memory_bank = memory_bank
        self.memory_optimizer = memory_optimizer
        self.memory_lr = None
        self.loss_cache = None
        self.grad_cache = None
Example #28
0
def test_allreduce(use_horovod, dtype):
    if use_horovod is False:
        kvstore_type = "dist_sync_device" if os.environ.get(
            "DMLC_ROLE") == "worker" else kvstore_type
        kv = mx.kvstore.create(kvstore_type)
        rank = kv.rank
        num_workers = kv.num_workers
    else:
        kvstore_type = "device"
        kv = mx.kvstore.create(kvstore_type)
        hvd.init()
        rank = hvd.rank()
        num_workers = hvd.size()
    print('use horovod: {}, rank {}/{}, kv type: {}, usetree: {}'.format(
        use_horovod, rank, num_workers, kvstore_type,
        os.environ.get("MXNET_KVSTORE_USETREE")))

    rescale_grad = 1.0 / (8 * num_workers)
    if use_horovod:
        rescale_grad = rescale_grad * num_workers

    optimizer_params = dict(
        momentum=0,  # pOpt.optimizer.momentum,
        wd=0,  # pOpt.optimizer.wd,
        learning_rate=0.1,
        rescale_grad=rescale_grad,
    )
    optimizer = mx.optimizer.create("sgd", **optimizer_params)
    if use_horovod:
        # Horovod: wrap optimizer with DistributedOptimizer
        optimizer = hvd.DistributedOptimizer(optimizer)

    print("opt rescale:{}".format(optimizer.rescale_grad))
    kv.set_optimizer(optimizer)

    test_hvd_kv(rank, num_workers, kv, dtype)
Example #29
0
def train(net, train_data, val_data, eval_metric, batch_size, ctx, args):
    """Training pipeline"""
    kv = mx.kvstore.create(args.kv_store)
    net.collect_params().setattr('grad_req', 'null')
    net.collect_train_params().setattr('grad_req', 'write')
    optimizer_params = {
        'learning_rate': args.lr,
        'wd': args.wd,
        'momentum': args.momentum
    }
    if args.horovod:
        hvd.broadcast_parameters(net.collect_params(), root_rank=0)
        trainer = hvd.DistributedTrainer(
            net.collect_train_params(
            ),  # fix batchnorm, fix first stage, etc...
            'sgd',
            optimizer_params)
    else:
        trainer = gluon.Trainer(
            net.collect_train_params(
            ),  # fix batchnorm, fix first stage, etc...
            'sgd',
            optimizer_params,
            update_on_kvstore=(False if args.amp else None),
            kvstore=kv)

    if args.amp:
        amp.init_trainer(trainer)

    # lr decay policy
    lr_decay = float(args.lr_decay)
    lr_steps = sorted(
        [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()])
    lr_warmup = float(args.lr_warmup)  # avoid int division

    # TODO(zhreshold) losses?
    rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss(
        from_sigmoid=False)
    rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.)  # == smoothl1
    rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss()
    rcnn_box_loss = mx.gluon.loss.HuberLoss()  # == smoothl1
    metrics = [
        mx.metric.Loss('RPN_Conf'),
        mx.metric.Loss('RPN_SmoothL1'),
        mx.metric.Loss('RCNN_CrossEntropy'),
        mx.metric.Loss('RCNN_SmoothL1'),
    ]

    rpn_acc_metric = RPNAccMetric()
    rpn_bbox_metric = RPNL1LossMetric()
    rcnn_acc_metric = RCNNAccMetric()
    rcnn_bbox_metric = RCNNL1LossMetric()
    metrics2 = [
        rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric
    ]

    # set up logger
    logging.basicConfig()
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_file_path = args.save_prefix + '_train.log'
    log_dir = os.path.dirname(log_file_path)
    if log_dir and not os.path.exists(log_dir):
        os.makedirs(log_dir)
    fh = logging.FileHandler(log_file_path)
    logger.addHandler(fh)
    logger.info(args)
    if args.verbose:
        logger.info('Trainable parameters:')
        logger.info(net.collect_train_params().keys())
    logger.info('Start training from [Epoch {}]'.format(args.start_epoch))
    best_map = [0]
    for epoch in range(args.start_epoch, args.epochs):
        mix_ratio = 1.0
        if not args.disable_hybridization:
            net.hybridize(static_alloc=args.static_alloc)
        rcnn_task = ForwardBackwardTask(net,
                                        trainer,
                                        rpn_cls_loss,
                                        rpn_box_loss,
                                        rcnn_cls_loss,
                                        rcnn_box_loss,
                                        mix_ratio=1.0)
        executor = Parallel(args.executor_threads,
                            rcnn_task) if not args.horovod else None
        if args.mixup:
            # TODO(zhreshold) only support evenly mixup now, target generator needs to be modified otherwise
            train_data._dataset._data.set_mixup(np.random.uniform, 0.5, 0.5)
            mix_ratio = 0.5
            if epoch >= args.epochs - args.no_mixup_epochs:
                train_data._dataset._data.set_mixup(None)
                mix_ratio = 1.0
        while lr_steps and epoch >= lr_steps[0]:
            new_lr = trainer.learning_rate * lr_decay
            lr_steps.pop(0)
            trainer.set_learning_rate(new_lr)
            logger.info("[Epoch {}] Set learning rate to {}".format(
                epoch, new_lr))
        for metric in metrics:
            metric.reset()
        tic = time.time()
        btic = time.time()
        base_lr = trainer.learning_rate
        rcnn_task.mix_ratio = mix_ratio
        print(len(train_data))
        for i, batch in enumerate(train_data):
            if epoch == 0 and i <= lr_warmup:
                # adjust based on real percentage
                new_lr = base_lr * get_lr_at_iter(i / lr_warmup,
                                                  args.lr_warmup_factor)
                if new_lr != trainer.learning_rate:
                    if i % args.log_interval == 0:
                        logger.info(
                            '[Epoch 0 Iteration {}] Set learning rate to {}'.
                            format(i, new_lr))
                    trainer.set_learning_rate(new_lr)
            batch = split_and_load(batch, ctx_list=ctx)
            metric_losses = [[] for _ in metrics]
            add_losses = [[] for _ in metrics2]
            if executor is not None:
                for data in zip(*batch):
                    executor.put(data)
            for j in range(len(ctx)):
                if executor is not None:
                    result = executor.get()
                else:
                    result = rcnn_task.forward_backward(list(zip(*batch))[0])
                if (not args.horovod) or hvd.rank() == 0:
                    for k in range(len(metric_losses)):
                        metric_losses[k].append(result[k])
                    for k in range(len(add_losses)):
                        add_losses[k].append(result[len(metric_losses) + k])
            for metric, record in zip(metrics, metric_losses):
                metric.update(0, record)
            for metric, records in zip(metrics2, add_losses):
                for pred in records:
                    metric.update(pred[0], pred[1])
            trainer.step(batch_size)

            # update metrics
            if (not args.horovod or hvd.rank() == 0) and args.log_interval \
                    and not (i + 1) % args.log_interval:
                msg = ','.join([
                    '{}={:.3f}'.format(*metric.get())
                    for metric in metrics + metrics2
                ])
                logger.info(
                    '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.
                    format(
                        epoch, i, args.log_interval * args.batch_size /
                        (time.time() - btic), msg))
                btic = time.time()

        if (not args.horovod) or hvd.rank() == 0:
            msg = ','.join(
                ['{}={:.3f}'.format(*metric.get()) for metric in metrics])
            logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format(
                epoch, (time.time() - tic), msg))
            if not (epoch + 1) % args.val_interval:
                # consider reduce the frequency of validation to save time
                map_name, mean_ap = validate(net, val_data, ctx, eval_metric,
                                             args)
                map_name_train, mean_ap_train = validate(
                    net, train_data, ctx, eval_metric, args)
                if isinstance(map_name, list):
                    val_msg = '\n'.join([
                        '{}={}'.format(k, v)
                        for k, v in zip(map_name, mean_ap)
                    ])
                    train_msg = '\n'.join([
                        '{}={}'.format(k, v)
                        for k, v in zip(map_name_train, mean_ap_train)
                    ])
                    current_map = float(mean_ap[-1])
                else:
                    val_msg = '{}={}'.format(map_name, mean_ap)
                    train_msg = '{}={}'.format(map_name_train, mean_ap_train)
                    current_map = mean_ap
                logger.info('[Epoch {}] Validation: {}'.format(epoch, val_msg))
                logger.info('[Epoch {}] Train: {}'.format(epoch, train_msg))
            else:
                current_map = 0.
            save_params(net, logger, best_map, current_map, epoch,
                        args.save_interval,
                        os.path.join(args.model_dir, 'fastrcnn'))
        executor.__del__()
    def train(ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        if opt.resume_params == '':
            net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        if opt.no_wd:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        hvd.broadcast_parameters(net.collect_params(), root_rank=0)

        # trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

        # trainer = hvd.DistributedTrainer(
        #     net.collect_params(),  
        #     optimizer,
        #     optimizer_params)

        if opt.trainer == 'sgd':
            trainer = SGDTrainer(
                net.collect_params(),  
                optimizer, optimizer_params)
        elif opt.trainer == 'efsgd':
            trainer = EFSGDTrainerV1(
                net.collect_params(),  
                'EFSGDV1', optimizer_params, 
                input_sparse_ratio=1./opt.input_sparse_1, 
                output_sparse_ratio=1./opt.output_sparse_1, 
                layer_sparse_ratio=1./opt.layer_sparse_1)
        elif opt.trainer == 'qsparselocalsgd':
            trainer = QSparseLocalSGDTrainerV1(
                net.collect_params(),  
                optimizer, optimizer_params, 
                input_sparse_ratio=1./opt.input_sparse_1, 
                output_sparse_ratio=1./opt.output_sparse_1, 
                layer_sparse_ratio=1./opt.layer_sparse_1,
                local_sgd_interval=opt.local_sgd_interval)
        elif opt.trainer == 'ersgd':
            trainer = ERSGDTrainerV2(
                net.collect_params(),  
                optimizer, optimizer_params, 
                input_sparse_ratio=1./opt.input_sparse_1, 
                output_sparse_ratio=1./opt.output_sparse_1, 
                layer_sparse_ratio=1./opt.layer_sparse_1)
        elif opt.trainer == 'partiallocalsgd':
            trainer = PartialLocalSGDTrainerV1(
                net.collect_params(),  
                optimizer, optimizer_params, 
                input_sparse_ratio=1./opt.input_sparse_1, 
                output_sparse_ratio=1./opt.output_sparse_1, 
                layer_sparse_ratio=1./opt.layer_sparse_1,
                local_sgd_interval=opt.local_sgd_interval)
        elif opt.trainer == 'ersgd2':
            trainer = ERSGD2TrainerV2(
                net.collect_params(),  
                optimizer, optimizer_params, 
                input_sparse_ratio_1=1./opt.input_sparse_1, 
                output_sparse_ratio_1=1./opt.output_sparse_1, 
                layer_sparse_ratio_1=1./opt.layer_sparse_1,
                input_sparse_ratio_2=1./opt.input_sparse_2, 
                output_sparse_ratio_2=1./opt.output_sparse_2, 
                layer_sparse_ratio_2=1./opt.layer_sparse_2,
                local_sgd_interval=opt.local_sgd_interval)
        else:
            trainer = SGDTrainer(
                net.collect_params(),  
                optimizer, optimizer_params)

        if opt.resume_states != '':
            trainer.load_states(opt.resume_states)

        if opt.label_smoothing or opt.mixup:
            sparse_label_loss = False
        else:
            sparse_label_loss = True
        if distillation:
            L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(temperature=opt.temperature,
                                                                 hard_weight=opt.hard_weight,
                                                                 sparse_label=sparse_label_loss)
        else:
            L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss)

        best_val_score = 1

        for epoch in range(opt.resume_epoch, opt.num_epochs):
            tic = time.time()
            if opt.use_rec:
                train_data.reset()
            # train_metric.reset()
            train_loss = 0
            btic = time.time()

            # test speed
            if opt.test_speed > 0:
                n_repeats = opt.test_speed
            elif opt.test_speed == 0:
                n_repeats = 1
            else:
                n_repeats = 0

            for i, batch in enumerate(train_data):
                
                # test speed
                if n_repeats == 0 and not (i+1)%opt.log_interval:
                    print('[Epoch %d] # batch: %d'%(epoch, i))
                    continue

                data, label = batch_fn(batch, ctx)

                for j in range(n_repeats):

                    if opt.mixup:
                        lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha)
                        if epoch >= opt.num_epochs - opt.mixup_off_epoch:
                            lam = 1
                        data = [lam*X + (1-lam)*X[::-1] for X in data]

                        if opt.label_smoothing:
                            eta = 0.1
                        else:
                            eta = 0.0
                        label = mixup_transform(label, classes, lam, eta)

                    elif opt.label_smoothing:
                        hard_label = label
                        label = smooth(label, classes)

                    if distillation:
                        teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \
                                        for X in data]

                    with ag.record():
                        outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
                        if distillation:
                            loss = [L(yhat.astype('float32', copy=False),
                                    y.astype('float32', copy=False),
                                    p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)]
                        else:
                            loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)]
                    for l in loss:
                        l.backward()
                    trainer.step(batch_size)

                    # if opt.mixup:
                    #     output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \
                    #                     for out in outputs]
                    #     train_metric.update(label, output_softmax)
                    # else:
                    #     if opt.label_smoothing:
                    #         train_metric.update(hard_label, outputs)
                    #     else:
                    #         train_metric.update(label, outputs)

                    step_loss = sum([l.sum().asscalar() for l in loss])

                    train_loss += step_loss

                    if opt.log_interval and not (i+j+1)%opt.log_interval:
                        # train_metric_name, train_metric_score = train_metric.get()
                        if hvd.rank() == 0:
                            # logger.info('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%(
                            #             epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic),
                            #             train_metric_name, train_metric_score, trainer.learning_rate, trainer._comm_counter/1e6))
                            # print('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%(
                            #             epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic),
                            #             train_metric_name, train_metric_score, trainer.learning_rate, trainer._comm_counter/1e6))
                            print('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%(
                                        epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic),
                                        'loss', step_loss/batch_size, trainer.learning_rate, trainer._comm_counter/1e6))
                        btic = time.time()

            mx.nd.waitall()
            toc = time.time()

            if n_repeats == 0:
                allreduce_array_nd = mx.nd.array([i])
                hvd.allreduce_(allreduce_array_nd, name='allreduce_array', average=True)
                mx.nd.waitall()
                print('[Epoch %d] # total batch: %d'%(epoch, i))
                continue

            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(batch_size * i /(toc - tic) * hvd.size())

            train_loss /= (batch_size * i)

            if opt.trainer == 'ersgd' or opt.trainer == 'qsparselocalsgd' or opt.trainer == 'ersgd2' or opt.trainer == 'partiallocalsgd':
                allreduce_for_val = True
            else:
                allreduce_for_val = False

            if allreduce_for_val:
                trainer.pre_test()
            # err_train_tic = time.time()
            # err_top1_train, err_top5_train = test(ctx, train_data, val=False)
            err_val_tic = time.time()
            err_top1_val, err_top5_val = test(ctx, val_data, val=True)
            err_val_toc = time.time()
            if allreduce_for_val:
                trainer.post_test()

            mx.nd.waitall()

            # allreduce the results
            allreduce_array_nd = mx.nd.array([train_loss, err_top1_val, err_top5_val])
            hvd.allreduce_(allreduce_array_nd, name='allreduce_array', average=True)
            allreduce_array_np = allreduce_array_nd.asnumpy()
            train_loss = np.asscalar(allreduce_array_np[0])
            err_top1_val = np.asscalar(allreduce_array_np[1])
            err_top5_val = np.asscalar(allreduce_array_np[2])

            if hvd.rank() == 0:
                # logger.info('[Epoch %d] training: %s=%f'%(epoch, train_metric_name, train_metric_score))
                logger.info('[Epoch %d] training: loss=%f'%(epoch, train_loss))
                logger.info('[Epoch %d] speed: %d samples/sec training-time: %f comm: %f'%(epoch, throughput, toc-tic, trainer._comm_counter/1e6))
                logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f err-time=%f'%(epoch, err_top1_val, err_top5_val, err_val_toc - err_val_tic))
                trainer._comm_counter = 0

            if err_top1_val < best_val_score:
                best_val_score = err_top1_val
                # if hvd.local_rank() == 0:
                #     net.save_parameters('%s/%.4f-imagenet-%s-%d-best.params'%(save_dir, best_val_score, model_name, epoch))
                #     trainer.save_states('%s/%.4f-imagenet-%s-%d-best.states'%(save_dir, best_val_score, model_name, epoch))

            if save_frequency and save_dir and (epoch + 1) % save_frequency == 0:
                if hvd.local_rank() == 0:
                    net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, epoch))
                    trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, epoch))
    def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers):
        rec_train = os.path.expanduser(rec_train)
        rec_train_idx = os.path.expanduser(rec_train_idx)
        rec_val = os.path.expanduser(rec_val)
        rec_val_idx = os.path.expanduser(rec_val_idx)
        jitter_param = 0.4
        lighting_param = 0.1
        input_size = opt.input_size
        crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875
        resize = int(math.ceil(input_size / crop_ratio))
        mean_rgb = [123.68, 116.779, 103.939]
        std_rgb = [58.393, 57.12, 57.375]

        def batch_fn(batch, ctx):
            data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
            return data, label

        train_data = mx.io.ImageRecordIter(
            path_imgrec         = rec_train,
            path_imgidx         = rec_train_idx,
            preprocess_threads  = num_workers,
            shuffle             = True,
            batch_size          = batch_size,
            round_batch         = False,
            data_shape          = (3, input_size, input_size),
            mean_r              = mean_rgb[0],
            mean_g              = mean_rgb[1],
            mean_b              = mean_rgb[2],
            std_r               = std_rgb[0],
            std_g               = std_rgb[1],
            std_b               = std_rgb[2],
            rand_mirror         = True,
            random_resized_crop = True,
            max_aspect_ratio    = 4. / 3.,
            min_aspect_ratio    = 3. / 4.,
            max_random_area     = 1,
            min_random_area     = 0.08,
            brightness          = jitter_param,
            saturation          = jitter_param,
            contrast            = jitter_param,
            pca_noise           = lighting_param,
            num_parts           = hvd.size(),
            part_index          = hvd.rank(),
        )
        val_data = mx.io.ImageRecordIter(
            path_imgrec         = rec_val,
            path_imgidx         = rec_val_idx,
            preprocess_threads  = num_workers,
            shuffle             = False,
            batch_size          = batch_size,

            resize              = resize,
            data_shape          = (3, input_size, input_size),
            mean_r              = mean_rgb[0],
            mean_g              = mean_rgb[1],
            mean_b              = mean_rgb[2],
            std_r               = std_rgb[0],
            std_g               = std_rgb[1],
            std_b               = std_rgb[2],
        )
        return train_data, val_data, batch_fn
Example #32
0
        batch_size=batch_size,
        flat=False,
        num_parts=hvd.size(),
        part_index=hvd.rank()
    )

    return train_iter, val_iter

# Step 1: initialize Horovod
hvd.init()

# Horovod: pin context to process
context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())

# Step 2: load data
train_iter, val_iter = get_mnist_iterator(hvd.rank())


# Step 3: define network
def conv_net():
    # placeholder for data
    data = mx.sym.var('data')
    # first conv layer
    conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=10)
    relu1 = mx.sym.Activation(data=conv1, act_type='relu')
    pool1 = mx.sym.Pooling(data=relu1, pool_type='max', kernel=(2, 2),
                           stride=(2, 2))
    # second conv layer
    conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=20)
    relu2 = mx.sym.Activation(data=conv2, act_type='relu')
    pool2 = mx.sym.Pooling(data=relu2, pool_type='max', kernel=(2, 2),