コード例 #1
0
def get_mnist_iterator(rank):
    data_dir = "data-%d" % rank
    if not os.path.isdir(data_dir):
        os.makedirs(data_dir)
    zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip',
                             dirname=data_dir)
    with zipfile.ZipFile(zip_file_path) as zf:
        zf.extractall(data_dir)

    input_shape = (1, 28, 28)
    batch_size = args.batch_size

    train_iter = mx.io.MNISTIter(
        image="%s/train-images-idx3-ubyte" % data_dir,
        label="%s/train-labels-idx1-ubyte" % data_dir,
        input_shape=input_shape,
        batch_size=batch_size,
        shuffle=True,
        flat=False,
        num_parts=hvd.size(),
        part_index=hvd.rank()
    )

    val_iter = mx.io.MNISTIter(
        image="%s/t10k-images-idx3-ubyte" % data_dir,
        label="%s/t10k-labels-idx1-ubyte" % data_dir,
        input_shape=input_shape,
        batch_size=batch_size,
        flat=False,
        num_parts=hvd.size(),
        part_index=hvd.rank()
    )

    return train_iter, val_iter
コード例 #2
0
def _get_dataloader(net, train_dataset, val_dataset, train_transform, val_transform, batch_size,
                    num_shards_per_process, args):
    """Get dataloader."""
    train_bfn = batchify.MaskRCNNTrainBatchify(net, num_shards_per_process)
    train_sampler = \
        SplitSortedBucketSampler(train_dataset.get_im_aspect_ratio(),
                                 batch_size,
                                 num_parts=hvd.size() if args.horovod else 1,
                                 part_index=hvd.rank() if args.horovod else 0,
                                 shuffle=True)
    train_loader = gluon.data.DataLoader(
        train_dataset.transform(train_transform(net.short, net.max_size,
                                                net, ashape=net.ashape, multi_stage=args.mask_rcnn.use_fpn)),
        batch_sampler=train_sampler, batchify_fn=train_bfn, num_workers=args.num_workers)
    val_bfn = batchify.Tuple(*[batchify.Append() for _ in range(2)])
    short = net.short[-1] if isinstance(net.short, (tuple, list)) else net.short
    # validation use 1 sample per device
    val_loader = gluon.data.DataLoader(
        val_dataset.transform(val_transform(short, net.max_size)), num_shards_per_process, False,
        batchify_fn=val_bfn, last_batch='keep', num_workers=args.num_workers)
    return train_loader, val_loader
コード例 #3
0
ファイル: yolo.py プロジェクト: kaixinbaba/gluon-cv
    def _resume_fit(self, train_data, val_data):
        if max(self._cfg.train.start_epoch, self.epoch) >= self._cfg.train.epochs:
            return {'time', self._time_elapsed}
        if not self.classes or not self.num_class:
            raise ValueError('Unable to determine classes of dataset')

        # training dataset
        train_dataset = train_data.to_mxnet()
        val_dataset = val_data.to_mxnet()
        # training dataloader
        self.batch_size = self._cfg.train.batch_size // hvd.size() if self._cfg.horovod else self._cfg.train.batch_size
        train_loader, val_loader, train_eval_loader = _get_dataloader(
            self.async_net, train_dataset, val_dataset, self._cfg.yolo3.data_shape,
            self.batch_size, self._cfg.num_workers, self._cfg)

        if self._cfg.train.no_wd:
            for _, v in self.net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0
        if self._cfg.train.label_smooth:
            self.net._target_generator._label_smooth = True
        return self._train_loop(train_loader, val_loader, train_eval_loader)
コード例 #4
0
    def test_horovod_broadcast_error(self):
        """Test that the broadcast returns an error if any dimension besides
           the first is different among the tensors being broadcasted."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        ctx = self._current_context()
        shape = (17, rank + 1)
        tensor = mx.nd.ones(shape=shape, ctx=ctx)

        try:
            output = hvd.broadcast(tensor, 0)
            output.wait_to_read()
            assert False, 'hvd.broadcast did not throw error'
        except (MXNetError, RuntimeError):
            pass
コード例 #5
0
def _get_dali_dataset(dataset_name, devices, args):
    if dataset_name.lower() == "coco":
        # training
        expanded_file_root = os.path.expanduser(args.dataset_root)
        coco_root = os.path.join(expanded_file_root, 'coco', 'train2017')
        coco_annotations = os.path.join(expanded_file_root, 'coco',
                                        'annotations',
                                        'instances_train2017.json')
        if args.horovod:
            train_dataset = [
                gdata.COCODetectionDALI(num_shards=hvd.size(),
                                        shard_id=hvd.rank(),
                                        file_root=coco_root,
                                        annotations_file=coco_annotations,
                                        device_id=hvd.local_rank())
            ]
        else:
            train_dataset = [gdata.COCODetectionDALI(num_shards=len(devices), shard_id=i, file_root=coco_root,
                                                     annotations_file=coco_annotations, device_id=i) \
                                                     for i, _ in enumerate(devices)]

        # validation
        if not args.horovod or hvd.rank() == 0:
            val_dataset = gdata.COCODetection(
                root=os.path.join(args.dataset_root + '/coco'),
                splits='instances_val2017',
                skip_empty=False)
            val_metric = COCODetectionMetric(
                val_dataset,
                os.path.join(args.logdir, args.save_prefix + '_eval'),
                cleanup=True,
                data_shape=(args.ssd.data_shape, args.ssd.data_shape))
        else:
            val_dataset = None
            val_metric = None
    else:
        raise NotImplementedError(
            'Dataset: {} not implemented with DALI.'.format(dataset_name))

    return train_dataset, val_dataset, val_metric
コード例 #6
0
ファイル: test_mxnet.py プロジェクト: zruiyuan/horovod
    def test_horovod_alltoall(self):
        """Test that the alltoall correctly distributes 1D, 2D, and 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if using gloo controller
        if hvd.gloo_enabled():
            self.skipTest(
                "Alltoall currently does not support Gloo controller.")

        # This test does not apply if NCCL version < 2.7.0
        if hvd.nccl_built() and hvd.nccl_built() < 2700:
            self.skipTest(
                "NCCL-based Alltoall requires NCCL version >= 2.7.0.")

        dtypes = ['int32', 'int64', 'float32', 'float64']
        dims = [1, 2, 3]
        ctx = self._current_context()
        for dtype, dim in itertools.product(dtypes, dims):
            vals = []
            for i in range(size):
                vals += [i] * (rank + 1)

            tensor = mx.ndarray.array(vals, dtype=dtype, ctx=ctx)
            for _ in range(dim - 1):
                tensor = mx.ndarray.expand_dims(tensor, axis=1)
                tensor = mx.ndarray.concat(tensor, tensor, dim=1)

            splits = mx.ndarray.array([rank + 1] * size,
                                      dtype='int32',
                                      ctx=ctx)
            collected = hvd.alltoall(tensor, splits)

            assert collected.min(
            ) == rank, 'hvd.alltoall produces incorrect collected tensor'
            assert collected.max(
            ) == rank, 'hvd.alltoall produces incorrect collected tensor'
            assert collected.size == size * (size + 1) // 2 * 2**(
                dim - 1), 'hvd.alltoall collected wrong number of values'
コード例 #7
0
    def test_horovod_grouped_allreduce_inplace(self):
        """Test that the in-place grouped allreduce correctly sums 1D, 2D, 3D tensors."""
        hvd.init()
        size = hvd.size()
        dtypes = self.filter_supported_types(
            ['int32', 'int64', 'float32', 'float64'])
        dims = [1, 2, 3]
        ctx = self._current_context()
        count = 1
        shapes = [(), (17), (17, 17), (17, 17, 17)]
        for dtype, dim in itertools.product(dtypes, dims):
            mx.random.seed(1234, ctx=ctx)

            tensors = [
                mx.nd.random.uniform(-100, 100, shape=shapes[dim], ctx=ctx)
                for _ in range(5)
            ]

            tensors = [tensor.astype(dtype) for tensor in tensors]

            multiplied = [tensor * size for tensor in tensors]

            hvd.grouped_allreduce_(tensors, average=False, name=str(count))

            count += 1

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in ['int32', 'int64']:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            assert all([almost_equal(t1.asnumpy(), t2.asnumpy(), atol=threshold)
                for t1, t2 in zip(tensors, multiplied)]), \
                f'hvd.grouped_allreduce_ produces incorrect results: {hvd.rank()} {count} {dtype} {dim}'
コード例 #8
0
ファイル: test_mxnet.py プロジェクト: fightseed/horovod-1
    def test_horovod_allreduce(self):
        """Test that the allreduce correctly sums 1D, 2D, 3D tensors."""
        hvd.init()
        size = hvd.size()
        dtypes = self.filter_supported_types(['int32',   'int64',
                                              'float32', 'float64'])
        dims = [1, 2, 3]
        ctx = self._current_context()
        count = 0
        shapes = [(), (17), (17, 17), (17, 17, 17)]
        for dtype, dim in itertools.product(dtypes, dims):
            # MXNet uses gpu_id as part of the seed, so to get identical seeds
            # we must set a context.
            mx.random.seed(1234, ctx=ctx)
            tensor = mx.nd.random.uniform(-100, 100, shape=shapes[dim],
                                          ctx=ctx)
            tensor = tensor.astype(dtype)
            summed = hvd.allreduce(tensor, average=False, name=str(count))
            multiplied = tensor * size
            max_difference = mx.nd.max(mx.nd.subtract(summed, multiplied))
            count += 1

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in ['int32', 'int64']:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            if max_difference > threshold:
                print("allreduce", count, dtype, dim, max_difference,
                      threshold)
                print("tensor", hvd.rank(), tensor)
                print("summed", hvd.rank(), summed)
                print("multiplied", hvd.rank(), multiplied)
            assert max_difference <= threshold, 'hvd.allreduce produces \
コード例 #9
0
ファイル: run_pretraining.py プロジェクト: Rivendile/examples
def init_comm(backend):
    """Init communication backend"""
    # backend specific implementation
    if backend == 'horovod':
        try:
            import horovod.mxnet as hvd
        except ImportError:
            logging.info('horovod must be installed.')
            exit()
        hvd.init()
        store = None
        num_workers = hvd.size()
        rank = hvd.rank()
        local_rank = hvd.local_rank()
        is_master_node = rank == local_rank
        ctxs = [mx.gpu(local_rank)]
    elif backend == 'byteps':
        try:
            import byteps.mxnet as bps
        except ImportError:
            logging.info('BytePS must be installed.')
            exit()
        bps.init()
        store = None
        num_workers = bps.size()
        rank = bps.rank()
        local_rank = bps.local_rank()
        is_master_node = rank == local_rank
        ctxs = [mx.gpu(local_rank)]
    else:
        # kvstore
        store = mx.kv.create(backend)
        num_workers = store.num_workers
        rank = store.rank
        local_rank = 0
        is_master_node = rank == local_rank
        ctxs = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
               [mx.gpu(int(x)) for x in args.gpus.split(',')]
    return store, num_workers, rank, local_rank, is_master_node, ctxs
コード例 #10
0
    def test_horovod_allgather_error(self):
        """Test that the allgather returns an error if any dimension besides
        the first is different among the tensors being gathered."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        ctx = self._current_context()

        tensor_size = [17] * 3
        tensor_size[1] = 10 * (rank + 1)
        tensor = mx.ndarray.ones(shape=tensor_size, ctx=ctx)

        try:
            hvd.allgather(tensor)
            assert False, 'hvd.allgather did not throw error'
        except (MXNetError, RuntimeError):
            pass
コード例 #11
0
ファイル: test_mxnet.py プロジェクト: tallamjr/horovod
    def test_horovod_alltoall_equal_split_length_error(self):
        """Test that the alltoall with default splitting returns an error if the first dimension
        of tensor is not a multiple of the number of workers."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        # This test does not apply if NCCL version < 2.7.0
        if hvd.nccl_built() and hvd.nccl_built() < 2700:
            self.skipTest("NCCL-based Alltoall requires NCCL version >= 2.7.0.")

        ctx = self._current_context()
        tensor = mx.ndarray.empty([size + 1], ctx=ctx)
        try:
            hvd.alltoall(tensor)
            assert False, 'hvd.alltoall did not throw error'
        except (MXNetError, RuntimeError):
            pass
コード例 #12
0
ファイル: test_mxnet.py プロジェクト: tallamjr/horovod
    def test_horovod_allgather(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = ['int32',   'int64',
                  'float32', 'float64']
        dims = [1, 2, 3]
        ctx = self._current_context()
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = mx.ndarray.ones(shape=[17] * dim, dtype=dtype, ctx=ctx) * rank
            gathered = hvd.allgather(tensor)

            assert list(gathered.shape) == [17 * size] + [17] * (dim - 1)

            for i in range(size):
                rank_tensor = gathered[i * 17:(i + 1) * 17]
                assert list(rank_tensor.shape) == [17] * dim, \
                    'hvd.allgather produces incorrect gathered shape'
                assert rank_tensor.min() == i, 'hvd.allgather produces incorrect gathered tensor'
                assert rank_tensor.max() == i, 'hvd.allgather produces incorrect gathered tensor'
コード例 #13
0
    def test_horovod_allreduce_average(self):
        """Test that the allreduce correctly sums 1D, 2D, 3D tensors."""
        hvd.init()
        size = hvd.size()
        dtypes = ['int32', 'int64', 'float32', 'float64']
        dims = [1, 2, 3]
        ctx = self._current_context()
        count = 0
        shapes = [(), (17), (17, 17), (17, 17, 17)]
        for dtype, dim in itertools.product(dtypes, dims):
            mx.random.seed(1234, ctx=ctx)
            tensor = mx.nd.random.uniform(-100,
                                          100,
                                          shape=shapes[dim],
                                          ctx=ctx)
            tensor = tensor.astype(dtype)
            averaged = hvd.allreduce(tensor, average=True, name=str(count))
            tensor *= size
            tensor /= size
            max_difference = mx.nd.max(mx.nd.subtract(averaged, tensor))
            count += 1

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in ['int32', 'int64']:
                threshold = 1
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            if max_difference > threshold:
                print("average", count, dtype, dim, max_difference, threshold)
                print("tensor", hvd.rank(), tensor)
                print("averaged", hvd.rank(), averaged)
            assert max_difference <= threshold, 'hvd.allreduce produces \
コード例 #14
0
    def test_horovod_broadcast_grad(self):
        """Test the correctness of the broadcast gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        dtypes = ['int32',   'int64',
                  'float32', 'float64'] 
        dims = [1, 2, 3]
        ctx = self._current_context()
        count = 0
        shapes = [(), (17), (17, 17), (17, 17, 17)]
        root_rank = 1
        tensor_dict = {}
        broadcast_dict = {}
        root_dict = {}
        for dtype, dim, in itertools.product(dtypes, dims):
            tensor_dict[count] = mx.nd.ones(shapes[dim], ctx=ctx) * rank
            root_dict[count] = mx.nd.ones(shapes[dim], ctx=ctx) * root_rank
            tensor_dict[count] = tensor_dict[count].astype(dtype)
            root_dict[count] = root_dict[count].astype(dtype)

            # Only do broadcasting using and on broadcast_tensor
            count += 1

        hvd.broadcast_parameters(tensor_dict, root_rank=root_rank)
        for i in range(count):
            if not same(tensor_dict[i].asnumpy(), root_dict[i].asnumpy()):
                print("broadcast", count, dtype, dim)
                print("broadcast_tensor", hvd.rank(), tensor_dict[i])
                print("root_tensor", hvd.rank(), root_dict[i])
                print("comparison", hvd.rank(), tensor_dict[i] == root_dict[i])
            assert same(tensor_dict[i].asnumpy(), root_dict[i].asnumpy()), \
                'hvd.broadcast produces incorrect broadcasted tensor'
コード例 #15
0
ファイル: test_mxnet.py プロジェクト: tallamjr/horovod
    def test_horovod_alltoall_splits_error(self):
        """Test that the alltoall returns an error if the sum of the splits entries exceeds
        the first dimension of the input tensor."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        # This test does not apply if NCCL version < 2.7.0
        if hvd.nccl_built() and hvd.nccl_built() < 2700:
            self.skipTest("NCCL-based Alltoall requires NCCL version >= 2.7.0.")

        ctx = self._current_context()
        tensor = mx.ndarray.empty([size-1], ctx=ctx)
        splits = mx.ndarray.ones([size], dtype='int32', ctx=ctx)
        try:
            hvd.alltoall(tensor, splits)
            assert False, 'hvd.alltoall did not throw error'
        except (MXNetError, RuntimeError):
            pass
コード例 #16
0
def get_dataloader(net, train_dataset, val_dataset, train_transform, val_transform, batch_size,
                   num_shards, args):
    """Get dataloader."""
    train_bfn = FasterRCNNTrainBatchify(net, num_shards)
    if hasattr(train_dataset, 'get_im_aspect_ratio'):
        im_aspect_ratio = train_dataset.get_im_aspect_ratio()
    else:
        im_aspect_ratio = [1.] * len(train_dataset)
    train_sampler = \
        gcv.nn.sampler.SplitSortedBucketSampler(im_aspect_ratio, batch_size,
                                                num_parts=hvd.size() if args.horovod else 1,
                                                part_index=hvd.rank() if args.horovod else 0,
                                                shuffle=True)
    train_loader = mx.gluon.data.DataLoader(train_dataset.transform(
        train_transform(net.short, net.max_size, net, ashape=net.ashape, multi_stage=args.use_fpn)),
        batch_sampler=train_sampler, batchify_fn=train_bfn, num_workers=args.num_workers)
    val_bfn = Tuple(*[Append() for _ in range(3)])
    short = net.short[-1] if isinstance(net.short, (tuple, list)) else net.short
    # validation use 1 sample per device
    val_loader = mx.gluon.data.DataLoader(
        val_dataset.transform(val_transform(short, net.max_size)), num_shards, False,
        batchify_fn=val_bfn, last_batch='keep', num_workers=args.num_workers)
    return train_loader, val_loader
コード例 #17
0
def test_allreduce(use_horovod, dtype):
    if use_horovod is False:
        kvstore_type = "dist_sync_device" if os.environ.get(
            "DMLC_ROLE") == "worker" else kvstore_type
        kv = mx.kvstore.create(kvstore_type)
        rank = kv.rank
        num_workers = kv.num_workers
    else:
        kvstore_type = "device"
        kv = mx.kvstore.create(kvstore_type)
        hvd.init()
        rank = hvd.rank()
        num_workers = hvd.size()
    print('use horovod: {}, rank {}/{}, kv type: {}, usetree: {}'.format(
        use_horovod, rank, num_workers, kvstore_type,
        os.environ.get("MXNET_KVSTORE_USETREE")))

    rescale_grad = 1.0 / (8 * num_workers)
    if use_horovod:
        rescale_grad = rescale_grad * num_workers

    optimizer_params = dict(
        momentum=0,  # pOpt.optimizer.momentum,
        wd=0,  # pOpt.optimizer.wd,
        learning_rate=0.1,
        rescale_grad=rescale_grad,
    )
    optimizer = mx.optimizer.create("sgd", **optimizer_params)
    if use_horovod:
        # Horovod: wrap optimizer with DistributedOptimizer
        optimizer = hvd.DistributedOptimizer(optimizer)

    print("opt rescale:{}".format(optimizer.rescale_grad))
    kv.set_optimizer(optimizer)

    test_hvd_kv(rank, num_workers, kv, dtype)
コード例 #18
0
ファイル: test_mxnet.py プロジェクト: tallamjr/horovod
    def test_horovod_allgather_type_error(self):
        """Test that the allgather returns an error if the types being gathered
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            self.skipTest("Only one worker available")

        ctx = self._current_context()

        tensor_size = [17] * 3
        if rank % 2 == 0:
            tensor = mx.ndarray.ones(shape=tensor_size, dtype="int32", ctx=ctx)
        else:
            tensor = mx.ndarray.ones(shape=tensor_size, dtype="float32", ctx=ctx)

        try:
            hvd.allgather(tensor)
            assert False, 'hvd.allgather did not throw error'
        except (MXNetError, RuntimeError):
            pass
コード例 #19
0
    def test_horovod_allreduce(self):
        """Test that the allreduce correctly sums 1D, 2D, 3D tensors."""
        hvd.init()
        size = hvd.size()
        dtypes = self.filter_supported_types(
            ['int32', 'int64', 'float32', 'float64'])
        dims = [1, 2, 3]
        ctx = self._current_context()
        count = 0
        shapes = [(), (17), (17, 17), (17, 17, 17)]
        for dtype, dim in itertools.product(dtypes, dims):
            # MXNet uses gpu_id as part of the seed, so to get identical seeds
            # we must set a context.
            mx.random.seed(1234, ctx=ctx)
            tensor = mx.nd.random.uniform(-100,
                                          100,
                                          shape=shapes[dim],
                                          ctx=ctx)
            tensor = tensor.astype(dtype)
            summed = hvd.allreduce(tensor, average=False, name=str(count))
            multiplied = tensor * size
            count += 1

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in ['int32', 'int64']:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            assert almost_equal(summed.asnumpy(), multiplied.asnumpy(), atol=threshold), \
                f'hvd.allreduce produces incorrect results: {hvd.rank()} {count} {dtype} {dim}'
コード例 #20
0
    def _resume_fit(self, train_data, val_data, time_limit=math.inf):
        tic = time.time()
        if max(self._cfg.train.start_epoch,
               self.epoch) >= self._cfg.train.epochs:
            return {'time', self._time_elapsed}
        if not self.classes or not self.num_class:
            raise ValueError('Unable to determine classes of dataset')

        # dataset
        devices = [int(i) for i in self._cfg.gpus]
        train_dataset = train_data.to_mxnet()
        val_dataset = val_data.to_mxnet()

        # dataloader
        if self._cfg.train.dali:
            if not dali_found:
                raise SystemExit(
                    "DALI not found, please check if you installed it correctly."
                )
            train_loader, val_loader = _get_dali_dataloader(
                self.async_net, train_dataset, val_dataset,
                self._cfg.ssd.data_shape, self._cfg.train.batch_size,
                self._cfg.num_workers, devices, self.ctx[0], self._cfg.horovod)
        else:
            self.batch_size = self._cfg.train.batch_size // hvd.size() \
                if self._cfg.horovod else self._cfg.train.batch_size
            train_loader, val_loader, train_eval_loader = _get_dataloader(
                self.async_net, train_dataset, val_dataset,
                self._cfg.ssd.data_shape, self.batch_size,
                self._cfg.num_workers)

        self._time_elapsed += time.time() - tic
        return self._train_loop(train_loader,
                                val_loader,
                                train_eval_loader,
                                time_limit=time_limit)
コード例 #21
0
    def __init__(self, symbol, fc7_model, memory_bank, memory_optimizer,
                 logger=logging, ):
        self.size = hvd.size()
        self.rank = hvd.rank()
        self.local_rank = hvd.local_rank()
        self.gpu = mx.gpu(self.local_rank)
        self.cpu = mx.cpu()                                     # `device_id` is not needed for CPU.
        self.nd_cache = {}
        self.embedding_size = config.embedding_size
        self.batch_size = config.batch_size
        self.num_update = 0
        self.batch_end_param = namedtuple('batch_end_param', ['loss', 'num_epoch', 'num_update'])

        self.fc7_model = fc7_model
        self.symbol = symbol
        self.logger = logger
        self.backbone_module = mx.module.Module(
            self.symbol, ['data'], ['softmax_label'], logger=self.logger, context=self.gpu)

        self.memory_bank = memory_bank
        self.memory_optimizer = memory_optimizer
        self.memory_lr = None
        self.loss_cache = None
        self.grad_cache = None
コード例 #22
0
    def test_horovod_allreduce_cpu_gpu_error(self):
        """Test that the allreduce raises an error if different ranks try to
           perform reduction on CPU and GPU."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        shape = (17, 17, 17)
        if rank % 2 == 0:
            ctx = mx.gpu(hvd.rank())
        else:
            ctx = mx.cpu(hvd.rank())
        tensor = mx.nd.ones(shape=shape, ctx=ctx)

        try:
            output = hvd.allreduce(tensor)
            output.wait_to_read()
            assert False, 'hvd.allreduce did not throw cpu-gpu error'
        except (MXNetError, RuntimeError):
            pass
コード例 #23
0
    def train(ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        if opt.resume_params == '':
            net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        if opt.no_wd:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        hvd.broadcast_parameters(net.collect_params(), root_rank=0)

        # trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

        # trainer = hvd.DistributedTrainer(
        #     net.collect_params(),  
        #     optimizer,
        #     optimizer_params)

        if opt.trainer == 'sgd':
            trainer = SGDTrainer(
                net.collect_params(),  
                optimizer, optimizer_params)
        elif opt.trainer == 'efsgd':
            trainer = EFSGDTrainerV1(
                net.collect_params(),  
                'EFSGDV1', optimizer_params, 
                input_sparse_ratio=1./opt.input_sparse_1, 
                output_sparse_ratio=1./opt.output_sparse_1, 
                layer_sparse_ratio=1./opt.layer_sparse_1)
        elif opt.trainer == 'qsparselocalsgd':
            trainer = QSparseLocalSGDTrainerV1(
                net.collect_params(),  
                optimizer, optimizer_params, 
                input_sparse_ratio=1./opt.input_sparse_1, 
                output_sparse_ratio=1./opt.output_sparse_1, 
                layer_sparse_ratio=1./opt.layer_sparse_1,
                local_sgd_interval=opt.local_sgd_interval)
        elif opt.trainer == 'ersgd':
            trainer = ERSGDTrainerV2(
                net.collect_params(),  
                optimizer, optimizer_params, 
                input_sparse_ratio=1./opt.input_sparse_1, 
                output_sparse_ratio=1./opt.output_sparse_1, 
                layer_sparse_ratio=1./opt.layer_sparse_1)
        elif opt.trainer == 'partiallocalsgd':
            trainer = PartialLocalSGDTrainerV1(
                net.collect_params(),  
                optimizer, optimizer_params, 
                input_sparse_ratio=1./opt.input_sparse_1, 
                output_sparse_ratio=1./opt.output_sparse_1, 
                layer_sparse_ratio=1./opt.layer_sparse_1,
                local_sgd_interval=opt.local_sgd_interval)
        elif opt.trainer == 'ersgd2':
            trainer = ERSGD2TrainerV2(
                net.collect_params(),  
                optimizer, optimizer_params, 
                input_sparse_ratio_1=1./opt.input_sparse_1, 
                output_sparse_ratio_1=1./opt.output_sparse_1, 
                layer_sparse_ratio_1=1./opt.layer_sparse_1,
                input_sparse_ratio_2=1./opt.input_sparse_2, 
                output_sparse_ratio_2=1./opt.output_sparse_2, 
                layer_sparse_ratio_2=1./opt.layer_sparse_2,
                local_sgd_interval=opt.local_sgd_interval)
        else:
            trainer = SGDTrainer(
                net.collect_params(),  
                optimizer, optimizer_params)

        if opt.resume_states != '':
            trainer.load_states(opt.resume_states)

        if opt.label_smoothing or opt.mixup:
            sparse_label_loss = False
        else:
            sparse_label_loss = True
        if distillation:
            L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(temperature=opt.temperature,
                                                                 hard_weight=opt.hard_weight,
                                                                 sparse_label=sparse_label_loss)
        else:
            L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss)

        best_val_score = 1

        for epoch in range(opt.resume_epoch, opt.num_epochs):
            tic = time.time()
            if opt.use_rec:
                train_data.reset()
            # train_metric.reset()
            train_loss = 0
            btic = time.time()

            # test speed
            if opt.test_speed > 0:
                n_repeats = opt.test_speed
            elif opt.test_speed == 0:
                n_repeats = 1
            else:
                n_repeats = 0

            for i, batch in enumerate(train_data):
                
                # test speed
                if n_repeats == 0 and not (i+1)%opt.log_interval:
                    print('[Epoch %d] # batch: %d'%(epoch, i))
                    continue

                data, label = batch_fn(batch, ctx)

                for j in range(n_repeats):

                    if opt.mixup:
                        lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha)
                        if epoch >= opt.num_epochs - opt.mixup_off_epoch:
                            lam = 1
                        data = [lam*X + (1-lam)*X[::-1] for X in data]

                        if opt.label_smoothing:
                            eta = 0.1
                        else:
                            eta = 0.0
                        label = mixup_transform(label, classes, lam, eta)

                    elif opt.label_smoothing:
                        hard_label = label
                        label = smooth(label, classes)

                    if distillation:
                        teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \
                                        for X in data]

                    with ag.record():
                        outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
                        if distillation:
                            loss = [L(yhat.astype('float32', copy=False),
                                    y.astype('float32', copy=False),
                                    p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)]
                        else:
                            loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)]
                    for l in loss:
                        l.backward()
                    trainer.step(batch_size)

                    # if opt.mixup:
                    #     output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \
                    #                     for out in outputs]
                    #     train_metric.update(label, output_softmax)
                    # else:
                    #     if opt.label_smoothing:
                    #         train_metric.update(hard_label, outputs)
                    #     else:
                    #         train_metric.update(label, outputs)

                    step_loss = sum([l.sum().asscalar() for l in loss])

                    train_loss += step_loss

                    if opt.log_interval and not (i+j+1)%opt.log_interval:
                        # train_metric_name, train_metric_score = train_metric.get()
                        if hvd.rank() == 0:
                            # logger.info('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%(
                            #             epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic),
                            #             train_metric_name, train_metric_score, trainer.learning_rate, trainer._comm_counter/1e6))
                            # print('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%(
                            #             epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic),
                            #             train_metric_name, train_metric_score, trainer.learning_rate, trainer._comm_counter/1e6))
                            print('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%(
                                        epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic),
                                        'loss', step_loss/batch_size, trainer.learning_rate, trainer._comm_counter/1e6))
                        btic = time.time()

            mx.nd.waitall()
            toc = time.time()

            if n_repeats == 0:
                allreduce_array_nd = mx.nd.array([i])
                hvd.allreduce_(allreduce_array_nd, name='allreduce_array', average=True)
                mx.nd.waitall()
                print('[Epoch %d] # total batch: %d'%(epoch, i))
                continue

            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(batch_size * i /(toc - tic) * hvd.size())

            train_loss /= (batch_size * i)

            if opt.trainer == 'ersgd' or opt.trainer == 'qsparselocalsgd' or opt.trainer == 'ersgd2' or opt.trainer == 'partiallocalsgd':
                allreduce_for_val = True
            else:
                allreduce_for_val = False

            if allreduce_for_val:
                trainer.pre_test()
            # err_train_tic = time.time()
            # err_top1_train, err_top5_train = test(ctx, train_data, val=False)
            err_val_tic = time.time()
            err_top1_val, err_top5_val = test(ctx, val_data, val=True)
            err_val_toc = time.time()
            if allreduce_for_val:
                trainer.post_test()

            mx.nd.waitall()

            # allreduce the results
            allreduce_array_nd = mx.nd.array([train_loss, err_top1_val, err_top5_val])
            hvd.allreduce_(allreduce_array_nd, name='allreduce_array', average=True)
            allreduce_array_np = allreduce_array_nd.asnumpy()
            train_loss = np.asscalar(allreduce_array_np[0])
            err_top1_val = np.asscalar(allreduce_array_np[1])
            err_top5_val = np.asscalar(allreduce_array_np[2])

            if hvd.rank() == 0:
                # logger.info('[Epoch %d] training: %s=%f'%(epoch, train_metric_name, train_metric_score))
                logger.info('[Epoch %d] training: loss=%f'%(epoch, train_loss))
                logger.info('[Epoch %d] speed: %d samples/sec training-time: %f comm: %f'%(epoch, throughput, toc-tic, trainer._comm_counter/1e6))
                logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f err-time=%f'%(epoch, err_top1_val, err_top5_val, err_val_toc - err_val_tic))
                trainer._comm_counter = 0

            if err_top1_val < best_val_score:
                best_val_score = err_top1_val
                # if hvd.local_rank() == 0:
                #     net.save_parameters('%s/%.4f-imagenet-%s-%d-best.params'%(save_dir, best_val_score, model_name, epoch))
                #     trainer.save_states('%s/%.4f-imagenet-%s-%d-best.states'%(save_dir, best_val_score, model_name, epoch))

            if save_frequency and save_dir and (epoch + 1) % save_frequency == 0:
                if hvd.local_rank() == 0:
                    net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, epoch))
                    trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, epoch))
コード例 #24
0
def main():
    opt = parse_args()

    hvd.init()

    logging_file = 'train_imagenet_%s.log' % (opt.trainer)

    filehandler = logging.FileHandler(logging_file)
    streamhandler = logging.StreamHandler()

    logger = logging.getLogger('')
    logger.setLevel(logging.INFO)
    logger.addHandler(filehandler)
    logger.addHandler(streamhandler)

    if hvd.rank() == 0:
        logger.info(opt)

    batch_size = opt.batch_size
    classes = 1000
    num_training_samples = 1281167

    context = [mx.gpu(hvd.local_rank())]
    num_workers = opt.num_workers

    optimizer = opt.optimizer

    warmup_epochs = opt.warmup_epochs

    lr_decay = opt.lr_decay
    lr_decay_period = opt.lr_decay_period
    if opt.lr_decay_period > 0:
        lr_decay_epoch = list(range(lr_decay_period, opt.num_epochs, lr_decay_period))
    else:
        lr_decay_epoch = [int(i) for i in opt.lr_decay_epoch.split(',')]
    lr_decay_epoch = [e - warmup_epochs for e in lr_decay_epoch]
    num_batches = num_training_samples // (batch_size * hvd.size())

    lr_scheduler = LRSequential([
        LRScheduler('linear', base_lr=opt.warmup_lr, target_lr=opt.lr,
                    nepochs=warmup_epochs, iters_per_epoch=num_batches),
        LRScheduler(opt.lr_mode, base_lr=opt.lr, target_lr=0,
                    nepochs=opt.num_epochs - warmup_epochs,
                    iters_per_epoch=num_batches,
                    step_epoch=lr_decay_epoch,
                    step_factor=lr_decay, power=2)
    ])


    model_name = opt.model

    kwargs = {'ctx': context, 'pretrained': opt.use_pretrained, 'classes': classes}
    if opt.use_gn:
        kwargs['norm_layer'] = gcv.nn.GroupNorm
    if model_name.startswith('vgg'):
        kwargs['batch_norm'] = opt.batch_norm
    elif model_name.startswith('resnext'):
        kwargs['use_se'] = opt.use_se

    if opt.last_gamma:
        kwargs['last_gamma'] = True

    optimizer_params = {'wd': opt.wd, 'momentum': opt.momentum, 'lr_scheduler': lr_scheduler}
    if opt.dtype != 'float32':
        optimizer_params['multi_precision'] = True

    net = get_model(model_name, **kwargs)
    net.cast(opt.dtype)
    if opt.resume_params != '':
        net.load_parameters(opt.resume_params, ctx = context)

    # teacher model for distillation training
    if opt.teacher is not None and opt.hard_weight < 1.0:
        teacher_name = opt.teacher
        teacher = get_model(teacher_name, pretrained=True, classes=classes, ctx=context)
        teacher.cast(opt.dtype)
        distillation = True
    else:
        distillation = False

    # Two functions for reading data from record file or raw images
    def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers):
        rec_train = os.path.expanduser(rec_train)
        rec_train_idx = os.path.expanduser(rec_train_idx)
        rec_val = os.path.expanduser(rec_val)
        rec_val_idx = os.path.expanduser(rec_val_idx)
        jitter_param = 0.4
        lighting_param = 0.1
        input_size = opt.input_size
        crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875
        resize = int(math.ceil(input_size / crop_ratio))
        mean_rgb = [123.68, 116.779, 103.939]
        std_rgb = [58.393, 57.12, 57.375]

        def batch_fn(batch, ctx):
            data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
            return data, label

        train_data = mx.io.ImageRecordIter(
            path_imgrec         = rec_train,
            path_imgidx         = rec_train_idx,
            preprocess_threads  = num_workers,
            shuffle             = True,
            batch_size          = batch_size,
            round_batch         = False,
            data_shape          = (3, input_size, input_size),
            mean_r              = mean_rgb[0],
            mean_g              = mean_rgb[1],
            mean_b              = mean_rgb[2],
            std_r               = std_rgb[0],
            std_g               = std_rgb[1],
            std_b               = std_rgb[2],
            rand_mirror         = True,
            random_resized_crop = True,
            max_aspect_ratio    = 4. / 3.,
            min_aspect_ratio    = 3. / 4.,
            max_random_area     = 1,
            min_random_area     = 0.08,
            brightness          = jitter_param,
            saturation          = jitter_param,
            contrast            = jitter_param,
            pca_noise           = lighting_param,
            num_parts           = hvd.size(),
            part_index          = hvd.rank(),
        )
        val_data = mx.io.ImageRecordIter(
            path_imgrec         = rec_val,
            path_imgidx         = rec_val_idx,
            preprocess_threads  = num_workers,
            shuffle             = False,
            batch_size          = batch_size,

            resize              = resize,
            data_shape          = (3, input_size, input_size),
            mean_r              = mean_rgb[0],
            mean_g              = mean_rgb[1],
            mean_b              = mean_rgb[2],
            std_r               = std_rgb[0],
            std_g               = std_rgb[1],
            std_b               = std_rgb[2],
        )
        return train_data, val_data, batch_fn

    def get_data_loader(data_dir, batch_size, num_workers):
        normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        jitter_param = 0.4
        lighting_param = 0.1
        input_size = opt.input_size
        crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875
        resize = int(math.ceil(input_size / crop_ratio))

        def batch_fn(batch, ctx):
            data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
            return data, label

        transform_train = transforms.Compose([
            transforms.RandomResizedCrop(input_size),
            transforms.RandomFlipLeftRight(),
            transforms.RandomColorJitter(brightness=jitter_param, contrast=jitter_param,
                                        saturation=jitter_param),
            transforms.RandomLighting(lighting_param),
            transforms.ToTensor(),
            normalize
        ])
        transform_test = transforms.Compose([
            transforms.Resize(resize, keep_ratio=True),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            normalize
        ])

        train_data = gluon.data.DataLoader(
            imagenet.classification.ImageNet(data_dir, train=True).transform_first(transform_train),
            batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)
        val_data = gluon.data.DataLoader(
            imagenet.classification.ImageNet(data_dir, train=False).transform_first(transform_test),
            batch_size=batch_size, shuffle=False, num_workers=num_workers)

        return train_data, val_data, batch_fn

    if opt.use_rec:
        train_data, val_data, batch_fn = get_data_rec(opt.rec_train, opt.rec_train_idx,
                                                    opt.rec_val, opt.rec_val_idx,
                                                    batch_size, num_workers)
    else:
        train_data, val_data, batch_fn = get_data_loader(opt.data_dir, batch_size, num_workers)

    if opt.mixup:
        train_metric = mx.metric.RMSE()
    else:
        train_metric = mx.metric.Accuracy()
    acc_top1 = mx.metric.Accuracy()
    acc_top5 = mx.metric.TopKAccuracy(5)

    save_frequency = opt.save_frequency
    if opt.save_dir and save_frequency:
        save_dir = opt.save_dir
        makedirs(save_dir)
    else:
        save_dir = ''
        save_frequency = 0

    def mixup_transform(label, classes, lam=1, eta=0.0):
        if isinstance(label, nd.NDArray):
            label = [label]
        res = []
        for l in label:
            y1 = l.one_hot(classes, on_value = 1 - eta + eta/classes, off_value = eta/classes)
            y2 = l[::-1].one_hot(classes, on_value = 1 - eta + eta/classes, off_value = eta/classes)
            res.append(lam*y1 + (1-lam)*y2)
        return res

    def smooth(label, classes, eta=0.1):
        if isinstance(label, nd.NDArray):
            label = [label]
        smoothed = []
        for l in label:
            res = l.one_hot(classes, on_value = 1 - eta + eta/classes, off_value = eta/classes)
            smoothed.append(res)
        return smoothed

    def test(ctx, val_data, val=True):
        if opt.use_rec:
            if val:
                val_data.reset()
            else:
                train_data.reset()
        acc_top1.reset()
        acc_top5.reset()
        for i, batch in enumerate(val_data):
            data, label = batch_fn(batch, ctx)
            outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
            acc_top1.update(label, outputs)
            acc_top5.update(label, outputs)

        _, top1 = acc_top1.get()
        _, top5 = acc_top5.get()
        return (1-top1, 1-top5)

    def train(ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        if opt.resume_params == '':
            net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        if opt.no_wd:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        hvd.broadcast_parameters(net.collect_params(), root_rank=0)

        # trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

        # trainer = hvd.DistributedTrainer(
        #     net.collect_params(),  
        #     optimizer,
        #     optimizer_params)

        if opt.trainer == 'sgd':
            trainer = SGDTrainer(
                net.collect_params(),  
                optimizer, optimizer_params)
        elif opt.trainer == 'efsgd':
            trainer = EFSGDTrainerV1(
                net.collect_params(),  
                'EFSGDV1', optimizer_params, 
                input_sparse_ratio=1./opt.input_sparse_1, 
                output_sparse_ratio=1./opt.output_sparse_1, 
                layer_sparse_ratio=1./opt.layer_sparse_1)
        elif opt.trainer == 'qsparselocalsgd':
            trainer = QSparseLocalSGDTrainerV1(
                net.collect_params(),  
                optimizer, optimizer_params, 
                input_sparse_ratio=1./opt.input_sparse_1, 
                output_sparse_ratio=1./opt.output_sparse_1, 
                layer_sparse_ratio=1./opt.layer_sparse_1,
                local_sgd_interval=opt.local_sgd_interval)
        elif opt.trainer == 'ersgd':
            trainer = ERSGDTrainerV2(
                net.collect_params(),  
                optimizer, optimizer_params, 
                input_sparse_ratio=1./opt.input_sparse_1, 
                output_sparse_ratio=1./opt.output_sparse_1, 
                layer_sparse_ratio=1./opt.layer_sparse_1)
        elif opt.trainer == 'partiallocalsgd':
            trainer = PartialLocalSGDTrainerV1(
                net.collect_params(),  
                optimizer, optimizer_params, 
                input_sparse_ratio=1./opt.input_sparse_1, 
                output_sparse_ratio=1./opt.output_sparse_1, 
                layer_sparse_ratio=1./opt.layer_sparse_1,
                local_sgd_interval=opt.local_sgd_interval)
        elif opt.trainer == 'ersgd2':
            trainer = ERSGD2TrainerV2(
                net.collect_params(),  
                optimizer, optimizer_params, 
                input_sparse_ratio_1=1./opt.input_sparse_1, 
                output_sparse_ratio_1=1./opt.output_sparse_1, 
                layer_sparse_ratio_1=1./opt.layer_sparse_1,
                input_sparse_ratio_2=1./opt.input_sparse_2, 
                output_sparse_ratio_2=1./opt.output_sparse_2, 
                layer_sparse_ratio_2=1./opt.layer_sparse_2,
                local_sgd_interval=opt.local_sgd_interval)
        else:
            trainer = SGDTrainer(
                net.collect_params(),  
                optimizer, optimizer_params)

        if opt.resume_states != '':
            trainer.load_states(opt.resume_states)

        if opt.label_smoothing or opt.mixup:
            sparse_label_loss = False
        else:
            sparse_label_loss = True
        if distillation:
            L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(temperature=opt.temperature,
                                                                 hard_weight=opt.hard_weight,
                                                                 sparse_label=sparse_label_loss)
        else:
            L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss)

        best_val_score = 1

        for epoch in range(opt.resume_epoch, opt.num_epochs):
            tic = time.time()
            if opt.use_rec:
                train_data.reset()
            # train_metric.reset()
            train_loss = 0
            btic = time.time()

            # test speed
            if opt.test_speed > 0:
                n_repeats = opt.test_speed
            elif opt.test_speed == 0:
                n_repeats = 1
            else:
                n_repeats = 0

            for i, batch in enumerate(train_data):
                
                # test speed
                if n_repeats == 0 and not (i+1)%opt.log_interval:
                    print('[Epoch %d] # batch: %d'%(epoch, i))
                    continue

                data, label = batch_fn(batch, ctx)

                for j in range(n_repeats):

                    if opt.mixup:
                        lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha)
                        if epoch >= opt.num_epochs - opt.mixup_off_epoch:
                            lam = 1
                        data = [lam*X + (1-lam)*X[::-1] for X in data]

                        if opt.label_smoothing:
                            eta = 0.1
                        else:
                            eta = 0.0
                        label = mixup_transform(label, classes, lam, eta)

                    elif opt.label_smoothing:
                        hard_label = label
                        label = smooth(label, classes)

                    if distillation:
                        teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \
                                        for X in data]

                    with ag.record():
                        outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
                        if distillation:
                            loss = [L(yhat.astype('float32', copy=False),
                                    y.astype('float32', copy=False),
                                    p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)]
                        else:
                            loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)]
                    for l in loss:
                        l.backward()
                    trainer.step(batch_size)

                    # if opt.mixup:
                    #     output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \
                    #                     for out in outputs]
                    #     train_metric.update(label, output_softmax)
                    # else:
                    #     if opt.label_smoothing:
                    #         train_metric.update(hard_label, outputs)
                    #     else:
                    #         train_metric.update(label, outputs)

                    step_loss = sum([l.sum().asscalar() for l in loss])

                    train_loss += step_loss

                    if opt.log_interval and not (i+j+1)%opt.log_interval:
                        # train_metric_name, train_metric_score = train_metric.get()
                        if hvd.rank() == 0:
                            # logger.info('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%(
                            #             epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic),
                            #             train_metric_name, train_metric_score, trainer.learning_rate, trainer._comm_counter/1e6))
                            # print('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%(
                            #             epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic),
                            #             train_metric_name, train_metric_score, trainer.learning_rate, trainer._comm_counter/1e6))
                            print('Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f lr=%f comm=%f'%(
                                        epoch, i, batch_size*hvd.size()*opt.log_interval/(time.time()-btic),
                                        'loss', step_loss/batch_size, trainer.learning_rate, trainer._comm_counter/1e6))
                        btic = time.time()

            mx.nd.waitall()
            toc = time.time()

            if n_repeats == 0:
                allreduce_array_nd = mx.nd.array([i])
                hvd.allreduce_(allreduce_array_nd, name='allreduce_array', average=True)
                mx.nd.waitall()
                print('[Epoch %d] # total batch: %d'%(epoch, i))
                continue

            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(batch_size * i /(toc - tic) * hvd.size())

            train_loss /= (batch_size * i)

            if opt.trainer == 'ersgd' or opt.trainer == 'qsparselocalsgd' or opt.trainer == 'ersgd2' or opt.trainer == 'partiallocalsgd':
                allreduce_for_val = True
            else:
                allreduce_for_val = False

            if allreduce_for_val:
                trainer.pre_test()
            # err_train_tic = time.time()
            # err_top1_train, err_top5_train = test(ctx, train_data, val=False)
            err_val_tic = time.time()
            err_top1_val, err_top5_val = test(ctx, val_data, val=True)
            err_val_toc = time.time()
            if allreduce_for_val:
                trainer.post_test()

            mx.nd.waitall()

            # allreduce the results
            allreduce_array_nd = mx.nd.array([train_loss, err_top1_val, err_top5_val])
            hvd.allreduce_(allreduce_array_nd, name='allreduce_array', average=True)
            allreduce_array_np = allreduce_array_nd.asnumpy()
            train_loss = np.asscalar(allreduce_array_np[0])
            err_top1_val = np.asscalar(allreduce_array_np[1])
            err_top5_val = np.asscalar(allreduce_array_np[2])

            if hvd.rank() == 0:
                # logger.info('[Epoch %d] training: %s=%f'%(epoch, train_metric_name, train_metric_score))
                logger.info('[Epoch %d] training: loss=%f'%(epoch, train_loss))
                logger.info('[Epoch %d] speed: %d samples/sec training-time: %f comm: %f'%(epoch, throughput, toc-tic, trainer._comm_counter/1e6))
                logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f err-time=%f'%(epoch, err_top1_val, err_top5_val, err_val_toc - err_val_tic))
                trainer._comm_counter = 0

            if err_top1_val < best_val_score:
                best_val_score = err_top1_val
                # if hvd.local_rank() == 0:
                #     net.save_parameters('%s/%.4f-imagenet-%s-%d-best.params'%(save_dir, best_val_score, model_name, epoch))
                #     trainer.save_states('%s/%.4f-imagenet-%s-%d-best.states'%(save_dir, best_val_score, model_name, epoch))

            if save_frequency and save_dir and (epoch + 1) % save_frequency == 0:
                if hvd.local_rank() == 0:
                    net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, epoch))
                    trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, epoch))

        # if save_frequency and save_dir:
        #     if hvd.local_rank() == 0:
        #         net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, opt.num_epochs-1))
        #         trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, opt.num_epochs-1))


    if opt.mode == 'hybrid':
        net.hybridize(static_alloc=True, static_shape=True)
        if distillation:
            teacher.hybridize(static_alloc=True, static_shape=True)
    train(context)
コード例 #25
0
    def get_data_rec(rec_train, rec_train_idx, rec_val, rec_val_idx, batch_size, num_workers):
        rec_train = os.path.expanduser(rec_train)
        rec_train_idx = os.path.expanduser(rec_train_idx)
        rec_val = os.path.expanduser(rec_val)
        rec_val_idx = os.path.expanduser(rec_val_idx)
        jitter_param = 0.4
        lighting_param = 0.1
        input_size = opt.input_size
        crop_ratio = opt.crop_ratio if opt.crop_ratio > 0 else 0.875
        resize = int(math.ceil(input_size / crop_ratio))
        mean_rgb = [123.68, 116.779, 103.939]
        std_rgb = [58.393, 57.12, 57.375]

        def batch_fn(batch, ctx):
            data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
            return data, label

        train_data = mx.io.ImageRecordIter(
            path_imgrec         = rec_train,
            path_imgidx         = rec_train_idx,
            preprocess_threads  = num_workers,
            shuffle             = True,
            batch_size          = batch_size,
            round_batch         = False,
            data_shape          = (3, input_size, input_size),
            mean_r              = mean_rgb[0],
            mean_g              = mean_rgb[1],
            mean_b              = mean_rgb[2],
            std_r               = std_rgb[0],
            std_g               = std_rgb[1],
            std_b               = std_rgb[2],
            rand_mirror         = True,
            random_resized_crop = True,
            max_aspect_ratio    = 4. / 3.,
            min_aspect_ratio    = 3. / 4.,
            max_random_area     = 1,
            min_random_area     = 0.08,
            brightness          = jitter_param,
            saturation          = jitter_param,
            contrast            = jitter_param,
            pca_noise           = lighting_param,
            num_parts           = hvd.size(),
            part_index          = hvd.rank(),
        )
        val_data = mx.io.ImageRecordIter(
            path_imgrec         = rec_val,
            path_imgidx         = rec_val_idx,
            preprocess_threads  = num_workers,
            shuffle             = False,
            batch_size          = batch_size,

            resize              = resize,
            data_shape          = (3, input_size, input_size),
            mean_r              = mean_rgb[0],
            mean_g              = mean_rgb[1],
            mean_b              = mean_rgb[2],
            std_r               = std_rgb[0],
            std_g               = std_rgb[1],
            std_b               = std_rgb[2],
        )
        return train_data, val_data, batch_fn
コード例 #26
0
    # use sync bn if specified
    if args.syncbn and len(ctx) > 1:
        net = get_model(net_name,
                        pretrained_base=True,
                        norm_layer=gluon.contrib.nn.SyncBatchNorm,
                        norm_kwargs={'num_devices': len(ctx)})
        async_net = get_model(net_name,
                              pretrained_base=False)  # used by cpu worker
    else:
        net = get_model(net_name, pretrained_base=True)
        async_net = net
    if args.resume.strip():
        net.load_parameters(args.resume.strip())
        async_net.load_parameters(args.resume.strip())
    else:
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            net.initialize()
            async_net.initialize()

    # training data
    batch_size = (args.batch_size //
                  hvd.size()) if args.horovod else args.batch_size
    train_dataset, val_dataset, eval_metric = get_dataset(args.dataset, args)
    train_data, val_data = get_dataloader(async_net, train_dataset,
                                          val_dataset, args.data_shape,
                                          batch_size, args.num_workers, args)

    # training
    train(net, train_data, val_data, eval_metric, ctx, args)
コード例 #27
0
args = parser.parse_args()

# logging
level = logging.DEBUG if args.verbose else logging.INFO
logging.getLogger().setLevel(level)
logging.info(args)
os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'

try:
    import horovod.mxnet as hvd
except ImportError:
    logging.info('horovod must be installed.')
    exit()
hvd.init()
store = None
num_workers = hvd.size()
rank = hvd.rank()
local_rank = hvd.local_rank()
is_master_node = rank == local_rank
if not args.use_avg_len and hvd.size() > 1:
    logging.info('Specifying --use-avg-len and setting --batch_size with the '
                 'target number of tokens would help improve training throughput.')

def train(data_train, data_eval, model, nsp_loss, mlm_loss, vocab_size, ctx):
    """Training function."""
    hvd.broadcast_parameters(model.collect_params(), root_rank=0)

    mlm_metric = nlp.metric.MaskedAccuracy()
    nsp_metric = nlp.metric.MaskedAccuracy()
    mlm_metric.reset()
    nsp_metric.reset()
コード例 #28
0
                    default=0,
                    help='frequency of model saving (default: 0)')
parser.add_argument(
    '--gradient-predivide-factor',
    type=float,
    default=1.0,
    help='apply gradient predivide factor in optimizer (default: 1.0)')

args = parser.parse_args()

logging.basicConfig(level=logging.INFO)
logging.info(args)

# Horovod: initialize Horovod
hvd.init()
num_workers = hvd.size()
rank = hvd.rank()
local_rank = hvd.local_rank()

num_classes = 1000
num_training_samples = 1281167
batch_size = args.batch_size
epoch_size = \
    int(math.ceil(int(num_training_samples // num_workers) / batch_size))

if args.lr_mode == 'step':
    lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')]
    steps = [epoch_size * x for x in lr_decay_epoch]
    lr_sched = lr_scheduler.MultiFactorScheduler(
        step=steps,
        factor=args.lr_decay,
コード例 #29
0
ファイル: train_mask_rcnn.py プロジェクト: ygest/gluon-cv
    if args.horovod:
        ctx = [mx.gpu(hvd.local_rank())]
    else:
        ctx = [mx.gpu(int(i)) for i in args.gpus.split(',') if i.strip()]
        ctx = ctx if ctx else [mx.cpu()]

    # network
    kwargs = {}
    module_list = []
    if args.use_fpn:
        module_list.append('fpn')
    if args.norm_layer is not None:
        module_list.append(args.norm_layer)
        if args.norm_layer == 'bn':
            kwargs['num_devices'] = len(ctx)
    num_gpus = hvd.size() if args.horovod else len(ctx)
    net_name = '_'.join(('mask_rcnn', *module_list, args.network, args.dataset))
    if args.custom_model:
        args.use_fpn = True
        net_name = '_'.join(('mask_rcnn_fpn', args.network, args.dataset))
        if args.norm_layer == 'bn':
            norm_layer = gluon.contrib.nn.SyncBatchNorm
            norm_kwargs = {'num_devices': len(ctx)}
            sym_norm_layer = mx.sym.contrib.SyncBatchNorm
            sym_norm_kwargs = {'ndev': len(ctx)}
        elif args.norm_layer == 'gn':
            norm_layer = gluon.nn.GroupNorm
            norm_kwargs = {'groups': 8}
            sym_norm_layer = mx.sym.GroupNorm
            sym_norm_kwargs = {'groups': 8}
        else:
コード例 #30
0
# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
#     http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import json
import os
import horovod.mxnet as hvd

hvd.init()

with open(
        os.path.join('/opt/ml/model/local-rank-%s-rank-%s' %
                     (hvd.local_rank(), hvd.rank())), 'w+') as f:
    basic_info = {
        'local-rank': hvd.local_rank(),
        'rank': hvd.rank(),
        'size': hvd.size()
    }

    print(basic_info)
    json.dump(basic_info, f)
コード例 #31
0
                           stride=(2, 2))
    # first fully connected layer
    flatten = mx.sym.flatten(data=pool2)
    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=50)
    relu3 = mx.sym.Activation(data=fc1, act_type='relu')
    # second fully connected layer
    fc2 = mx.sym.FullyConnected(data=relu3, num_hidden=10)
    # softmax loss
    loss = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
    return loss


# Step 4: fit the model
net = conv_net()
model = mx.mod.Module(symbol=net, context=context)
optimizer_params = {'learning_rate': args.lr * hvd.size(),
                    'rescale_grad': 1.0 / args.batch_size}
opt = mx.optimizer.create('sgd', **optimizer_params)

# Horovod: wrap optimizer with DistributedOptimizer
opt = hvd.DistributedOptimizer(opt)

initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
                             magnitude=2)
model.bind(data_shapes=train_iter.provide_data,
           label_shapes=train_iter.provide_label)
model.init_params(initializer)

# Horovod: fetch and broadcast parameters
(arg_params, aux_params) = model.get_params()
if arg_params is not None:
コード例 #32
0
    def _init_network(self):
        if not self.num_class:
            raise ValueError(
                'Unable to create network when `num_class` is unknown. \
                It should be inferred from dataset or resumed from saved states.'
            )
        assert len(self.classes) == self.num_class

        # training contexts
        if self._cfg.horovod:
            self.ctx = [mx.gpu(hvd.local_rank())]
        else:
            ctx = [mx.gpu(int(i)) for i in self._cfg.gpus]
            self.ctx = ctx if ctx else [mx.cpu()]
        # network
        kwargs = {}
        module_list = []
        if self._cfg.faster_rcnn.use_fpn:
            module_list.append('fpn')
        if self._cfg.faster_rcnn.norm_layer is not None:
            module_list.append(self._cfg.faster_rcnn.norm_layer)
            if self._cfg.faster_rcnn.norm_layer == 'syncbn':
                kwargs['num_devices'] = len(self.ctx)

        self.num_gpus = hvd.size() if self._cfg.horovod else len(self.ctx)

        if self._cfg.faster_rcnn.transfer is not None:
            assert isinstance(self._cfg.faster_rcnn.transfer, str)
            self._logger.info(
                f'Using transfer learning from {self._cfg.faster_rcnn.transfer}, '
                + 'the other network parameters are ignored.')
            self._cfg.faster_rcnn.use_fpn = 'fpn' in self._cfg.faster_rcnn.transfer
            self.net = get_model(
                self._cfg.faster_rcnn.transfer,
                pretrained=True,
                per_device_batch_size=self._cfg.train.batch_size //
                self.num_gpus,
                **kwargs)
            self.net.reset_class(self.classes,
                                 reuse_weights=[
                                     cname for cname in self.classes
                                     if cname in self.net.classes
                                 ])
        else:
            self._cfg.faster_rcnn.use_fpn = True
            if self._cfg.faster_rcnn.norm_layer == 'syncbn':
                norm_layer = gluon.contrib.nn.SyncBatchNorm
                norm_kwargs = {'num_devices': len(self.ctx)}
                sym_norm_layer = mx.sym.contrib.SyncBatchNorm
                sym_norm_kwargs = {'ndev': len(self.ctx)}
            elif self._cfg.faster_rcnn.norm_layer == 'gn':
                norm_layer = gluon.nn.GroupNorm
                norm_kwargs = {'groups': 8}
                sym_norm_layer = mx.sym.GroupNorm
                sym_norm_kwargs = {'groups': 8}
            else:
                norm_layer = gluon.nn.BatchNorm
                norm_kwargs = None
                sym_norm_layer = None
                sym_norm_kwargs = None
            self.net = get_model(
                'custom_faster_rcnn_fpn',
                classes=self.classes,
                transfer=None,
                dataset=self._cfg.dataset,
                pretrained_base=self._cfg.train.pretrained_base,
                base_network_name=self._cfg.faster_rcnn.backbone,
                norm_layer=norm_layer,
                norm_kwargs=norm_kwargs,
                sym_norm_layer=sym_norm_layer,
                sym_norm_kwargs=sym_norm_kwargs,
                num_fpn_filters=self._cfg.faster_rcnn.num_fpn_filters,
                num_box_head_conv=self._cfg.faster_rcnn.num_box_head_conv,
                num_box_head_conv_filters=self._cfg.faster_rcnn.
                num_box_head_conv_filters,
                num_box_head_dense_filters=self._cfg.faster_rcnn.
                num_box_head_dense_filters,
                short=self._cfg.faster_rcnn.image_short,
                max_size=self._cfg.faster_rcnn.image_max_size,
                min_stage=2,
                max_stage=6,
                nms_thresh=self._cfg.faster_rcnn.nms_thresh,
                nms_topk=self._cfg.faster_rcnn.nms_topk,
                roi_mode=self._cfg.faster_rcnn.roi_mode,
                roi_size=self._cfg.faster_rcnn.roi_size,
                strides=self._cfg.faster_rcnn.strides,
                clip=self._cfg.faster_rcnn.clip,
                rpn_channel=self._cfg.faster_rcnn.rpn_channel,
                base_size=self._cfg.faster_rcnn.anchor_base_size,
                scales=self._cfg.faster_rcnn.anchor_scales,
                ratios=self._cfg.faster_rcnn.anchor_aspect_ratio,
                alloc_size=self._cfg.faster_rcnn.anchor_alloc_size,
                rpn_nms_thresh=self._cfg.faster_rcnn.rpn_nms_thresh,
                rpn_train_pre_nms=self._cfg.train.rpn_train_pre_nms,
                rpn_train_post_nms=self._cfg.train.rpn_train_post_nms,
                rpn_test_pre_nms=self._cfg.valid.rpn_test_pre_nms,
                rpn_test_post_nms=self._cfg.valid.rpn_test_post_nms,
                rpn_min_size=self._cfg.train.rpn_min_size,
                per_device_batch_size=self._cfg.train.batch_size //
                self.num_gpus,
                num_sample=self._cfg.train.rcnn_num_samples,
                pos_iou_thresh=self._cfg.train.rcnn_pos_iou_thresh,
                pos_ratio=self._cfg.train.rcnn_pos_ratio,
                max_num_gt=self._cfg.faster_rcnn.max_num_gt)

        if self._cfg.resume.strip():
            self.net.load_parameters(self._cfg.resume.strip())
        else:
            for param in self.net.collect_params().values():
                if param._data is not None:
                    continue
                param.initialize()
        self.net.collect_params().reset_ctx(self.ctx)
        if self._cfg.faster_rcnn.amp:
            # Cast both weights and gradients to 'float16'
            self.net.cast('float16')
            # These layers don't support type 'float16'
            self.net.collect_params('.*batchnorm.*').setattr(
                'dtype', 'float32')
            self.net.collect_params(
                '.*normalizedperclassboxcenterencoder.*').setattr(
                    'dtype', 'float32')
        if self._cfg.resume.strip():
            self.net.load_parameters(self._cfg.resume.strip())
        else:
            for param in self.net.collect_params().values():
                if param._data is not None:
                    continue
                param.initialize()
        self.net.collect_params().reset_ctx(self.ctx)