Esempio n. 1
0
def all_reduce(tensor, op='SUM', group=None):
    """Reduce the tensor across all nodes in a group.

    Parameters
    ----------
    tensor : Sequence[dragon.vm.torch.Tensor]
        The tensor(s) to reduce.
    op : {'SUM', 'MEAN'}, optional
        The reduce operation.
    group : ProcessGroup, optional
        The group for communication.

    Returns
    -------
    dragon.vm.torch.Tensor
        The output tensor.

    """
    if group is None:
        group = distributed.get_group()
    if group is None:
        raise ValueError('<group> is required.')
    if op not in ('MEAN', 'SUM'):
        raise ValueError('Unsupported reduce op: ' + op)
    tensors = nest.flatten(tensor)
    return _functions.Collective \
        .instantiate(
            tensors[0].device,
            operation=op,
            communication='ALLREDUCE',
            group=group,
        ).apply(tensors)
Esempio n. 2
0
def broadcast(inputs, root=0, group=None, **kwargs):
    """Broadcast the input from root node in a group.

    Parameters
    ----------
    inputs : dragon.Tensor
        The tensor to broadcast.
    root : int, optional, default=0
        The node index in the group.
    group : ProcessGroup, optional
        The communication group.

    Returns
    -------
    dragon.Tensor
        The output tensor.

    """
    args = OpSchema.parse_args(locals())
    if group is None:
        group = distributed.get_group()
    if group is None:
        raise ValueError('<group> is required.')
    coll_args = group.arguments.copy()
    coll_args['root'] = root
    coll_args['operation'] = 'BROADCAST'
    if context.executing_eagerly():
        return OpLib.execute('Collective', inputs, **coll_args)
    kwargs.update(coll_args)
    return OpLib.add('Collective', inputs, **kwargs)
Esempio n. 3
0
def broadcast(tensor, src=0, group=None):
    """Broadcast the tensor from source node in a group.

    Parameters
    ----------
    tensor : Sequence[dragon.vm.torch.Tensor]
        The tensor(s) to reduce.
    src : int
        The rank of the source node.
    group : ProcessGroup, optional
        The group for communication.

    Returns
    -------
    dragon.vm.torch.Tensor
        The output tensor.

    """
    if group is None:
        group = distributed.get_group()
    if group is None:
        raise ValueError('<group> is required.')
    tensors = nest.flatten(tensor)
    return _functions.Collective \
        .instantiate(
            tensors[0].device,
            root=src,
            communication='BROADCAST',
            group=group,
        ).apply(tensors)
Esempio n. 4
0
def broadcast(tensor, src=0, group=None):
    """Broadcast the tensor from source node in a group.

    Parameters
    ----------
    tensor : dragon.vm.torch.Tensor
        The tensor to be sent.
    src : int
        The rank of the source node.
    group : ProcessGroup, optional
        The group for communication.

    Returns
    -------
    dragon.vm.torch.Tensor
        The output tensor.

    """
    group = group or distributed.get_group()
    if group is None:
        return tensor
    return Function.apply('Collective',
                          tensor.device, [tensor],
                          outputs=[tensor],
                          operation='BROADCAST',
                          root=src,
                          **group.arguments)
Esempio n. 5
0
def all_reduce(inputs, reduction='mean', group=None, **kwargs):
    """Reduce the input across all nodes in a group.

    Parameters
    ----------
    inputs : dragon.Tensor
        The input tensor.
    reduction : str, optional
        The reduction method.
    group : ProcessGroup, optional
        The group for communication.

    Returns
    -------
    dragon.Tensor
        The output tensor.

    """
    reduction = reduction.upper()
    if group is None:
        group = distributed.get_group()
    if group is None:
        raise ValueError('<group> is required.')
    if reduction not in ('MEAN', 'SUM'):
        raise ValueError('Unsupported reduction: ' + reduction)
    coll_args = group.arguments.copy()
    coll_args['operation'] = 'ALLREDUCE'
    coll_args['reduction'] = reduction
    if context.executing_eagerly():
        return OpLib.execute('Collective', inputs, **coll_args)
    kwargs.update(coll_args)
    return OpLib.add('Collective', inputs, **kwargs)
Esempio n. 6
0
def all_reduce(tensor, op='sum', group=None):
    """Reduce the tensor across all nodes in a group.

    Parameters
    ----------
    tensor : dragon.vm.torch.Tensor
        The tensor to reduce.
    op : str, optional
        The reduction op.
    group : ProcessGroup, optional
        The group for communication.

    Returns
    -------
    dragon.vm.torch.Tensor
        The output tensor.

    """
    group = group or distributed.get_group()
    if group is None:
        return tensor
    op = op.upper()
    if op not in ('MEAN', 'SUM'):
        raise ValueError('Unsupported reduction: ' + op)
    return Function.apply('Collective',
                          tensor.device, [tensor],
                          outputs=[tensor],
                          operation='ALLREDUCE',
                          reduction=op,
                          **group.arguments)
Esempio n. 7
0
def all_gather(tensor_list, tensor, group=None):
    """Gather the tensor across all nodes in a group.

    Parameters
    ----------
    tensor_list : Sequence[dragon.vm.torch.Tensor]
        The output tensor list.
    tensor : dragon.vm.torch.Tensor
        The tensor to be sent.
    group : ProcessGroup, optional
        The group for communication.

    Returns
    -------
    dragon.vm.torch.Tensor
        The output tensor.

    """
    group = group or distributed.get_group()
    if group is None:
        return tensor
    output_tensor = Function.apply('Collective',
                                   tensor.device, [tensor],
                                   operation='ALLGATHER',
                                   **group.arguments)
    if len(tensor_list) > 0:
        return Function.apply('Split',
                              output_tensor.device, [output_tensor],
                              outputs=[None] * len(tensor_list),
                              axis=0,
                              size_split=None,
                              copy=True)
    return output_tensor
Esempio n. 8
0
 def _add_updates(graph_def, grads_and_vars, optimizer):
     group_vars = collections.defaultdict(list)
     group_grads = collections.defaultdict(list)
     for grad, var in grads_and_vars:
         weight_decay = getattr(var, '_weight_decay', None)
         if weight_decay is not None:
             weight_decay = float(weight_decay)
         group_vars[weight_decay].append(var.id)
         group_grads[weight_decay].append(grad.id)
     op_defs = []
     process_group = distributed.get_group()
     if process_group:
         grads = list(itertools.chain(*group_grads.values()))
         op_defs.append(proto_util.make_operator_def(
             op_type='Collective',
             inputs=grads,
             outputs=grads,
             name=optimizer._name,
             operation='ALLREDUCE',
             reduction='MEAN',
             **process_group.arguments))
     for weight_decay, vars in group_vars.items():
         grads = group_grads[weight_decay]
         op_defs.append(proto_util.make_operator_def(
             op_type=optimizer._op_type,
             inputs=grads,
             outputs=vars,
             name=optimizer._name,
             weight_decay=weight_decay))
     graph_def.op.extend(op_defs)
Esempio n. 9
0
    def __init__(self, params, defaults):
        """Create a ``Optimizer``.

        Parameters
        ----------
        params : Sequence[dragon.vm.torch.nn.Parameter]
            The parameters to optimize.
        defaults : dict
            The pre-defined default hyper-parameters.

        """
        self.defaults = defaults
        if isinstance(params, Tensor):
            raise TypeError('<params> should be a sequence of tensors.')
        self.state = defaultdict(dict)
        self.param_groups = []
        param_groups = list(params)
        if len(param_groups) == 0:
            raise ValueError('Got an empty parameter list')
        if not isinstance(param_groups[0], dict):
            param_groups = [{'params': param_groups}]
        for param_group in param_groups:
            self.add_param_group(param_group)
        self._op_type = self.__class__.__name__ + 'Update'
        self._process_group = distributed.get_group()
        self._shared_args = {}
Esempio n. 10
0
def broadcast(inputs, root=0, group=None, **kwargs):
    """Broadcast the input from root node in a group.

    Parameters
    ----------
    inputs : dragon.Tensor
        The tensor to broadcast.
    root : int, optional, default=0
        The node index in the group.
    group : ProcessGroup, optional
        The group for communication.

    Returns
    -------
    dragon.Tensor
        The output tensor.

    """
    args = ArgHelper.parse(locals())
    if group is None:
        group = distributed.get_group()
    if group is None:
        raise ValueError('<group> is required.')
    args.update(group.arguments)
    args.pop('group')
    op_lib = distributed_ops_lib.Collective
    if context.executing_eagerly():
        return op_lib \
            .instantiate(
                root=root,
                communication='BROADCAST',
                group=group,
            ).apply(inputs)
    else:
        return op_lib.blend(communication='BROADCAST', **args)
Esempio n. 11
0
def sync_batch_norm(inputs,
                    axis=-1,
                    momentum=0.9,
                    epsilon=1e-5,
                    use_stats=-1,
                    process_group=None,
                    **kwargs):
    r"""Apply the batch normalization with synced statistics.
    `[Ioffe & Szegedy, 2015] <https://arxiv.org/abs/1502.03167>`_.

    The normalization is defined as:

    .. math:: y = \frac{x - \mathrm{E}[x]}
                       {\sqrt{\mathrm{Var}[x] + \epsilon}}
                  * \gamma + \beta

    The running average of statistics are calculated as:

    .. math:: x_{\text{running}} = \text{momentum} * x_{\text{running}}
                                   + (1 - \text{momentum}) * x_{\text{batch}}

    Parameters
    ----------
    inputs : Sequence[dragon.Tensor]
        The tensor ``x``, ``gamma``, ``beta``, ``mean`` and ``var``.
    axis : int, optional, default=-1
        The channel axis.
    momentum : Union[float, dragon.Tensor], optional
        The value to :math:`\text{momentum}`.
    epsilon : float, optional, default=1e-5
        The value to :math:`\epsilon`.
    use_stats : int, optional, default=-1
        Whether to use estimated statistics or not.
    process_group : ProcessGroup, optional
        The group for communication.

    Returns
    -------
    dragon.Tensor
        The output tensor.

    """
    args = OpSchema.parse_args(locals())
    args['epsilon'] = float(epsilon)
    if process_group is None:
        process_group = distributed.get_group()
    if process_group is None:
        raise ValueError('<process_group> is required.')
    if context.executing_eagerly():
        return OpLib.execute('SyncBatchNorm',
                             inputs,
                             axis=axis,
                             epsilon=args['epsilon'],
                             use_stats=use_stats,
                             momentum=args['momentum'],
                             **process_group.arguments)
    args.pop('process_group')
    args.update(process_group.arguments)
    return OpLib.add('SyncBatchNorm', **args)
Esempio n. 12
0
 def __init__(self, **kwargs):
     """Create a ``Optimizer``."""
     self._name = workspace.get_workspace().create_handle('Optimizer')
     self._op_type = self.__class__.__name__
     self._process_group = distributed.get_group()
     self._hyper = {}
     self._set_hyper('grad_scale', kwargs.pop('grad_scale', 1))
     self._set_hyper('weight_decay', kwargs.pop('weight_decay', 0))
     self._set_hyper('clip_norm', kwargs.pop('clip_norm', 0))
     self._set_hyper('clip_value', kwargs.pop('clip_value', 0))
     if kwargs:
         raise ValueError('Unexpected arguments: ' + ','.join(v for v in kwargs))
Esempio n. 13
0
def get_distributed_info(allowed=True):
    """Return the rank and size of current nesting group.

    Parameters
    ----------
    allowed : bool, optional, default=True
        Whether the distributed utilities are allowed.

    Returns
    -------
    Tuple[int]
        The node rank and group size.

    """
    if allowed:
        group = distributed.get_group()
        if group is not None:
            return distributed.get_rank(group), group.size
    return 0, 1
Esempio n. 14
0
    def apply_gradients(self, grads_and_vars):
        """Apply the gradients on variables.

        Parameters
        ----------
        grads_and_vars : Sequence[Sequence[dragon.Tensor]]
            The sequence of update pair.

        """
        # Create execution context for graph mode.
        if not context.executing_eagerly():
            return GraphLib.from_updates(grads_and_vars, self)

        # Separate variables by explicit weight decay.
        group_vars = collections.defaultdict(list)
        group_grads = collections.defaultdict(list)
        for grad, var in grads_and_vars:
            if grad is not None:
                weight_decay = getattr(var, '_weight_decay', None)
                if weight_decay is not None:
                    weight_decay = float(weight_decay)
                group_vars[weight_decay].append(var)
                group_grads[weight_decay].append(grad)

        # Reduce grads in the process group.
        process_group = distributed.get_group()
        if process_group is not None:
            grads = list(itertools.chain(*group_grads.values()))
            OpLib.execute('Collective', grads, outputs=grads,
                          operation='ALLREDUCE', reduction='MEAN',
                          **process_group.arguments)

        # Apply updates.
        for weight_decay, vars in group_vars.items():
            grads = group_grads[weight_decay]
            # Skip if grads are all missing.
            if len(grads) == 0:
                continue
            OpLib.execute(self._op_type, grads, outputs=vars,
                          name=self._name, weight_decay=weight_decay)
Esempio n. 15
0
    def __init__(
        self,
        num_features,
        eps=1e-5,
        momentum=0.1,
        affine=True,
        track_running_stats=True,
        process_group=None,
    ):
        r"""Create a ``SyncBatchNorm`` module.

        Parameters
        ----------
        num_features : int
            The number of channels.
        eps : float, optional, default=1e-5
            The value to :math:`\epsilon`.
        momentum : float, optional, default=0.1
            The value to :math:`\text{momentum}`.
        affine : bool, optional, default=True
            ``True`` to apply a affine transformation.
        track_running_stats : bool, optional, default=True
            ``True`` to using stats when switching to ``eval``.
        process_group : ProcessGroup, optional
            The group for communication.

        """
        super(SyncBatchNorm, self).__init__(
            num_features,
            eps,
            momentum,
            affine,
            track_running_stats,
        )
        if process_group is None:
            process_group = distributed.get_group()
        self.process_group = process_group
Esempio n. 16
0
def all_reduce(inputs, operation='MEAN', group=None, **kwargs):
    """Reduce the input across all nodes in a group.

    Parameters
    ----------
    inputs : dragon.Tensor
        The input tensor.
    operation : {'MEAN', 'SUM'}, optional
        The reduce operation.
    group : ProcessGroup, optional
        The group for communication.

    Returns
    -------
    dragon.Tensor
        The output tensor.

    """
    args = ArgHelper.parse(locals())
    if group is None:
        group = distributed.get_group()
    if group is None:
        raise ValueError('<group> is required.')
    if operation not in ('MEAN', 'SUM'):
        raise ValueError('Unsupported reduce op: ' + operation)
    args.update(group.arguments)
    args.pop('group')
    op_lib = distributed_ops_lib.Collective
    if context.executing_eagerly():
        return op_lib \
            .instantiate(
                operation=operation,
                communication='ALLREDUCE',
                group=group,
            ).apply(inputs)
    else:
        return op_lib.blend(communication='ALLREDUCE', **args)
Esempio n. 17
0
    def _update_group(self, group):
        """Update parameters for the group."""
        execute_ws = workspace.get_workspace()

        # Collect params and grads.
        params_with_grad, grads = [], []
        for p in group['params']:
            g = self._get_grad(execute_ws, p, self._sums_grad)
            if g is not None:
                params_with_grad.append(p)
                grads.append(g)

        # Skip if grads are all missing.
        if len(params_with_grad) == 0:
            return

        # Update hyper from group values.
        for name in self._hyper.keys():
            group_name = group['name']
            impl_name, group_dict = self._hyper[name]
            if group_name not in group_dict:
                impl_name = group_name + '/' + impl_name
                group_dict[group_name] = execute_ws.create_tensor(impl_name)
            impl = group_dict[group_name]
            impl.FromNumpy(numpy.array(group[name], 'float32'), False)

        # Reduce grads in the process group.
        process_group = distributed.get_group()
        if process_group is not None:
            Function.apply('Collective', grads[0].device, grads,
                           outputs=grads, operation='ALLREDUCE',
                           reduction='MEAN', **process_group.arguments)

        # Apply updates.
        Function.apply(self._op_type, params_with_grad[0].device, grads,
                       outputs=params_with_grad, name=group['name'],
                       weight_decay=None)
Esempio n. 18
0
    def __init__(self, **kwargs):
        """Create a ``DataIterator``.

        Parameters
        ----------
        dataset : class
            The dataset class to load examples.
        source : str
            The path of data source.
        shuffle : bool, optional, default=False
            Whether to shuffle the data.
        initial_fill : int, optional, default=1024
            The length of sampling sequence for shuffle.
        resize : int, optional, default=0
            The size for the shortest edge.
        padding : int, optional, default=0
            The size for the zero padding on two sides.
        fill_value : Union[int, Sequence], optional, default=127
            The value(s) to fill for padding or cutout.
        crop_size : int, optional, default=0
            The size for random-or-center cropping.
        random_crop_size: int, optional, default=0
            The size for sampling-based random cropping.
        cutout_size : int, optional, default=0
            The square size for the cutout algorithm.
        mirror : bool, optional, default=False
            Whether to apply the mirror (flip horizontally).
        random_scales : Sequence[float], optional, default=(0.08, 1.)
            The range of scales to sample a crop randomly.
        random_aspect_ratios : Sequence[float], optional, default=(0.75, 1.33)
            The range of aspect ratios to sample a crop randomly.
        distort_color : bool, optional, default=False
            Whether to apply color distortion.
        inverse_color : bool, option, default=False
            Whether to inverse channels for color images.
        training : optional, default=True
            Whether to enable the training randoms.
        batch_size : int, optional, default=128
            The size of a mini-batch.
        prefetch_depth : int, optional, default=4
            The number of prefetching queues.
        num_transformers : int, optional, default=-1
            The number of transformers to process image.
        seed : int, optional
            The random seed to use instead.

        """
        super(DataIterator, self).__init__(daemon=True)
        # Distributed settings.
        rank, group_size = 0, 1
        process_group = distributed.get_group()
        if process_group is not None and kwargs.get('training', True):
            group_size = process_group.size
            rank = distributed.get_rank(process_group)

        # Configuration.
        self._prefetch_depth = kwargs.get('prefetch_depth', 4)
        self._num_readers = kwargs.get('num_readers', 1)
        self._num_workers = kwargs.get('num_workers', -1)
        self._batch_size = kwargs.get('batch_size', 128)

        # Io-Aware Policy.
        if self._num_workers == -1:
            self._num_workers = 1
            # Add a transformer for cropping.
            if kwargs.get('random_crop_size', 0) > 0:
                self._num_workers += 1
            # Add a transformer for distortion.
            if kwargs.get('distort_color', False):
                self._num_workers += 1

        # Initialize queues.
        num_batches = self._prefetch_depth * self._num_readers
        self._reader_queue = mp.Queue(num_batches * self._batch_size)
        self._worker_queue = mp.Queue(num_batches * self._batch_size)
        self._batch_queue = queue.Queue(num_batches)

        # Initialize readers.
        self._readers = []
        for i in range(self._num_readers):
            part_idx, num_parts = i, self._num_readers
            num_parts *= group_size
            part_idx += rank * self._num_readers
            self._readers.append(
                reader.DataReader(part_idx=part_idx,
                                  num_parts=num_parts,
                                  **kwargs))
            self._readers[i]._seed += part_idx
            self._readers[i]._reader_queue = self._reader_queue
            self._readers[i].start()
            time.sleep(0.1)

        # Initialize transformers.
        self._workers = []
        for i in range(self._num_workers):
            p = data_worker.DataWorker(**kwargs)
            p._seed += (i + rank * self._num_workers)
            p._reader_queue = self._reader_queue
            p._worker_queue = self._worker_queue
            p.start()
            self._workers.append(p)
            time.sleep(0.1)

        # Register cleanup callbacks.
        def cleanup():
            def terminate(processes):
                for p in processes:
                    p.terminate()
                    p.join()

            terminate(self._workers)
            if rank == 0:
                logging.info('Terminate DataWorker.')
            terminate(self._readers)
            if rank == 0:
                logging.info('Terminate DataReader.')

        import atexit
        atexit.register(cleanup)

        # Start batch prefetching.
        self.start()