コード例 #1
0
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
    """Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    """
    dense_tensors = []  # shape (num_tensors, num_gpus)
    output = []
    for tensor_at_gpus in zip(*inputs):
        if tensor_at_gpus[0].is_sparse:
            result = reduce_add(tensor_at_gpus, destination)
            output.append(result)
        else:
            dense_tensors.append(tensor_at_gpus)
    itrs = [_take_tensors(tensors, buffer_size) for tensors in zip(*dense_tensors)]
    for chunks in zip(*itrs):
        tensors = [_flatten_dense_tensors(chunk) for chunk in chunks]
        result = reduce_add(tensors, destination)
        output.extend(_unflatten_dense_tensors(result, chunks[0]))
    return tuple(_reorder_tensors_as(output, inputs[0]))
コード例 #2
0
ファイル: comm.py プロジェクト: zfenice/pytorch
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
    """Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    """
    # TODO: When `len(inputs) == 1` and all inputs are on `destination`, just
    #       return `inputs`.
    dense_tensors: List[List] = [[] for _ in inputs
                                 ]  # shape (num_gpus, num_tensors)
    output = []
    ref_order = []
    # process sparse ones first since they may have different sizes on different gpus
    for tensor_at_gpus in zip(*inputs):
        if all(t.is_sparse for t in tensor_at_gpus):
            result = reduce_add(tensor_at_gpus,
                                destination)  # this will be sparse too
            output.append(result)
            ref_order.append(tensor_at_gpus[0])
        else:
            for coll, t in zip(dense_tensors, tensor_at_gpus):
                coll.append(t.to_dense() if t.is_sparse else t)
            ref_order.append(dense_tensors[0][-1])
    itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors]
    # now the dense ones, which have consistent sizes
    for chunks in zip(*itrs):
        flat_tensors = [_flatten_dense_tensors(chunk)
                        for chunk in chunks]  # (num_gpus,)
        flat_result = reduce_add(flat_tensors, destination)
        for t in _unflatten_dense_tensors(flat_result, chunks[0]):
            # The unflattened tensors do not share storage, and we don't expose
            # base flat tensor anyways, so give them different version counters.
            # See NOTE [ Version Counter in comm.*_coalesced ]
            output.append(t.data)
    return tuple(_reorder_tensors_as(output, ref_order))
コード例 #3
0
def broadcast_coalesced(tensors, devices, buffer_size=10485760):
    """Broadcasts a sequence tensors to the specified GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        tensors (sequence): tensors to broadcast.
        devices (Iterable): an iterable of devices among which to broadcast.
          Note that it should be like (src, dst1, dst2, ...), the first element
          of which is the source device to broadcast from.
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple containing copies of the ``tensor``, placed on devices
        corresponding to indices from ``devices``.
    """
    for tensor in tensors:
        if tensor.get_device() != devices[0]:
            raise RuntimeError('all tensors must be on devices[0]')
    outputs = [[] for _ in devices]
    # use the original tensors for the first device
    outputs[0].extend(tensors)
    for chunk in _take_tensors(tensors, buffer_size):
        if chunk[0].is_sparse:
            flat_indices, flat_values = _flatten_sparse_tensors(chunk)
            result_indices = broadcast(flat_indices, devices)
            result_values = broadcast(flat_values, devices)
            unflat_results = tuple(
                _unflatten_sparse_tensors(iv, chunk)
                for iv in zip(result_indices, result_values))
        else:
            flat = _flatten_dense_tensors(chunk)
            results = broadcast(flat, devices)
            unflat_results = tuple(
                _unflatten_dense_tensors(tensor, chunk) for tensor in results)
        # use the broadcasted tensors for the remaining devices
        for dst, unflat_res in zip(outputs[1:], unflat_results[1:]):
            dst.extend(unflat_res)
    for i, output in enumerate(outputs):
        outputs[i] = _reorder_tensors_as(output, tensors)
    return tuple(outputs)
コード例 #4
0
ファイル: comm.py プロジェクト: Northrend/pytorch
def broadcast_coalesced(tensors, devices, buffer_size=10485760):
    """Broadcasts a sequence tensors to the specified GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        tensors (sequence): tensors to broadcast.
        devices (Iterable): an iterable of devices among which to broadcast.
          Note that it should be like (src, dst1, dst2, ...), the first element
          of which is the source device to broadcast from.
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple containing copies of the ``tensor``, placed on devices
        corresponding to indices from ``devices``.
    """
    for tensor in tensors:
        if tensor.get_device() != devices[0]:
            raise RuntimeError('all tensors must be on devices[0]')
    outputs = [[] for _ in devices]
    # use the original tensors for the first device
    outputs[0].extend(tensors)
    for chunk in _take_tensors(tensors, buffer_size):
        if chunk[0].is_sparse:
            flat_indices, flat_values = _flatten_sparse_tensors(chunk)
            result_indices = broadcast(flat_indices, devices)
            result_values = broadcast(flat_values, devices)
            unflat_results = tuple(_unflatten_sparse_tensors(iv, chunk) for iv in zip(result_indices, result_values))
        else:
            flat = _flatten_dense_tensors(chunk)
            results = broadcast(flat, devices)
            unflat_results = tuple(_unflatten_dense_tensors(tensor, chunk) for tensor in results)
        # use the broadcasted tensors for the remaining devices
        for dst, unflat_res in zip(outputs[1:], unflat_results[1:]):
            dst.extend(unflat_res)
    for i, output in enumerate(outputs):
        outputs[i] = _reorder_tensors_as(output, tensors)
    return tuple(outputs)
コード例 #5
0
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
    """Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    """
    dense_tensors = [[] for _ in inputs]  # shape (num_gpus, num_tensors)
    output = []
    ref_order = []
    # process sparse ones first since they may have different sizes on different gpus
    for tensor_at_gpus in zip(*inputs):
        if all(t.is_sparse for t in tensor_at_gpus):
            result = reduce_add(tensor_at_gpus, destination)
            output.append(result)
            ref_order.append(tensor_at_gpus[0])
        else:
            for coll, t in zip(dense_tensors, tensor_at_gpus):
                coll.append(t.to_dense() if t.is_sparse else t)
            ref_order.append(dense_tensors[0][-1])
    itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors]
    # now the dense ones, which have consistent sizes
    for chunks in zip(*itrs):
        flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks]
        flat_result = reduce_add(flat_tensors, destination)
        output.extend(_unflatten_dense_tensors(flat_result, chunks[0]))
    return tuple(_reorder_tensors_as(output, ref_order))
コード例 #6
0
ファイル: comm.py プロジェクト: Jsmilemsj/pytorch
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
    """Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    """
    dense_tensors = [[] for _ in inputs]  # shape (num_gpus, num_tensors)
    output = []
    ref_order = []
    # process sparse ones first since they may have different sizes on different gpus
    for tensor_at_gpus in zip(*inputs):
        if all(t.is_sparse for t in tensor_at_gpus):
            result = reduce_add(tensor_at_gpus, destination)
            output.append(result)
            ref_order.append(tensor_at_gpus[0])
        else:
            for coll, t in zip(dense_tensors, tensor_at_gpus):
                coll.append(t.to_dense() if t.is_sparse else t)
            ref_order.append(dense_tensors[0][-1])
    itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors]
    # now the dense ones, which have consistent sizes
    for chunks in zip(*itrs):
        flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks]
        flat_result = reduce_add(flat_tensors, destination)
        output.extend(_unflatten_dense_tensors(flat_result, chunks[0]))
    return tuple(_reorder_tensors_as(output, ref_order))