Ejemplo n.º 1
0
def _allreduce_async(tensor, output, name, op, prescale_factor,
                     postscale_factor, process_set: ProcessSet):
    # Set the divisor for reduced gradients to average when necessary
    if op == Average:
        if rocm_built():
            # For ROCm, perform averaging at framework level
            divisor = size()
            op = Sum
        else:
            divisor = 1

    elif op == Adasum:
        if process_set != global_process_set:
            raise NotImplementedError(
                "Adasum does not support non-global process sets yet.")
        if tensor.device.type != 'cpu' and gpu_available('torch'):
            if nccl_built():
                if not is_homogeneous():
                    raise NotImplementedError(
                        'Running GPU Adasum on heterogeneous cluster is not supported yet.'
                    )
                elif not num_rank_is_power_2(int(size() / local_size())):
                    raise NotImplementedError(
                        'Running GPU Adasum with non-power of 2 nodes is not supported yet.'
                    )
                if rocm_built():
                    # For ROCm, perform averaging at framework level
                    divisor = local_size()
                else:
                    divisor = 1
            else:
                warnings.warn(
                    'Adasum reduction does not currently support GPU reduction using MPI. Tensors are '
                    'copied to CPU memory instead. To use Adasum for GPU reduction, please compile Horovod '
                    'with HOROVOD_GPU_OPERATIONS=NCCL.')
                divisor = 1
        else:
            if not num_rank_is_power_2(size()):
                raise NotImplementedError(
                    'Running Adasum with non-power of 2 ranks is not supported yet.'
                )
            divisor = 1
    else:
        divisor = 1

    function = _check_function(_allreduce_function_factory, tensor)
    try:
        handle = getattr(mpi_lib, function)(
            tensor, output, divisor,
            name.encode() if name is not None else _NULL, op, prescale_factor,
            postscale_factor, process_set.process_set_id)
    except RuntimeError as e:
        raise HorovodInternalError(e)
    _handle_map[handle] = (tensor, output)
    return handle
Ejemplo n.º 2
0
def _allreduce_async(tensor, output, name, op):
    if tensor.dtype == torch.float16 and not _fp16_supported:
        raise NotImplementedError(
            'float16 allreduce is not supported for PyTorch version {} < 1.0.0'
            .format(torch.__version__))

    # Set the divisor for reduced gradients to average when necessary
    if op == Average:
        divisor = size()
    elif op == Adasum:
        if tensor.device.type != 'cpu' and gpu_available('torch'):
            if nccl_built():
                if not is_homogeneous():
                    raise NotImplementedError(
                        'Running GPU Adasum on heterogeneous cluster is not supported yet.'
                    )
                elif not num_rank_is_power_2(int(size() / local_size())):
                    raise NotImplementedError(
                        'Running GPU Adasum with non-power of 2 nodes is not supported yet.'
                    )
                divisor = local_size()
            else:
                warnings.warn(
                    'Adasum reduction does not currently support GPU reduction using MPI. Tensors are '
                    'copied to CPU memory instead. To use Adasum for GPU reduction, please compile Horovod '
                    'with HOROVOD_GPU_OPERATIONS=NCCL.')
                divisor = 1
        else:
            if not num_rank_is_power_2(size()):
                raise NotImplementedError(
                    'Running Adasum with non-power of 2 ranks is not supported yet.'
                )
            divisor = 1
    else:
        divisor = 1
    # Averaging happens in framework code, so translate that to Sum for the actual call
    true_op = Sum if op == Average else op

    function = _check_function(_allreduce_function_factory, tensor)
    try:
        handle = getattr(mpi_lib, function)(
            tensor, output, divisor,
            name.encode() if name is not None else _NULL, true_op)
    except RuntimeError as e:
        raise HorovodInternalError(e)
    _handle_map[handle] = (tensor, output)
    return handle