def _AllReduce(devices, model, net, param, use_nccl=False, control_input=None):
    blobs_group = list(viewvalues(model._device_grouped_blobs[param]))
    if model._device_type == caffe2_pb2.CUDA and use_nccl:
        model.NCCLAllreduce(blobs_group,
                            blobs_group,
                            control_input=control_input)
        return

    if model._device_type == caffe2_pb2.CUDA:
        p2p_access_pattern = workspace.GetCudaPeerAccessPattern()
    else:
        p2p_access_pattern = None

    def sumN(*dev_indices):
        """Create a Sum op for 2 or more blobs on different devices.
        Saves the result on the first device.

        Arguments:
        dev_indices -- a list of device indices, which can be translated into
                       CUDA identifiers with model._devices
        """
        devices = [model._devices[idx] for idx in dev_indices]
        blobs = [blobs_group[idx] for idx in dev_indices]
        for i, peer in enumerate(devices):
            if i == 0:
                continue  # Skip the first device
            if p2p_access_pattern is not None and not p2p_access_pattern[
                    devices[0], peer]:
                # Copy from peer to d0
                blobs[i] = model.Copy(
                    blobs[i],
                    'gpu_{}/{}_gpu{}_copy'.format(devices[0], param, peer))
        device_opt = core.DeviceOption(model._device_type, devices[0])
        with core.DeviceScope(device_opt):
            net.Sum(blobs, [blobs[0]], name='dpm')

    if len(devices) == 8:
        # Special tree reduction for 8 gpus, TODO generalize like in muji.py
        for j in range(4):
            sumN(j * 2, j * 2 + 1)
        for j in range(2):
            sumN(j * 4, j * 4 + 2)
        sumN(0, 4)
    elif len(devices) == 4:
        sumN(0, 1)
        sumN(2, 3)
        sumN(0, 2)
    else:
        sumN(*range(len(devices)))
    _Broadcast(devices, model, net, param)
Beispiel #2
0
def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None):
    """The general Allreduce interface that reroutes the function calls.
  """
    if gpu_indices is None:
        gpu_indices = list(range(len(blobs)))
    if len(gpu_indices) != len(blobs):
        raise RuntimeError(
            "gpu_indices length and blobs length mismatch: %d vs %d" %
            (len(gpu_indices), len(blobs)))
    pattern = workspace.GetCudaPeerAccessPattern()
    if len(blobs) == 2 and pattern.shape[0] >= 2 and np.all(pattern[:2, :2]):
        return Allreduce2(net, blobs, reduced_affix, gpu_indices)
    elif len(blobs) == 4 and pattern.shape[0] >= 4 and np.all(pattern[:4, :4]):
        return Allreduce4(net, blobs, reduced_affix, gpu_indices)
    elif len(blobs) == 4 and pattern.shape[0] >= 4 and np.all(
            pattern[:2, :2]) and np.all(pattern[2:4, 2:4]):
        return Allreduce4Group2(net, blobs, reduced_affix, gpu_indices)
    elif len(blobs) == 8 and pattern.shape[0] >= 8 and np.all(pattern[:8, :8]):
        return Allreduce8(net, blobs, reduced_affix, gpu_indices)
    else:
        return AllreduceFallback(net, blobs, reduced_affix, gpu_indices)
Beispiel #3
0
 def testGetCudaPeerAccessPattern(self):
     pattern = workspace.GetCudaPeerAccessPattern()
     self.assertEqual(type(pattern), np.ndarray)
     self.assertEqual(pattern.ndim, 2)
     self.assertEqual(pattern.shape[0], pattern.shape[1])
     self.assertEqual(pattern.shape[0], workspace.NumCudaDevices())
Beispiel #4
0
 def testAllreduceWithFourGPUs(self):
     pattern = workspace.GetCudaPeerAccessPattern()
     if pattern.shape[0] >= 4 and np.all(pattern[:4, :4]):
         self.RunningAllreduceWithGPUs([0, 1, 2, 3], muji.Allreduce4)
     else:
         print('Skipping allreduce with 4 gpus. Not peer access ready.')
Beispiel #5
0
 def testAllreduceWithTwoGPUs(self):
     pattern = workspace.GetCudaPeerAccessPattern()
     if pattern.shape[0] >= 2 and np.all(pattern[:2, :2]):
         self.RunningAllreduceWithGPUs([0, 1], muji.Allreduce2)
     else:
         print('Skipping allreduce with 2 gpus. Not peer access ready.')