Example #1
0
def broadcast_list(l, device_ids):
    """ Broadcasting list """
    l_copies = Broadcast.apply(device_ids, *l)
    l_copies = [
        l_copies[i:i + len(l)] for i in range(0, len(l_copies), len(l))
    ]
    return l_copies
Example #2
0
    def _data_parallel_master(self, intermediates):
        """Reduce the sum and square-sum, compute the statistics, and broadcast it."""

        # Always using same "device order" makes the ReduceAdd operation faster.
        # Thanks to:: Tete Xiao (http://tetexiao.com/)
        intermediates = sorted(intermediates,
                               key=lambda i: i[1].sum.get_device())

        to_reduce = [i[1][:2] for i in intermediates]
        to_reduce = [j for i in to_reduce for j in i]  # flatten
        target_gpus = [i[1].sum.get_device() for i in intermediates]

        sum_size = sum([i[1].sum_size for i in intermediates])
        sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce)
        mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size)

        broadcasted = Broadcast.apply(target_gpus, mean, inv_std)
        # print('a')
        # print(type(sum_), type(ssum), type(sum_size), sum_.shape, ssum.shape, sum_size)
        # broadcasted = Broadcast.apply(target_gpus, sum_, ssum, torch.tensor(sum_size).float().to(sum_.device))
        # print('b')
        outputs = []
        for i, rec in enumerate(intermediates):
            outputs.append(
                (rec[0], _MasterMessage(*broadcasted[i * 2:i * 2 + 2])))
            # outputs.append((rec[0], _MasterMessage(*broadcasted[i*3:i*3+3])))

        return outputs
Example #3
0
    def _data_parallel_master(self, intermediates):
        """Reduce the sum and square-sum, compute the statistics, and broadcast it."""

        # Always using same "device order" makes the ReduceAdd operation faster.
        # Thanks to:: Tete Xiao (http://tetexiao.com/)

        intermediates = sorted(intermediates, key=lambda i: \
                               i[1].sum.get_device())

        to_reduce = [(i[1])[:2] for i in intermediates]
        to_reduce = [j for i in to_reduce for j in i]  # flatten
        target_gpus = [i[1].sum.get_device() for i in intermediates]

        sum_size = sum([i[1].sum_size for i in intermediates])
        (sum_, ssum) = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce)
        (mean, inv_std) = self._compute_mean_std(sum_, ssum, sum_size)

        broadcasted = Broadcast.apply(target_gpus, mean, inv_std)

        outputs = []
        for (i, rec) in enumerate(intermediates):
            outputs.append(
                (rec[0], _MasterMessage(*broadcasted[i * 2:i * 2 + 2])))

        return outputs
Example #4
0
def data_parallel(f,
                  input,
                  params,
                  stats,
                  mode,
                  device_ids,
                  output_device=None):
    assert isinstance(device_ids, list)
    if output_device is None:
        output_device = device_ids[0]

    if len(device_ids) == 1:
        return f(input, params, stats, mode)

    params_all = Broadcast.apply(device_ids, *params.values())
    params_replicas = [{
        k: params_all[i + j * len(params)]
        for i, k in enumerate(params.keys())
    } for j in range(len(device_ids))]
    stats_replicas = [
        dict(zip(stats.keys(), p))
        for p in comm.broadcast_coalesced(list(stats.values()), device_ids)
    ]

    replicas = [
        partial(f, params=p, stats=s, mode=mode)
        for p, s in zip(params_replicas, stats_replicas)
    ]
    inputs = scatter([input], device_ids)
    outputs = parallel_apply(replicas, inputs)
    return gather(outputs, output_device)
    def forward(self, *inputs, **kwargs):
        batch_size = inputs[0].size()[0]
        self.batch_size = batch_size
        # assert batch_size % len(self.device_ids) == 0

        # Data parallel
        inputs, kwargs = scatter_kwargs(inputs, kwargs, self.device_ids)

        if len(self.device_ids) == 1:
            return self.module(*inputs[0], **kwargs[0])
        else:
            replicas = replicate(self.module, self.device_ids[:len(inputs)])
            theta_list = [[] for _ in self.device_ids]
            for t in self.theta:
                t_ = Broadcast.apply(self.device_ids, t)
                for dev in range(len(self.device_ids)):
                    theta_list[dev].append(t_[dev])
            for i, k in enumerate(kwargs):
                k['theta_list'] = theta_list[i]

            outputs = parallel_apply(replicas, inputs, kwargs,
                                     self.device_ids[:len(replicas)])
            outputs = gather(outputs, self.device_ids[0])

            return [o.mean() for o in outputs]
Example #6
0
def data_parallel(f,
                  input,
                  params,
                  stats,
                  mode,
                  device_ids,
                  output_device=None):
    if output_device is None:
        output_device = device_ids[0]

    if len(device_ids) == 1:
        return f(input, params, stats, mode)

    def replicate(param_dict, g):
        replicas = [{} for d in device_ids]
        for k, v in param_dict.items():
            for i, u in enumerate(g(v)):
                replicas[i][k] = u
        return replicas

    params_replicas = replicate(params,
                                lambda x: Broadcast.apply(device_ids, x))
    stats_replicas = replicate(stats, lambda x: comm.broadcast(x, device_ids))

    replicas = [
        partial(f, params=p, stats=s, mode=mode)
        for p, s in zip(params_replicas, stats_replicas)
    ]
    inputs = scatter([input], device_ids)
    outputs = parallel_apply(replicas, inputs)
    return gather(outputs, output_device)
    def _coalesce_and_compute(self, intermediates):
        """Reduce the sum and square-sum, compute the statistics, and broadcast it."""

        # Ensure that master being the first one.
        intermediates = sorted(intermediates, key=lambda i: i[0])

        # Get sum & square sum of from every device.
        to_reduce = [i[1][:2] for i in intermediates]
        # Flatten
        to_reduce = [j for i in to_reduce for j in i]
        # Size of data from every device.
        sum_size = sum([i[1].sum_size for i in intermediates])
        # Device of every copies
        target_gpus = [i[1].sum.get_device() for i in intermediates]
        # print("target gpus: ", target_gpus)

        # Add all sum & square sum individually from every copies,
        # and put the result to the master device.
        # 2 means that has 2 types input data.
        sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce)
        mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size)
        # Copied results for every device that to broadcasted.
        broadcasted = Broadcast.apply(target_gpus, mean, inv_std)
        # print("broadcasted: ", broadcasted)

        outputs = []
        for i, rec in enumerate(intermediates):
            outputs.append(
                (rec[0], _MessageToBroadcast(*broadcasted[i * 2:i * 2 + 2])))

        # print("outputs: ", outputs)
        return outputs
def allreduce(*inputs):
    """Cross GPU all reduce autograd operation for calculate mean and
    variance in SyncBN.
    """
    target_gpus = [inputs[i].get_device() for i in range(len(inputs))]
    result = ReduceAddCoalesced.apply(target_gpus[0], 1, *inputs)
    outputs = Broadcast.apply(target_gpus, *result)
    assert len(outputs) == len(inputs)
    return outputs
def data_parallel(f, input, params, mode, device_ids, output_device=None):
    assert isinstance(device_ids, list)
    if output_device is None:
        output_device = device_ids[0]

    if len(device_ids) == 1:
        return f(input, params, mode)

    params_all = Broadcast.apply(device_ids, *params.values())
    params_replicas = [{k: params_all[i + j*len(params)] for i, k in enumerate(params.keys())}
                       for j in range(len(device_ids))]

    replicas = [partial(f, params=p, mode=mode)
                for p in params_replicas]
    inputs = scatter([input], device_ids)
    outputs = parallel_apply(replicas, inputs)
    return gather(outputs, output_device)
Example #10
0
def data_parallel(f, input, params, mode, device_ids, output_device=None):
    device_ids = list(device_ids)
    if output_device is None:
        output_device = device_ids[0]

    if len(device_ids) == 1:
        return f(input, params, mode)

    params_all = Broadcast.apply(device_ids, *params.values())
    params_replicas = [{
        k: params_all[i + j * len(params)]
        for i, k in enumerate(params.keys())
    } for j in range(len(device_ids))]

    replicas = [partial(f, params=p, mode=mode) for p in params_replicas]
    inputs = scatter([input], device_ids)
    outputs = parallel_apply(replicas, inputs)
    return gather(outputs, output_device)
Example #11
0
    def _data_parallel_master(self, intermediates):
        intermediates = sorted(intermediates,
                               key=lambda i: i[1].sum.get_device())
        to_reduce = [i[1][:2] for i in intermediates]
        to_reduce = [j for i in to_reduce for j in i]
        target_gpus = [i[1].sum.get_device() for i in intermediates]

        sum_size = sum([i[1].sum_size for i in intermediates])
        sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce)
        mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size)

        broadcasted = Broadcast.apply(target_gpus, mean, inv_std)

        outputs = []
        for i, rec in enumerate(intermediates):
            outputs.append(
                (rec[0], _MasterMessage(*broadcasted[i * 2:i * 2 + 2])))
        return outputs
Example #12
0
    def _data_parallel_master(self, intermediates):
        """Reduce the sum and square-sum, compute the statistics, and broadcast it."""
        intermediates = sorted(intermediates, key=lambda i: i[1].sum.get_device())

        to_reduce = [i[1][:2] for i in intermediates]
        to_reduce = [j for i in to_reduce for j in i]  # flatten
        target_gpus = [i[1].sum.get_device() for i in intermediates]

        sum_size = sum([i[1].sum_size for i in intermediates])
        sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce)

        mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size)

        broadcasted = Broadcast.apply(target_gpus, mean, inv_std)

        outputs = []
        for i, rec in enumerate(intermediates):
            outputs.append((rec[0], _MasterMessage(*broadcasted[i*2:i*2+2])))

        return outputs
Example #13
0
 def backward(ctx, gradOutput):
     return Broadcast.apply(ctx.target_gpus, gradOutput)
Example #14
0
def broadcast_list(li, device_ids):
    l_copies = Broadcast.apply(device_ids, *li) # default broadcast not right?
    l_copies = [l_copies[i:i+len(li)]
                for i in range(0, len(l_copies), len(li))]
    return l_copies
Example #15
0
def replicate(network, devices, detach=False):
    from torch.nn.parallel._functions import Broadcast

    devices = tuple(devices)
    num_replicas = len(devices)

    params = list(network.parameters())
    param_indices = {param: idx for idx, param in enumerate(params)}
    param_copies = Broadcast.apply(devices, *params)
    if len(params) > 0:
        param_copies = [param_copies[i:i + len(params)]
                        for i in range(0, len(param_copies), len(params))]

    buffers = list(network._all_buffers())
    buffer_indices = {buf: idx for idx, buf in enumerate(buffers)}
    buffer_copies = comm.broadcast_coalesced(buffers, devices)

    modules = list(network.modules())
    module_copies = [[] for device in devices]
    module_indices = {}

    for i, module in enumerate(modules):
        module_indices[module] = i
        for j in range(num_replicas):
            replica = module.__new__(type(module))
            replica.__dict__ = module.__dict__.copy()
            replica._parameters = replica._parameters.copy()
            replica._buffers = replica._buffers.copy()
            replica._modules = replica._modules.copy()
            module_copies[j].append(replica)

    for i, module in enumerate(modules):
        for key, child in module._modules.items():
            if child is None:
                for j in range(num_replicas):
                    replica = module_copies[j][i]
                    replica._modules[key] = None
            else:
                module_idx = module_indices[child]
                for j in range(num_replicas):
                    replica = module_copies[j][i]
                    replica._modules[key] = module_copies[j][module_idx]
        for key, param in module._parameters.items():
            if param is None:
                for j in range(num_replicas):
                    replica = module_copies[j][i]
                    replica._parameters[key] = None
            else:
                param_idx = param_indices[param]
                for j in range(num_replicas):
                    replica = module_copies[j][i]
                    replica._parameters[key] = param_copies[j][param_idx].detach() \
                        if detach else param_copies[j][param_idx]
        for key, buf in module._buffers.items():
            if buf is None:
                for j in range(num_replicas):
                    replica = module_copies[j][i]
                    replica._buffers[key] = None
            else:
                buffer_idx = buffer_indices[buf]
                for j in range(num_replicas):
                    replica = module_copies[j][i]
                    replica._buffers[key] = buffer_copies[j][buffer_idx]

    return [module_copies[j][0] for j in range(num_replicas)]