def data_parallel(f, input, params, mode, device_ids, output_device=None): assert isinstance(device_ids, list) if output_device is None: output_device = device_ids[0] if len(device_ids) == 1: return f(input, params, mode) params_all = Broadcast.apply(device_ids, *params.values()) params_replicas = [{k: params_all[i + j*len(params)] for i, k in enumerate(params.keys())} for j in range(len(device_ids))] replicas = [partial(f, params=p, mode=mode) for p in params_replicas] inputs = scatter([input], device_ids) outputs = parallel_apply(replicas, inputs) return gather(outputs, output_device)
def _data_parallel_master(self, intermediates): """Reduce the sum and square-sum, compute the statistics, and broadcast it.""" intermediates = sorted(intermediates, key=lambda i: i[1].sum.get_device()) to_reduce = [i[1][:2] for i in intermediates] to_reduce = [j for i in to_reduce for j in i] # flatten target_gpus = [i[1].sum.get_device() for i in intermediates] sum_size = sum([i[1].sum_size for i in intermediates]) sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce) mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size) broadcasted = Broadcast.apply(target_gpus, mean, inv_std) outputs = [] for i, rec in enumerate(intermediates): outputs.append((rec[0], _MasterMessage(*broadcasted[i*2:i*2+2]))) return outputs
def backward(ctx, gradOutput): return Broadcast.apply(ctx.target_gpus, gradOutput)
def broadcast_list(li, device_ids): l_copies = Broadcast.apply(device_ids, *li) # default broadcast not right? l_copies = [l_copies[i:i+len(li)] for i in range(0, len(l_copies), len(li))] return l_copies
def broadcast_list(l, device_ids): """ Broadcasting list """ l_copies = Broadcast.apply(device_ids, *l) l_copies = [l_copies[i:i+len(l)] for i in range(0, len(l_copies), len(l))] return l_copies
def replicate(network, devices, copy_parameters=False, copy_buffers=False): devices = tuple(devices) num_replicas = len(devices) params = list(network.parameters()) param_indices = {param: idx for idx, param in enumerate(params)} if not copy_parameters: from torch.nn.parallel._functions import Broadcast param_copies = Broadcast(devices)(*params) if len(params) > 0: param_copies = [param_copies[i:i + len(params)] for i in range(0, len(param_copies), len(params))] else: param_copies = _copy_parameters(params, devices) buffers = list(network._all_buffers()) buffer_indices = {buf: idx for idx, buf in enumerate(buffers)} if not copy_buffers: buffer_copies = comm.broadcast_coalesced(buffers, devices) else: buffer_copies = _copy_parameters(buffers, devices) modules = list(network.modules()) module_copies = [[] for device in devices] module_indices = {} for i, module in enumerate(modules): module_indices[module] = i for j in range(num_replicas): replica = module.__new__(type(module)) replica.__dict__ = module.__dict__.copy() replica._parameters = replica._parameters.copy() replica._buffers = replica._buffers.copy() replica._modules = replica._modules.copy() module_copies[j].append(replica) for i, module in enumerate(modules): for key, child in module._modules.items(): if child is None: for j in range(num_replicas): replica = module_copies[j][i] replica._modules[key] = None else: module_idx = module_indices[child] for j in range(num_replicas): replica = module_copies[j][i] replica._modules[key] = module_copies[j][module_idx] for key, param in module._parameters.items(): if param is None: for j in range(num_replicas): replica = module_copies[j][i] replica._parameters[key] = None else: param_idx = param_indices[param] for j in range(num_replicas): replica = module_copies[j][i] replica._parameters[key] = param_copies[j][param_idx] for key, buf in module._buffers.items(): if buf is None: for j in range(num_replicas): replica = module_copies[j][i] replica._buffers[key] = None else: buffer_idx = buffer_indices[buf] for j in range(num_replicas): replica = module_copies[j][i] replica._buffers[key] = buffer_copies[j][buffer_idx] return [module_copies[j][0] for j in range(num_replicas)]