def broadcast_list(l, device_ids): """ Broadcasting list """ l_copies = Broadcast.apply(device_ids, *l) l_copies = [ l_copies[i:i + len(l)] for i in range(0, len(l_copies), len(l)) ] return l_copies
def _data_parallel_master(self, intermediates): """Reduce the sum and square-sum, compute the statistics, and broadcast it.""" # Always using same "device order" makes the ReduceAdd operation faster. # Thanks to:: Tete Xiao (http://tetexiao.com/) intermediates = sorted(intermediates, key=lambda i: i[1].sum.get_device()) to_reduce = [i[1][:2] for i in intermediates] to_reduce = [j for i in to_reduce for j in i] # flatten target_gpus = [i[1].sum.get_device() for i in intermediates] sum_size = sum([i[1].sum_size for i in intermediates]) sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce) mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size) broadcasted = Broadcast.apply(target_gpus, mean, inv_std) # print('a') # print(type(sum_), type(ssum), type(sum_size), sum_.shape, ssum.shape, sum_size) # broadcasted = Broadcast.apply(target_gpus, sum_, ssum, torch.tensor(sum_size).float().to(sum_.device)) # print('b') outputs = [] for i, rec in enumerate(intermediates): outputs.append( (rec[0], _MasterMessage(*broadcasted[i * 2:i * 2 + 2]))) # outputs.append((rec[0], _MasterMessage(*broadcasted[i*3:i*3+3]))) return outputs
def _data_parallel_master(self, intermediates): """Reduce the sum and square-sum, compute the statistics, and broadcast it.""" # Always using same "device order" makes the ReduceAdd operation faster. # Thanks to:: Tete Xiao (http://tetexiao.com/) intermediates = sorted(intermediates, key=lambda i: \ i[1].sum.get_device()) to_reduce = [(i[1])[:2] for i in intermediates] to_reduce = [j for i in to_reduce for j in i] # flatten target_gpus = [i[1].sum.get_device() for i in intermediates] sum_size = sum([i[1].sum_size for i in intermediates]) (sum_, ssum) = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce) (mean, inv_std) = self._compute_mean_std(sum_, ssum, sum_size) broadcasted = Broadcast.apply(target_gpus, mean, inv_std) outputs = [] for (i, rec) in enumerate(intermediates): outputs.append( (rec[0], _MasterMessage(*broadcasted[i * 2:i * 2 + 2]))) return outputs
def data_parallel(f, input, params, stats, mode, device_ids, output_device=None): assert isinstance(device_ids, list) if output_device is None: output_device = device_ids[0] if len(device_ids) == 1: return f(input, params, stats, mode) params_all = Broadcast.apply(device_ids, *params.values()) params_replicas = [{ k: params_all[i + j * len(params)] for i, k in enumerate(params.keys()) } for j in range(len(device_ids))] stats_replicas = [ dict(zip(stats.keys(), p)) for p in comm.broadcast_coalesced(list(stats.values()), device_ids) ] replicas = [ partial(f, params=p, stats=s, mode=mode) for p, s in zip(params_replicas, stats_replicas) ] inputs = scatter([input], device_ids) outputs = parallel_apply(replicas, inputs) return gather(outputs, output_device)
def forward(self, *inputs, **kwargs): batch_size = inputs[0].size()[0] self.batch_size = batch_size # assert batch_size % len(self.device_ids) == 0 # Data parallel inputs, kwargs = scatter_kwargs(inputs, kwargs, self.device_ids) if len(self.device_ids) == 1: return self.module(*inputs[0], **kwargs[0]) else: replicas = replicate(self.module, self.device_ids[:len(inputs)]) theta_list = [[] for _ in self.device_ids] for t in self.theta: t_ = Broadcast.apply(self.device_ids, t) for dev in range(len(self.device_ids)): theta_list[dev].append(t_[dev]) for i, k in enumerate(kwargs): k['theta_list'] = theta_list[i] outputs = parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) outputs = gather(outputs, self.device_ids[0]) return [o.mean() for o in outputs]
def data_parallel(f, input, params, stats, mode, device_ids, output_device=None): if output_device is None: output_device = device_ids[0] if len(device_ids) == 1: return f(input, params, stats, mode) def replicate(param_dict, g): replicas = [{} for d in device_ids] for k, v in param_dict.items(): for i, u in enumerate(g(v)): replicas[i][k] = u return replicas params_replicas = replicate(params, lambda x: Broadcast.apply(device_ids, x)) stats_replicas = replicate(stats, lambda x: comm.broadcast(x, device_ids)) replicas = [ partial(f, params=p, stats=s, mode=mode) for p, s in zip(params_replicas, stats_replicas) ] inputs = scatter([input], device_ids) outputs = parallel_apply(replicas, inputs) return gather(outputs, output_device)
def _coalesce_and_compute(self, intermediates): """Reduce the sum and square-sum, compute the statistics, and broadcast it.""" # Ensure that master being the first one. intermediates = sorted(intermediates, key=lambda i: i[0]) # Get sum & square sum of from every device. to_reduce = [i[1][:2] for i in intermediates] # Flatten to_reduce = [j for i in to_reduce for j in i] # Size of data from every device. sum_size = sum([i[1].sum_size for i in intermediates]) # Device of every copies target_gpus = [i[1].sum.get_device() for i in intermediates] # print("target gpus: ", target_gpus) # Add all sum & square sum individually from every copies, # and put the result to the master device. # 2 means that has 2 types input data. sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce) mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size) # Copied results for every device that to broadcasted. broadcasted = Broadcast.apply(target_gpus, mean, inv_std) # print("broadcasted: ", broadcasted) outputs = [] for i, rec in enumerate(intermediates): outputs.append( (rec[0], _MessageToBroadcast(*broadcasted[i * 2:i * 2 + 2]))) # print("outputs: ", outputs) return outputs
def allreduce(*inputs): """Cross GPU all reduce autograd operation for calculate mean and variance in SyncBN. """ target_gpus = [inputs[i].get_device() for i in range(len(inputs))] result = ReduceAddCoalesced.apply(target_gpus[0], 1, *inputs) outputs = Broadcast.apply(target_gpus, *result) assert len(outputs) == len(inputs) return outputs
def data_parallel(f, input, params, mode, device_ids, output_device=None): assert isinstance(device_ids, list) if output_device is None: output_device = device_ids[0] if len(device_ids) == 1: return f(input, params, mode) params_all = Broadcast.apply(device_ids, *params.values()) params_replicas = [{k: params_all[i + j*len(params)] for i, k in enumerate(params.keys())} for j in range(len(device_ids))] replicas = [partial(f, params=p, mode=mode) for p in params_replicas] inputs = scatter([input], device_ids) outputs = parallel_apply(replicas, inputs) return gather(outputs, output_device)
def data_parallel(f, input, params, mode, device_ids, output_device=None): device_ids = list(device_ids) if output_device is None: output_device = device_ids[0] if len(device_ids) == 1: return f(input, params, mode) params_all = Broadcast.apply(device_ids, *params.values()) params_replicas = [{ k: params_all[i + j * len(params)] for i, k in enumerate(params.keys()) } for j in range(len(device_ids))] replicas = [partial(f, params=p, mode=mode) for p in params_replicas] inputs = scatter([input], device_ids) outputs = parallel_apply(replicas, inputs) return gather(outputs, output_device)
def _data_parallel_master(self, intermediates): intermediates = sorted(intermediates, key=lambda i: i[1].sum.get_device()) to_reduce = [i[1][:2] for i in intermediates] to_reduce = [j for i in to_reduce for j in i] target_gpus = [i[1].sum.get_device() for i in intermediates] sum_size = sum([i[1].sum_size for i in intermediates]) sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce) mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size) broadcasted = Broadcast.apply(target_gpus, mean, inv_std) outputs = [] for i, rec in enumerate(intermediates): outputs.append( (rec[0], _MasterMessage(*broadcasted[i * 2:i * 2 + 2]))) return outputs
def _data_parallel_master(self, intermediates): """Reduce the sum and square-sum, compute the statistics, and broadcast it.""" intermediates = sorted(intermediates, key=lambda i: i[1].sum.get_device()) to_reduce = [i[1][:2] for i in intermediates] to_reduce = [j for i in to_reduce for j in i] # flatten target_gpus = [i[1].sum.get_device() for i in intermediates] sum_size = sum([i[1].sum_size for i in intermediates]) sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce) mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size) broadcasted = Broadcast.apply(target_gpus, mean, inv_std) outputs = [] for i, rec in enumerate(intermediates): outputs.append((rec[0], _MasterMessage(*broadcasted[i*2:i*2+2]))) return outputs
def backward(ctx, gradOutput): return Broadcast.apply(ctx.target_gpus, gradOutput)
def broadcast_list(li, device_ids): l_copies = Broadcast.apply(device_ids, *li) # default broadcast not right? l_copies = [l_copies[i:i+len(li)] for i in range(0, len(l_copies), len(li))] return l_copies
def replicate(network, devices, detach=False): from torch.nn.parallel._functions import Broadcast devices = tuple(devices) num_replicas = len(devices) params = list(network.parameters()) param_indices = {param: idx for idx, param in enumerate(params)} param_copies = Broadcast.apply(devices, *params) if len(params) > 0: param_copies = [param_copies[i:i + len(params)] for i in range(0, len(param_copies), len(params))] buffers = list(network._all_buffers()) buffer_indices = {buf: idx for idx, buf in enumerate(buffers)} buffer_copies = comm.broadcast_coalesced(buffers, devices) modules = list(network.modules()) module_copies = [[] for device in devices] module_indices = {} for i, module in enumerate(modules): module_indices[module] = i for j in range(num_replicas): replica = module.__new__(type(module)) replica.__dict__ = module.__dict__.copy() replica._parameters = replica._parameters.copy() replica._buffers = replica._buffers.copy() replica._modules = replica._modules.copy() module_copies[j].append(replica) for i, module in enumerate(modules): for key, child in module._modules.items(): if child is None: for j in range(num_replicas): replica = module_copies[j][i] replica._modules[key] = None else: module_idx = module_indices[child] for j in range(num_replicas): replica = module_copies[j][i] replica._modules[key] = module_copies[j][module_idx] for key, param in module._parameters.items(): if param is None: for j in range(num_replicas): replica = module_copies[j][i] replica._parameters[key] = None else: param_idx = param_indices[param] for j in range(num_replicas): replica = module_copies[j][i] replica._parameters[key] = param_copies[j][param_idx].detach() \ if detach else param_copies[j][param_idx] for key, buf in module._buffers.items(): if buf is None: for j in range(num_replicas): replica = module_copies[j][i] replica._buffers[key] = None else: buffer_idx = buffer_indices[buf] for j in range(num_replicas): replica = module_copies[j][i] replica._buffers[key] = buffer_copies[j][buffer_idx] return [module_copies[j][0] for j in range(num_replicas)]