def Center_loss_surgery(model): # broadcast parameters blobs = [ 'gpu_' + str(gpu_id) + '/center_feature' for gpu_id in range(cfg.NUM_GPUS) ] data = workspace.FetchBlob(blobs[0]) for i, p in enumerate(blobs[1:]): with c2_utils.CudaScope(i + 1): workspace.FeedBlob(p, data) # sync parameters with c2_utils.CudaScope(0): gradients = [ 'gpu_' + str(gpu_id) + '/center_feature_g' for gpu_id in range(cfg.NUM_GPUS) ] if cfg.USE_NCCL: model.net.NCCLAllreduce(gradients, gradients) else: muji.Allreduce(model.net, gradients, reduced_affix='') with c2_utils.CudaScope(0): gradients = [ 'gpu_' + str(gpu_id) + '/center_feature_n_u' for gpu_id in range(cfg.NUM_GPUS) ] if cfg.USE_NCCL: model.net.NCCLAllreduce(gradients, gradients) else: muji.Allreduce(model.net, gradients, reduced_affix='')
def build_data_parallel_model(model, single_gpu_build_func): if model.train: all_loss_gradients = {} # Will include loss gradients from all GPUs # Build the model on each GPU with correct name and device scoping for gpu_id in range(cfg.NUM_GPUS): with core.NameScope('gpu_{}'.format(gpu_id)): with core.DeviceScope(muji.OnGPU(gpu_id)): all_loss_gradients.update( single_gpu_build_func(model)) # Add backward pass on all GPUs model.AddGradientOperators(all_loss_gradients) if cfg.NUM_GPUS > 1: # Need to all-reduce the per-GPU gradients if training with more # than 1 GPU all_params = model.TrainableParams() assert len(all_params) % cfg.NUM_GPUS == 0, \ 'This should not happen.' # The model parameters are replicated on each GPU, get the number # distinct parameter blobs (i.e., the number of parameter blobs on # each GPU) params_per_gpu = int(len(all_params) / cfg.NUM_GPUS) with core.DeviceScope(muji.OnGPU(cfg.ROOT_GPU_ID)): # Iterate over distinct parameter blobs for i in range(params_per_gpu): # Gradients from all GPUs for this parameter blob gradients = [ model.param_to_grad[p] for p in all_params[i::params_per_gpu] ] if len(gradients) > 0: if cfg.USE_NCCL: model.net.NCCLAllreduce(gradients, gradients) else: muji.Allreduce( model.net, gradients, reduced_affix='') for gpu_id in range(cfg.NUM_GPUS): # After all-reduce, all GPUs perform SGD updates on their identical # params and gradients in parallel add_parameter_update_ops(model, gpu_id) else: # Testing only supports running on a single GPU with core.NameScope('gpu_{}'.format(cfg.ROOT_GPU_ID)): with core.DeviceScope(muji.OnGPU(cfg.ROOT_GPU_ID)): single_gpu_build_func(model)
def _add_allreduce_graph(model): """Construct the graph that performs Allreduce on the gradients.""" # Need to all-reduce the per-GPU gradients if training with more than 1 GPU all_params = model.TrainableParams() assert len(all_params) % cfg.NUM_GPUS == 0 # The model parameters are replicated on each GPU, get the number # distinct parameter blobs (i.e., the number of parameter blobs on # each GPU) params_per_gpu = int(len(all_params) / cfg.NUM_GPUS) with c2_utils.CudaScope(0): # Iterate over distinct parameter blobs for i in range(params_per_gpu): # Gradients from all GPUs for this parameter blob gradients = [ model.param_to_grad[p] for p in all_params[i::params_per_gpu] ] if len(gradients) > 0: if cfg.USE_NCCL: model.net.NCCLAllreduce(gradients, gradients) else: muji.Allreduce(model.net, gradients, reduced_affix='')
def test_timings(self): for n in range(2, workspace.NumCudaDevices()): for in_place in [False, True]: xs = [np.random.randn(1e7).astype(np.float32) for i in range(n)] inputs = [str("x_{}".format(i)) for i in range(n)] prefix = "" if in_place else "o" outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)] net = core.Net("test") net.NCCLAllreduce(inputs, outputs) net.RunAllOnGPU() for i in range(n): self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i)) self.ws.run(net) net_time = benchmark(self.ws, net) vanilla = core.Net("vanilla") muji.Allreduce(vanilla, inputs) vanilla_time = benchmark(self.ws, vanilla) print("Speedup for NCCL: {:.2f}".format( vanilla_time / net_time))
def test_timings(self): for n in range(2, workspace.NumCudaDevices()): for in_place in [False, True]: xs = [ np.random.randn(1e7).astype(np.float32) for i in range(n) ] inputs = [str("x_{}".format(i)) for i in range(n)] prefix = "" if in_place else "o" outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)] net = core.Net("test") net.NCCLAllreduce(inputs, outputs) net.RunAllOnGPU() for i in range(n): workspace.FeedBlob(inputs[i], xs[i], gpu_device(i).SerializeToString()) workspace.RunNetOnce(net.Proto().SerializeToString()) net_time = benchmark(net) vanilla = core.Net("vanilla") muji.Allreduce(vanilla, inputs) vanilla_time = benchmark(vanilla) print("Speedup for NCCL: {:.2f}".format(vanilla_time / net_time))