def add_parameter_update_ops(model, gpu_id): with core.DeviceScope(muji.OnGPU(gpu_id)): with core.NameScope('gpu_{}'.format(gpu_id)): # Learning rate of 0 is a dummy value to be set properly at the # start of training lr = model.param_init_net.ConstantFill( [], 'lr', shape=[1], value=0.0) one = model.param_init_net.ConstantFill( [], 'one', shape=[1], value=1.0) wd = model.param_init_net.ConstantFill( [], 'wd', shape=[1], value=cfg.SOLVER.WEIGHT_DECAY) for param in model.TrainableParams(gpu_id=gpu_id): logger.info('param ' + str(param) + ' will be updated') param_grad = model.param_to_grad[param] # Initialize momentum vector param_momentum = model.param_init_net.ConstantFill( [param], param + '_momentum', value=0.0) if param in model.biases: # Special treatment for biases (mainly to match historical impl. # details): # (1) Do not apply weight decay # (2) Use a 2x higher learning rate model.Scale(param_grad, param_grad, scale=2.0) elif cfg.SOLVER.WEIGHT_DECAY > 0: # Apply weight decay to non-bias weights model.WeightedSum([param_grad, one, param, wd], param_grad) # Update param_grad and param_momentum in place model.net.MomentumSGDUpdate( [param_grad, param_momentum, lr, param], [param_grad, param_momentum, param], momentum=cfg.SOLVER.MOMENTUM)
def add_inputs(model, roidb=None): """Add network input ops. To be called *after* model_bulder.create().""" # Implementation notes: # Typically, one would create the input ops and then the rest of the net. # However, creating the input ops depends on loading the dataset, which # can take a few minutes for COCO. # We prefer to avoid waiting so debugging can fail fast. # Thus, we create the net *without input ops* prior to loading the # dataset, and then add the input ops after loading the dataset. # Since we defer input op creation, we need to do a little bit of surgery # to place the input ops at the start of the network op list. if roidb is not None: # Make debugging easier when NUM_GPUS is 1 by only using one worker # thread for loading mini-batches num_workers = 1 if cfg.NUM_GPUS == 1 else cfg.NUM_WORKERS model.roi_data_loader = RoIDataLoader( roidb, num_workers=num_workers, num_enqueuers=1, minibatch_queue_size=cfg.TRAIN.MINIBATCH_QUEUE_SIZE) orig_num_op = len(model.net._net.op) for gpu_id in range(cfg.NUM_GPUS): with core.NameScope('gpu_{}'.format(gpu_id)): with core.DeviceScope(muji.OnGPU(gpu_id)): if model.train: add_train_inputs(model) else: add_test_inputs(model) # A little op surgery to move input ops to the start of the net diff = len(model.net._net.op) - orig_num_op new_op = model.net._net.op[-diff:] + model.net._net.op[:-diff] del model.net._net.op[:] model.net._net.op.extend(new_op)
def main(opts): logger = logging.getLogger(__name__) roidb = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) logger.info('{:d} roidb entries'.format(len(roidb))) roi_data_loader = RoIDataLoader( roidb, num_loaders=cfg.DATA_LOADER.NUM_THREADS, minibatch_queue_size=cfg.DATA_LOADER.MINIBATCH_QUEUE_SIZE, blobs_queue_capacity=cfg.DATA_LOADER.BLOBS_QUEUE_CAPACITY ) blob_names = roi_data_loader.get_output_names() net = core.Net('dequeue_net') net.type = 'dag' all_blobs = [] for gpu_id in range(cfg.NUM_GPUS): with core.NameScope('gpu_{}'.format(gpu_id)): with core.DeviceScope(muji.OnGPU(gpu_id)): for blob_name in blob_names: blob = core.ScopedName(blob_name) all_blobs.append(blob) workspace.CreateBlob(blob) logger.info('Creating blob: {}'.format(blob)) net.DequeueBlobs( roi_data_loader._blobs_queue_name, blob_names) logger.info("Protobuf:\n" + str(net.Proto())) if opts.profiler: import cProfile cProfile.runctx( 'loader_loop(roi_data_loader)', globals(), locals(), sort='cumulative') else: loader_loop(roi_data_loader) roi_data_loader.register_sigint_handler() roi_data_loader.start(prefill=True) total_time = 0 for i in range(opts.num_batches): start_t = time.time() for _ in range(opts.x_factor): workspace.RunNetOnce(net) total_time += (time.time() - start_t) / opts.x_factor logger.info( '{:d}/{:d}: Averge dequeue time: {:.3f}s [{:d}/{:d}]'.format( i + 1, opts.num_batches, total_time / (i + 1), roi_data_loader._minibatch_queue.qsize(), cfg.DATA_LOADER.MINIBATCH_QUEUE_SIZE ) ) # Sleep to simulate the time taken by running a little network time.sleep(opts.sleep_time) # To inspect: # blobs = workspace.FetchBlobs(all_blobs) # from IPython import embed; embed() logger.info('Shutting down data loader...') roi_data_loader.shutdown()
def build_data_parallel_model(model, single_gpu_build_func): if model.train: all_loss_gradients = {} # Will include loss gradients from all GPUs # Build the model on each GPU with correct name and device scoping for gpu_id in range(cfg.NUM_GPUS): with core.NameScope('gpu_{}'.format(gpu_id)): with core.DeviceScope(muji.OnGPU(gpu_id)): all_loss_gradients.update( single_gpu_build_func(model)) # Add backward pass on all GPUs model.AddGradientOperators(all_loss_gradients) if cfg.NUM_GPUS > 1: # Need to all-reduce the per-GPU gradients if training with more # than 1 GPU all_params = model.TrainableParams() assert len(all_params) % cfg.NUM_GPUS == 0, \ 'This should not happen.' # The model parameters are replicated on each GPU, get the number # distinct parameter blobs (i.e., the number of parameter blobs on # each GPU) params_per_gpu = int(len(all_params) / cfg.NUM_GPUS) with core.DeviceScope(muji.OnGPU(cfg.ROOT_GPU_ID)): # Iterate over distinct parameter blobs for i in range(params_per_gpu): # Gradients from all GPUs for this parameter blob gradients = [ model.param_to_grad[p] for p in all_params[i::params_per_gpu] ] if len(gradients) > 0: if cfg.USE_NCCL: model.net.NCCLAllreduce(gradients, gradients) else: muji.Allreduce( model.net, gradients, reduced_affix='') for gpu_id in range(cfg.NUM_GPUS): # After all-reduce, all GPUs perform SGD updates on their identical # params and gradients in parallel add_parameter_update_ops(model, gpu_id) else: # Testing only supports running on a single GPU with core.NameScope('gpu_{}'.format(cfg.ROOT_GPU_ID)): with core.DeviceScope(muji.OnGPU(cfg.ROOT_GPU_ID)): single_gpu_build_func(model)
def get_net(data_loader, name): logger = logging.getLogger(__name__) blob_names = data_loader.get_output_names() net = core.Net(name) net.type = 'dag' for gpu_id in range(cfg.NUM_GPUS): with core.NameScope('gpu_{}'.format(gpu_id)): with core.DeviceScope(muji.OnGPU(gpu_id)): for blob_name in blob_names: blob = core.ScopedName(blob_name) workspace.CreateBlob(blob) net.DequeueBlobs(data_loader._blobs_queue_name, blob_names) logger.info("Protobuf:\n" + str(net.Proto())) return net
def RunningAllreduceWithGPUs(self, gpu_ids, allreduce_function): """A base function to test different scenarios.""" workspace.ResetWorkspace() net = core.Net("mujitest") for id in gpu_ids: net.ConstantFill([], "testblob_gpu_" + str(id), shape=[1, 2, 3, 4], value=float(id+1), device_option=muji.OnGPU(id)) allreduce_function( net, ["testblob_gpu_" + str(i) for i in gpu_ids], "_reduced", gpu_ids) workspace.RunNetOnce(net) target_value = sum(gpu_ids) + len(gpu_ids) all_blobs = workspace.Blobs() all_blobs.sort() for blob in all_blobs: print blob, workspace.FetchBlob(blob) for id in gpu_ids: blob = workspace.FetchBlob("testblob_gpu_" + str(i) + "_reduced") np.testing.assert_array_equal( blob, target_value, err_msg="gpu id %d of %s" % (id, str(gpu_ids)))