コード例 #1
0
def add_parameter_update_ops(model, gpu_id):
    with core.DeviceScope(muji.OnGPU(gpu_id)):
        with core.NameScope('gpu_{}'.format(gpu_id)):
            # Learning rate of 0 is a dummy value to be set properly at the
            # start of training
            lr = model.param_init_net.ConstantFill(
                [], 'lr', shape=[1], value=0.0)
            one = model.param_init_net.ConstantFill(
                [], 'one', shape=[1], value=1.0)
            wd = model.param_init_net.ConstantFill(
                [], 'wd', shape=[1], value=cfg.SOLVER.WEIGHT_DECAY)

        for param in model.TrainableParams(gpu_id=gpu_id):
            logger.info('param ' + str(param) + ' will be updated')
            param_grad = model.param_to_grad[param]
            # Initialize momentum vector
            param_momentum = model.param_init_net.ConstantFill(
                [param], param + '_momentum', value=0.0)
            if param in model.biases:
                # Special treatment for biases (mainly to match historical impl.
                # details):
                # (1) Do not apply weight decay
                # (2) Use a 2x higher learning rate
                model.Scale(param_grad, param_grad, scale=2.0)
            elif cfg.SOLVER.WEIGHT_DECAY > 0:
                # Apply weight decay to non-bias weights
                model.WeightedSum([param_grad, one, param, wd], param_grad)
            # Update param_grad and param_momentum in place
            model.net.MomentumSGDUpdate(
                [param_grad, param_momentum, lr, param],
                [param_grad, param_momentum, param],
                momentum=cfg.SOLVER.MOMENTUM)
コード例 #2
0
def add_inputs(model, roidb=None):
    """Add network input ops. To be called *after* model_bulder.create()."""
    # Implementation notes:
    #   Typically, one would create the input ops and then the rest of the net.
    #   However, creating the input ops depends on loading the dataset, which
    #   can take a few minutes for COCO.
    #   We prefer to avoid waiting so debugging can fail fast.
    #   Thus, we create the net *without input ops* prior to loading the
    #   dataset, and then add the input ops after loading the dataset.
    #   Since we defer input op creation, we need to do a little bit of surgery
    #   to place the input ops at the start of the network op list.
    if roidb is not None:
        # Make debugging easier when NUM_GPUS is 1 by only using one worker
        # thread for loading mini-batches
        num_workers = 1 if cfg.NUM_GPUS == 1 else cfg.NUM_WORKERS
        model.roi_data_loader = RoIDataLoader(
            roidb, num_workers=num_workers, num_enqueuers=1,
            minibatch_queue_size=cfg.TRAIN.MINIBATCH_QUEUE_SIZE)
    orig_num_op = len(model.net._net.op)
    for gpu_id in range(cfg.NUM_GPUS):
        with core.NameScope('gpu_{}'.format(gpu_id)):
            with core.DeviceScope(muji.OnGPU(gpu_id)):
                if model.train:
                    add_train_inputs(model)
                else:
                    add_test_inputs(model)
    # A little op surgery to move input ops to the start of the net
    diff = len(model.net._net.op) - orig_num_op
    new_op = model.net._net.op[-diff:] + model.net._net.op[:-diff]
    del model.net._net.op[:]
    model.net._net.op.extend(new_op)
コード例 #3
0
def main(opts):
    logger = logging.getLogger(__name__)
    roidb = combined_roidb_for_training(
        cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES)
    logger.info('{:d} roidb entries'.format(len(roidb)))
    roi_data_loader = RoIDataLoader(
        roidb,
        num_loaders=cfg.DATA_LOADER.NUM_THREADS,
        minibatch_queue_size=cfg.DATA_LOADER.MINIBATCH_QUEUE_SIZE,
        blobs_queue_capacity=cfg.DATA_LOADER.BLOBS_QUEUE_CAPACITY
    )
    blob_names = roi_data_loader.get_output_names()

    net = core.Net('dequeue_net')
    net.type = 'dag'
    all_blobs = []
    for gpu_id in range(cfg.NUM_GPUS):
        with core.NameScope('gpu_{}'.format(gpu_id)):
            with core.DeviceScope(muji.OnGPU(gpu_id)):
                for blob_name in blob_names:
                    blob = core.ScopedName(blob_name)
                    all_blobs.append(blob)
                    workspace.CreateBlob(blob)
                    logger.info('Creating blob: {}'.format(blob))
                net.DequeueBlobs(
                    roi_data_loader._blobs_queue_name, blob_names)
    logger.info("Protobuf:\n" + str(net.Proto()))

    if opts.profiler:
        import cProfile
        cProfile.runctx(
            'loader_loop(roi_data_loader)', globals(), locals(),
            sort='cumulative')
    else:
        loader_loop(roi_data_loader)

    roi_data_loader.register_sigint_handler()
    roi_data_loader.start(prefill=True)
    total_time = 0
    for i in range(opts.num_batches):
        start_t = time.time()
        for _ in range(opts.x_factor):
            workspace.RunNetOnce(net)
        total_time += (time.time() - start_t) / opts.x_factor
        logger.info(
            '{:d}/{:d}: Averge dequeue time: {:.3f}s  [{:d}/{:d}]'.format(
                i + 1, opts.num_batches, total_time / (i + 1),
                roi_data_loader._minibatch_queue.qsize(),
                cfg.DATA_LOADER.MINIBATCH_QUEUE_SIZE
            )
        )
        # Sleep to simulate the time taken by running a little network
        time.sleep(opts.sleep_time)
        # To inspect:
        # blobs = workspace.FetchBlobs(all_blobs)
        # from IPython import embed; embed()
    logger.info('Shutting down data loader...')
    roi_data_loader.shutdown()
コード例 #4
0
def build_data_parallel_model(model, single_gpu_build_func):
    if model.train:
        all_loss_gradients = {}  # Will include loss gradients from all GPUs
        # Build the model on each GPU with correct name and device scoping
        for gpu_id in range(cfg.NUM_GPUS):
            with core.NameScope('gpu_{}'.format(gpu_id)):
                with core.DeviceScope(muji.OnGPU(gpu_id)):
                    all_loss_gradients.update(
                        single_gpu_build_func(model))
        # Add backward pass on all GPUs
        model.AddGradientOperators(all_loss_gradients)
        if cfg.NUM_GPUS > 1:
            # Need to all-reduce the per-GPU gradients if training with more
            # than 1 GPU
            all_params = model.TrainableParams()
            assert len(all_params) % cfg.NUM_GPUS == 0, \
                'This should not happen.'
            # The model parameters are replicated on each GPU, get the number
            # distinct parameter blobs (i.e., the number of parameter blobs on
            # each GPU)
            params_per_gpu = int(len(all_params) / cfg.NUM_GPUS)
            with core.DeviceScope(muji.OnGPU(cfg.ROOT_GPU_ID)):
                # Iterate over distinct parameter blobs
                for i in range(params_per_gpu):
                    # Gradients from all GPUs for this parameter blob
                    gradients = [
                        model.param_to_grad[p]
                        for p in all_params[i::params_per_gpu]
                    ]
                    if len(gradients) > 0:
                        if cfg.USE_NCCL:
                            model.net.NCCLAllreduce(gradients, gradients)
                        else:
                            muji.Allreduce(
                                model.net, gradients, reduced_affix='')
        for gpu_id in range(cfg.NUM_GPUS):
            # After all-reduce, all GPUs perform SGD updates on their identical
            # params and gradients in parallel
            add_parameter_update_ops(model, gpu_id)
    else:
        # Testing only supports running on a single GPU
        with core.NameScope('gpu_{}'.format(cfg.ROOT_GPU_ID)):
            with core.DeviceScope(muji.OnGPU(cfg.ROOT_GPU_ID)):
                single_gpu_build_func(model)
コード例 #5
0
def get_net(data_loader, name):
    logger = logging.getLogger(__name__)
    blob_names = data_loader.get_output_names()
    net = core.Net(name)
    net.type = 'dag'
    for gpu_id in range(cfg.NUM_GPUS):
        with core.NameScope('gpu_{}'.format(gpu_id)):
            with core.DeviceScope(muji.OnGPU(gpu_id)):
                for blob_name in blob_names:
                    blob = core.ScopedName(blob_name)
                    workspace.CreateBlob(blob)
                net.DequeueBlobs(data_loader._blobs_queue_name, blob_names)
    logger.info("Protobuf:\n" + str(net.Proto()))

    return net
コード例 #6
0
ファイル: muji_test.py プロジェクト: winning1120xx/caffe2
  def RunningAllreduceWithGPUs(self, gpu_ids, allreduce_function):
    """A base function to test different scenarios."""
    workspace.ResetWorkspace()
    net = core.Net("mujitest")
    for id in gpu_ids:
      net.ConstantFill([], "testblob_gpu_" + str(id), shape=[1, 2, 3, 4],
                            value=float(id+1),
                            device_option=muji.OnGPU(id))
    allreduce_function(
      net, ["testblob_gpu_" + str(i) for i in gpu_ids],
      "_reduced", gpu_ids)
    workspace.RunNetOnce(net)
    target_value = sum(gpu_ids) + len(gpu_ids)
    all_blobs = workspace.Blobs()
    all_blobs.sort()
    for blob in all_blobs:
      print blob, workspace.FetchBlob(blob)

    for id in gpu_ids:
      blob = workspace.FetchBlob("testblob_gpu_" + str(i) + "_reduced")
      np.testing.assert_array_equal(
          blob, target_value, err_msg="gpu id %d of %s" % (id, str(gpu_ids)))