def _do_broadcast(all_blobs):
     assert len(all_blobs) % cfg.NUM_GPUS == 0, \
         ('Unexpected value for NUM_GPUS. Make sure you are not '
          'running single-GPU inference with NUM_GPUS > 1.')
     blobs_per_gpu = int(len(all_blobs) / cfg.NUM_GPUS)
     for i in range(blobs_per_gpu):
         blobs = [p for p in all_blobs[i::blobs_per_gpu]]
         data = workspace.FetchBlob(blobs[0])
         logger.debug('Broadcasting {} to'.format(str(blobs[0])))
         for i, p in enumerate(blobs[1:]):
             logger.debug(' |-> {}'.format(str(p)))
             with c2_utils.CudaScope(i + 1):
                 workspace.FeedBlob(p, data)
Exemple #2
0
 def InitializeLossWeight(self):
     weight_cls1 = np.array([0.5]).astype(np.float32)
     weight_cls2 = np.array([0.5]).astype(np.float32)
     weight_bbox1 = np.array([0.5]).astype(np.float32)
     weight_bbox2 = np.array([0.5]).astype(np.float32)
     for i in range(cfg.NUM_GPUS):
         with c2_utils.CudaScope(i):
             workspace.FeedBlob('gpu_{}/weight_cls1'.format(i), weight_cls1)
             workspace.FeedBlob('gpu_{}/weight_cls2'.format(i), weight_cls2)
             workspace.FeedBlob('gpu_{}/weight_bbox1'.format(i),
                                weight_bbox1)
             workspace.FeedBlob('gpu_{}/weight_bbox2'.format(i),
                                weight_bbox2)
def _add_allreduce_graph(model):
    """Construct the graph that performs Allreduce on the gradients."""
    # Need to all-reduce the per-GPU gradients if training with more than 1 GPU
    all_params = model.TrainableParams()
    assert len(all_params) % cfg.NUM_GPUS == 0
    # The model parameters are replicated on each GPU, get the number
    # distinct parameter blobs (i.e., the number of parameter blobs on
    # each GPU)
    params_per_gpu = int(len(all_params) / cfg.NUM_GPUS)
    with c2_utils.CudaScope(0):
        # Iterate over distinct parameter blobs
        for i in range(params_per_gpu):
            # Gradients from all GPUs for this parameter blob
            gradients = [
                model.param_to_grad[p] for p in all_params[i::params_per_gpu]
            ]
            if len(gradients) > 0:
                if cfg.USE_NCCL:
                    model.net.NCCLAllreduce(gradients, gradients)
                else:
                    muji.Allreduce(model.net, gradients, reduced_affix='')
Exemple #4
0
    def _CorrectMomentum(self, correction):
        """The MomentumSGDUpdate op implements the update V as

            V := mu * V + lr * grad,

        where mu is the momentum factor, lr is the learning rate, and grad is
        the stochastic gradient. Since V is not defined independently of the
        learning rate (as it should ideally be), when the learning rate is
        changed we should scale the update history V in order to make it
        compatible in scale with lr * grad.
        """
        logger.info(
            'Scaling update history by {:.6f} (new lr / old lr)'.format(
                correction))
        for i in range(cfg.NUM_GPUS):
            with c2_utils.CudaScope(i):
                for param in self.TrainableParams(gpu_id=i):
                    op = core.CreateOperator('Scale', [param + '_momentum'],
                                             [param + '_momentum'],
                                             scale=correction)
                    workspace.RunOperatorOnce(op)
Exemple #5
0
    def UpdateLossWeight(self):
        scale = 10
        # set bias for constraint for weight >0
        bias = 0.5
        lr = workspace.FetchBlob('gpu_0/lr').astype(np.float32)
        weight_cls1 = workspace.FetchBlob('gpu_0/weight_cls1').astype(
            np.float32)
        weight_cls2 = workspace.FetchBlob('gpu_0/weight_cls2').astype(
            np.float32)
        #loss_cls1 = workspace.FetchBlob('gpu_0/loss_cls1')
        #loss_cls2 = workspace.FetchBlob('gpu_0/loss_cls2')
        weight_cls1 -= lr * scale * workspace.FetchBlob(
            'gpu_0/weight_cls1_grad') + bias
        weight_cls2 -= lr * scale * workspace.FetchBlob(
            'gpu_0/weight_cls2_grad') + bias
        weight_cls1 = weight_cls1 / (weight_cls1 + weight_cls2)
        weight_cls2 = weight_cls2 / (weight_cls1 + weight_cls2)

        weight_bbox1 = workspace.FetchBlob('gpu_0/weight_bbox1').astype(
            np.float32)
        weight_bbox2 = workspace.FetchBlob('gpu_0/weight_bbox2').astype(
            np.float32)
        #loss_bbox1 = workspace.FetchBlob('gpu_0/loss_bbox1')
        #loss_bbox2 = workspace.FetchBlob('gpu_0/loss_bbox2')
        weight_bbox1 -= lr * scale * workspace.FetchBlob(
            'gpu_0/weight_bbox1_grad') + bias
        weight_bbox2 -= lr * scale * workspace.FetchBlob(
            'gpu_0/weight_bbox2_grad') + bias
        weight_bbox1 = weight_bbox1 / (weight_bbox1 + weight_bbox2)
        weight_bbox2 = weight_bbox2 / (weight_bbox1 + weight_bbox2)

        for i in range(cfg.NUM_GPUS):
            with c2_utils.CudaScope(i):
                workspace.FeedBlob('gpu_{}/weight_cls1'.format(i), weight_cls1)
                workspace.FeedBlob('gpu_{}/weight_cls2'.format(i), weight_cls2)
                workspace.FeedBlob('gpu_{}/weight_bbox1'.format(i),
                                   weight_bbox1)
                workspace.FeedBlob('gpu_{}/weight_bbox2'.format(i),
                                   weight_bbox2)
Exemple #6
0
    def _SetNewLr(self, cur_lr, new_lr):
        """Do the actual work of updating the model and workspace blobs.
        """
        for i in range(cfg.NUM_GPUS):
            with c2_utils.CudaScope(i):
                workspace.FeedBlob('gpu_{}/lr'.format(i),
                                   np.array([new_lr], dtype=np.float32))

                lr_scale_new_param = cfg.SOLVER.LR_SCALE_NEW_PARAM
                workspace.FeedBlob(
                    'gpu_{}/lr_new_param'.format(i),
                    np.array([new_lr * lr_scale_new_param], dtype=np.float32))

                lr_scale_new_fc = cfg.SOLVER.LR_SCALE_NEW_FC
                workspace.FeedBlob(
                    'gpu_{}/lr_new_fc'.format(i),
                    np.array([new_lr * lr_scale_new_fc], dtype=np.float32))

        ratio = _get_lr_change_ratio(cur_lr, new_lr)
        if cfg.SOLVER.SCALE_MOMENTUM and cur_lr > 1e-7 and \
                ratio > cfg.SOLVER.SCALE_MOMENTUM_THRESHOLD:
            self._CorrectMomentum(new_lr / cur_lr)
Exemple #7
0
    def GenerateProposals(self, blobs_in, blobs_out, anchors, spatial_scale):
        """Op for generating RPN porposals.

        blobs_in:
          - 'rpn_cls_probs': 4D tensor of shape (N, A, H, W), where N is the
            number of minibatch images, A is the number of anchors per
            locations, and (H, W) is the spatial size of the prediction grid.
            Each value represents a "probability of object" rating in [0, 1].
          - 'rpn_bbox_pred': 4D tensor of shape (N, 4 * A, H, W) of predicted
            deltas for transformation anchor boxes into RPN proposals.
          - 'im_info': 2D tensor of shape (N, 3) where the three columns encode
            the input image's [height, width, scale]. Height and width are
            for the input to the network, not the original image; scale is the
            scale factor used to scale the original image to the network input
            size.

        blobs_out:
          - 'rpn_rois': 2D tensor of shape (R, 5), for R RPN proposals where the
            five columns encode [batch ind, x1, y1, x2, y2]. The boxes are
            w.r.t. the network input, which is a *scaled* version of the
            original image; these proposals must be scaled by 1 / scale (where
            scale comes from im_info; see above) to transform it back to the
            original input image coordinate system.
          - 'rpn_roi_probs': 1D tensor of objectness probability scores
            (extracted from rpn_cls_probs; see above).
        """
        cfg_key = 'TRAIN' if self.train else 'TEST'

        if cfg[cfg_key].GENERATE_PROPOSALS_ON_GPU:
            rpn_pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
            rpn_post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
            rpn_nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
            rpn_min_size = float(cfg[cfg_key].RPN_MIN_SIZE)

            input_name = str(blobs_in[0])
            lvl = int(input_name[-1]) if input_name[-1].isdigit() else None
            anchors_name = 'anchors{}'.format(lvl) if lvl else 'anchors'

            for i in range(cfg.NUM_GPUS):
                with c2_utils.CudaScope(i):
                    workspace.FeedBlob(
                        'gpu_{}/{}'.format(i, anchors_name),
                        anchors.astype(np.float32))

            self.net.GenerateProposals(
                blobs_in + [anchors_name],
                blobs_out,
                spatial_scale=spatial_scale,
                pre_nms_topN=rpn_pre_nms_topN,
                post_nms_topN=rpn_post_nms_topN,
                nms_thresh=rpn_nms_thresh,
                min_size=rpn_min_size,
            )
        else:
            name = 'GenerateProposalsOp:' + ','.join([str(b) for b in blobs_in])
            # spatial_scale passed to the Python op is only used in
            # convert_pkl_to_pb
            self.net.Python(
                GenerateProposalsOp(anchors, spatial_scale, self.train).forward
            )(blobs_in, blobs_out, name=name, spatial_scale=spatial_scale)

        return blobs_out
Exemple #8
0
def softmax_surgery(model):
    print('softmax surgery')

    gpu_prefixs = ['gpu_' + str(i) for i in range(cfg.NUM_GPUS)]

    old_ops = model.net._net.op[:]
    num_op = len(model.net._net.op)
    is_end = False

    del model.net._net.op[:]

    gpu_point = {gpu_prefix: -1 for gpu_prefix in gpu_prefixs}
    while (True):
        for gpu_prefix in gpu_prefixs:
            for i, op in enumerate(old_ops):
                if i <= gpu_point[gpu_prefix]:
                    continue

                gpu = op.input[0].split('/')[0]
                if gpu == gpu_prefix:
                    pass
                else:
                    continue

                if op.type == 'Softmax' and 'fc8d_t' in op.input[0]:
                    gpu_point[gpu_prefix] = i
                    # print(op)
                    print('find softmax: ', op.input[0], '\t-->\t',
                          op.output[0])
                    break
                model.net._net.op.extend([op])

                if i == num_op - 1:
                    is_end = True

        if is_end:
            break

        if gpu_point[gpu_prefixs[0]] == -1 or gpu_point[
                gpu_prefixs[1]] == -1 or gpu_point[
                    gpu_prefixs[2]] == -1 or gpu_point[gpu_prefixs[3]] == -1:
            break

        assert old_ops[gpu_point[gpu_prefixs[0]]].input[0].split('/')[
            1] == old_ops[gpu_point[gpu_prefixs[1]]].input[0].split('/')[1]
        assert old_ops[gpu_point[gpu_prefixs[0]]].input[0].split('/')[
            1] == old_ops[gpu_point[gpu_prefixs[2]]].input[0].split('/')[1]
        assert old_ops[gpu_point[gpu_prefixs[0]]].input[0].split('/')[
            1] == old_ops[gpu_point[gpu_prefixs[3]]].input[0].split('/')[1]

        in_blobs = []
        out_blobs = []
        for gpu_prefix in gpu_prefixs:
            in_blob = old_ops[gpu_point[gpu_prefix]].input[0]
            in_blobs.append(in_blob)

            out_blob = old_ops[gpu_point[gpu_prefix]].output[0]
            out_blobs.append(out_blob)
        in_blob_name = in_blobs[0].split('/')[1]
        out_blob_name = out_blobs[0].split('/')[1]

        for gpu_prefix in gpu_prefixs:
            gpu_id = int(gpu_prefix.split('_')[1])
            with c2_utils.CudaScope(gpu_id):
                for i in range(cfg.NUM_GPUS):
                    if gpu_id == i:
                        continue
                    model.net.Copy(
                        in_blobs[i],
                        gpu_prefix + '/' + in_blob_name + '_gpu_' + str(i))
                    model.net.StopGradient(
                        gpu_prefix + '/' + in_blob_name + '_gpu_' + str(i),
                        gpu_prefix + '/' + in_blob_name + '_gpu_' + str(i))
                concat_in_blobs = [
                    gpu_prefix + '/' + in_blob_name + '_gpu_' + str(i)
                    for i in range(cfg.NUM_GPUS)
                ]
                concat_in_blobs[gpu_id] = in_blobs[gpu_id]
                model.net.Concat(concat_in_blobs, [
                    gpu_prefix + '/' + in_blob_name + '_cross',
                    gpu_prefix + '/' + in_blob_name + '_cross_split_info'
                ],
                                 axis=1)

                op = old_ops[gpu_point[gpu_prefix]]
                op.input[0] = gpu_prefix + '/' + in_blob_name + '_cross'
                op.output[0] = gpu_prefix + '/' + out_blob_name + '_cross'
                model.net._net.op.extend([op])

                split_out_blobs = [
                    gpu_prefix + '/' + str(i) + '_useless'
                    for i in range(len(out_blobs))
                ]

                split_out_blobs[gpu_id] = out_blobs[gpu_id]
                model.net.Split([
                    gpu_prefix + '/' + out_blob_name + '_cross',
                    gpu_prefix + '/' + in_blob_name + '_cross_split_info'
                ],
                                split_out_blobs,
                                axis=1)
    return

    num_op = len(model.net._net.op)
    for i, op in enumerate(model.net._net.op):
        print(op)

    exit(0)