Exemple #1
0
    def test(self):
        # mask
        with tf.Session() as sess:
            gt_masks = np.zeros((self.N, 100, 100), dtype=np.int32)

            rois = self.gt_boxes[:, :4]
            rois = rois + np.random.randint(-5, 5, (self.N, 4))
            rois[rois < 0] = 0
            bgs = np.random.randint(0, 60, (self.N + 2, 2))
            bgs = np.hstack(
                (bgs, bgs + np.random.randint(20, 30, (self.N + 2, 2))))
            bgs = bgs.astype(np.float32)
            rois = np.vstack((rois, bgs))
            print(rois)

            for i in range(self.N):
                x1, y1 = int(self.gt_boxes[i, 0] +
                             2), int(self.gt_boxes[i, 1] + 2)
                x2, y2 = int(self.gt_boxes[i, 2] -
                             1), int(self.gt_boxes[i, 3] - 1)
                gt_masks[i, y1:y2, x1:x2] = 1
            self.gt_masks = gt_masks

            labels, mask_targets, mask_inside_weights = \
                    mask_encoder(self.gt_masks, self.gt_boxes, rois, self.num_classes, 15, 15)
            self.labels = labels.eval()
            self.mask_targets = mask_targets.eval()
            self.mask_inside_weights = mask_inside_weights.eval()

            # print (self.mask_targets)
            print(self.labels)
            for i in range(rois.shape[0]):
                print(i, 'label:', self.labels[i])
                print(self.mask_targets[i, :, :, int(self.labels[i])])
Exemple #2
0
    def test(self):
        # mask 
        with tf.Session() as sess:
            gt_masks = np.zeros((self.N, 100, 100), dtype=np.int32)
         
            rois = self.gt_boxes[:, :4]
            rois = rois + np.random.randint(-5, 5, (self.N, 4))
            rois[rois < 0] = 0
            bgs = np.random.randint(0, 60, (self.N + 2, 2))
            bgs = np.hstack((bgs, bgs + np.random.randint(20, 30, (self.N + 2, 2))))
            bgs = bgs.astype(np.float32)
            rois = np.vstack((rois, bgs))
            print (rois)

            for i in range(self.N):
                x1, y1 = int(self.gt_boxes[i, 0] + 2), int(self.gt_boxes[i, 1] + 2)
                x2, y2 = int(self.gt_boxes[i, 2] - 1), int(self.gt_boxes[i, 3] - 1)
                gt_masks[i, y1:y2, x1:x2] = 1
            self.gt_masks = gt_masks

            labels, mask_targets, mask_inside_weights = \
                    mask_encoder(self.gt_masks, self.gt_boxes, rois, self.num_classes, 15, 15)
            self.labels = labels.eval()
            self.mask_targets = mask_targets.eval()
            self.mask_inside_weights = mask_inside_weights.eval()

            # print (self.mask_targets)
            print (self.labels)
            for i in range(rois.shape[0]):
                print(i, 'label:', self.labels[i])
                print (self.mask_targets[i, :, :, int(self.labels[i])])
Exemple #3
0
def build_losses(pyramid,
                 outputs,
                 gt_boxes,
                 gt_masks,
                 num_classes,
                 base_anchors,
                 rpn_box_lw=1.0,
                 rpn_cls_lw=1.0,
                 refined_box_lw=1.0,
                 refined_cls_lw=1.0,
                 mask_lw=1.0):
    """Building 3-way output losses, totally 5 losses
  Params:
  ------
  outputs: output of build_heads
  gt_boxes: A tensor of shape (G, 5), [x1, y1, x2, y2, class]
  gt_masks: A tensor of shape (G, ih, iw),  {0, 1}
  *_lw: loss weight of rpn, refined and mask losses
  
  Returns:
  -------
  l: a loss tensor
  """
    for i in range(5, 1, -1):
        p = 'P%d' % i
        stride = 2**i
        shape = tf.shape(pyramid[p])
        height, width = shape[1], shape[2]

        ### rpn losses
        # 1. encode ground truth
        # 2. compute distances
        all_anchors = gen_all_anchors(height, width, stride)
        labels, bbox_targets, bbox_inside_weights = \
          anchor_encoder(gt_boxes, all_anchors, height, width, stride, scope='AnchorEncoder')
        boxes = outputs[p]['rpn']['box']
        classes = tf.reshape(outputs[p]['rpn']['cls'],
                             (1, height, width, base_anchors, 2))

        labels, classes, boxes, bbox_targets, bbox_inside_weights = \
                _filter_negative_samples(tf.reshape(labels, [-1]), [
                    tf.reshape(labels, [-1]),
                    tf.reshape(classes, [-1, 2]),
                    tf.reshape(boxes, [-1, 4]),
                    tf.reshape(bbox_targets, [-1, 4]),
                    tf.reshape(bbox_inside_weights, [-1, 4])
                    ])
        rpn_box_loss = bbox_inside_weights * _smooth_l1_dist(
            boxes, bbox_targets)
        rpn_box_loss = tf.reshape(rpn_box_loss, [-1, 4])
        rpn_box_loss = tf.reduce_sum(rpn_box_loss, axis=1)
        rpn_box_loss = rpn_box_lw * tf.reduce_mean(rpn_box_loss)
        tf.add_to_collection(tf.GraphKeys.LOSSES, rpn_box_loss)

        # NOTE: examples with negative labels are ignore when compute one_hot_encoding and entropy losses
        # BUT these examples still count when computing the average of softmax_cross_entropy,
        # the loss become smaller by a factor (None_negtive_labels / all_labels)
        # So the BEST practise still should be gathering all none-negative examples
        labels = slim.one_hot_encoding(
            labels, 2, on_value=1.0,
            off_value=0.0)  # this will set -1 label to all zeros
        rpn_cls_loss = rpn_cls_lw * tf.losses.softmax_cross_entropy(
            labels, classes)

        ### refined loss
        # 1. encode ground truth
        # 2. compute distances
        rois = outputs[p]['roi']['box']

        boxes = outputs[p]['refined']['box']
        classes = outputs[p]['refined']['cls']
        labels, bbox_targets, bbox_inside_weights = \
          roi_encoder(gt_boxes, rois, num_classes, scope='ROIEncoder')

        labels, classes, boxes, bbox_targets, bbox_inside_weights = \
                _filter_negative_samples(tf.reshape(labels, [-1]),[
                    tf.reshape(labels, [-1]),
                    tf.reshape(classes, [-1, num_classes]),
                    tf.reshape(boxes, [-1, num_classes * 4]),
                    tf.reshape(bbox_targets, [-1, num_classes * 4]),
                    tf.reshape(bbox_inside_weights, [-1, num_classes * 4])
                    ] )
        refined_box_loss = bbox_inside_weights * _smooth_l1_dist(
            boxes, bbox_targets)
        refined_box_loss = tf.reshape(refined_box_loss, [-1, 4])
        refined_box_loss = tf.reduce_sum(refined_box_loss, axis=1)
        refined_box_loss = refined_box_lw * tf.reduce_mean(refined_box_loss)
        tf.add_to_collection(tf.GraphKeys.LOSSES, refined_box_loss)

        labels = slim.one_hot_encoding(labels,
                                       num_classes,
                                       on_value=1.0,
                                       off_value=0.0)
        refined_cls_loss = refined_cls_lw * tf.losses.softmax_cross_entropy(
            classes, labels)

        ### mask loss
        # mask of shape (N, h, w, num_classes*2)
        masks = outputs[p]['mask']['mask']
        mask_shape = tf.shape(masks)
        masks = tf.reshape(masks, (mask_shape[0], mask_shape[1], mask_shape[2],
                                   tf.cast(mask_shape[3] / 2, tf.int32), 2))
        labels, mask_targets, mask_inside_weights = \
          mask_encoder(gt_masks, gt_boxes, rois, num_classes, 28, 28, scope='MaskEncoder')
        labels, masks, mask_targets, mask_inside_weights = \
                _filter_negative_samples(tf.reshape(labels, [-1]), [
                    tf.reshape(labels, [-1]),
                    masks,
                    mask_targets,
                    mask_inside_weights,
                    ])
        mask_targets = slim.one_hot_encoding(mask_targets,
                                             2,
                                             on_value=1.0,
                                             off_value=0.0)
        mask_binary_loss = mask_lw * tf.losses.softmax_cross_entropy(
            masks, mask_targets)

    return rpn_box_loss + rpn_cls_loss + refined_box_loss + refined_cls_loss + mask_binary_loss
Exemple #4
0
def build_losses(pyramid,
                 outputs,
                 gt_boxes,
                 gt_masks,
                 num_classes,
                 rpn_box_lw=1.0,
                 rpn_cls_lw=1.0,
                 refined_box_lw=1.0,
                 refined_cls_lw=1.0,
                 mask_lw=1.0):
    """Building 3-way output losses, totally 5 losses
  Params:
  ------
  outputs: output of build_heads
  gt_boxes: A tensor of shape (G, 5), [x1, y1, x2, y2, class]
  gt_masks: A tensor of shape (G, ih, iw),  {0, 1}
  *_lw: loss weight of rpn, refined and mask losses
  
  Returns:
  -------
  l: a loss tensor
  """
    for i in range(5, 1, -1):
        p = 'P%d' % i
        stride = 2**i
        shape = tf.shape(pyramid[p])
        height, width = shape[1], shape[2]

        ### rpn losses
        # 1. encode ground truth
        # 2. compute distances
        all_anchors = gen_all_anchors(height, width, stride)
        labels, bbox_targets, bbox_inside_weights = \
          anchor_encoder(gt_boxes, all_anchors, height, width, stride, scope='AnchorEncoder')
        boxes = outputs[p]['rpn']['box']
        classes = outputs[p]['rpn']['cls']
        rpn_box_loss = bbox_inside_weights * _smooth_l1_dist(
            boxes, bbox_targets)
        rpn_box_loss = tf.reshape(rpn_box_loss, [-1, 4])
        rpn_box_loss = tf.reduce_sum(rpn_box_loss, axis=1)
        rpn_box_loss = rpn_box_lw * tf.reduce_mean(rpn_box_loss)

        labels = slim.one_hot_encoding(labels, 2, on_value=1.0, off_value=0.0)
        rpn_cls_loss = rpn_cls_lw * tf.losses.softmax_cross_entropy(
            classes, labels)

        ### refined loss
        # 1. encode ground truth
        # 2. compute distances
        rois = outputs[p]['roi']['box']

        boxes = outputs[p]['refined']['box']
        classes = outputs[p]['refined']['cls']
        labels, bbox_targets, bbox_inside_weights = \
          roi_encoder(gt_boxes, rois, num_classes, scope='ROIEncoder')
        refined_box_loss = bbox_inside_weights * _smooth_l1_dist(
            boxes, bbox_targets)
        refined_box_loss = tf.reshape(refined_box_loss, [-1, 4])
        refined_box_loss = tf.reduce_sum(refined_box_loss, axis=1)
        refined_box_loss = refined_box_lw * tf.reduce_mean(refined_box_loss)

        labels = slim.one_hot_encoding(labels,
                                       num_classes,
                                       on_value=1.0,
                                       off_value=0.0)
        refined_cls_loss = refined_cls_lw * tf.losses.softmax_cross_entropy(
            classes, labels)

        ### mask loss
        # {'mask': m, 'classes': classes, 'scores': scores}
        masks = outputs[p]['mask']['mask']
        labels, mask_targets, mask_inside_weights = \
          mask_encoder(gt_masks, gt_boxes, rois, num_classes, 28, 28, scope='MaskEncoder')

    return
Exemple #5
0
def build_losses(pyramid,
                 outputs,
                 gt_boxes,
                 gt_masks,
                 num_classes,
                 base_anchors,
                 rpn_box_lw=1.0,
                 rpn_cls_lw=1.0,
                 refined_box_lw=1.0,
                 refined_cls_lw=1.0,
                 mask_lw=1.0):
    """Building 3-way output losses, totally 5 losses
  Params:
  ------
  outputs: output of build_heads
  gt_boxes: A tensor of shape (G, 5), [x1, y1, x2, y2, class]
  gt_masks: A tensor of shape (G, ih, iw),  {0, 1}Ì[MaÌ[MaÌ]]
  *_lw: loss weight of rpn, refined and mask losses
  
  Returns:
  -------
  l: a loss tensor
  """

    # losses for pyramid
    losses = []
    rpn_box_losses, rpn_cls_losses = [], []
    refined_box_losses, refined_cls_losses = [], []
    mask_losses = []

    # watch some info during training
    rpn_batch = []
    refine_batch = []
    mask_batch = []
    rpn_batch_pos = []
    refine_batch_pos = []
    mask_batch_pos = []

    arg_scope = _extra_conv_arg_scope(activation_fn=None)
    with slim.arg_scope(arg_scope):
        with tf.variable_scope('pyramid'):

            ## assigning gt_boxes
            assigned_gt_boxes = assign_boxes(gt_boxes, [2, 3, 4, 5])
            assigned_layer_inds = assigned_gt_boxes[-1]

            ## build losses for PFN
            for i in range(5, 1, -1):
                p = 'P%d' % i
                stride = 2**i
                shape = tf.shape(pyramid[p])
                height, width = shape[1], shape[2]

                gt_boxes = assigned_gt_boxes[i - 2]

                ### rpn losses
                # 1. encode ground truth
                # 2. compute distances
                anchor_scales = [2**(i - 2), 2**(i - 1), 2**(i)]
                all_anchors = gen_all_anchors(height, width, stride,
                                              anchor_scales)
                labels, bbox_targets, bbox_inside_weights = \
                  anchor_encoder(gt_boxes, all_anchors, height, width, stride, scope='AnchorEncoder')
                boxes = outputs[p]['rpn']['box']
                classes = tf.reshape(outputs[p]['rpn']['cls'],
                                     (1, height, width, base_anchors, 2))

                labels, classes, boxes, bbox_targets, bbox_inside_weights = \
                        _filter_negative_samples(tf.reshape(labels, [-1]), [
                            tf.reshape(labels, [-1]),
                            tf.reshape(classes, [-1, 2]),
                            tf.reshape(boxes, [-1, 4]),
                            tf.reshape(bbox_targets, [-1, 4]),
                            tf.reshape(bbox_inside_weights, [-1, 4])
                            ])
                _, frac_ = _get_valid_sample_fraction(labels)
                rpn_batch.append(
                    tf.reduce_sum(
                        tf.cast(tf.greater_equal(labels, 0), tf.float32)))
                rpn_batch_pos.append(
                    tf.reduce_sum(
                        tf.cast(tf.greater_equal(labels, 1), tf.float32)))
                rpn_box_loss = bbox_inside_weights * _smooth_l1_dist(
                    boxes, bbox_targets)
                rpn_box_loss = tf.reshape(rpn_box_loss, [-1, 4])
                rpn_box_loss = tf.reduce_sum(rpn_box_loss, axis=1)
                rpn_box_loss = rpn_box_lw * tf.reduce_mean(rpn_box_loss)
                tf.add_to_collection(tf.GraphKeys.LOSSES, rpn_box_loss)
                rpn_box_losses.append(rpn_box_loss)

                # NOTE: examples with negative labels are ignore when compute one_hot_encoding and entropy losses
                # BUT these examples still count when computing the average of softmax_cross_entropy,
                # the loss become smaller by a factor (None_negtive_labels / all_labels)
                # So the BEST practise still should be gathering all none-negative examples
                labels = slim.one_hot_encoding(
                    labels, 2, on_value=1.0,
                    off_value=0.0)  # this will set -1 label to all zeros
                rpn_cls_loss = rpn_cls_lw * tf.nn.softmax_cross_entropy_with_logits(
                    labels=labels, logits=classes)
                rpn_cls_loss = tf.reduce_mean(rpn_cls_loss)
                tf.add_to_collection(tf.GraphKeys.LOSSES, rpn_cls_loss)
                rpn_cls_losses.append(rpn_cls_loss)

                ### refined loss
                # 1. encode ground truth
                # 2. compute distances
                rois = outputs[p]['roi']['box']

                boxes = outputs[p]['refined']['box']
                classes = outputs[p]['refined']['cls']
                labels, bbox_targets, bbox_inside_weights = \
                  roi_encoder(gt_boxes, rois, num_classes, scope='ROIEncoder')

                labels, classes, boxes, bbox_targets, bbox_inside_weights = \
                        _filter_negative_samples(tf.reshape(labels, [-1]),[
                            tf.reshape(labels, [-1]),
                            tf.reshape(classes, [-1, num_classes]),
                            tf.reshape(boxes, [-1, num_classes * 4]),
                            tf.reshape(bbox_targets, [-1, num_classes * 4]),
                            tf.reshape(bbox_inside_weights, [-1, num_classes * 4])
                            ] )
                frac, frac_ = _get_valid_sample_fraction(labels)
                refine_batch.append(
                    tf.reduce_sum(
                        tf.cast(tf.greater_equal(labels, 0), tf.float32)))
                refine_batch_pos.append(
                    tf.reduce_sum(
                        tf.cast(tf.greater_equal(labels, 1), tf.float32)))

                refined_box_loss = bbox_inside_weights * _smooth_l1_dist(
                    boxes, bbox_targets)
                refined_box_loss = tf.reshape(refined_box_loss, [-1, 4])
                refined_box_loss = tf.reduce_sum(refined_box_loss, axis=1)
                refined_box_loss = refined_box_lw * tf.reduce_mean(
                    refined_box_loss) * frac_
                tf.add_to_collection(tf.GraphKeys.LOSSES, refined_box_loss)
                refined_box_losses.append(refined_box_loss)

                labels = slim.one_hot_encoding(labels,
                                               num_classes,
                                               on_value=1.0,
                                               off_value=0.0)
                refined_cls_loss = refined_cls_lw * tf.nn.softmax_cross_entropy_with_logits(
                    labels=labels, logits=classes)
                refined_cls_loss = tf.reduce_mean(refined_cls_loss) * frac_
                tf.add_to_collection(tf.GraphKeys.LOSSES, refined_cls_loss)
                refined_cls_losses.append(refined_cls_loss)

                ### mask loss
                # mask of shape (N, h, w, num_classes*2)
                masks = outputs[p]['mask']['mask']
                # mask_shape = tf.shape(masks)
                # masks = tf.reshape(masks, (mask_shape[0], mask_shape[1],
                #                            mask_shape[2], tf.cast(mask_shape[3]/2, tf.int32), 2))
                labels, mask_targets, mask_inside_weights = \
                  mask_encoder(gt_masks, gt_boxes, rois, num_classes, 28, 28, scope='MaskEncoder')
                labels, masks, mask_targets, mask_inside_weights = \
                        _filter_negative_samples(tf.reshape(labels, [-1]), [
                            tf.reshape(labels, [-1]),
                            masks,
                            mask_targets,
                            mask_inside_weights,
                            ])
                _, frac_ = _get_valid_sample_fraction(labels)
                mask_batch.append(
                    tf.reduce_sum(
                        tf.cast(tf.greater_equal(labels, 0), tf.float32)))
                mask_batch_pos.append(
                    tf.reduce_sum(
                        tf.cast(tf.greater_equal(labels, 1), tf.float32)))
                # mask_targets = slim.one_hot_encoding(mask_targets, 2, on_value=1.0, off_value=0.0)
                # mask_binary_loss = mask_lw * tf.losses.softmax_cross_entropy(mask_targets, masks)
                # NOTE: w/o competition between classes.
                mask_targets = tf.cast(mask_targets, tf.float32)
                mask_loss = mask_lw * tf.nn.sigmoid_cross_entropy_with_logits(
                    labels=mask_targets, logits=masks)
                mask_loss = tf.reduce_mean(mask_loss)
                mask_loss = tf.cond(tf.greater(tf.size(labels),
                                               0), lambda: mask_loss,
                                    lambda: tf.constant(0.0))
                tf.add_to_collection(tf.GraphKeys.LOSSES, mask_loss)
                mask_losses.append(mask_loss)

    rpn_box_losses = tf.add_n(rpn_box_losses)
    rpn_cls_losses = tf.add_n(rpn_cls_losses)
    refined_box_losses = tf.add_n(refined_box_losses)
    refined_cls_losses = tf.add_n(refined_cls_losses)
    mask_losses = tf.add_n(mask_losses)
    losses = [
        rpn_box_losses, rpn_cls_losses, refined_box_losses, refined_cls_losses,
        mask_losses
    ]
    total_loss = tf.add_n(losses)

    rpn_batch = tf.cast(tf.add_n(rpn_batch), tf.float32)
    refine_batch = tf.cast(tf.add_n(refine_batch), tf.float32)
    mask_batch = tf.cast(tf.add_n(mask_batch), tf.float32)
    rpn_batch_pos = tf.cast(tf.add_n(rpn_batch_pos), tf.float32)
    refine_batch_pos = tf.cast(tf.add_n(refine_batch_pos), tf.float32)
    mask_batch_pos = tf.cast(tf.add_n(mask_batch_pos), tf.float32)

    return total_loss, losses, [rpn_batch_pos, rpn_batch, \
                                refine_batch_pos, refine_batch, \
                                mask_batch_pos, mask_batch]
def build_losses(pyramid, outputs, gt_boxes, gt_masks,
                 num_classes, base_anchors,
                 rpn_box_lw =1.0, rpn_cls_lw = 1.0,
                 refined_box_lw=1.0, refined_cls_lw=1.0,
                 mask_lw=1.0):
  """Building 3-way output losses, totally 5 losses
  Params:
  ------
  outputs: output of build_heads
  gt_boxes: A tensor of shape (G, 5), [x1, y1, x2, y2, class]
  gt_masks: A tensor of shape (G, ih, iw),  {0, 1}Ì[MaÌ[MaÌ]]
  *_lw: loss weight of rpn, refined and mask losses
  
  Returns:
  -------
  l: a loss tensor
  """

  # losses for pyramid
  losses = []
  rpn_box_losses, rpn_cls_losses = [], []
  refined_box_losses, refined_cls_losses = [], []
  mask_losses = []
  
  # watch some info during training
  rpn_batch = []
  refine_batch = []
  mask_batch = []
  rpn_batch_pos = []
  refine_batch_pos = []
  mask_batch_pos = []

  arg_scope = _extra_conv_arg_scope(activation_fn=None)
  with slim.arg_scope(arg_scope):
      with tf.variable_scope('pyramid'):

        ## assigning gt_boxes
        [assigned_gt_boxes, assigned_layer_inds] = assign_boxes(gt_boxes, [gt_boxes], [2, 3, 4, 5])

        ## build losses for PFN

        for i in range(5, 1, -1):
            p = 'P%d' % i
            stride = 2 ** i
            shape = tf.shape(pyramid[p])
            height, width = shape[1], shape[2]

            splitted_gt_boxes = assigned_gt_boxes[i-2]
            
            ### rpn losses
            # 1. encode ground truth
            # 2. compute distances
            # anchor_scales = [2 **(i-2), 2 ** (i-1), 2 **(i)]
            # all_anchors = gen_all_anchors(height, width, stride, anchor_scales)
            all_anchors = outputs['rpn'][p]['anchor']
            labels, bbox_targets, bbox_inside_weights = \
              anchor_encoder(splitted_gt_boxes, all_anchors, height, width, stride, scope='AnchorEncoder')
            boxes = outputs['rpn'][p]['box']
            classes = tf.reshape(outputs['rpn'][p]['cls'], (1, height, width, base_anchors, 2))

            labels, classes, boxes, bbox_targets, bbox_inside_weights = \
                    _filter_negative_samples(tf.reshape(labels, [-1]), [
                        tf.reshape(labels, [-1]),
                        tf.reshape(classes, [-1, 2]),
                        tf.reshape(boxes, [-1, 4]),
                        tf.reshape(bbox_targets, [-1, 4]),
                        tf.reshape(bbox_inside_weights, [-1, 4])
                        ])
            # _, frac_ = _get_valid_sample_fraction(labels)
            rpn_batch.append(
                    tf.reduce_sum(tf.cast(
                        tf.greater_equal(labels, 0), tf.float32
                        )))
            rpn_batch_pos.append(
                    tf.reduce_sum(tf.cast(
                        tf.greater_equal(labels, 1), tf.float32
                        )))
            rpn_box_loss = bbox_inside_weights * _smooth_l1_dist(boxes, bbox_targets)
            rpn_box_loss = tf.reshape(rpn_box_loss, [-1, 4])
            rpn_box_loss = tf.reduce_sum(rpn_box_loss, axis=1)
            rpn_box_loss = rpn_box_lw * tf.reduce_mean(rpn_box_loss) 
            tf.add_to_collection(tf.GraphKeys.LOSSES, rpn_box_loss)
            rpn_box_losses.append(rpn_box_loss)

            # NOTE: examples with negative labels are ignore when compute one_hot_encoding and entropy losses 
            # BUT these examples still count when computing the average of softmax_cross_entropy, 
            # the loss become smaller by a factor (None_negtive_labels / all_labels)
            # the BEST practise still should be gathering all none-negative examples
            labels = slim.one_hot_encoding(labels, 2, on_value=1.0, off_value=0.0) # this will set -1 label to all zeros
            rpn_cls_loss = rpn_cls_lw * tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=classes) 
            rpn_cls_loss = tf.reduce_mean(rpn_cls_loss) 
            tf.add_to_collection(tf.GraphKeys.LOSSES, rpn_cls_loss)
            rpn_cls_losses.append(rpn_cls_loss)
            

        ### refined loss
        # 1. encode ground truth
        # 2. compute distances
        rois = outputs['roi']['box']
        
        boxes = outputs['refined']['box']
        classes = outputs['refined']['cls']
        labels, bbox_targets, bbox_inside_weights = \
          roi_encoder(gt_boxes, rois, num_classes, scope='ROIEncoder')

        labels, classes, boxes, bbox_targets, bbox_inside_weights = \
                _filter_negative_samples(tf.reshape(labels, [-1]),[
                    tf.reshape(labels, [-1]),
                    tf.reshape(classes, [-1, num_classes]),
                    tf.reshape(boxes, [-1, num_classes * 4]),
                    tf.reshape(bbox_targets, [-1, num_classes * 4]),
                    tf.reshape(bbox_inside_weights, [-1, num_classes * 4])
                    ] )
        # frac, frac_ = _get_valid_sample_fraction(labels, 1)
        refine_batch.append(
                tf.reduce_sum(tf.cast(
                    tf.greater_equal(labels, 0), tf.float32
                    )))
        refine_batch_pos.append(
                tf.reduce_sum(tf.cast(
                    tf.greater_equal(labels, 1), tf.float32
                    )))

        refined_box_loss = bbox_inside_weights * _smooth_l1_dist(boxes, bbox_targets)
        refined_box_loss = tf.reshape(refined_box_loss, [-1, 4])
        refined_box_loss = tf.reduce_sum(refined_box_loss, axis=1)
        refined_box_loss = refined_box_lw * tf.reduce_mean(refined_box_loss) # * frac_
        tf.add_to_collection(tf.GraphKeys.LOSSES, refined_box_loss)
        refined_box_losses.append(refined_box_loss)

        labels = slim.one_hot_encoding(labels, num_classes, on_value=1.0, off_value=0.0)
        refined_cls_loss = refined_cls_lw * tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=classes) 
        refined_cls_loss = tf.reduce_mean(refined_cls_loss) # * frac_
        tf.add_to_collection(tf.GraphKeys.LOSSES, refined_cls_loss)
        refined_cls_losses.append(refined_cls_loss)

        ### mask loss
        # mask of shape (N, h, w, num_classes)
        masks = outputs['mask']['mask']
        # mask_shape = tf.shape(masks)
        # masks = tf.reshape(masks, (mask_shape[0], mask_shape[1],
        #                            mask_shape[2], tf.cast(mask_shape[3]/2, tf.int32), 2))
        labels, mask_targets, mask_inside_weights = \
          mask_encoder(gt_masks, gt_boxes, rois, num_classes, 28, 28, scope='MaskEncoder')
        labels, masks, mask_targets, mask_inside_weights = \
                _filter_negative_samples(tf.reshape(labels, [-1]), [
                    tf.reshape(labels, [-1]),
                    masks,
                    mask_targets, 
                    mask_inside_weights, 
                    ])
        # _, frac_ = _get_valid_sample_fraction(labels)
        mask_batch.append(
                tf.reduce_sum(tf.cast(
                    tf.greater_equal(labels, 0), tf.float32
                    )))
        mask_batch_pos.append(
                tf.reduce_sum(tf.cast(
                    tf.greater_equal(labels, 1), tf.float32
                    )))
        # mask_targets = slim.one_hot_encoding(mask_targets, 2, on_value=1.0, off_value=0.0)
        # mask_binary_loss = mask_lw * tf.losses.softmax_cross_entropy(mask_targets, masks)
        # NOTE: w/o competition between classes. 
        mask_targets = tf.cast(mask_targets, tf.float32)
        mask_loss = mask_lw * tf.nn.sigmoid_cross_entropy_with_logits(labels=mask_targets, logits=masks) 
        mask_loss = tf.reduce_mean(mask_loss) 
        mask_loss = tf.cond(tf.greater(tf.size(labels), 0), lambda: mask_loss, lambda: tf.constant(0.0))
        tf.add_to_collection(tf.GraphKeys.LOSSES, mask_loss)
        mask_losses.append(mask_loss)

  rpn_box_losses = tf.add_n(rpn_box_losses)
  rpn_cls_losses = tf.add_n(rpn_cls_losses)
  refined_box_losses = tf.add_n(refined_box_losses)
  refined_cls_losses = tf.add_n(refined_cls_losses)
  mask_losses = tf.add_n(mask_losses)
  losses = [rpn_box_losses, rpn_cls_losses, refined_box_losses, refined_cls_losses, mask_losses]
  total_loss = tf.add_n(losses)

  rpn_batch = tf.cast(tf.add_n(rpn_batch), tf.float32)
  refine_batch = tf.cast(tf.add_n(refine_batch), tf.float32)
  mask_batch = tf.cast(tf.add_n(mask_batch), tf.float32)
  rpn_batch_pos = tf.cast(tf.add_n(rpn_batch_pos), tf.float32)
  refine_batch_pos = tf.cast(tf.add_n(refine_batch_pos), tf.float32)
  mask_batch_pos = tf.cast(tf.add_n(mask_batch_pos), tf.float32)
    
  return total_loss, losses, [rpn_batch_pos, rpn_batch, \
                              refine_batch_pos, refine_batch, \
                              mask_batch_pos, mask_batch]
def build_losses(pyramid,
                 py_scope,
                 slim_scope,
                 image_height,
                 image_width,
                 outputs,
                 gt_boxes,
                 gt_masks,
                 num_classes,
                 base_anchors,
                 rpn_box_lw=0.1,
                 rpn_cls_lw=0.1,
                 rcnn_box_lw=1.0,
                 rcnn_cls_lw=0.1,
                 mask_lw=1.0):
    """Building 3-way output losses, totally 5 losses
  Params:
  ------
  outputs: output of build_heads
  gt_boxes: A tensor of shape (G, 5), [x1, y1, x2, y2, class]
  gt_masks: A tensor of shape (G, ih, iw),  {0, 1}Ì[MaÌ[MaÌ]]
  *_lw: loss weight of rpn, rcnn and mask losses
  
  Returns:
  -------
  l: a loss tensor
  """

    # losses for pyramid
    losses = []
    rpn_box_losses, rpn_cls_losses = [], []
    rcnn_box_losses, rcnn_cls_losses = [], []
    mask_losses = []

    # watch some info during training
    rpn_batch = []
    rcnn_batch = []
    mask_batch = []
    rpn_batch_pos = []
    rcnn_batch_pos = []
    mask_batch_pos = []

    # if _BN is True:
    #     arg_scope = _extra_conv_arg_scope_with_bn()
    #   # arg_scope = _extra_conv_arg_scope_with_bn(is_training=True)
    # else:
    #   arg_scope = _extra_conv_arg_scope(activation_fn=tf.nn.relu)
    with tf.name_scope(py_scope) as py_scope:
        with slim.arg_scope(slim_scope) as slim_scope:
            ## assigning gt_boxes
            [assigned_gt_boxes,
             assigned_layer_inds] = assign_boxes(gt_boxes, [gt_boxes],
                                                 [2, 3, 4, 5])

            ## build losses for PFN
            for i in range(5, 1, -1):
                p = 'P%d' % i
                stride = 2**i
                shape = tf.shape(pyramid[p])
                height, width = shape[1], shape[2]

                splitted_gt_boxes = assigned_gt_boxes[i - 2]

                ### rpn losses
                # 1. encode ground truth
                # 2. compute distances
                all_anchors = outputs['rpn'][p]['anchor']
                rpn_boxes = outputs['rpn'][p]['box']
                rpn_clses = tf.reshape(outputs['rpn'][p]['cls'],
                                       (1, height, width, base_anchors, 2))

                rpn_clses_target, rpn_boxes_target, rpn_boxes_inside_weight = \
                        anchor_encoder(splitted_gt_boxes, all_anchors, height, width, stride, image_height, image_width, scope='AnchorEncoder')

                rpn_clses_target, rpn_clses, rpn_boxes, rpn_boxes_target, rpn_boxes_inside_weight = \
                        _filter_negative_samples(tf.reshape(rpn_clses_target, [-1]), [
                            tf.reshape(rpn_clses_target, [-1]),
                            tf.reshape(rpn_clses, [-1, 2]),
                            tf.reshape(rpn_boxes, [-1, 4]),
                            tf.reshape(rpn_boxes_target, [-1, 4]),
                            tf.reshape(rpn_boxes_inside_weight, [-1, 4])
                            ])

                rpn_batch.append(
                    tf.reduce_sum(
                        tf.cast(tf.greater_equal(rpn_clses_target, 0),
                                tf.float32)))
                rpn_batch_pos.append(
                    tf.reduce_sum(
                        tf.cast(tf.greater_equal(rpn_clses_target, 1),
                                tf.float32)))

                rpn_box_loss = rpn_boxes_inside_weight * _smooth_l1_dist(
                    rpn_boxes, rpn_boxes_target)
                rpn_box_loss = tf.reshape(rpn_box_loss, [-1, 4])
                rpn_box_loss = tf.reduce_sum(rpn_box_loss, axis=1)
                rpn_box_loss = rpn_box_lw * tf.reduce_mean(rpn_box_loss)
                tf.add_to_collection(tf.GraphKeys.LOSSES, rpn_box_loss)
                rpn_box_losses.append(rpn_box_loss)

                ### NOTE: examples with negative labels are ignore when compute one_hot_encoding and entropy losses
                # BUT these examples still count when computing the average of softmax_cross_entropy,
                # the loss become smaller by a factor (None_negtive_labels / all_labels)
                # the BEST practise still should be gathering all none-negative examples
                rpn_clses_target = slim.one_hot_encoding(
                    rpn_clses_target, 2, on_value=1.0,
                    off_value=0.0)  # this will set -1 label to all zeros
                rpn_cls_loss = rpn_cls_lw * tf.nn.softmax_cross_entropy_with_logits(
                    labels=rpn_clses_target, logits=rpn_clses)
                rpn_cls_loss = tf.reduce_mean(rpn_cls_loss)
                tf.add_to_collection(tf.GraphKeys.LOSSES, rpn_cls_loss)
                rpn_cls_losses.append(rpn_cls_loss)

            ### rcnn losses
            # 1. encode ground truth
            # 2. compute distances
            rcnn_ordered_rois = outputs['rcnn_ordered_rois']
            rcnn_boxes = outputs['rcnn_boxes']
            rcnn_clses = outputs['rcnn_clses']
            rcnn_scores = outputs['rcnn_scores']

            rcnn_clses_target, rcnn_boxes_target, rcnn_boxes_inside_weight = \
                    roi_encoder(gt_boxes, rcnn_ordered_rois, num_classes, scope='ROIEncoder')

            rcnn_clses_target, rcnn_ordered_rois, rcnn_clses, rcnn_scores, rcnn_boxes, rcnn_boxes_target, rcnn_boxes_inside_weight = \
                    _filter_negative_samples(tf.reshape(rcnn_clses_target, [-1]),[
                        tf.reshape(rcnn_clses_target, [-1]),
                        tf.reshape(rcnn_ordered_rois, [-1, 4]),
                        tf.reshape(rcnn_clses, [-1, num_classes]),
                        tf.reshape(rcnn_scores, [-1, num_classes]),
                        tf.reshape(rcnn_boxes, [-1, num_classes * 4]),
                        tf.reshape(rcnn_boxes_target, [-1, num_classes * 4]),
                        tf.reshape(rcnn_boxes_inside_weight, [-1, num_classes * 4])
                        ] )

            rcnn_batch.append(
                tf.reduce_sum(
                    tf.cast(tf.greater_equal(rcnn_clses_target, 0),
                            tf.float32)))
            rcnn_batch_pos.append(
                tf.reduce_sum(
                    tf.cast(tf.greater_equal(rcnn_clses_target, 1),
                            tf.float32)))

            rcnn_box_loss = rcnn_boxes_inside_weight * _smooth_l1_dist(
                rcnn_boxes, rcnn_boxes_target)
            rcnn_box_loss = tf.reshape(rcnn_box_loss, [-1, 4])
            rcnn_box_loss = tf.reduce_sum(rcnn_box_loss, axis=1)
            rcnn_box_loss = rcnn_box_lw * tf.reduce_mean(
                rcnn_box_loss)  # * frac_
            tf.add_to_collection(tf.GraphKeys.LOSSES, rcnn_box_loss)
            rcnn_box_losses.append(rcnn_box_loss)

            rcnn_clses_target = slim.one_hot_encoding(rcnn_clses_target,
                                                      num_classes,
                                                      on_value=1.0,
                                                      off_value=0.0)
            rcnn_cls_loss = rcnn_cls_lw * tf.nn.softmax_cross_entropy_with_logits(
                labels=rcnn_clses_target, logits=rcnn_clses)
            rcnn_cls_loss = tf.reduce_mean(rcnn_cls_loss)  # * frac_
            tf.add_to_collection(tf.GraphKeys.LOSSES, rcnn_cls_loss)
            rcnn_cls_losses.append(rcnn_cls_loss)

            outputs['training_rcnn_rois'] = rcnn_ordered_rois
            outputs['training_rcnn_clses_target'] = rcnn_clses_target
            outputs['training_rcnn_clses'] = rcnn_clses
            outputs['training_rcnn_scores'] = rcnn_scores

            ### mask loss
            # mask of shape (N, h, w, num_classes)
            mask_ordered_rois = outputs['mask_ordered_rois']
            masks = outputs['mask_mask']

            mask_clses_target, mask_targets, mask_inside_weights, mask_rois = \
                    mask_encoder(gt_masks, gt_boxes, mask_ordered_rois, num_classes, 28, 28,scope='MaskEncoder')

            mask_clses_target, mask_targets, mask_inside_weights, mask_rois, masks = \
                    _filter_negative_samples(tf.reshape(mask_clses_target, [-1]), [
                        tf.reshape(mask_clses_target, [-1]),
                        tf.reshape(mask_targets, [-1, 28, 28, num_classes]),
                        tf.reshape(mask_inside_weights, [-1, 28, 28, num_classes]),
                        tf.reshape(mask_rois, [-1, 4]),
                        tf.reshape(masks, [-1, 28, 28, num_classes]),
                        ])

            mask_batch.append(
                tf.reduce_sum(
                    tf.cast(tf.greater_equal(mask_clses_target, 0),
                            tf.float32)))
            mask_batch_pos.append(
                tf.reduce_sum(
                    tf.cast(tf.greater_equal(mask_clses_target, 1),
                            tf.float32)))
            ### NOTE: w/o competition between classes.
            mask_loss = mask_inside_weights * tf.nn.sigmoid_cross_entropy_with_logits(
                labels=mask_targets, logits=masks)
            mask_loss = mask_lw * mask_loss
            mask_loss = tf.reduce_mean(mask_loss)
            mask_loss = tf.cond(tf.greater(tf.size(mask_clses_target), 0),
                                lambda: mask_loss, lambda: tf.constant(0.0))
            tf.add_to_collection(tf.GraphKeys.LOSSES, mask_loss)
            mask_losses.append(mask_loss)

            outputs['training_mask_rois'] = mask_rois
            outputs['training_mask_clses_target'] = mask_clses_target
            outputs['training_mask_final_mask'] = tf.nn.sigmoid(masks)
            outputs['training_mask_final_mask_target'] = mask_targets

            rpn_box_losses = tf.add_n(rpn_box_losses)
            rpn_cls_losses = tf.add_n(rpn_cls_losses)
            rcnn_box_losses = tf.add_n(rcnn_box_losses)
            rcnn_cls_losses = tf.add_n(rcnn_cls_losses)
            mask_losses = tf.add_n(mask_losses)
            losses = [
                rpn_box_losses, rpn_cls_losses, rcnn_box_losses,
                rcnn_cls_losses, mask_losses
            ]
            total_loss = tf.add_n(losses)

            rpn_batch = tf.cast(tf.add_n(rpn_batch), tf.float32)
            rcnn_batch = tf.cast(tf.add_n(rcnn_batch), tf.float32)
            mask_batch = tf.cast(tf.add_n(mask_batch), tf.float32)
            rpn_batch_pos = tf.cast(tf.add_n(rpn_batch_pos), tf.float32)
            rcnn_batch_pos = tf.cast(tf.add_n(rcnn_batch_pos), tf.float32)
            mask_batch_pos = tf.cast(tf.add_n(mask_batch_pos), tf.float32)

            return total_loss, losses, [rpn_batch_pos, rpn_batch, \
                                        rcnn_batch_pos, rcnn_batch, \
                                        mask_batch_pos, mask_batch]