Ejemplo n.º 1
0
 def _get_ground_truth(self):
     map_shape = btf.combined_static_and_dynamic_shape(self.pred_maps[0][0])
     output = tfop.open_pose_encode(
         keypoints=self.gt_keypoints,
         output_size=map_shape[1:3],
         glength=self.gt_length,
         keypoints_pair=self.cfg.POINTS_PAIRS,
         l_delta=self.cfg.OPENPOSE_L_DELTA,
         gaussian_delta=self.cfg.OPENPOSE_GAUSSIAN_DELTA)
     gt_conf_maps = output[0]
     gt_paf_maps = output[1]
     wsummary.feature_map_summary(gt_conf_maps,
                                  "gt_conf_maps",
                                  max_outputs=5)
     wsummary.feature_map_summary(gt_paf_maps, "gt_paf_maps", max_outputs=5)
     if self.cfg.USE_LOSS_MASK:
         B, H, W, _ = btf.combined_static_and_dynamic_shape(gt_paf_maps)
         image = tf.zeros([B, H, W, 1])
         mask = odtl.batch_fill_bboxes(image,
                                       self.gt_boxes,
                                       v=1.0,
                                       length=self.gt_length,
                                       H=H,
                                       W=W,
                                       relative_coord=True)
         conf_mask = mask
         paf_mask = mask
         tf.summary.image("bboxes_mask", mask, max_outputs=5)
     else:
         conf_mask = None
         paf_mask = None
     return gt_paf_maps, gt_conf_maps, paf_mask, conf_mask
Ejemplo n.º 2
0
    def _get_ground_truth(self):
        """
        Returns:
            gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the
                total number of anchors in image i (i.e., len(anchors[i])). Label values are
                in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class.
            gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), 4).
        """

        img_size = tf.shape(self.batched_inputs[IMAGE])[1:3]

        res_list = []

        for i, logits, regression, center_ness in zip(count(),
                                                      self.pred_logits,
                                                      self.pred_regression,
                                                      self.pred_center_ness):
            res = self.box2box_transform.get_deltas(
                gboxes=self.gt_boxes,
                glabels=self.gt_labels,
                glength=self.gt_length,
                min_size=self.size_threshold[i],
                max_size=self.size_threshold[i + 1],
                fm_shape=tf.shape(logits)[1:3],
                img_size=img_size)
            if global_cfg.GLOBAL.SUMMARY_LEVEL <= SummaryLevel.DEBUG:
                for k, v in res.items():
                    if len(v.get_shape()) == 3:
                        v = tf.expand_dims(v, axis=-1)
                    wsummary.feature_map_summary(v, k)
            res_list.append(res)

        return res_list
Ejemplo n.º 3
0
 def _get_ground_truth(self, net):
     map_shape = btf.combined_static_and_dynamic_shape(net)
     output = tfop.hr_net_encode(
         keypoints=self.gt_keypoints,
         output_size=map_shape[1:3],
         glength=self.gt_length,
         gaussian_delta=self.cfg.OPENPOSE_GAUSSIAN_DELTA)
     gt_conf_maps = output[0]
     gt_indexs = output[1]
     wsummary.feature_map_summary(gt_conf_maps,
                                  "gt_conf_maps",
                                  max_outputs=5)
     if self.cfg.USE_LOSS_MASK:
         B, H, W, _ = btf.combined_static_and_dynamic_shape(gt_conf_maps)
         image = tf.zeros([B, H, W, 1])
         mask = odtl.batch_fill_bboxes(image,
                                       self.gt_boxes,
                                       v=1.0,
                                       length=self.gt_length,
                                       H=H,
                                       W=W,
                                       relative_coord=True)
         conf_mask = mask
         tf.summary.image("loss_mask", mask, max_outputs=5)
     else:
         conf_mask = None
     return gt_conf_maps, gt_indexs, conf_mask
Ejemplo n.º 4
0
    def semantic_loss(self):
        pred_semantic = self.head_outputs[SEMANTIC]
        shape = wmlt.combined_static_and_dynamic_shape(pred_semantic)
        target_mask = smt.batch_sparse_mask_to_dense(
            mask=self.batched_inputs[GT_MASKS],
            labels=self.batched_inputs[GT_LABELS],
            lens=self.batched_inputs[GT_LENGTH],
            num_classes=self.num_classes)
        target_mask = tf.cast(target_mask, tf.float32)
        target_mask = tf.transpose(target_mask, [0, 2, 3, 1])
        target_mask = tf.image.resize_bilinear(target_mask, shape[1:3])
        if global_cfg.GLOBAL.SUMMARY_LEVEL <= SummaryLevel.DEBUG:
            wsummary.feature_map_summary(target_mask, name="target_mask")

        return wnn.sigmoid_cross_entropy_with_logits_FL(
            labels=target_mask, logits=self.head_outputs[SEMANTIC])
Ejemplo n.º 5
0
    def inference(self, inputs, pred_maps):
        """
        Arguments:
            inputs: same as forward's batched_inputs
            pred_maps: output of hrnet head [B,H,W,NUM_KEYPOINTS*2],[B,H,W,NUM_KEYPOINTS]
        Returns:
            results:
            RD_BOXES: [B,N,4]
            RD_PROBABILITY:[ B,N]
            RD_KEYPOINTS:[B,N,NUM_KEYPOINTS,2]
            RD_LENGTH:[B]
        """
        with tf.name_scope("aggregate_results"):
            pred0, det1 = pred_maps
            det0, tags = tf.split(pred0, num_or_size_splits=2, axis=-1)
            target_size = wmlt.combined_static_and_dynamic_shape(det1)[1:3]
            tags = tf.image.resize_bilinear(tags, target_size)
            det0 = tf.image.resize_bilinear(det0, target_size)
            H, W = target_size
            wsummary.feature_map_summary(tags, "tags", max_outputs=5)

            tags = tf.expand_dims(tags,
                                  axis=-1)  #shape [B,H,W,NUM_KEYPOINTS,1]
            det = (det0 + det1) / 2  #shape [B,H,W,NUM_KEYPOINTS]
            wsummary.feature_map_summary(det0, "det0", max_outputs=5)
            wsummary.feature_map_summary(det1, "det1", max_outputs=5)

        tag_k, loc_k, val_k = self.top_k(det, tags)
        ans = self.match(tag_k, loc_k, val_k)
        ans = self.adjust(ans, det=det)
        ans = tfop.hr_net_refine(ans, det=det, tag=tags)

        scores = ans[..., 2]
        scores = tf.reduce_mean(scores, axis=-1, keepdims=False)
        x, y = tf.unstack(ans[..., :2], axis=-1)
        mask = tf.greater(scores, self.cfg.DET_SCORE_THRESHOLD_TEST)
        size = wmlt.combined_static_and_dynamic_shape(x)[1]
        x, output_lens = wmlt.batch_boolean_mask(x,
                                                 mask,
                                                 size=size,
                                                 return_length=True)
        y = wmlt.batch_boolean_mask(y, mask, size=size)
        scores = wmlt.batch_boolean_mask(scores, mask, size=size)
        keypoints = tf.stack([x, y], axis=-1)

        output_keypoints = kp.keypoints_absolute2relative(keypoints,
                                                          width=W,
                                                          height=H)
        bboxes = kp.batch_get_bboxes(output_keypoints, output_lens)

        outdata = {
            RD_BOXES: bboxes,
            RD_LENGTH: output_lens,
            RD_KEYPOINT: output_keypoints,
            RD_PROBABILITY: scores,
            RD_LABELS: tf.ones_like(scores, dtype=tf.int32)
        }

        if global_cfg.GLOBAL.SUMMARY_LEVEL <= SummaryLevel.DEBUG:
            wsummary.keypoints_image_summary(
                images=inputs[IMAGE],
                keypoints=output_keypoints,
                lengths=outdata[RD_LENGTH],
                keypoints_pair=self.cfg.POINTS_PAIRS,
                name="keypoints_results")
        return outdata
Ejemplo n.º 6
0
    def losses(self):
        """
        Args:
            For `gt_classes` and `gt_anchors_deltas` parameters, see
                :meth:`RetinaNet.get_ground_truth`.
            Their shapes are (N, R) and (N, R, 4), respectively, where R is
            the total number of anchors across levels, i.e. sum(Hi x Wi x A)
            For `pred_class_logits` and `pred_anchor_deltas`, see
                :meth:`RetinaNetHead.forward`.

        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a scalar tensor
                storing the loss. Used during training only. The dict keys are:
                "loss_cls" and "loss_box_reg"
        """
        assert len(self.pred_logits[0].get_shape()) == 4, "error logits dim"
        assert len(
            self.pred_anchor_deltas[0].get_shape()) == 4, "error anchors dim"
        gt_classes, gt_anchors_deltas, to_gt_indices = self._get_ground_truth()
        pred_class_logits, pred_anchor_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat(
            self.pred_logits, self.pred_anchor_deltas,
            self.num_classes)  # Shapes: (N, R, K) and (N, R, 4), respectively.
        pred_coeff = general_to_N_HWA_K_and_concat(
            self.head_outputs[COEFFICIENT], K=self.coefficient_nr)

        valid_idxs = gt_classes >= 0
        foreground_idxs = (gt_classes > 0)
        num_foreground = tf.reduce_sum(tf.cast(foreground_idxs, tf.int32))

        gt_classes_target = tf.boolean_mask(gt_classes, valid_idxs)
        gt_classes_target = tf.one_hot(gt_classes_target,
                                       depth=self.num_classes + 1)
        gt_classes_target = gt_classes_target[:,
                                              1:]  #RetinaNet中没有背景, 因为背景index=0, 所以要在one hot 后去掉背景
        pred_class_logits = tf.boolean_mask(pred_class_logits, valid_idxs)

        # logits loss
        loss_cls = tf.reduce_sum(
            wnn.sigmoid_cross_entropy_with_logits_FL(
                labels=gt_classes_target,
                logits=pred_class_logits,
                alpha=self.focal_loss_alpha,
                gamma=self.focal_loss_gamma,
            )) / tf.cast(tf.maximum(1, num_foreground), tf.float32)

        # regression loss
        pred_anchor_deltas = tf.boolean_mask(pred_anchor_deltas,
                                             foreground_idxs)
        gt_anchors_deltas = tf.boolean_mask(gt_anchors_deltas, foreground_idxs)
        loss_box_reg = tf.losses.huber_loss(
            pred_anchor_deltas,
            gt_anchors_deltas,
            loss_collection=None,
            reduction=tf.losses.Reduction.SUM,
        ) / tf.cast(tf.maximum(1, num_foreground), tf.float32)

        # mask loss
        with tf.device(":/cpu:0"):
            target_mask = wmlt.batch_boolean_maskv3(
                self.batched_inputs[GT_MASKS], to_gt_indices, foreground_idxs)
            target_bboxes = wmlt.batch_boolean_maskv3(
                self.batched_inputs[GT_BOXES], to_gt_indices, foreground_idxs)
            target_mask = tf.expand_dims(target_mask, axis=-1)
            target_mask = wmlt.tf_crop_and_resize(target_mask,
                                                  target_bboxes,
                                                  size=[31, 31])

        pred_mask = self.get_pred_mask(pred_coeff, self.head_outputs["protos"],
                                       foreground_idxs)
        pred_mask = tf.expand_dims(pred_mask, axis=-1)
        pred_mask = wmlt.tf_crop_and_resize(pred_mask,
                                            target_bboxes,
                                            size=[31, 31])

        if global_cfg.GLOBAL.SUMMARY_LEVEL <= SummaryLevel.DEBUG:
            wsummary.feature_map_summary(self.head_outputs["protos"],
                                         name="protos")
            wsummary.row_image_summaries(
                [target_mask, tf.nn.sigmoid(pred_mask)], name="gt_vs_pred")

        target_mask = tf.squeeze(target_mask, axis=-1)
        pred_mask = tf.squeeze(pred_mask, axis=-1)
        mask_loss = tf.reduce_mean(
            wnn.sigmoid_cross_entropy_with_logits_FL(labels=target_mask,
                                                     logits=pred_mask))

        #aux sem loss
        sem_loss = tf.reduce_sum(self.semantic_loss()) / tf.cast(
            tf.maximum(1, num_foreground), tf.float32)

        return {
            "loss_cls": loss_cls,
            "loss_box_reg": loss_box_reg,
            "semantic_loss": sem_loss,
            "mask_loss": mask_loss
        }
Ejemplo n.º 7
0
    def forward(self, batched_inputs):
        """
        Args:
            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
                Each item in the list contains the inputs for one image.
                For now, each item in the list is a dict that contains:

                * image: Tensor, image in (H, W, C) format.
                * instances: Instances

                Other information that's included in the original dicts, such as:

                * "height", "width" (int): the output resolution of the model, used in inference.
                  See :meth:`postprocess` for details.
        Returns:
            dict[str: Tensor]:
                mapping from a named loss to a tensor storing the loss. Used during training only.
        """
        batched_inputs = self.preprocess_image(batched_inputs)

        features = self.backbone(batched_inputs)
        if len(self.in_features) == 0:
            print(
                f"Error no input features for deeplab, use all features {features.keys()}"
            )
            features = list(features.values())
        else:
            features = [features[f] for f in self.in_features]
        pred_logits = self.head(features)
        gt_labels = batched_inputs.get(GT_SEMANTIC_LABELS, None)

        outputs = build_outputs(
            name=self.cfg.MODEL.DEEPLAB.OUTPUTS,
            cfg=self.cfg.MODEL.DEEPLAB,
            parent=self,
            pred_logits=pred_logits,
            labels=gt_labels,
        )
        outputs.batched_inputs = batched_inputs
        max_outputs = 3
        wsummary.batch_semantic_summary(batched_inputs[IMAGE],
                                        masks=gt_labels[..., 1:],
                                        max_outputs=max_outputs,
                                        name="gt")

        if self.is_training:
            if self.cfg.GLOBAL.SUMMARY_LEVEL <= SummaryLevel.DEBUG:
                results = outputs.inference(inputs=batched_inputs,
                                            logits=pred_logits)
                wsummary.batch_semantic_summary(batched_inputs[IMAGE],
                                                masks=results[RD_SEMANTIC][...,
                                                                           1:],
                                                max_outputs=max_outputs,
                                                name="pred")
                wsummary.feature_map_summary(gt_labels,
                                             name="gt_semantic",
                                             max_outputs=10)
                wsummary.feature_map_summary(results[RD_SEMANTIC],
                                             name="pred_semantic",
                                             max_outputs=10)
            else:
                results = {}

            return results, outputs.losses()
        else:
            results = outputs.inference(inputs=batched_inputs,
                                        logits=pred_logits)
            wsummary.batch_semantic_summary(batched_inputs[IMAGE],
                                            masks=results[RD_SEMANTIC][...,
                                                                       1:],
                                            max_outputs=max_outputs,
                                            name="pred")
            wsummary.feature_map_summary(gt_labels,
                                         name="gt_semantic",
                                         max_outputs=10)
            wsummary.feature_map_summary(results[RD_SEMANTIC],
                                         name="pred_semantic",
                                         max_outputs=10)
            return results, {}