def _get_ground_truth(self): map_shape = btf.combined_static_and_dynamic_shape(self.pred_maps[0][0]) output = tfop.open_pose_encode( keypoints=self.gt_keypoints, output_size=map_shape[1:3], glength=self.gt_length, keypoints_pair=self.cfg.POINTS_PAIRS, l_delta=self.cfg.OPENPOSE_L_DELTA, gaussian_delta=self.cfg.OPENPOSE_GAUSSIAN_DELTA) gt_conf_maps = output[0] gt_paf_maps = output[1] wsummary.feature_map_summary(gt_conf_maps, "gt_conf_maps", max_outputs=5) wsummary.feature_map_summary(gt_paf_maps, "gt_paf_maps", max_outputs=5) if self.cfg.USE_LOSS_MASK: B, H, W, _ = btf.combined_static_and_dynamic_shape(gt_paf_maps) image = tf.zeros([B, H, W, 1]) mask = odtl.batch_fill_bboxes(image, self.gt_boxes, v=1.0, length=self.gt_length, H=H, W=W, relative_coord=True) conf_mask = mask paf_mask = mask tf.summary.image("bboxes_mask", mask, max_outputs=5) else: conf_mask = None paf_mask = None return gt_paf_maps, gt_conf_maps, paf_mask, conf_mask
def _get_ground_truth(self): """ Returns: gt_objectness_logits: list of N tensors. Tensor i is a vector whose length is the total number of anchors in image i (i.e., len(anchors[i])). Label values are in {-1, 0, 1}, with meanings: -1 = ignore; 0 = negative class; 1 = positive class. gt_anchor_deltas: list of N tensors. Tensor i has shape (len(anchors[i]), 4). """ img_size = tf.shape(self.batched_inputs[IMAGE])[1:3] res_list = [] for i, logits, regression, center_ness in zip(count(), self.pred_logits, self.pred_regression, self.pred_center_ness): res = self.box2box_transform.get_deltas( gboxes=self.gt_boxes, glabels=self.gt_labels, glength=self.gt_length, min_size=self.size_threshold[i], max_size=self.size_threshold[i + 1], fm_shape=tf.shape(logits)[1:3], img_size=img_size) if global_cfg.GLOBAL.SUMMARY_LEVEL <= SummaryLevel.DEBUG: for k, v in res.items(): if len(v.get_shape()) == 3: v = tf.expand_dims(v, axis=-1) wsummary.feature_map_summary(v, k) res_list.append(res) return res_list
def _get_ground_truth(self, net): map_shape = btf.combined_static_and_dynamic_shape(net) output = tfop.hr_net_encode( keypoints=self.gt_keypoints, output_size=map_shape[1:3], glength=self.gt_length, gaussian_delta=self.cfg.OPENPOSE_GAUSSIAN_DELTA) gt_conf_maps = output[0] gt_indexs = output[1] wsummary.feature_map_summary(gt_conf_maps, "gt_conf_maps", max_outputs=5) if self.cfg.USE_LOSS_MASK: B, H, W, _ = btf.combined_static_and_dynamic_shape(gt_conf_maps) image = tf.zeros([B, H, W, 1]) mask = odtl.batch_fill_bboxes(image, self.gt_boxes, v=1.0, length=self.gt_length, H=H, W=W, relative_coord=True) conf_mask = mask tf.summary.image("loss_mask", mask, max_outputs=5) else: conf_mask = None return gt_conf_maps, gt_indexs, conf_mask
def semantic_loss(self): pred_semantic = self.head_outputs[SEMANTIC] shape = wmlt.combined_static_and_dynamic_shape(pred_semantic) target_mask = smt.batch_sparse_mask_to_dense( mask=self.batched_inputs[GT_MASKS], labels=self.batched_inputs[GT_LABELS], lens=self.batched_inputs[GT_LENGTH], num_classes=self.num_classes) target_mask = tf.cast(target_mask, tf.float32) target_mask = tf.transpose(target_mask, [0, 2, 3, 1]) target_mask = tf.image.resize_bilinear(target_mask, shape[1:3]) if global_cfg.GLOBAL.SUMMARY_LEVEL <= SummaryLevel.DEBUG: wsummary.feature_map_summary(target_mask, name="target_mask") return wnn.sigmoid_cross_entropy_with_logits_FL( labels=target_mask, logits=self.head_outputs[SEMANTIC])
def inference(self, inputs, pred_maps): """ Arguments: inputs: same as forward's batched_inputs pred_maps: output of hrnet head [B,H,W,NUM_KEYPOINTS*2],[B,H,W,NUM_KEYPOINTS] Returns: results: RD_BOXES: [B,N,4] RD_PROBABILITY:[ B,N] RD_KEYPOINTS:[B,N,NUM_KEYPOINTS,2] RD_LENGTH:[B] """ with tf.name_scope("aggregate_results"): pred0, det1 = pred_maps det0, tags = tf.split(pred0, num_or_size_splits=2, axis=-1) target_size = wmlt.combined_static_and_dynamic_shape(det1)[1:3] tags = tf.image.resize_bilinear(tags, target_size) det0 = tf.image.resize_bilinear(det0, target_size) H, W = target_size wsummary.feature_map_summary(tags, "tags", max_outputs=5) tags = tf.expand_dims(tags, axis=-1) #shape [B,H,W,NUM_KEYPOINTS,1] det = (det0 + det1) / 2 #shape [B,H,W,NUM_KEYPOINTS] wsummary.feature_map_summary(det0, "det0", max_outputs=5) wsummary.feature_map_summary(det1, "det1", max_outputs=5) tag_k, loc_k, val_k = self.top_k(det, tags) ans = self.match(tag_k, loc_k, val_k) ans = self.adjust(ans, det=det) ans = tfop.hr_net_refine(ans, det=det, tag=tags) scores = ans[..., 2] scores = tf.reduce_mean(scores, axis=-1, keepdims=False) x, y = tf.unstack(ans[..., :2], axis=-1) mask = tf.greater(scores, self.cfg.DET_SCORE_THRESHOLD_TEST) size = wmlt.combined_static_and_dynamic_shape(x)[1] x, output_lens = wmlt.batch_boolean_mask(x, mask, size=size, return_length=True) y = wmlt.batch_boolean_mask(y, mask, size=size) scores = wmlt.batch_boolean_mask(scores, mask, size=size) keypoints = tf.stack([x, y], axis=-1) output_keypoints = kp.keypoints_absolute2relative(keypoints, width=W, height=H) bboxes = kp.batch_get_bboxes(output_keypoints, output_lens) outdata = { RD_BOXES: bboxes, RD_LENGTH: output_lens, RD_KEYPOINT: output_keypoints, RD_PROBABILITY: scores, RD_LABELS: tf.ones_like(scores, dtype=tf.int32) } if global_cfg.GLOBAL.SUMMARY_LEVEL <= SummaryLevel.DEBUG: wsummary.keypoints_image_summary( images=inputs[IMAGE], keypoints=output_keypoints, lengths=outdata[RD_LENGTH], keypoints_pair=self.cfg.POINTS_PAIRS, name="keypoints_results") return outdata
def losses(self): """ Args: For `gt_classes` and `gt_anchors_deltas` parameters, see :meth:`RetinaNet.get_ground_truth`. Their shapes are (N, R) and (N, R, 4), respectively, where R is the total number of anchors across levels, i.e. sum(Hi x Wi x A) For `pred_class_logits` and `pred_anchor_deltas`, see :meth:`RetinaNetHead.forward`. Returns: dict[str: Tensor]: mapping from a named loss to a scalar tensor storing the loss. Used during training only. The dict keys are: "loss_cls" and "loss_box_reg" """ assert len(self.pred_logits[0].get_shape()) == 4, "error logits dim" assert len( self.pred_anchor_deltas[0].get_shape()) == 4, "error anchors dim" gt_classes, gt_anchors_deltas, to_gt_indices = self._get_ground_truth() pred_class_logits, pred_anchor_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat( self.pred_logits, self.pred_anchor_deltas, self.num_classes) # Shapes: (N, R, K) and (N, R, 4), respectively. pred_coeff = general_to_N_HWA_K_and_concat( self.head_outputs[COEFFICIENT], K=self.coefficient_nr) valid_idxs = gt_classes >= 0 foreground_idxs = (gt_classes > 0) num_foreground = tf.reduce_sum(tf.cast(foreground_idxs, tf.int32)) gt_classes_target = tf.boolean_mask(gt_classes, valid_idxs) gt_classes_target = tf.one_hot(gt_classes_target, depth=self.num_classes + 1) gt_classes_target = gt_classes_target[:, 1:] #RetinaNet中没有背景, 因为背景index=0, 所以要在one hot 后去掉背景 pred_class_logits = tf.boolean_mask(pred_class_logits, valid_idxs) # logits loss loss_cls = tf.reduce_sum( wnn.sigmoid_cross_entropy_with_logits_FL( labels=gt_classes_target, logits=pred_class_logits, alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, )) / tf.cast(tf.maximum(1, num_foreground), tf.float32) # regression loss pred_anchor_deltas = tf.boolean_mask(pred_anchor_deltas, foreground_idxs) gt_anchors_deltas = tf.boolean_mask(gt_anchors_deltas, foreground_idxs) loss_box_reg = tf.losses.huber_loss( pred_anchor_deltas, gt_anchors_deltas, loss_collection=None, reduction=tf.losses.Reduction.SUM, ) / tf.cast(tf.maximum(1, num_foreground), tf.float32) # mask loss with tf.device(":/cpu:0"): target_mask = wmlt.batch_boolean_maskv3( self.batched_inputs[GT_MASKS], to_gt_indices, foreground_idxs) target_bboxes = wmlt.batch_boolean_maskv3( self.batched_inputs[GT_BOXES], to_gt_indices, foreground_idxs) target_mask = tf.expand_dims(target_mask, axis=-1) target_mask = wmlt.tf_crop_and_resize(target_mask, target_bboxes, size=[31, 31]) pred_mask = self.get_pred_mask(pred_coeff, self.head_outputs["protos"], foreground_idxs) pred_mask = tf.expand_dims(pred_mask, axis=-1) pred_mask = wmlt.tf_crop_and_resize(pred_mask, target_bboxes, size=[31, 31]) if global_cfg.GLOBAL.SUMMARY_LEVEL <= SummaryLevel.DEBUG: wsummary.feature_map_summary(self.head_outputs["protos"], name="protos") wsummary.row_image_summaries( [target_mask, tf.nn.sigmoid(pred_mask)], name="gt_vs_pred") target_mask = tf.squeeze(target_mask, axis=-1) pred_mask = tf.squeeze(pred_mask, axis=-1) mask_loss = tf.reduce_mean( wnn.sigmoid_cross_entropy_with_logits_FL(labels=target_mask, logits=pred_mask)) #aux sem loss sem_loss = tf.reduce_sum(self.semantic_loss()) / tf.cast( tf.maximum(1, num_foreground), tf.float32) return { "loss_cls": loss_cls, "loss_box_reg": loss_box_reg, "semantic_loss": sem_loss, "mask_loss": mask_loss }
def forward(self, batched_inputs): """ Args: batched_inputs: a list, batched outputs of :class:`DatasetMapper` . Each item in the list contains the inputs for one image. For now, each item in the list is a dict that contains: * image: Tensor, image in (H, W, C) format. * instances: Instances Other information that's included in the original dicts, such as: * "height", "width" (int): the output resolution of the model, used in inference. See :meth:`postprocess` for details. Returns: dict[str: Tensor]: mapping from a named loss to a tensor storing the loss. Used during training only. """ batched_inputs = self.preprocess_image(batched_inputs) features = self.backbone(batched_inputs) if len(self.in_features) == 0: print( f"Error no input features for deeplab, use all features {features.keys()}" ) features = list(features.values()) else: features = [features[f] for f in self.in_features] pred_logits = self.head(features) gt_labels = batched_inputs.get(GT_SEMANTIC_LABELS, None) outputs = build_outputs( name=self.cfg.MODEL.DEEPLAB.OUTPUTS, cfg=self.cfg.MODEL.DEEPLAB, parent=self, pred_logits=pred_logits, labels=gt_labels, ) outputs.batched_inputs = batched_inputs max_outputs = 3 wsummary.batch_semantic_summary(batched_inputs[IMAGE], masks=gt_labels[..., 1:], max_outputs=max_outputs, name="gt") if self.is_training: if self.cfg.GLOBAL.SUMMARY_LEVEL <= SummaryLevel.DEBUG: results = outputs.inference(inputs=batched_inputs, logits=pred_logits) wsummary.batch_semantic_summary(batched_inputs[IMAGE], masks=results[RD_SEMANTIC][..., 1:], max_outputs=max_outputs, name="pred") wsummary.feature_map_summary(gt_labels, name="gt_semantic", max_outputs=10) wsummary.feature_map_summary(results[RD_SEMANTIC], name="pred_semantic", max_outputs=10) else: results = {} return results, outputs.losses() else: results = outputs.inference(inputs=batched_inputs, logits=pred_logits) wsummary.batch_semantic_summary(batched_inputs[IMAGE], masks=results[RD_SEMANTIC][..., 1:], max_outputs=max_outputs, name="pred") wsummary.feature_map_summary(gt_labels, name="gt_semantic", max_outputs=10) wsummary.feature_map_summary(results[RD_SEMANTIC], name="pred_semantic", max_outputs=10) return results, {}