def detr_probabilistic_inference(self, input_im): outputs = self.model(input_im, return_raw_results=True, is_mc_dropout=self.mc_dropout_enabled) image_width = input_im[0]['image'].shape[2] image_height = input_im[0]['image'].shape[1] # Handle logits and classes predicted_logits = outputs['pred_logits'][0] if 'pred_logits_var' in outputs.keys(): predicted_logits_var = outputs['pred_logits_var'][0] box_cls_dists = torch.distributions.normal.Normal( predicted_logits, scale=torch.sqrt(torch.exp(predicted_logits_var))) predicted_logits = box_cls_dists.rsample( (self.model.cls_var_num_samples, )) predicted_prob_vectors = F.softmax(predicted_logits, dim=-1) predicted_prob_vectors = predicted_prob_vectors.mean(0) else: predicted_prob_vectors = F.softmax(predicted_logits, dim=-1) predicted_prob, classes_idxs = predicted_prob_vectors[:, :-1].max(-1) # Handle boxes and covariance matrices predicted_boxes = outputs['pred_boxes'][0] # Rescale boxes to inference image size (not COCO original size) pred_boxes = Boxes(box_cxcywh_to_xyxy(predicted_boxes)) pred_boxes.scale(scale_x=image_width, scale_y=image_height) predicted_boxes = pred_boxes.tensor # Rescale boxes to inference image size (not COCO original size) if 'pred_boxes_cov' in outputs.keys(): predicted_boxes_covariance = covariance_output_to_cholesky( outputs['pred_boxes_cov'][0]) predicted_boxes_covariance = torch.matmul( predicted_boxes_covariance, predicted_boxes_covariance.transpose(1, 2)) transform_mat = torch.tensor([[[1.0, 0.0, -0.5, 0.0], [0.0, 1.0, 0.0, -0.5], [1.0, 0.0, 0.5, 0.0], [0.0, 1.0, 0.0, 0.5]]]).to(self.model.device) predicted_boxes_covariance = torch.matmul( torch.matmul(transform_mat, predicted_boxes_covariance), transform_mat.transpose(1, 2)) scale_mat = torch.diag_embed( torch.as_tensor( (image_width, image_height, image_width, image_height), dtype=torch.float32)).to(self.model.device).unsqueeze(0) predicted_boxes_covariance = torch.matmul( torch.matmul(scale_mat, predicted_boxes_covariance), torch.transpose(scale_mat, 2, 1)) else: predicted_boxes_covariance = [] return predicted_boxes, predicted_boxes_covariance, predicted_prob, classes_idxs, predicted_prob_vectors
def predict_boxes(self, images, boxes): assert images.shape[0] == 1 img = images img = self.to_numpy(img)[:, :, ::-1] # to BGR original_size = img.shape[:2] try: img, proposals = self._transform_image_and_boxes(img, boxes) except BaseException as e: print(e) import ipdb; ipdb.set_trace() device = self.model.device img = img.to(device) proposals = [proposals.to(device)] inputs = [{ 'image': img, 'proposals': proposals, 'height': original_size[0], 'width': original_size[1] }] model = self.model roi_heads = self.model.roi_heads images = model.preprocess_image(inputs) features = model.backbone(images.tensor) features = [features[f] for f in roi_heads.in_features] box_features = roi_heads.box_pooler( features, [x.proposal_boxes for x in proposals]) box_features = roi_heads.box_head(box_features) pred_class_logits, pred_proposal_deltas = roi_heads.box_predictor( box_features) outputs = FastRCNNOutputs( roi_heads.box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, roi_heads.smooth_l1_beta, ) pred_boxes = outputs.predict_boxes()[0] c = self.person_class if self.softmax_only_person: scores = pred_class_logits[:, [c, -1]].detach() scores = F.softmax(scores, -1)[:, 0] else: scores = F.softmax(pred_class_logits, -1) scores = scores[:, c].detach() boxes = pred_boxes[:, c * 4:(c + 1) * 4].detach() scale_y = original_size[0] / img.shape[1] scale_x = original_size[1] / img.shape[2] boxes = Boxes(boxes) boxes.scale(scale_x, scale_y) boxes = boxes.tensor return boxes, scores
def regress_and_classify(self, image: np.ndarray, tracklets: List[Tracklet]) -> Tuple[np.ndarray, np.ndarray]: # Convert boxes to proposals height, width = image.shape[:2] image = self.transform_gen.get_transform(image).apply_image(image) image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) # Size of feature maps, used in the detector feat_height, feat_width = image.shape[1:3] scale_x = feat_width / width scale_y = feat_height / height proposal_boxes = Boxes(torch.tensor([tracklet.last_detection.box for tracklet in tracklets])) # Scale proposals to the same size as boxes proposal_boxes.scale(scale_x, scale_y) proposals = Instances((feat_height, feat_width), proposal_boxes=proposal_boxes) inputs = {"image": image, "height": height, "width": width, "proposals": proposals} images = self.model.preprocess_image([inputs]) features = self.model.backbone(images.tensor) proposals = [inputs["proposals"].to(self.model.device)] # Extract features, perform RoI pooling and perform regression/classification for each RoI features_list = [features[f] for f in self.model.roi_heads.in_features] box_features = self.model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals]) box_features = self.model.roi_heads.box_head(box_features) pred_class_logits, pred_proposal_deltas = self.model.roi_heads.box_predictor(box_features) del box_features raw_outputs = FastRCNNOutputs( self.model.roi_heads.box_predictor.box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, self.model.roi_heads.box_predictor.smooth_l1_beta, ) # Convert raw outputs to predicted boxes and scores boxes = raw_outputs.predict_boxes()[0] scores = raw_outputs.predict_probs()[0] num_bbox_reg_classes = boxes.shape[1] // 4 boxes = Boxes(boxes.reshape(-1, 4)) # Scale regressed boxes to the same size as original image boxes.clip((feat_height, feat_width)) boxes.scale(1 / scale_x, 1 / scale_y) boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) boxes = boxes[:, 0, :] scores = scores[:, 0] pred_boxes = boxes.detach().cpu().numpy() scores = scores.detach().cpu().numpy() return pred_boxes, scores
def add_pseudo_label(self, targets, image_path, flip): new_targets = [] if self.pseudo_gt is None: return targets if len(targets) > 0 and targets[ 0].gt_boxes.tensor.device != self.pseudo_gt.device: self.pseudo_gt = self.pseudo_gt.to( targets[0].gt_boxes.tensor.device) for i, (targets_per_image, path) in enumerate(zip(targets, image_path)): H, W = targets_per_image._image_size gt_boxes = targets_per_image.gt_boxes gt_classes = targets_per_image.gt_classes p = int(path.split('/')[-1].split('.')[0]) data = self.pseudo_gt[self.pseudo_gt[:, 0] == p] ld = len(data) if len(data) == 0: new_targets.append(targets_per_image) continue label = data[:, 1].long() boxes = data[:, 2:].clone() if flip[i] == 1: boxes[:, 0] = 1 - boxes[:, 0] boxes[:, 2] = 1 - boxes[:, 2] boxes = torch.index_select( boxes, -1, torch.as_tensor([2, 1, 0, 3], device=boxes.device)) boxes = Boxes(boxes) boxes.scale(scale_x=W, scale_y=H) new_gt_boxes = gt_boxes.cat([gt_boxes, boxes]) new_gt_masks = PolygonMasks([[]]) if hasattr(targets_per_image, 'gt_masks'): gt_masks = targets_per_image.gt_masks new_gt_masks = new_gt_masks.cat([gt_masks] + [new_gt_masks] * ld) else: new_gt_masks = new_gt_masks.cat([new_gt_masks] * ld) new_gt_classes = torch.cat((gt_classes, label)) new_target = Instances((H, W)) new_target.gt_classes = new_gt_classes new_target.gt_masks = new_gt_masks new_target.gt_boxes = new_gt_boxes new_targets.append(new_target) lbl, cnt = label.unique(return_counts=True) return new_targets
def seg_det_postprocess_bk(segmap, contour, emb, img_size, output_height, output_width): """ Translate segmentation predictions into detection results. The input images are often resized when entering semantic segmentor. Moreover, in same cases, they also padded inside segmentor to be divisible by maximum network stride. As a result, we often need the predictions of the segmentor in a different resolution from its inputs. Args: segmap (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W), where C is the number of classes, and H, W are the height and width of the prediction. contour (Tensor): contour prediction logits. A tensor of shape (C, H, W), where C is the number of classes, and H, W are the height and width of the prediction. emb (Tensor): contour prediction logits. A tensor of shape (C, H, W), where C is the number of classes, and H, W are the height and width of the prediction. img_size (tuple): image size that segmentor is taking as input. output_height, output_width: the desired output resolution. Returns: semantic segmentation prediction (Tensor): A tensor of the shape (C, output_height, output_width) that contains per-pixel soft predictions. """ segmap = segmap[:, :img_size[0], :img_size[1]].cpu().numpy() contour = contour[:, :img_size[0], :img_size[1]].cpu().numpy() emb = emb[:, :img_size[0], :img_size[1]].cpu().numpy() ncls = segmap.shape[0] - 1 # remove the background assert (contour.shape[0] == ncls) assert (emb.shape[0] == ncls) H = segmap.shape[1] W = segmap.shape[2] pred_boxes = [] pred_scores = [] pred_classes = [] pred_masks = [] # Step1: segment the foreground (according to segmap) into super-pixels (separated by contours) for c in range(ncls): cont_c = contour[c] seg_c = segmap[c] emb_c = emb[c] #TODO: we may need to turn the contour map and segmap into binary images #For now we combine contour map and segmentation before connecting superpixels for simplicity bw = (1 - cont_c) * seg_c > 0.2 # 0.05 # 1-cont_c > 0.7 # retval, labels, stats, centroids = cv2.connectedComponentsWithStats( bw.astype(np.uint8)) nseg = retval # np.max(labels) # Note: the background is labeled to b 0, which should be ignored # assert(retval==nseg+1) avg_embed = np.zeros(nseg) avg_scores = np.zeros(nseg) bboxes = np.zeros((nseg, 4)) for s in range(nseg): seg_size = stats[s, cv2.CC_STAT_AREA] if seg_size < H * W * 0.0001: # 0.008 0.0002 continue # calculate the average embedding of each superpixel superpixel = labels == s superpixel = superpixel.astype(np.float) npixel = np.sum(superpixel) avg_scores[s] = np.sum(seg_c * superpixel) / npixel avg_embed[s] = np.sum(emb_c * superpixel) / npixel # get the bounding boxes of superpixels in X1Y1X2Y2 format bboxes[s, 0] = stats[s, cv2.CC_STAT_LEFT] bboxes[s, 1] = stats[s, cv2.CC_STAT_TOP] bboxes[s, 2] = stats[s, cv2.CC_STAT_WIDTH] + bboxes[s, 0] bboxes[s, 3] = stats[s, cv2.CC_STAT_HEIGHT] + bboxes[s, 1] # remove small segments and low-confident segments idx = [s for s in range(nseg) if avg_scores[s] >= 0.2] # 0.2 avg_embed = avg_embed[idx] avg_scores = avg_scores[idx] bboxes = bboxes[idx, :] nseg = len(avg_scores) # Step 2: group the super-pixels of the same object according to embedding bmerged = np.zeros(nseg, dtype=np.bool) merged_bboxes = bboxes areas = np.zeros(nseg, dtype=np.float) for s in range(nseg): if bmerged[s]: continue areas[s] = (merged_bboxes[s, 3] - merged_bboxes[s, 1]) * ( merged_bboxes[s, 2] - merged_bboxes[s, 0]) for t in range(s + 1, nseg): # TODO: we may take spatial distance as an auxiliary criteria if abs(avg_embed[s] - avg_embed[t]) < 0.2: # 0.2 # idx = np.where(labels==t) # labels[idx] = s # merge the bounding boxes merged_bboxes[s, 0] = min(merged_bboxes[s, 0], merged_bboxes[t, 0]) merged_bboxes[s, 1] = min(merged_bboxes[s, 1], merged_bboxes[t, 1]) merged_bboxes[s, 2] = max(merged_bboxes[s, 2], merged_bboxes[t, 2]) merged_bboxes[s, 3] = max(merged_bboxes[s, 3], merged_bboxes[t, 3]) areas[s] = (merged_bboxes[s, 3] - merged_bboxes[s, 1]) * ( merged_bboxes[s, 2] - merged_bboxes[s, 0]) # merge the scores avg_scores[s] = max(avg_scores[s], avg_scores[t]) bmerged[t] = True ileft = [ s for s in range(nseg) if not bmerged[s] and areas[s] > H * W * 0.002 ] avg_scores = avg_scores[ileft] bboxes = merged_bboxes[ileft, :].astype(np.int32) nseg = len(avg_scores) masks = [] for i in range(nseg): mask = np.zeros(img_size, dtype=np.float) mask[bboxes[i, 0]:bboxes[i, 2], bboxes[i, 1]:bboxes[i, 3]] = seg_c[bboxes[i, 0]:bboxes[i, 2], bboxes[i, 1]:bboxes[i, 3]] masks.append(mask) pred_boxes.append(bboxes) pred_scores.append(avg_scores) pred_classes += [c] * len(avg_scores) pred_masks += masks # rescale the bounding boxes to match the output resolution scale_x, scale_y = (output_width / img_size[1], output_height / img_size[0]) result = Instances((output_height, output_width)) # img_size output_boxes = Boxes(torch.tensor(np.concatenate(pred_boxes).astype(int))) output_boxes.scale(scale_x, scale_y) output_boxes.clip(result.image_size) result.pred_boxes = output_boxes # output_boxes.clip(results.image_size) result.scores = torch.tensor(np.concatenate(pred_scores)) result.pred_classes = torch.tensor(pred_classes) # result.pred_masks = torch.tensor(np.concatenate(pred_masks)) #TODO: we have to rescale to the output size return result
def seg_det_postprocess(segmap, contour, emb, img_size, output_height, output_width): """ Translate segmentation predictions into detection results. The input images are often resized when entering semantic segmentor. Moreover, in same cases, they also padded inside segmentor to be divisible by maximum network stride. As a result, we often need the predictions of the segmentor in a different resolution from its inputs. Args: segmap (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W), where C is the number of classes, and H, W are the height and width of the prediction. contour (Tensor): contour prediction logits. A tensor of shape (C, H, W), where C is the number of classes, and H, W are the height and width of the prediction. emb (Tensor): contour prediction logits. A tensor of shape (C, H, W), where C is the number of classes, and H, W are the height and width of the prediction. img_size (tuple): image size that segmentor is taking as input. output_height, output_width: the desired output resolution. Returns: semantic segmentation prediction (Tensor): A tensor of the shape (C, output_height, output_width) that contains per-pixel soft predictions. """ segmap = segmap[:, :img_size[0], :img_size[1]].cpu().numpy() contour = contour[:, :img_size[0], :img_size[1]].cpu().numpy() emb = emb[:, :img_size[0], :img_size[1]].cpu().numpy() ncls = segmap.shape[0] - 1 # remove the background assert (contour.shape[0] == ncls) assert (emb.shape[0] == ncls) H = segmap.shape[1] W = segmap.shape[2] pred_boxes = [] pred_scores = [] pred_classes = [] pred_masks = [] # Step1: segment the foreground (according to segmap) into super-pixels (separated by contours) for c in range(ncls): cont_c = contour[c] seg_c = segmap[c] emb_c = emb[c] #TODO: we may need to turn the contour map and segmap into binary images #For now we combine contour map and segmentation before connecting superpixels for simplicity bw = (1 - 1.5 * cont_c) * seg_c > 0.2 # 0.05 # 1-cont_c > 0.5 # retval, labels, stats, centroids = cv2.connectedComponentsWithStats( bw.astype(np.uint8)) nseg = np.max(labels) # Note: the background is labeled to b 0, which should be ignored assert (retval == nseg + 1) avg_embed = np.zeros(nseg) avg_scores = np.zeros(nseg) bboxes = np.zeros((nseg, 4)) bboxes_size = np.zeros(nseg) sizes = np.zeros(nseg) for s in range(nseg): sizes[s] = stats[s + 1, cv2.CC_STAT_AREA] if sizes[s] < H * W * 0.0001: continue # calculate the average embedding of each superpixel superpixel = labels == s + 1 superpixel = superpixel.astype(np.float) npixel = np.sum(superpixel) avg_scores[s] = np.sum(seg_c * superpixel) / npixel # avg_embed[s] = np.sum(emb_c * superpixel) / npixel # calculate the median value of the segment ipixels = np.nonzero(labels == s + 1) avg_embed[s] = np.median(emb_c[ipixels]) # print(avg_embed[s] - median_embed) # get the bounding boxes of superpixels in X1Y1X2Y2 format bboxes[s, 0] = stats[s + 1, cv2.CC_STAT_LEFT] bboxes[s, 1] = stats[s + 1, cv2.CC_STAT_TOP] bboxes[s, 2] = stats[s + 1, cv2.CC_STAT_WIDTH] + bboxes[s, 0] bboxes[s, 3] = stats[s + 1, cv2.CC_STAT_HEIGHT] + bboxes[s, 1] bboxes_size[s] = (bboxes[s, 3] - bboxes[s, 1]) * (bboxes[s, 2] - bboxes[s, 0]) # Step 2: remove low-confident segments idx = [s for s in range(nseg) if avg_scores[s] >= 0.2] # 0.05 avg_embed = avg_embed[idx] avg_scores = avg_scores[idx] bboxes = bboxes[idx, :] bboxes_size = bboxes_size[idx] sizes = sizes[idx] nseg = len(avg_scores) # Step 3: Sort the segments by size sorted_idx = np.flip(np.argsort(sizes)) avg_embed = avg_embed[sorted_idx] avg_scores = avg_scores[sorted_idx] bboxes = bboxes[sorted_idx, :] bboxes_size = bboxes_size[sorted_idx] sizes = sizes[sorted_idx] # Step 4: calculate the similarity between each pair of segments sim = np.zeros((nseg, nseg)) if nseg >= 2: emb_sigma = avg_embed.std() else: emb_sigma = 0.5 SIM_EMB_FACTOR = 0.8 # 1.5 5*(avg_embed.max()-avg_embed.min())/nseg # 1.0/(emb_sigma*np.sqrt(2*np.pi)) for s in range(nseg): for t in range(s + 1, nseg): # similarity of embedding #sim_emb = np.exp(-SIM_EMB_FACTOR * np.abs(avg_embed[s]-avg_embed[t])/emb_var) #sim_emb = SIM_EMB_FACTOR * np.exp(-np.math.pow(avg_embed[s]-avg_embed[t], 2)/(2*2*emb_sigma**2)) sim_emb = np.exp(-np.abs(avg_embed[s] - avg_embed[t]) / SIM_EMB_FACTOR) # sim_emb = np.exp(-4*np.abs(avg_embed[s]-avg_embed[t])/(np.abs(avg_embed[s])+np.abs(avg_embed[t]))) # spatial distance based on GIOU merged_bbox = np.zeros(4) merged_bbox[0] = min(bboxes[s, 0], bboxes[t, 0]) merged_bbox[1] = min(bboxes[s, 1], bboxes[t, 1]) merged_bbox[2] = max(bboxes[s, 2], bboxes[t, 2]) merged_bbox[3] = max(bboxes[s, 3], bboxes[t, 3]) merged_area = (merged_bbox[3] - merged_bbox[1]) * ( merged_bbox[2] - merged_bbox[0]) overlap_bbox = np.zeros(4) overlap_bbox[0] = max(bboxes[s, 0], bboxes[t, 0]) overlap_bbox[1] = max(bboxes[s, 1], bboxes[t, 1]) overlap_bbox[2] = min(bboxes[s, 2], bboxes[t, 2]) overlap_bbox[3] = min(bboxes[s, 3], bboxes[t, 3]) overlap_area = max(0, overlap_bbox[2] - overlap_bbox[0]) * max( 0, overlap_bbox[3] - overlap_bbox[1]) sim_spatial = (bboxes_size[s] + bboxes_size[t] - overlap_area) / merged_area #TODO: calculate contour-based distance sim[s, t] = sim_spatial * sim_emb # sim[t, s] = sim[s, t] #TODO: calculate the keypoint-based similarity # Step 5: group the segments of the same object according to the similarity matrix bmerged = np.zeros(nseg, dtype=np.bool) group_IDs = np.ones(nseg, dtype=np.int) * -1 ngroups = 0 THR_SIM = 0.5 while any(group_IDs < 0): for s in range(nseg): if group_IDs[s] < 0: # find out the closest segment assigned = np.nonzero(group_IDs >= 0) if assigned[0].size == 0: group_IDs[s] = ngroups ngroups += 1 else: sim_group = sim[s, assigned[0]] t = np.argmax(sim_group) # print(sim_group) # print(t) if sim_group[t] > THR_SIM: group_IDs[s] = group_IDs[assigned[0][t]] else: group_IDs[s] = ngroups ngroups += 1 # merge the groups group_bboxes = np.zeros((ngroups, 4)) group_scores = np.zeros(ngroups) group_areas = np.zeros(ngroups) for g in range(ngroups): assigned = np.nonzero(group_IDs == g) assigned = assigned[0] group_bboxes[g, :] = bboxes[assigned[0], :] group_scores[g] = avg_scores[assigned[0]] group_areas[g] = sizes[assigned[0]] for s in range(1, len(assigned)): # merge the bounding boxes group_bboxes[g, 0] = min(bboxes[assigned[s], 0], group_bboxes[g, 0]) group_bboxes[g, 1] = min(bboxes[assigned[s], 1], group_bboxes[g, 1]) group_bboxes[g, 2] = max(bboxes[assigned[s], 2], group_bboxes[g, 2]) group_bboxes[g, 3] = max(bboxes[assigned[s], 3], group_bboxes[g, 3]) # areas[s] = (merged_bboxes[s, 3]-merged_bboxes[s, 1])*(merged_bboxes[s, 2]-merged_bboxes[s, 0]) group_areas[g] += sizes[assigned[s]] # # merge the scores # use the score of the large segment # group_scores[g] = max(avg_scores[assigned[s]], group_scores[g]) if nseg: THR_AREA = max(sizes[0] * 0.1, H * W * 0.001) else: THR_AREA = H * W * 0.001 # 0.002 ileft = np.nonzero(group_areas > THR_AREA) ileft = ileft[0] avg_scores = group_scores[ileft] bboxes = group_bboxes[ileft, :].astype(np.int32) nseg = len(avg_scores) masks = [] for i in range(nseg): mask = np.zeros((H, W), dtype=np.float) mask[bboxes[i, 0]:bboxes[i, 2], bboxes[i, 1]:bboxes[i, 3]] = seg_c[bboxes[i, 0]:bboxes[i, 2], bboxes[i, 1]:bboxes[i, 3]] masks.append(mask) pred_boxes.append(bboxes) pred_scores.append(avg_scores) pred_classes += [c] * len(avg_scores) pred_masks += masks # rescale the bounding boxes to match the output resolution scale_x, scale_y = (output_width / img_size[1], output_height / img_size[0]) result = Instances((output_height, output_width)) # img_size output_boxes = Boxes(torch.tensor(np.concatenate(pred_boxes).astype(int))) output_boxes.scale(scale_x, scale_y) output_boxes.clip(result.image_size) result.pred_boxes = output_boxes # output_boxes.clip(results.image_size) result.scores = torch.tensor(np.concatenate(pred_scores)) result.pred_classes = torch.tensor(pred_classes) # result.pred_masks = torch.tensor(np.concatenate(pred_masks)) #TODO: we have to rescale to the output size return result