def forward(self, x, rel_matrix=None): # part module for i in range(self.n_stacked_convs // 2): layer_name = self._get_layer_name(i) x = getattr(self, layer_name)(x) x = F.relu(x) inter_x = x # for i in range(2): # layer_name = self._get_deconv_layer_name(i, 'PM') # inter_x = getattr(self, layer_name)(inter_x) # inter_x = F.relu(inter_x) part_scores_logits = self.inter_part_score(inter_x) part_scores = F.sigmoid(part_scores_logits) rel_embs = self._forward_relation_embedding(part_scores, self.rel_matrix, self.word_emb) part_scores = interpolate(part_scores_logits, scale_factor=self.up_scale, mode="bilinear", align_corners=False) part_scores = F.sigmoid(part_scores) for i in range(self.n_stacked_convs): layer_name = self._get_layer_name(i) rel_embs = getattr(self, layer_name)(rel_embs) rel_embs = F.relu(rel_embs) # kpt module for i in range(self.n_stacked_convs // 2, self.n_stacked_convs): layer_name = self._get_layer_name(i) x = getattr(self, layer_name)(x) x = F.relu(x) # for i in range(2): # layer_name = self._get_deconv_layer_name(i, 'KM') # x = getattr(self, layer_name)(x) # x = F.relu(x) x = torch.cat([x, rel_embs], 1) x = self.kpt_score(x) x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False) return x, part_scores
def forward(self, features, rel_matrix=None): # part module x = features for i in range(self.n_stacked_convs // 2): layer_name = self._get_layer_name(i) x = getattr(self, layer_name)(x) x = F.relu(x) inter_x = x part_scores_logits = self.kpt_score(inter_x) # B, num_kpts, size_h, size_w = part_scores_logits.size(0), part_scores_logits.size(1),part_scores_logits.size(2),part_scores_logits.size(3) part_scores = F.softmax(part_scores_logits,dim=1) part_scores = part_scores[:,1:, :, :] # part_scores = part_scores.reshape((B,num_kpts,size_h,size_w)) rel_embs = self._forward_relation_embedding(x, part_scores, self.kpt_rel_matrix, self.kpt_score.weight[:,1:,:,:]) part_scores = interpolate(part_scores_logits, scale_factor=self.up_scale, mode="bilinear", align_corners=False) part_scores = part_scores[:, 1:, :, :] # print(part_scores.size()) for i in range(self.n_stacked_convs // 2, self.n_stacked_convs): layer_name = self._get_layer_name(i) rel_embs = getattr(self, layer_name)(rel_embs) rel_embs = F.relu(rel_embs) # kpt module # for i in range(self.n_stacked_convs // 2, self.n_stacked_convs): # layer_name = self._get_layer_name(i) # x = getattr(self, layer_name)(x) # x = F.relu(x) # x = [x, rel_embs] # x = self._ama_module_forward(x) # kpt_weight = self._forward_kpt_weight_generate(self.kpt_rel_matrix, self.kpt_word_emb) # x = nn.functional.conv_transpose2d(x, weight=kpt_weight, padding=1, stride=2) kpt_scores = self.final_kpt_score(rel_embs) kpt_scores = interpolate(kpt_scores, scale_factor=self.up_scale, mode="bilinear", align_corners=False) return kpt_scores, part_scores.contiguous()
def forward_for_mask(self, boxlists): N, dim, h, w = self.masks.shape grid_x = torch.arange(w).view(1,-1).float().repeat(h,1).cuda() / (w-1) * 2 - 1 grid_y = torch.arange(h).view(-1,1).float().repeat(1,w).cuda() / (h-1) * 2 - 1 x_map = grid_x.view(1, 1, h, w).repeat(N, 1, 1, 1) y_map = grid_y.view(1, 1, h, w).repeat(N, 1, 1, 1) masks_feat = torch.cat((self.masks, x_map, y_map), dim=1) o_h = int(h * self.strides[0]) o_w = int(w * self.strides[0]) for im in range(N): boxlist = boxlists[im] input_h, input_w = boxlist.image_size mask = masks_feat[None, im] ins_num = boxlist.controllers.shape[0] weights1 = boxlist.controllers[:,:80].reshape(-1,8,10).reshape(-1,10).unsqueeze(-1).unsqueeze(-1) bias1 = boxlist.controllers[:, 80:88].flatten() weights2 = boxlist.controllers[:, 88:152].reshape(-1,8,8).reshape(-1,8).unsqueeze(-1).unsqueeze(-1) bias2 = boxlist.controllers[:, 152:160].flatten() weights3 = boxlist.controllers[:, 160:168].unsqueeze(-1).unsqueeze(-1) bias3 = boxlist.controllers[:,168:169].flatten() conv1 = F.conv2d(mask,weights1,bias1).relu() conv2 = F.conv2d(conv1, weights2, bias2, groups = ins_num).relu() masks_per_image = F.conv2d(conv2, weights3, bias3, groups = ins_num).sigmoid() masks = interpolate(masks_per_image, size = (o_h,o_w), mode="bilinear", align_corners=False) masks = masks[:, :, :input_h, :input_w].permute(1,0,2,3) boxlist.pred_masks = masks return boxlists
def forward(self, x): x = layers.interpolate(x, scale_factor=(2, 2), mode="nearest", align_corners=False) x = self.kps_score_lowres(x) return x
def interp2d(input): return interpolate( input, scale_factor=self.scale_factor, mode="bilinear", align_corners=False, )
def forward(self, features): features = interpolate(features, scale_factor=2, mode="nearest") ann_index = self.ann_index_lowres(features) index_uv = self.index_uv_lowres(features) u = self.u_lowres(features) v = self.v_lowres(features) return (ann_index, index_uv, u, v), (None, None, None, None)
def forward(self, x): x = self.kps_score_lowres(x) x = layers.interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False) return x
def forward(self, x): for layer in self: x = layer(x) x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False) return x
def _subdivision_inference(self, features, mask_representations, instances): assert not self.training pred_boxes = [x.pred_boxes for x in instances] pred_classes = cat([x.pred_classes for x in instances]) mask_logits = None # +1 here to include an initial step to generate the coarsest mask # prediction with init_resolution, when mask_logits is None. # We compute initial mask by sampling on a regular grid. coarse_mask # can be used as initial mask as well, but it's typically very low-res # so it will be completely overwritten during subdivision anyway. for _ in range(self.mask_point_subdivision_steps + 1): if mask_logits is None: point_coords = generate_regular_grid_point_coords( pred_classes.size(0), self.mask_point_subdivision_init_resolution, pred_boxes[0].device, ) else: mask_logits = interpolate( mask_logits, scale_factor=2, mode="bilinear", align_corners=False ) uncertainty_map = calculate_uncertainty(mask_logits, pred_classes) point_indices, point_coords = get_uncertain_point_coords_on_grid( uncertainty_map, self.mask_point_subdivision_num_points ) # Run the point head for every point in point_coords fine_grained_features = self._point_pooler(features, pred_boxes, point_coords) point_logits = self._get_point_logits( fine_grained_features, point_coords, mask_representations ) if mask_logits is None: # Create initial mask_logits using point_logits on this regular grid R, C, _ = point_logits.shape mask_logits = point_logits.reshape( R, C, self.mask_point_subdivision_init_resolution, self.mask_point_subdivision_init_resolution, ) # The subdivision code will fail with the empty list of boxes if len(pred_classes) == 0: mask_rcnn_inference(mask_logits, instances) return instances else: # Put point predictions to the right places on the upsampled grid. R, C, H, W = mask_logits.shape point_indices = point_indices.unsqueeze(1).expand(-1, C, -1) mask_logits = ( mask_logits.reshape(R, C, H * W) .scatter_(2, point_indices, point_logits) .view(R, C, H, W) ) mask_rcnn_inference(mask_logits, instances) return instances
def layers(self, x): for layer in self.blocks: x = F.relu(layer(x)) x = self.score_lowres(x) x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False) return x
def interp2d(input): if self.scale_factor == 1: return input # return interpolate( # input, scale_factor=self.scale_factor, mode="bilinear", align_corners=False # ) else: return interpolate( input, scale_factor=self.scale_factor, mode="bilinear", align_corners=False )
def forward(self, x): for layer in self.blocks: x = F.relu(layer(x)) #print('keypoint head=================') x = self.score_lowres(x) x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False) return x
def forward(self, x): for block in self.conv_fcns: x = block(x) x = F.relu(x) # x = self.conv_fcn(x) x = self.score_lowres(x) x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False) return x
def interp2d(self, tensor_nchw: torch.Tensor): """ Bilinear interpolation method to be used for upscaling Args: tensor_nchw (tensor): tensor of shape (N, C, H, W) Return: tensor of shape (N, C, Hout, Wout), where Hout and Wout are computed by applying the scale factor to H and W """ return interpolate( tensor_nchw, scale_factor=self.scale_factor, mode="bilinear", align_corners=False )
def forward_for_mask(self, boxlists): N, dim, m_h, m_w = self.protos.shape o_h = int(m_h * self.strides[0]) o_w = int(m_w * self.strides[0]) protos = interpolate(self.protos, size = (o_h,o_w), mode="bilinear", align_corners=False) for im in range(N): boxlist = boxlists[im] input_h, input_w = boxlist.image_size proto = protos[im] coeffs = boxlist.coeffs.unsqueeze(-1).unsqueeze(-1) masks = torch.sum(coeffs*proto,dim=1,keepdim = True).sigmoid() masks = masks[:, :, :input_h, :input_w] boxlist.pred_masks = masks return boxlists
def forward(self, x, rel_matrix=None): if len(x) == 0: return torch.zeros(size=(0, 0, 0, 0), device=x.device) for i in range(self.n_stacked_convs): layer_name = self._get_layer_name(i) x = getattr(self, layer_name)(x) x = F.relu(x) kpt_weight = self._forward_relation_embedding(self.kpt_weight, self.rel_matrix) x = nn.functional.conv_transpose2d(x, weight=kpt_weight, bias=self.kpt_bias, padding=1, stride=2) x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False) # x = self.score_lowres(x) return x
def prepare_masks(self, m_h, m_w, r_h, r_w, targets_masks): masks = [] for im_i in range(len(targets_masks)): mask_t = targets_masks[im_i] if len(mask_t) == 0: masks.append(mask_t.new_tensor([])) continue n, h, w = mask_t.shape mask = mask_t.new_zeros((n, r_h, r_w)) mask[:, :h, :w] = mask_t resized_mask = interpolate( input=mask.float().unsqueeze(0), size=(m_h, m_w), mode="bilinear", align_corners=False, )[0].gt(0) masks.append(resized_mask) return masks
def process_heatmaps(maps, rois, img_shapes): """ Extract predicted keypoint locations from heatmaps. Args: maps (Tensor): (#ROIs, #keypoints, POOL_H, POOL_W). The predicted heatmap of logits for each ROI and each keypoint. rois (Tensor): (#ROIs, 4). The box of each ROI. Returns: Tensor of shape (#ROIs, #keypoints, POOL_H, POOL_W) representing confidence scores """ offset_i = (rois[:, 1]).int() offset_j = (rois[:, 0]).int() widths = (rois[:, 2] - rois[:, 0]).clamp(min=1) heights = (rois[:, 3] - rois[:, 1]).clamp(min=1) widths_ceil = widths.ceil() heights_ceil = heights.ceil() # roi_map_scores = torch.zeros((maps.shape[0], maps.shape[1], imgShape[0], imgShape[1])) roi_map_scores = [torch.zeros((maps.shape[1], img_shapes[i][0], img_shapes[i][1])) for i in range(maps.shape[0])] num_rois, num_keypoints = maps.shape[:2] for i in range(num_rois): outsize = (int(heights_ceil[i]), int(widths_ceil[i])) # #keypoints x H x W roi_map = interpolate(maps[[i]], size=outsize, mode="bicubic", align_corners=False).squeeze(0) # softmax over the spatial region max_score, _ = roi_map.view(num_keypoints, -1).max(1) max_score = max_score.view(num_keypoints, 1, 1) tmp_full_resolution = (roi_map - max_score).exp_() tmp_pool_resolution = (maps[i] - max_score).exp_() norm_score = ((tmp_full_resolution / tmp_pool_resolution.sum((1, 2), keepdim=True)) * 255.0).to(torch.uint8) # Produce scores over the region H x W, but normalize with POOL_H x POOL_W, # so that the scores of objects of different absolute sizes will be more comparable for idx in range(num_keypoints): roi_map_scores[i][idx, offset_i[i]:(offset_i[i] + outsize[0]), offset_j[i]:(offset_j[i] + outsize[1])] = \ norm_score[idx, ...].float() return roi_map_scores
def heatmaps_to_keypoints(maps: torch.Tensor, rois: torch.Tensor) -> torch.Tensor: """ Args: maps (Tensor): (#ROIs, #keypoints, POOL_H, POOL_W) rois (Tensor): (#ROIs, 4) Extract predicted keypoint locations from heatmaps. Output has shape (#rois, #keypoints, 4) with the last dimension corresponding to (x, y, logit, prob) for each keypoint. Converts a discrete image coordinate in an NxN image to a continuous keypoint coordinate. We maintain consistency with keypoints_to_heatmap by using the conversion from Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate. """ offset_x = rois[:, 0] offset_y = rois[:, 1] widths = (rois[:, 2] - rois[:, 0]).clamp(min=1) heights = (rois[:, 3] - rois[:, 1]).clamp(min=1) widths_ceil = widths.ceil() heights_ceil = heights.ceil() num_rois, num_keypoints = maps.shape[:2] xy_preds = maps.new_zeros(rois.shape[0], num_keypoints, 4) width_corrections = widths / widths_ceil height_corrections = heights / heights_ceil keypoints_idx = torch.arange(num_keypoints, device=maps.device) for i in range(num_rois): outsize = (int(heights_ceil[i]), int(widths_ceil[i])) roi_map = interpolate(maps[[i]], size=outsize, mode="bicubic", align_corners=False).squeeze( 0) # #keypoints x H x W # softmax over the spatial region max_score, _ = roi_map.view(num_keypoints, -1).max(1) max_score = max_score.view(num_keypoints, 1, 1) tmp_full_resolution = (roi_map - max_score).exp_() tmp_pool_resolution = (maps[i] - max_score).exp_() # Produce scores over the region H x W, but normalize with POOL_H x POOL_W # So that the scores of objects of different absolute sizes will be more comparable roi_map_probs = tmp_full_resolution / tmp_pool_resolution.sum( (1, 2), keepdim=True) w = roi_map.shape[2] pos = roi_map.view(num_keypoints, -1).argmax(1) x_int = pos % w y_int = (pos - x_int) // w assert (roi_map_probs[keypoints_idx, y_int, x_int] == roi_map_probs.view( num_keypoints, -1).max(1)[0]).all() x = (x_int.float() + 0.5) * width_corrections[i] y = (y_int.float() + 0.5) * height_corrections[i] xy_preds[i, :, 0] = x + offset_x[i] xy_preds[i, :, 1] = y + offset_y[i] xy_preds[i, :, 2] = roi_map[keypoints_idx, y_int, x_int] xy_preds[i, :, 3] = roi_map_probs[keypoints_idx, y_int, x_int] return xy_preds
def _forward_mask_point(self, features, mask_coarse_logits, instances): """ Forward logic of the mask point head. """ if not self.mask_point_on: return {} if self.training else mask_coarse_logits mask_features_list = [features[k] for k in self.mask_point_in_features] features_scales = [self._feature_scales[k] for k in self.mask_point_in_features] if self.training: proposal_boxes = [x.proposal_boxes for x in instances] gt_classes = cat([x.gt_classes for x in instances]) with torch.no_grad(): point_coords = get_uncertain_point_coords_with_randomness( mask_coarse_logits, lambda logits: calculate_uncertainty(logits, gt_classes), self.mask_point_train_num_points, self.mask_point_oversample_ratio, self.mask_point_importance_sample_ratio, ) fine_grained_features, point_coords_wrt_image = point_sample_fine_grained_features( mask_features_list, features_scales, proposal_boxes, point_coords ) coarse_features = point_sample(mask_coarse_logits, point_coords, align_corners=False) point_logits = self.point_head(fine_grained_features, coarse_features) return { "loss_mask_point": roi_mask_point_loss( point_logits, instances, point_coords_wrt_image ) } else: pred_boxes = [x.pred_boxes for x in instances] pred_classes = cat([x.pred_classes for x in instances]) # The subdivision code will fail with the empty list of boxes if len(pred_classes) == 0: return mask_coarse_logits mask_logits = mask_coarse_logits.clone() for subdivions_step in range(self.mask_point_subdivision_steps): mask_logits = interpolate( mask_logits, scale_factor=2, mode="bilinear", align_corners=False ) # If `mask_point_subdivision_num_points` is larger or equal to the # resolution of the next step, then we can skip this step H, W = mask_logits.shape[-2:] if ( self.mask_point_subdivision_num_points >= 4 * H * W and subdivions_step < self.mask_point_subdivision_steps - 1 ): continue uncertainty_map = calculate_uncertainty(mask_logits, pred_classes) point_indices, point_coords = get_uncertain_point_coords_on_grid( uncertainty_map, self.mask_point_subdivision_num_points ) fine_grained_features, _ = point_sample_fine_grained_features( mask_features_list, features_scales, pred_boxes, point_coords ) coarse_features = point_sample( mask_coarse_logits, point_coords, align_corners=False ) point_logits = self.point_head(fine_grained_features, coarse_features) # put mask point predictions to the right places on the upsampled grid. R, C, H, W = mask_logits.shape point_indices = point_indices.unsqueeze(1).expand(-1, C, -1) mask_logits = ( mask_logits.reshape(R, C, H * W) .scatter_(2, point_indices, point_logits) .view(R, C, H, W) ) return mask_logits
def _forward_mask_point(self, features, mask_coarse_logits, instances): """ Forward logic of the mask point head. """ if not self.mask_point_on: return {} if self.training else mask_coarse_logits mask_features_list = [features[k] for k in self.mask_point_in_features] features_scales = [ self._feature_scales[k] for k in self.mask_point_in_features ] if self.training: proposal_boxes = [x.proposal_boxes for x in instances] gt_classes = cat([x.gt_classes for x in instances]) with torch.no_grad(): point_coords = get_uncertain_point_coords_with_randomness( mask_coarse_logits, lambda logits: calculate_uncertainty(logits, gt_classes), self.mask_point_train_num_points, self.mask_point_oversample_ratio, self.mask_point_importance_sample_ratio, ) fine_grained_features, point_coords_wrt_image = point_sample_fine_grained_features( mask_features_list, features_scales, proposal_boxes, point_coords) coarse_features = point_sample(mask_coarse_logits, point_coords, align_corners=False) point_logits = self.point_head(fine_grained_features, coarse_features) return { "loss_mask_point": roi_mask_point_loss(point_logits, instances, point_coords_wrt_image) } else: pred_boxes = [x.pred_boxes for x in instances] pred_classes = cat([x.pred_classes for x in instances]) # The subdivision code will fail with the empty list of boxes if len(pred_classes) == 0: return mask_coarse_logits mask_logits = None # +1 here to include an initial step to generate the coarsest mask # prediction with init_resolution, when mask_logits is None. # We compute initial mask by sampling on a regular grid. coarse_mask # can be used as initial mask as well, but it's typically very low-res # so it will be completely overwritten during subdivision anyway. for _ in range(self.mask_point_subdivision_steps + 1): if mask_logits is None: point_coords = generate_regular_grid_point_coords( pred_classes.size(0), self.mask_point_subdivision_init_resolution, pred_boxes[0].device, ) else: mask_logits = interpolate(mask_logits, scale_factor=2, mode="bilinear", align_corners=False) uncertainty_map = calculate_uncertainty( mask_logits, pred_classes) point_indices, point_coords = get_uncertain_point_coords_on_grid( uncertainty_map, self.mask_point_subdivision_num_points) # Run the point head for every point in point_coords fine_grained_features, _ = point_sample_fine_grained_features( mask_features_list, features_scales, pred_boxes, point_coords) coarse_features = point_sample(mask_coarse_logits, point_coords, align_corners=False) point_logits = self.point_head(fine_grained_features, coarse_features) if mask_logits is None: # Create initial mask_logits using point_logits on this regular grid R, C, _ = point_logits.shape mask_logits = point_logits.reshape( R, C, self.mask_point_subdivision_init_resolution, self.mask_point_subdivision_init_resolution, ) else: # Put point predictions to the right places on the upsampled grid. R, C, H, W = mask_logits.shape point_indices = point_indices.unsqueeze(1).expand( -1, C, -1) mask_logits = (mask_logits.reshape(R, C, H * W).scatter_( 2, point_indices, point_logits).view(R, C, H, W)) return mask_logits
def heatmaps_to_keypoints(maps: torch.Tensor, rois: torch.Tensor) -> torch.Tensor: """ Extract predicted keypoint locations from heatmaps. Args: maps (Tensor): (#ROIs, #keypoints, POOL_H, POOL_W). The predicted heatmap of logits for each ROI and each keypoint. rois (Tensor): (#ROIs, 4). The box of each ROI. Returns: Tensor of shape (#ROIs, #keypoints, 4) with the last dimension corresponding to (x, y, logit, score) for each keypoint. When converting discrete pixel indices in an NxN image to a continuous keypoint coordinate, we maintain consistency with :meth:`Keypoints.to_heatmap` by using the conversion from Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate. """ # The decorator use of torch.no_grad() was not supported by torchscript. # https://github.com/pytorch/pytorch/pull/41371 maps = maps.detach() rois = rois.detach() offset_x = rois[:, 0] offset_y = rois[:, 1] widths = (rois[:, 2] - rois[:, 0]).clamp(min=1) heights = (rois[:, 3] - rois[:, 1]).clamp(min=1) widths_ceil = widths.ceil() heights_ceil = heights.ceil() num_rois, num_keypoints = maps.shape[:2] xy_preds = maps.new_zeros(rois.shape[0], num_keypoints, 4) width_corrections = widths / widths_ceil height_corrections = heights / heights_ceil keypoints_idx = torch.arange(num_keypoints, device=maps.device) for i in range(num_rois): outsize = (int(heights_ceil[i]), int(widths_ceil[i])) roi_map = interpolate(maps[[i]], size=outsize, mode="bicubic", align_corners=False).squeeze( 0) # #keypoints x H x W # softmax over the spatial region max_score, _ = roi_map.view(num_keypoints, -1).max(1) max_score = max_score.view(num_keypoints, 1, 1) tmp_full_resolution = (roi_map - max_score).exp_() tmp_pool_resolution = (maps[i] - max_score).exp_() # Produce scores over the region H x W, but normalize with POOL_H x POOL_W, # so that the scores of objects of different absolute sizes will be more comparable roi_map_scores = tmp_full_resolution / tmp_pool_resolution.sum( (1, 2), keepdim=True) w = roi_map.shape[2] pos = roi_map.view(num_keypoints, -1).argmax(1) x_int = pos % w y_int = (pos - x_int) // w assert (roi_map_scores[keypoints_idx, y_int, x_int] == roi_map_scores.view( num_keypoints, -1).max(1)[0]).all() x = (x_int.float() + 0.5) * width_corrections[i] y = (y_int.float() + 0.5) * height_corrections[i] xy_preds[i, :, 0] = x + offset_x[i] xy_preds[i, :, 1] = y + offset_y[i] xy_preds[i, :, 2] = roi_map[keypoints_idx, y_int, x_int] xy_preds[i, :, 3] = roi_map_scores[keypoints_idx, y_int, x_int] return xy_preds