def densepose_cse_predictions_to_dict(instances, embedder, class_to_mesh_name): results = [] pred_classes = instances.pred_classes.tolist() for k in range(len(instances)): cse = instances.pred_densepose[k] box_xyxy = instances.pred_boxes[k].tensor.int().tolist()[0] w, h = max(box_xyxy[2] - box_xyxy[0], 1), max(box_xyxy[3] - box_xyxy[1], 1) coarse_segm_resized = F.interpolate(cse.coarse_segm, (h, w), mode="bilinear", align_corners=False) embedding_resized = F.interpolate(cse.embedding, (h, w), mode="bilinear", align_corners=False) mesh_name = class_to_mesh_name[pred_classes[k]] mesh_vertex_embeddings = embedder(mesh_name).to( embedding_resized.device) # computing the closest mesh vertex for each pixel of the instance pixel_vertex_indices = np.zeros((h, w)) for i in range(h): local_embeddings = embedding_resized[0, :, i, :].t() edm = squared_euclidean_distance_matrix(local_embeddings, mesh_vertex_embeddings) pixel_vertex_indices[i] = edm.argmin(dim=1).int().cpu().numpy() cse_mask = coarse_segm_resized[0].argmax(0).cpu().numpy().astype( np.int8) results.append({ "cse_mask": cse_mask, "cse_indices": pixel_vertex_indices }) return results
def _create_pixel_dist_matrix(grid_size: int) -> torch.Tensor: rows = torch.arange(grid_size) cols = torch.arange(grid_size) # at index `i` contains [row, col], where # row = i // grid_size # col = i % grid_size pix_coords = (torch.stack(torch.meshgrid(rows, cols), -1).reshape( (grid_size * grid_size, 2)).float()) return squared_euclidean_distance_matrix(pix_coords, pix_coords)
def _sample(self, instance: Instances, bbox_xywh: IntTupleBox) -> Dict[str, List[Any]]: """ Sample DensPoseDataRelative from estimation results """ if self.use_gt_categories: instance_class = instance.dataset_classes.tolist()[0] else: instance_class = instance.pred_classes.tolist()[0] mesh_name = self.class_to_mesh_name[instance_class] annotation = { DensePoseDataRelative.X_KEY: [], DensePoseDataRelative.Y_KEY: [], DensePoseDataRelative.VERTEX_IDS_KEY: [], DensePoseDataRelative.MESH_NAME_KEY: mesh_name, } mask, embeddings, other_values = self._produce_mask_and_results( instance, bbox_xywh) indices = torch.nonzero(mask, as_tuple=True) selected_embeddings = embeddings.permute(1, 2, 0)[indices] values = other_values[:, indices[0], indices[1]] k = values.shape[1] count = min(self.count_per_class, k) if count <= 0: return annotation index_sample = self._produce_index_sample(values, count) closest_vertices = squared_euclidean_distance_matrix( selected_embeddings[index_sample], self.embedder(mesh_name)) closest_vertices = torch.argmin(closest_vertices, dim=1) sampled_y = indices[0][index_sample] + 0.5 sampled_x = indices[1][index_sample] + 0.5 # prepare / normalize data _, _, w, h = bbox_xywh x = (sampled_x / w * 256.0).cpu().tolist() y = (sampled_y / h * 256.0).cpu().tolist() # extend annotations annotation[DensePoseDataRelative.X_KEY].extend(x) annotation[DensePoseDataRelative.Y_KEY].extend(y) annotation[DensePoseDataRelative.VERTEX_IDS_KEY].extend( closest_vertices.cpu().tolist()) return annotation
def __call__( self, proposals_with_gt: List[Instances], densepose_predictor_outputs: Any, packed_annotations: PackedCseAnnotations, interpolator: BilinearInterpolationHelper, embedder: nn.Module, ) -> Dict[int, torch.Tensor]: """ Produces losses for estimated embeddings given annotated vertices. Embeddings for all the vertices of a mesh are computed by the embedder. Embeddings for observed pixels are estimated by a predictor. Losses are computed as cross-entropy for squared distances between observed vertex embeddings and all mesh vertex embeddings given ground truth vertex IDs. Args: proposals_with_gt (list of Instances): detections with associated ground truth data; each item corresponds to instances detected on 1 image; the number of items corresponds to the number of images in a batch densepose_predictor_outputs: an object of a dataclass that contains predictor outputs with estimated values; assumed to have the following attributes: * embedding - embedding estimates, tensor of shape [N, D, S, S], where N = number of instances (= sum N_i, where N_i is the number of instances on image i) D = embedding space dimensionality (MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE) S = output size (width and height) packed_annotations (PackedCseAnnotations): contains various data useful for loss computation, each data is packed into a single tensor interpolator (BilinearInterpolationHelper): bilinear interpolation helper embedder (nn.Module): module that computes vertex embeddings for different meshes Return: dict(int -> tensor): losses for different mesh IDs """ losses = {} for mesh_id_tensor in packed_annotations.vertex_mesh_ids_gt.unique(): # pyre-ignore[16] mesh_id = mesh_id_tensor.item() mesh_name = MeshCatalog.get_mesh_name(mesh_id) # valid points are those that fall into estimated bbox # and correspond to the current mesh j_valid = interpolator.j_valid * ( # pyre-ignore[16] packed_annotations.vertex_mesh_ids_gt == mesh_id ) # extract estimated embeddings for valid points # -> tensor [J, D] vertex_embeddings_i = normalize_embeddings( interpolator.extract_at_points( densepose_predictor_outputs.embedding, slice_fine_segm=slice(None), w_ylo_xlo=interpolator.w_ylo_xlo[:, None], # pyre-ignore[16] w_ylo_xhi=interpolator.w_ylo_xhi[:, None], # pyre-ignore[16] w_yhi_xlo=interpolator.w_yhi_xlo[:, None], # pyre-ignore[16] w_yhi_xhi=interpolator.w_yhi_xhi[:, None], # pyre-ignore[16] )[j_valid, :] ) # extract vertex ids for valid points # -> tensor [J] vertex_indices_i = packed_annotations.vertex_ids_gt[j_valid] # embeddings for all mesh vertices # -> tensor [K, D] mesh_vertex_embeddings = embedder(mesh_name) # unnormalized scores for valid points # -> tensor [J, K] scores = squared_euclidean_distance_matrix( vertex_embeddings_i, mesh_vertex_embeddings ) / (-self.embdist_gauss_sigma) losses[mesh_name] = F.cross_entropy(scores, vertex_indices_i, ignore_index=-1) for mesh_name in embedder.mesh_names: # pyre-ignore[16] if mesh_name not in losses: losses[mesh_name] = self.fake_value( densepose_predictor_outputs, embedder, mesh_name ) return losses