def cat_boxlist(bboxes): """ Concatenates a list of BoxList (having the same image size) into a single BoxList Arguments: bboxes (list[BoxList]) """ assert isinstance(bboxes, (list, tuple)) assert all(isinstance(bbox, BoxList) for bbox in bboxes) if len(bboxes)==0: return BoxList(jt.empty((0,4)), (0,0), mode='xyxy') size = bboxes[0].size assert all(bbox.size == size for bbox in bboxes) mode = bboxes[0].mode assert all(bbox.mode == mode for bbox in bboxes) fields = set(bboxes[0].fields()) assert all(set(bbox.fields()) == fields for bbox in bboxes) cat_boxes = BoxList(_cat([bbox.bbox for bbox in bboxes], dim=0), size, mode) for field in fields: data = _cat([bbox.get_field(field) for bbox in bboxes], dim=0) cat_boxes.add_field(field, data) return cat_boxes
def mask_prob_cuda(embed_pixel, embed_center, sigma_center, boxes, box_areas, area_sum, mask_width): assert embed_pixel.ndim == 2, "embed_pixel should be MxDim" assert embed_center.ndim == 2, "embed_center should be NxDim" assert sigma_center.ndim == 1, "sigma_center should be N" assert embed_pixel.shape[1] == embed_center.shape[1], "Dim should the same" assert embed_center.shape[0] == sigma_center.shape[ 0], "center number should be the same" assert embed_center.shape[0] == boxes.shape[ 0], "center number and box number should be the same" output_shape = (embed_pixel.shape[0], embed_center.shape[0]) if output_shape[0] * output_shape[1] == 0: return jt.array([], embed_pixel.dtype) output_type = embed_pixel.dtype option = jt.empty((0, )) option.compile_options = { "area_sum": int(area_sum), "mask_width": int(mask_width) } inputs = [ embed_pixel, embed_center, sigma_center, boxes, box_areas, option ] output = jt.code(output_shape, output_type, inputs, cuda_header=CUDA_HEADER, cuda_src=CUDA_SRC) return output
def concat(arr, dim): '''Concat Operator can concat a list of jt Var at a specfic dimension. * [in] x: input var list for concat * [in] dim: concat which dim * [out] out: concat result Example:: jt.concat([jt.array([[1],[2]]), jt.array([[2],[2]])], dim=1) # return [[1],[2],[2],[2]] ''' # TODO: low performance when concat lots of vars total_dim = 0 if dim < 0: dim += len(arr[0].shape) for a in arr: total_dim += a.shape[dim] cdim = 0 shape = list(a.shape) shape[dim] = total_dim s = jt.empty(shape, a.dtype) slices = [slice(None)] * len(a.shape) for a in arr: if a.shape[dim] == 0: continue slices[dim] = slice(cdim, cdim + a.shape[dim]) # print(slices, type(a)) s = s.setitem(tuple(slices), a) # s = jt.setitem(s, tuple(slices), a) cdim += a.shape[dim] return s
def convert_to_binarymask(self): if len(self) > 0: masks = jt.stack( [p.convert_to_binarymask() for p in self.polygons]) else: size = self.size masks = jt.empty([0, size[1], size[0]]).bool() return BinaryMaskList(masks, size=self.size)
def forward_face_index_map(self, faces, face_index_map, weight_map, depth_map, face_inv_map): faces_inv = jt.empty(faces.shape) return rasterize_cuda.forward_face_index_map(faces, face_index_map, weight_map, depth_map, face_inv_map, faces_inv, self.image_size, self.near, self.far, int(self.return_rgb), int(self.return_alpha), int(self.return_depth))
def execute(self, x_q, x_r): # n_points, c_dim batch_size, c_dim, q_points = x_q.shape batch_size, c_dim, r_points = x_r.shape out_idx_shapes = [batch_size, self.k, q_points] tmp_dist = jt.empty((batch_size, r_points, q_points), "float32") idxs, = jt.code( [out_idx_shapes], ['int32'], [x_r, x_q, tmp_dist], # in0 r point in1 q point cuda_src=self.cuda_src, cuda_header=self.cuda_inc, ) return idxs
def concat(arr, dim=0): '''Concat Operator can concat a list of jt Var at a specfic dimension. * [in] x: input var list for concat * [in] dim: concat which dim * return: concat result Example:: jt.concat([jt.array([[1],[2]]), jt.array([[2],[2]])], dim=1) # return [[1],[2],[2],[2]] ''' if not isinstance(arr, Sequence): raise TypeError("concat arr needs to be a tuple or list") if len(arr) == 0: raise ValueError("need at least one array to concat") total_dim = 0 if dim < 0: dim += len(arr[0].shape) dtypes = [] for a in arr: total_dim += a.shape[dim] dtypes.append(str(a.dtype)) cdim = 0 shape = list(a.shape) shape[dim] = total_dim s = jt.empty(shape, dtype=_merge_dtypes(dtypes)) slices = [slice(None)] * len(a.shape) for a in arr: if a.shape[dim] == 0: continue slices[dim] = slice(cdim, cdim + a.shape[dim]) # print(slices, type(a)) s = s.setitem(tuple(slices), a) # s = jt.setitem(s, tuple(slices), a) cdim += a.shape[dim] return s
def execute(self, x, new_shape): self.save_vars = x.shape return jt.empty(new_shape, str(x.dtype))
def execute(self, faces, textures): self.batch_size, self.num_faces = faces.shape[:2] if self.return_rgb: self.texture_size = textures.shape[2] else: # initializing with dummy values textures = jt.array([0]).float32() self.texture_size = None face_index_map = jt.empty( (self.batch_size, self.image_size, self.image_size)).int() weight_map = jt.empty( (self.batch_size, self.image_size, self.image_size, 3)) depth_map = jt.empty( (self.batch_size, self.image_size, self.image_size)) * self.far if self.return_rgb: rgb_map = jt.empty((self.batch_size, self.image_size, self.image_size, 3)).float() sampling_index_map = jt.empty( (self.batch_size, self.image_size, self.image_size, 8)).int() sampling_weight_map = jt.empty( (self.batch_size, self.image_size, self.image_size, 8)) else: rgb_map = jt.zeros(1) sampling_index_map = jt.zeros(1).int() sampling_weight_map = jt.zeros(1) if self.return_alpha: alpha_map = jt.empty( (self.batch_size, self.image_size, self.image_size)) else: alpha_map = jt.zeros(1) if self.return_depth: face_inv_map = jt.empty( (self.batch_size, self.image_size, self.image_size, 3, 3)) else: face_inv_map = jt.zeros(1) # faces -> face_index_map, weight_map, depth_map, face_inv_map face_index_map, weight_map, depth_map, face_inv_map = self.forward_face_index_map( faces, face_index_map, weight_map, depth_map, face_inv_map) # faces, textures, face_index_map, weight_map, depth_map -> rgb_map, sampling_index_map, sampling_weight_map rgb_map, sampling_index_map, sampling_weight_map = self.forward_texture_sampling( faces, textures, face_index_map, weight_map, depth_map, rgb_map, sampling_index_map, sampling_weight_map) rgb_map = self.forward_background(face_index_map, rgb_map) alpha_map = self.forward_alpha_map(alpha_map, face_index_map) self.save_vars = faces, textures, face_index_map, weight_map, depth_map, rgb_map, alpha_map, face_inv_map, sampling_index_map, sampling_weight_map rgb_r, alpha_r, depth_r = jt.array([]), jt.array([]), jt.array([]) if self.return_rgb: rgb_r = rgb_map if self.return_alpha: alpha_r = alpha_map if self.return_depth: depth_r = depth_map return rgb_r, alpha_r, depth_r
def filter_results(self, boxlist, num_classes): """Returns bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). """ # unwrap the boxlist to avoid additional overhead. # if we had multi-class NMS, we could perform this directly on the boxlist boxes = boxlist.bbox.reshape(-1, num_classes * 4) scores = boxlist.get_field("scores").reshape(-1, num_classes) result = [] # Apply threshold on detection probabilities and apply NMS # Skip j = 0, because it's the background class # inds_all = (scores > self.score_thresh).int() inds_all = scores > self.score_thresh # print(self.score_thresh,num_classes) # print(inds_all.shape) # inds_all = inds_all.transpose(1,0) inds_nonzeros = [ inds_all[:,j].nonzero() for j in range(1, num_classes) ] jt.sync(inds_nonzeros) for j in range(1, num_classes): # with nvtx_scope("aa"): # inds = inds_all[:,j].nonzero().squeeze(1) # with nvtx_scope("bb"): # scores_j = scores[inds, j] # boxes_j = boxes[inds, j * 4 : (j + 1) * 4] # with nvtx_scope("cc"): # boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") # with nvtx_scope("cc2"): # boxlist_for_class.add_field("scores", scores_j) # with nvtx_scope("cc3"): # boxlist_for_class = boxlist_nms( # boxlist_for_class, self.nms # ) # with nvtx_scope("dd"): # num_labels = len(boxlist_for_class) # with nvtx_scope("dd2"): # boxlist_for_class.add_field( # "labels", jt.full((num_labels,), j).int32() # ) # result.append(boxlist_for_class) # inds = inds_all[:,j].nonzero().squeeze(1) inds = inds_nonzeros[j-1] if inds.shape[0] == 0: continue inds = inds.squeeze(1) scores_j = scores[inds, j] boxes_j = boxes[inds, j * 4 : (j + 1) * 4] boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") boxlist_for_class.add_field("scores", scores_j) boxlist_for_class = boxlist_nms( boxlist_for_class, self.nms ) num_labels = len(boxlist_for_class) # print(j,num_labels) boxlist_for_class.add_field( "labels", jt.full((num_labels,), j).int32() ) result.append(boxlist_for_class) result = cat_boxlist(result) if not result.has_field('labels'): result.add_field('labels',jt.empty((0,))) if not result.has_field('scores'): result.add_field('scores',jt.empty((0,))) number_of_detections = len(result) #Limit to max_per_image detections **over all classes** if number_of_detections > self.detections_per_img > 0: cls_scores = result.get_field("scores") image_thresh, _ = jt.kthvalue( cls_scores, number_of_detections - self.detections_per_img + 1 ) keep = cls_scores >= image_thresh keep = jt.nonzero(keep).squeeze(1) result = result[keep] # # Absolute limit detection imgs # if number_of_detections > self.detections_per_img > 0: # cls_scores = result.get_field("scores") # scores, indices = jt.topk( # cls_scores, self.detections_per_img # ) # result = result[indices] return result
def forward_face_index_map(faces, face_index_map, weight_map, depth_map, face_inv_map, faces_inv, image_size, near, far, return_rgb, return_alpha, return_depth): lock = jt.empty(depth_map.shape, 'int') return jt.code([ face_index_map.shape, weight_map.shape, depth_map.shape, face_inv_map.shape ], [ face_index_map.dtype, weight_map.dtype, depth_map.dtype, face_inv_map.dtype ], [faces, faces_inv, lock], cuda_header=''' #include <cuda.h> #include <cuda_runtime.h> #include <cassert> #include <thrust/device_ptr.h> #include <thrust/fill.h> // for the older gpus atomicAdd with double arguments does not exist #if __CUDA_ARCH__ < 600 and defined(__CUDA_ARCH__) static __inline__ __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); } while (assumed != old); return __longlong_as_double(old); } #endif namespace{ template <typename scalar_t, int image_size, int return_rgb, int return_alpha, int return_depth> __global__ void forward_face_index_map_cuda_kernel( const scalar_t* faces, scalar_t* faces_inv, int32_t* face_index_map, scalar_t* weight_map, scalar_t* depth_map, scalar_t* face_inv_map, int batch_size, int num_faces, scalar_t near, scalar_t far, int threads_n_bits, int32_t* dev_tilemutex) { /* batch number, face, number, image size, face[v012][RGB] */ int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= batch_size * (1<<threads_n_bits)) { return; } const int fn = i & ((1<<threads_n_bits) - 1); if (fn >= num_faces) return; const int bn = i>>threads_n_bits; i = bn * num_faces + fn; const int is = image_size; const scalar_t* face = &faces[i * 9]; scalar_t* face_inv_g = &faces_inv[i * 9]; /* return if backside */ if ((face[7] - face[1]) * (face[3] - face[0]) < (face[4] - face[1]) * (face[6] - face[0])) return; /* p[num][xy]: x, y is normalized from [-1, 1] to [0, is - 1]. */ scalar_t p[3][2]; for (int num = 0; num < 3; num++) { for (int dim = 0; dim < 2; dim++) { p[num][dim] = 0.5 * (face[3 * num + dim] * is + is - 1); } } /* compute face_inv */ scalar_t face_inv[9] = { p[1][1] - p[2][1], p[2][0] - p[1][0], p[1][0] * p[2][1] - p[2][0] * p[1][1], p[2][1] - p[0][1], p[0][0] - p[2][0], p[2][0] * p[0][1] - p[0][0] * p[2][1], p[0][1] - p[1][1], p[1][0] - p[0][0], p[0][0] * p[1][1] - p[1][0] * p[0][1]}; scalar_t face_inv_denominator = ( p[2][0] * (p[0][1] - p[1][1]) + p[0][0] * (p[1][1] - p[2][1]) + p[1][0] * (p[2][1] - p[0][1])); /* set to global memory */ for (int k = 0; k < 9; k++) { face_inv[k] /= face_inv_denominator; face_inv_g[k] = face_inv[k]; } /* compute the bounding box of triangle facet */ scalar_t x_min=is, y_min=is, x_max=0, y_max=0; for (int num = 0; num < 3; num++) { if (p[num][0] < x_min) x_min = p[num][0]; if (p[num][0] > x_max) x_max = p[num][0]; if (p[num][1] < y_min) y_min = p[num][1]; if (p[num][1] > y_max) y_max = p[num][1]; } int ix_min = max(0, (int)x_min); int ix_max = min(is-1, (int)x_max); int iy_min = max(0, (int)y_min); int iy_max = min(is-1, (int)y_max); /* traverse each pixel in the bounding box */ for (int xi=ix_min;xi<=ix_max;xi++) { for (int yi=iy_min;yi<=iy_max;yi++) { const scalar_t yp = (2. * yi + 1 - is) / is; const scalar_t xp = (2. * xi + 1 - is) / is; /* check [py, px] is inside the face */ if (((yp - face[1]) * (face[3] - face[0]) < (xp - face[0]) * (face[4] - face[1])) || ((yp - face[4]) * (face[6] - face[3]) < (xp - face[3]) * (face[7] - face[4])) || ((yp - face[7]) * (face[0] - face[6]) < (xp - face[6]) * (face[1] - face[7]))) continue; int i1 = bn * is * is + yi * is + xi; /* compute w = face_inv * p */ scalar_t w[3]; w[0] = face_inv[3 * 0 + 0] * xi + face_inv[3 * 0 + 1] * yi + face_inv[3 * 0 + 2]; w[1] = face_inv[3 * 1 + 0] * xi + face_inv[3 * 1 + 1] * yi + face_inv[3 * 1 + 2]; w[2] = face_inv[3 * 2 + 0] * xi + face_inv[3 * 2 + 1] * yi + face_inv[3 * 2 + 2]; /* sum(w) -> 1, 0 < w < 1 */ scalar_t w_sum = 0; for (int k = 0; k < 3; k++) { w[k] = min(max(w[k], 0.), 1.); w_sum += w[k]; } for (int k = 0; k < 3; k++) { w[k] /= w_sum; } /* compute 1 / zp = sum(w / z) */ const scalar_t zp = 1. / (w[0] / face[2] + w[1] / face[5] + w[2] / face[8]); if (zp <= near || far <= zp) { continue; } /* check z-buffer */ bool isSet; do { isSet = (atomicCAS(&dev_tilemutex[i1], 0, 1) == 0); if (isSet) { if (zp < depth_map[i1]) { depth_map[i1] = zp; face_index_map[i1] = fn; for (int k = 0; k < 3; k++) { weight_map[3 * i1 + k] = w[k]; } if (return_depth) { for (int k = 0; k < 9; k++) { face_inv_map[9 * i1 + k] = face_inv[k]; } } } __threadfence(); dev_tilemutex[i1] = 0; } } while (!isSet); } } } } ''', cuda_src=f''' @alias(faces, in0) @alias(faces_inv, in1) @alias(face_index_map, out0) @alias(weight_map, out1) @alias(depth_map, out2) @alias(face_inv_map, out3) thrust::device_ptr<out0_type> dev_ptr0(out0_p); thrust::fill(dev_ptr0, dev_ptr0 + out0->num, -1); cudaMemsetAsync(out1_p, 0, out1->size); thrust::device_ptr<out2_type> dev_ptr2(out2_p); thrust::fill(dev_ptr2, dev_ptr2 + out2->num, {far}); cudaMemsetAsync(out3_p, 0, out3->size); const auto batch_size = faces_shape0; const auto num_faces = faces_shape1; const int threads = 256; const int threads_n_bits = NanoVector::get_nbits(num_faces) - 1; const dim3 blocks_1 ((batch_size * (1<<threads_n_bits) - 1) / threads +1); cudaMemsetAsync(in2_p, 0, in2->size); forward_face_index_map_cuda_kernel< float32, (int) {image_size}, {return_rgb}, {return_alpha}, {return_depth} ><<<blocks_1, threads>>>( faces_p, faces_inv_p, face_index_map_p, weight_map_p, depth_map_p, face_inv_map_p, (int) batch_size, (int) num_faces, (float32) {near}, (float32) {far}, threads_n_bits, in2_p); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) printf("Error in forward_face_index_map: %s\\n", cudaGetErrorString(err)); ''')
def knn(unknown, known, k): b, n, c = unknown.shape _, m, _ = known.shape # dists2 = jt.ones((b, n, k), dtype="float") * 1e40 # idx = jt.zeros((b, n, k), dtype="int") dists2 = jt.empty((b, n, k), dtype="float") idx = jt.empty((b, n, k), dtype="int") return jt.code([unknown, known], [dists2, idx], cuda_header=''' #define TOTAL_THREADS 512 #define K %s namespace { inline int opt_n_threads(int work_size) { const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0); return max(min(1 << pow_2, TOTAL_THREADS), 1); } __global__ void three_nn_kernel(int b, int n, int m, const float *__restrict__ unknown, const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) { int batch_index = blockIdx.x; unknown += batch_index * n * 3; known += batch_index * m * 3; dist2 += batch_index * n * K; idx += batch_index * n * K; int index = threadIdx.x; int stride = blockDim.x; for (int j = index; j < n; j += stride) { float ux = unknown[j * 3 + 0]; float uy = unknown[j * 3 + 1]; float uz = unknown[j * 3 + 2]; float tmp_dist[K]; int tmp_idx[K]; #pragma unroll for (int i=0; i<K; i++) tmp_dist[i] = 1e30; for (int k = 0; k < m; ++k) { float x = known[k * 3 + 0]; float y = known[k * 3 + 1]; float z = known[k * 3 + 2]; float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); int first = -1; #pragma unroll for (int i=0; i<K; i++) if (first == -1 && d<tmp_dist[i]) first = i; if (first == -1) continue; #pragma unroll for (int i=0; i<K; i++) if (K-1-i > first) { tmp_dist[K-1-i] = tmp_dist[K-2-i]; tmp_idx[K-1-i] = tmp_idx[K-2-i]; } tmp_dist[first] = d; tmp_idx[first] = k; /* for (int l = 0; l < K; ++l) { if (d < dist2[j * K + l]) { for (int m = K-1; m > l; --m) { dist2[j * K + m] = dist2[j * K + m - 1]; idx[j * K + m] = idx[j * K + m - 1]; } dist2[j * K + l] = d; idx[j * K + l] = k; break; } } */ } #pragma unroll for (int i=0; i<K; i++) { dist2[j * K + i] = tmp_dist[i]; idx[j * K + i] = tmp_idx[i]; } } } } ''' % k, cuda_src=f''' @alias(unknown, in0) @alias(known, in1) @alias(dists2, out0) @alias(idx, out1) three_nn_kernel<<<{b}, opt_n_threads({n}), 0, 0>>>({b}, {n}, {m}, unknown_p, known_p, dists2_p, idx_p); ''')
def three_nn(unknown, known): b, n, c = unknown.shape _, m, _ = known.shape dists2 = jt.empty((b, n, 3), dtype="float") idx = jt.empty((b, n, 3), dtype="int") return jt.code([unknown, known], [dists2, idx], cuda_header=''' #define TOTAL_THREADS 512 namespace { inline int opt_n_threads(int work_size) { const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0); return max(min(1 << pow_2, TOTAL_THREADS), 1); } __global__ void three_nn_kernel(int b, int n, int m, const float *__restrict__ unknown, const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) { int batch_index = blockIdx.x; unknown += batch_index * n * 3; known += batch_index * m * 3; dist2 += batch_index * n * 3; idx += batch_index * n * 3; int index = threadIdx.x; int stride = blockDim.x; for (int j = index; j < n; j += stride) { float ux = unknown[j * 3 + 0]; float uy = unknown[j * 3 + 1]; float uz = unknown[j * 3 + 2]; double best1 = 1e40, best2 = 1e40, best3 = 1e40; int besti1 = 0, besti2 = 0, besti3 = 0; for (int k = 0; k < m; ++k) { float x = known[k * 3 + 0]; float y = known[k * 3 + 1]; float z = known[k * 3 + 2]; float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); if (d < best1) { best3 = best2; besti3 = besti2; best2 = best1; besti2 = besti1; best1 = d; besti1 = k; } else if (d < best2) { best3 = best2; besti3 = besti2; best2 = d; besti2 = k; } else if (d < best3) { best3 = d; besti3 = k; } } dist2[j * 3 + 0] = best1; dist2[j * 3 + 1] = best2; dist2[j * 3 + 2] = best3; idx[j * 3 + 0] = besti1; idx[j * 3 + 1] = besti2; idx[j * 3 + 2] = besti3; } } } ''', cuda_src=f''' @alias(unknown, in0) @alias(known, in1) @alias(dists2, out0) @alias(idx, out1) three_nn_kernel<<<{b}, opt_n_threads({n}), 0, 0>>>({b}, {n}, {m}, unknown_p, known_p, dists2_p, idx_p); ''')
def execute(self, net, predictions, targets, masks, num_crowds): """Multibox Loss Args: predictions (tuple): A tuple containing loc preds, conf preds, mask preds, and prior boxes from SSD net. loc shape: jt.size(batch_size,num_priors,4) conf shape: jt.size(batch_size,num_priors,num_classes) masks shape: jt.size(batch_size,num_priors,mask_dim) priors shape: jt.size(num_priors,4) proto* shape: jt.size(batch_size,mask_h,mask_w,mask_dim) targets (list<tensor>): Ground truth boxes and labels for a batch, shape: [batch_size][num_objs,5] (last idx is the label). masks (list<tensor>): Ground truth masks for each object in each image, shape: [batch_size][num_objs,im_height,im_width] num_crowds (list<int>): Number of crowd annotations per batch. The crowd annotations should be the last num_crowds elements of targets and masks. * Only if mask_type == lincomb """ loc_data = predictions['loc'] conf_data = predictions['conf'] mask_data = predictions['mask'] priors = predictions['priors'] if cfg.mask_type == mask_type.lincomb: proto_data = predictions['proto'] score_data = predictions['score'] if cfg.use_mask_scoring else None inst_data = predictions['inst'] if cfg.use_instance_coeff else None labels = [None] * len(targets) # Used in sem segm loss batch_size = loc_data.shape[0] num_priors = priors.shape[0] num_classes = self.num_classes # Match priors (default boxes) and ground truth boxes # These tensors will be created with the same device as loc_data loc_t = jt.empty((batch_size, num_priors, 4),dtype=loc_data.dtype) gt_box_t = jt.empty((batch_size, num_priors, 4),dtype=loc_data.dtype) conf_t = jt.empty((batch_size, num_priors)).int32() idx_t = jt.empty((batch_size, num_priors)).int32() if cfg.use_class_existence_loss: class_existence_t = jt.empty((batch_size, num_classes-1),dtype=loc_data.dtype) # jt.sync(list(predictions.values())) for idx in range(batch_size): truths = targets[idx][:, :-1] labels[idx] = targets[idx][:, -1].int32() if cfg.use_class_existence_loss: # Construct a one-hot vector for each object and collapse it into an existence vector with max # Also it's fine to include the crowd annotations here class_existence_t[idx,:] = jt.eye(num_classes-1)[labels[idx]].max(dim=0)[0] # Split the crowd annotations because they come bundled in cur_crowds = num_crowds[idx] if cur_crowds > 0: split = lambda x: (x[-cur_crowds:], x[:-cur_crowds]) crowd_boxes, truths = split(truths) # We don't use the crowd labels or masks _, labels[idx] = split(labels[idx]) _, masks[idx] = split(masks[idx]) else: crowd_boxes = None match(self.pos_threshold, self.neg_threshold, truths, priors, labels[idx], crowd_boxes, loc_t, conf_t, idx_t, idx, loc_data[idx]) gt_box_t[idx,:,:] = truths[idx_t[idx]] # wrap targets loc_t.stop_grad() conf_t.stop_grad() idx_t.stop_grad() pos = conf_t > 0 num_pos = pos.sum(dim=1, keepdims=True) # Shape: [batch,num_priors,4] pos_idx = pos.unsqueeze(pos.ndim).expand_as(loc_data) losses = {} # Localization Loss (Smooth L1) if cfg.train_boxes: loc_p = loc_data[pos_idx].view(-1, 4) loc_t = loc_t[pos_idx].view(-1, 4) # print(loc_t) losses['B'] = nn.smooth_l1_loss(loc_p, loc_t, reduction='sum') * cfg.bbox_alpha if cfg.train_masks: if cfg.mask_type == mask_type.direct: if cfg.use_gt_bboxes: pos_masks = [] for idx in range(batch_size): pos_masks.append(masks[idx][idx_t[idx, pos[idx]]]) masks_t = jt.contrib.concat(pos_masks, 0) masks_p = mask_data[pos, :].view(-1, cfg.mask_dim) losses['M'] = nn.bce_loss(jt.clamp(masks_p, 0, 1), masks_t, size_average=False) * cfg.mask_alpha else: losses['M'] = self.direct_mask_loss(pos_idx, idx_t, loc_data, mask_data, priors, masks) elif cfg.mask_type == mask_type.lincomb: ret = self.lincomb_mask_loss(pos, idx_t, loc_data, mask_data, priors, proto_data, masks, gt_box_t, score_data, inst_data, labels) if cfg.use_maskiou: loss, maskiou_targets = ret else: loss = ret losses.update(loss) if cfg.mask_proto_loss is not None: if cfg.mask_proto_loss == 'l1': losses['P'] = jt.mean(jt.abs(proto_data)) / self.l1_expected_area * self.l1_alpha elif cfg.mask_proto_loss == 'disj': losses['P'] = -jt.mean(jt.max(nn.log_softmax(proto_data, dim=-1), dim=-1)[0]) # Confidence loss if cfg.use_focal_loss: if cfg.use_sigmoid_focal_loss: losses['C'] = self.focal_conf_sigmoid_loss(conf_data, conf_t) elif cfg.use_objectness_score: losses['C'] = self.focal_conf_objectness_loss(conf_data, conf_t) else: losses['C'] = self.focal_conf_loss(conf_data, conf_t) else: if cfg.use_objectness_score: losses['C'] = self.conf_objectness_loss(conf_data, conf_t, batch_size, loc_p, loc_t, priors) else: losses['C'] = self.ohem_conf_loss(conf_data, conf_t, pos, batch_size) # Mask IoU Loss if cfg.use_maskiou and maskiou_targets is not None: losses['I'] = self.mask_iou_loss(net, maskiou_targets) # These losses also don't depend on anchors if cfg.use_class_existence_loss: losses['E'] = self.class_existence_loss(predictions['classes'], class_existence_t) if cfg.use_semantic_segmentation_loss: losses['S'] = self.semantic_segmentation_loss(predictions['segm'], masks, labels) # Divide all losses by the number of positives. # Don't do it for loss[P] because that doesn't depend on the anchors. total_num_pos = num_pos.sum().float() for k in losses: if k not in ('P', 'E', 'S'): losses[k] /= total_num_pos else: losses[k] /= batch_size # Loss Key: # - B: Box Localization Loss # - C: Class Confidence Loss # - M: Mask Loss # - P: Prototype Loss # - D: Coefficient Diversity Loss # - E: Class Existence Loss # - S: Semantic Segmentation Loss return losses
def __init__(self, masks, size): """ Arguments: masks: Either jt.array of [num_instances, H, W] or list of jt.arrays of [H, W] with num_instances elems, or RLE (Run Length Encoding) - interpreted as list of dicts, or BinaryMaskList. size: absolute image size, width first After initialization, a hard copy will be made, to leave the initializing source data intact. """ assert isinstance(size, (list, tuple)) assert len(size) == 2 if isinstance(masks, jt.Var): # The raw data representation is passed as argument masks = masks.clone() elif isinstance(masks, (list, tuple)): if len(masks) == 0: masks = jt.empty([0, size[1], size[0]]) # num_instances = 0! elif isinstance(masks[0], jt.Var): masks = jt.stack(masks, dim=0).clone() elif isinstance(masks[0], dict) and "counts" in masks[0]: if (isinstance(masks[0]["counts"], (list, tuple))): masks = mask_utils.frPyObjects(masks, size[1], size[0]) # RLE interpretation rle_sizes = [tuple(inst["size"]) for inst in masks] masks = mask_utils.decode(masks) # [h, w, n] masks = jt.array(masks).transpose(2, 0, 1) # [n, h, w] assert rle_sizes.count(rle_sizes[0]) == len(rle_sizes), ( "All the sizes must be the same size: %s" % rle_sizes) # in RLE, height come first in "size" rle_height, rle_width = rle_sizes[0] assert masks.shape[1] == rle_height assert masks.shape[2] == rle_width width, height = size if width != rle_width or height != rle_height: masks = interpolate( input=masks.unsqueeze(0).float(), size=(height, width), mode="bilinear", align_corners=False, )[0].type_as(masks) else: RuntimeError( "Type of `masks[0]` could not be interpreted: %s" % type(masks)) elif isinstance(masks, BinaryMaskList): # just hard copy the BinaryMaskList instance's underlying data masks = masks.masks.clone() else: RuntimeError( "Type of `masks` argument could not be interpreted:%s" % type(masks)) if len(masks.shape) == 2: # if only a single instance mask is passed masks = masks.unsqueeze(0) assert len(masks.shape) == 3 assert masks.shape[1] == size[1], "%s != %s" % (masks.shape[1], size[1]) assert masks.shape[2] == size[0], "%s != %s" % (masks.shape[2], size[0]) self.masks = masks self.size = tuple(size)