def prep_benchmark(dets_out, h, w): with timer.env('Postprocess'): t = postprocess(dets_out, w, h, crop_masks=args.crop, score_threshold=args.score_threshold) with timer.env('Copy'): classes, scores, boxes, masks = [x[:args.top_k] for x in t] if isinstance(scores, list): box_scores = scores[0].cpu().numpy() mask_scores = scores[1].cpu().numpy() else: scores = scores.cpu().numpy() classes = classes.cpu().numpy() boxes = boxes.cpu().numpy() masks = masks.cpu().numpy() with timer.env('Sync'): # Just in case torch.cuda.synchronize()
def make_priors(self, conv_h, conv_w, device): """ Note that priors are [x,y,width,height] where (x,y) is the center of the box. """ global prior_cache size = (conv_h, conv_w) with timer.env('makepriors'): if self.last_img_size != (cfg._tmp_img_w, cfg._tmp_img_h): prior_data = [] # Iteration order is important (it has to sync up with the convout) for j, i in product(range(conv_h), range(conv_w)): # +0.5 because priors are in center-size notation x = (i + 0.5) / conv_w y = (j + 0.5) / conv_h for ars in self.aspect_ratios: for scale in self.scales: for ar in ars: if not cfg.backbone.preapply_sqrt: ar = sqrt(ar) if cfg.backbone.use_pixel_scales: w = scale * ar / cfg.max_size h = scale / ar / cfg.max_size else: w = scale * ar / conv_w h = scale / ar / conv_h # This is for backward compatability with a bug where I made everything square by accident if cfg.backbone.use_square_anchors: h = w prior_data += [x, y, w, h] self.priors = torch.Tensor(prior_data, device=device).view(-1, 4).detach() self.priors.requires_grad = False self.last_img_size = (cfg._tmp_img_w, cfg._tmp_img_h) self.last_conv_size = (conv_w, conv_h) prior_cache[size] = None elif self.priors.device != device: # This whole weird situation is so that DataParalell doesn't copy the priors each iteration if prior_cache[size] is None: prior_cache[size] = {} if device not in prior_cache[size]: prior_cache[size][device] = self.priors = prior_cache[size][device] return self.priors
def __call__(self, predictions, net): """ Args: loc_data: (tensor) Loc preds from loc layers Shape: [batch, num_priors, 4] conf_data: (tensor) Shape: Conf preds from conf layers Shape: [batch, num_priors, num_classes] mask_data: (tensor) Mask preds from mask layers Shape: [batch, num_priors, mask_dim] prior_data: (tensor) Prior boxes and variances from priorbox layers Shape: [num_priors, 4] proto_data: (tensor) If using mask_type.lincomb, the prototype masks Shape: [batch, mask_h, mask_w, mask_dim] Returns: output of shape (batch_size, top_k, 1 + 1 + 4 + mask_dim) These outputs are in the order: class idx, confidence, bbox coords, and mask. Note that the outputs are sorted only if cross_class_nms is False """ loc_data = predictions['loc'] conf_data = predictions['conf'] mask_data = predictions['mask'] prior_data = predictions['priors'] proto_data = predictions['proto'] if 'proto' in predictions else None inst_data = predictions['inst'] if 'inst' in predictions else None out = [] with timer.env('Detect'): batch_size = loc_data.size(0) num_priors = prior_data.size(0) conf_preds = conf_data.view(batch_size, num_priors, self.num_classes).transpose( 2, 1).contiguous() for batch_idx in range(batch_size): decoded_boxes = decode(loc_data[batch_idx], prior_data) result = self.detect(batch_idx, conf_preds, decoded_boxes, mask_data, inst_data) if result is not None and proto_data is not None: result['proto'] = proto_data[batch_idx] out.append({'detection': result, 'net': net}) return out
def prep_metrics(ap_data, dets, img, gt, gt_masks, h, w, num_crowd, image_id, detections: Detections = None): """ Returns a list of APs for this image, with each element being for a class """ if not args.output_coco_json: with timer.env('Prepare gt'): gt_boxes = torch.Tensor(gt[:, :4]) gt_boxes[:, [0, 2]] *= w gt_boxes[:, [1, 3]] *= h gt_classes = list(gt[:, 4].astype(int)) gt_masks = torch.Tensor(gt_masks).view(-1, h * w) if num_crowd > 0: split = lambda x: (x[-num_crowd:], x[:-num_crowd]) crowd_boxes, gt_boxes = split(gt_boxes) crowd_masks, gt_masks = split(gt_masks) crowd_classes, gt_classes = split(gt_classes) with timer.env('Postprocess'): classes, scores, boxes, masks = postprocess( dets, w, h, crop_masks=args.crop, score_threshold=args.score_threshold) if classes.size(0) == 0: return classes = list(classes.cpu().numpy().astype(int)) if isinstance(scores, list): box_scores = list(scores[0].cpu().numpy().astype(float)) mask_scores = list(scores[1].cpu().numpy().astype(float)) else: scores = list(scores.cpu().numpy().astype(float)) box_scores = scores mask_scores = scores masks = masks.view(-1, h * w).cuda() boxes = boxes.cuda() if args.output_coco_json: with timer.env('JSON Output'): boxes = boxes.cpu().numpy() masks = masks.view(-1, h, w).cpu().numpy() for i in range(masks.shape[0]): # Make sure that the bounding box actually makes sense and a mask was produced if (boxes[i, 3] - boxes[i, 1]) * (boxes[i, 2] - boxes[i, 0]) > 0: detections.add_bbox(image_id, classes[i], boxes[i, :], box_scores[i]) detections.add_mask(image_id, classes[i], masks[i, :, :], mask_scores[i]) return with timer.env('Eval Setup'): num_pred = len(classes) num_gt = len(gt_classes) mask_iou_cache = _mask_iou(masks, gt_masks) bbox_iou_cache = _bbox_iou(boxes.float(), gt_boxes.float()) if num_crowd > 0: crowd_mask_iou_cache = _mask_iou(masks, crowd_masks, iscrowd=True) crowd_bbox_iou_cache = _bbox_iou(boxes.float(), crowd_boxes.float(), iscrowd=True) else: crowd_mask_iou_cache = None crowd_bbox_iou_cache = None box_indices = sorted(range(num_pred), key=lambda i: -box_scores[i]) mask_indices = sorted(box_indices, key=lambda i: -mask_scores[i]) iou_types = [('box', lambda i, j: bbox_iou_cache[i, j].item(), lambda i, j: crowd_bbox_iou_cache[i, j].item(), lambda i: box_scores[i], box_indices), ('mask', lambda i, j: mask_iou_cache[i, j].item(), lambda i, j: crowd_mask_iou_cache[i, j].item(), lambda i: mask_scores[i], mask_indices)] timer.start('Main loop') for _class in set(classes + gt_classes): ap_per_iou = [] num_gt_for_class = sum([1 for x in gt_classes if x == _class]) for iouIdx in range(len(iou_thresholds)): iou_threshold = iou_thresholds[iouIdx] for iou_type, iou_func, crowd_func, score_func, indices in iou_types: gt_used = [False] * len(gt_classes) ap_obj = ap_data[iou_type][iouIdx][_class] ap_obj.add_gt_positives(num_gt_for_class) for i in indices: if classes[i] != _class: continue max_iou_found = iou_threshold max_match_idx = -1 for j in range(num_gt): if gt_used[j] or gt_classes[j] != _class: continue iou = iou_func(i, j) if iou > max_iou_found: max_iou_found = iou max_match_idx = j if max_match_idx >= 0: gt_used[max_match_idx] = True ap_obj.push(score_func(i), True) else: # If the detection matches a crowd, we can just ignore it matched_crowd = False if num_crowd > 0: for j in range(len(crowd_classes)): if crowd_classes[j] != _class: continue iou = crowd_func(i, j) if iou > iou_threshold: matched_crowd = True break # All this crowd code so that we can make sure that our eval code gives the # same result as COCOEval. There aren't even that many crowd annotations to # begin with, but accuracy is of the utmost importance. if not matched_crowd: ap_obj.push(score_func(i), False) timer.stop('Main loop')
def _bbox_iou(bbox1, bbox2, iscrowd=False): with timer.env('BBox IoU'): ret = jaccard(bbox1, bbox2, iscrowd) return ret.cpu()
def _mask_iou(mask1, mask2, iscrowd=False): with timer.env('Mask IoU'): ret = mask_iou(mask1, mask2, iscrowd) return ret.cpu()
def prep_display(dets_out, img, h, w, undo_transform=True, class_color=False, mask_alpha=0.45, fps_str=''): """ Note: If undo_transform=False then im_h and im_w are allowed to be None. """ if undo_transform: img_numpy = undo_image_transformation(img, w, h) img_gpu = torch.Tensor(img_numpy).cuda() else: img_gpu = img / 255.0 h, w, _ = img.shape with timer.env('Postprocess'): save = cfg.rescore_bbox cfg.rescore_bbox = True t = postprocess(dets_out, w, h, visualize_lincomb=args.display_lincomb, crop_masks=args.crop, score_threshold=args.score_threshold) cfg.rescore_bbox = save with timer.env('Copy'): idx = t[1].argsort(0, descending=True)[:args.top_k] if cfg.eval_mask_branch: # Masks are drawn on the GPU, so don't copy masks = t[3][idx] classes, scores, boxes = [x[idx].cpu().numpy() for x in t[:3]] num_dets_to_consider = min(args.top_k, classes.shape[0]) for j in range(num_dets_to_consider): if scores[j] < args.score_threshold: num_dets_to_consider = j break # Quick and dirty lambda for selecting the color for a particular index # Also keeps track of a per-gpu color cache for maximum speed def get_color(j, on_gpu=None): global color_cache color_idx = (classes[j] * 5 if class_color else j * 5) % len(COLORS) if on_gpu is not None and color_idx in color_cache[on_gpu]: return color_cache[on_gpu][color_idx] else: color = COLORS[color_idx] if not undo_transform: # The image might come in as RGB or BRG, depending color = (color[2], color[1], color[0]) if on_gpu is not None: color = torch.Tensor(color).to(on_gpu).float() / 255. color_cache[on_gpu][color_idx] = color return color # First, draw the masks on the GPU where we can do it really fast # Beware: very fast but possibly unintelligible mask-drawing code ahead # I wish I had access to OpenGL or Vulkan but alas, I guess Pytorch tensor operations will have to suffice if args.display_masks and cfg.eval_mask_branch and num_dets_to_consider > 0: # After this, mask is of size [num_dets, h, w, 1] masks = masks[:num_dets_to_consider, :, :, None] # Prepare the RGB images for each mask given their color (size [num_dets, h, w, 1]) colors =[ get_color(j, on_gpu=img_gpu.device.index).view(1, 1, 1, 3) for j in range(num_dets_to_consider) ], dim=0) masks_color = masks.repeat(1, 1, 1, 3) * colors * mask_alpha # This is 1 everywhere except for 1-mask_alpha where the mask is inv_alph_masks = masks * (-mask_alpha) + 1 # I did the math for this on pen and paper. This whole block should be equivalent to: # for j in range(num_dets_to_consider): # img_gpu = img_gpu * inv_alph_masks[j] + masks_color[j] masks_color_summand = masks_color[0] if num_dets_to_consider > 1: inv_alph_cumul = inv_alph_masks[:(num_dets_to_consider - 1)].cumprod(dim=0) masks_color_cumul = masks_color[1:] * inv_alph_cumul masks_color_summand += masks_color_cumul.sum(dim=0) img_gpu = img_gpu * + masks_color_summand if args.display_fps: # Draw the box for the fps on the GPU font_face = cv2.FONT_HERSHEY_DUPLEX font_scale = 0.6 font_thickness = 1 text_w, text_h = cv2.getTextSize(fps_str, font_face, font_scale, font_thickness)[0] img_gpu[0:text_h + 8, 0:text_w + 8] *= 0.6 # 1 - Box alpha # Then draw the stuff that needs to be done on the cpu # Note, make sure this is a uint8 tensor or opencv will not anti alias text for whatever reason img_numpy = (img_gpu * 255).byte().cpu().numpy() if args.display_fps: # Draw the text on the CPU text_pt = (4, text_h + 2) text_color = [255, 255, 255] cv2.putText(img_numpy, fps_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA) if num_dets_to_consider == 0: return img_numpy if args.display_text or args.display_bboxes: for j in reversed(range(num_dets_to_consider)): x1, y1, x2, y2 = boxes[j, :] color = get_color(j) score = scores[j] if args.display_bboxes: cv2.rectangle(img_numpy, (x1, y1), (x2, y2), color, 1) if args.display_text: _class = cfg.dataset.class_names[classes[j]] text_str = '%s: %.2f' % ( _class, score) if args.display_scores else _class font_face = cv2.FONT_HERSHEY_DUPLEX font_scale = 0.6 font_thickness = 1 text_w, text_h = cv2.getTextSize(text_str, font_face, font_scale, font_thickness)[0] text_pt = (x1, y1 - 3) text_color = [255, 255, 255] cv2.rectangle(img_numpy, (x1, y1), (x1 + text_w, y1 - text_h - 4), color, -1) cv2.putText(img_numpy, text_str, text_pt, font_face, font_scale, text_color, font_thickness, cv2.LINE_AA) return img_numpy
def evaluate(net: Yolact, dataset, train_mode=False): net.detect.use_fast_nms = args.fast_nms net.detect.use_cross_class_nms = args.cross_class_nms cfg.mask_proto_debug = args.mask_proto_debug # TODO Currently we do not support Fast Mask Re-scroing in evalimage, evalimages, and evalvideo if args.image is not None: if ':' in args.image: inp, out = args.image.split(':') evalimage(net, inp, out) else: evalimage(net, args.image) return elif args.images is not None: inp, out = args.images.split(':') evalimages(net, inp, out) return elif is not None: if ':' in inp, out =':') evalvideo(net, inp, out) else: evalvideo(net, return frame_times = MovingAverage() dataset_size = len(dataset) if args.max_images < 0 else min( args.max_images, len(dataset)) progress_bar = ProgressBar(30, dataset_size) print() if not args.display and not args.benchmark: # For each class and iou, stores tuples (score, isPositive) # Index ap_data[type][iouIdx][classIdx] ap_data = { 'box': [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds], 'mask': [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds] } detections = Detections() else: timer.disable('Load Data') dataset_indices = list(range(len(dataset))) if args.shuffle: random.shuffle(dataset_indices) elif not args.no_sort: # Do a deterministic shuffle based on the image ids # # I do this because on python 3.5 dictionary key order is *random*, while in 3.6 it's # the order of insertion. That means on python 3.6, the images come in the order they are in # in the annotations file. For some reason, the first images in the annotations file are # the hardest. To combat this, I use a hard-coded hash function based on the image ids # to shuffle the indices we use. That way, no matter what python version or how pycocotools # handles the data, we get the same result every time. hashed = [badhash(x) for x in dataset.ids] dataset_indices.sort(key=lambda x: hashed[x]) dataset_indices = dataset_indices[:dataset_size] try: # Main eval loop for it, image_idx in enumerate(dataset_indices): timer.reset() with timer.env('Load Data'): img, gt, gt_masks, h, w, num_crowd = dataset.pull_item( image_idx) # Test flag, do not upvote if cfg.mask_proto_debug: with open('scripts/info.txt', 'w') as f: f.write(str(dataset.ids[image_idx]))'scripts/gt.npy', gt_masks) batch = Variable(img.unsqueeze(0)) if args.cuda: batch = batch.cuda() with timer.env('Network Extra'): preds = net(batch) # Perform the meat of the operation here depending on our mode. if args.display: img_numpy = prep_display(preds, img, h, w) elif args.benchmark: prep_benchmark(preds, h, w) else: prep_metrics(ap_data, preds, img, gt, gt_masks, h, w, num_crowd, dataset.ids[image_idx], detections) # First couple of images take longer because we're constructing the graph. # Since that's technically initialization, don't include those in the FPS calculations. if it > 1: frame_times.add(timer.total_time()) if args.display: if it > 1: print('Avg FPS: %.4f' % (1 / frame_times.get_avg())) plt.imshow(img_numpy) plt.title(str(dataset.ids[image_idx])) elif not args.no_bar: if it > 1: fps = 1 / frame_times.get_avg() else: fps = 0 progress = (it + 1) / dataset_size * 100 progress_bar.set_val(it + 1) print( '\rProcessing Images %s %6d / %6d (%5.2f%%) %5.2f fps ' % (repr(progress_bar), it + 1, dataset_size, progress, fps), end='') if not args.display and not args.benchmark: print() if args.output_coco_json: print('Dumping detections...') if args.output_web_json: detections.dump_web() else: detections.dump() else: if not train_mode: print('Saving data...') with open(args.ap_data_file, 'wb') as f: pickle.dump(ap_data, f) return calc_map(ap_data) elif args.benchmark: print() print() print('Stats for the last frame:') timer.print_stats() avg_seconds = frame_times.get_avg() print('Average: %5.2f fps, %5.2f ms' % (1 / frame_times.get_avg(), 1000 * avg_seconds)) except KeyboardInterrupt: print('Stopping...')
def forward(self, x): """ The input should be of size [batch_size, 3, img_h, img_w] """ _, _, img_h, img_w = x.size() cfg._tmp_img_h = img_h cfg._tmp_img_w = img_w with timer.env('backbone'): outs = self.backbone(x) if cfg.fpn is not None: with timer.env('fpn'): # Use backbone.selected_layers because we overwrote self.selected_layers outs = [outs[i] for i in cfg.backbone.selected_layers] outs = self.fpn(outs) proto_out = None if cfg.mask_type == mask_type.lincomb and cfg.eval_mask_branch: with timer.env('proto'): proto_x = x if self.proto_src is None else outs[self.proto_src] if self.num_grids > 0: grids = self.grid.repeat(proto_x.size(0), 1, 1, 1) proto_x =[proto_x, grids], dim=1) proto_out = self.proto_net(proto_x) proto_out = cfg.mask_proto_prototype_activation(proto_out) if cfg.mask_proto_prototypes_as_features: # Clone here because we don't want to permute this, though idk if contiguous makes this unnecessary proto_downsampled = proto_out.clone() if cfg.mask_proto_prototypes_as_features_no_grad: proto_downsampled = proto_out.detach() # Move the features last so the multiplication is easy proto_out = proto_out.permute(0, 2, 3, 1).contiguous() if cfg.mask_proto_bias: bias_shape = [x for x in proto_out.size()] bias_shape[-1] = 1 proto_out = [proto_out, torch.ones(*bias_shape)], -1) with timer.env('pred_heads'): pred_outs = {'loc': [], 'conf': [], 'mask': [], 'priors': []} if cfg.use_mask_scoring: pred_outs['score'] = [] if cfg.use_instance_coeff: pred_outs['inst'] = [] for idx, pred_layer in zip(self.selected_layers, self.prediction_layers): pred_x = outs[idx] if cfg.mask_type == mask_type.lincomb and cfg.mask_proto_prototypes_as_features: # Scale the prototypes down to the current prediction layer's size and add it as inputs proto_downsampled = F.interpolate( proto_downsampled, size=outs[idx].size()[2:], mode='bilinear', align_corners=False) pred_x =[pred_x, proto_downsampled], dim=1) # A hack for the way dataparallel works if cfg.share_prediction_module and pred_layer is not self.prediction_layers[ 0]: pred_layer.parent = [self.prediction_layers[0]] p = pred_layer(pred_x) for k, v in p.items(): pred_outs[k].append(v) for k, v in pred_outs.items(): pred_outs[k] =, -2) if proto_out is not None: pred_outs['proto'] = proto_out if # For the extra loss functions if cfg.use_class_existence_loss: pred_outs['classes'] = self.class_existence_fc( outs[-1].mean(dim=(2, 3))) if cfg.use_semantic_segmentation_loss: pred_outs['segm'] = self.semantic_seg_conv(outs[0]) return pred_outs else: if cfg.use_mask_scoring: pred_outs['score'] = torch.sigmoid(pred_outs['score']) if cfg.use_focal_loss: if cfg.use_sigmoid_focal_loss: # Note: even though conf[0] exists, this mode doesn't train it so don't use it pred_outs['conf'] = torch.sigmoid(pred_outs['conf']) if cfg.use_mask_scoring: pred_outs['conf'] *= pred_outs['score'] elif cfg.use_objectness_score: # See focal_loss_sigmoid in for details objectness = torch.sigmoid(pred_outs['conf'][:, :, 0]) pred_outs['conf'][:, :, 1:] = objectness[:, :, None] * F.softmax( pred_outs['conf'][:, :, 1:], -1) pred_outs['conf'][:, :, 0] = 1 - objectness else: pred_outs['conf'] = F.softmax(pred_outs['conf'], -1) else: if cfg.use_objectness_score: objectness = torch.sigmoid(pred_outs['conf'][:, :, 0]) pred_outs['conf'][:, :, 1:] = (objectness > 0.10)[..., None] \ * F.softmax(pred_outs['conf'][:, :, 1:], dim=-1) else: pred_outs['conf'] = F.softmax(pred_outs['conf'], -1) return self.detect(pred_outs, self)
# GPU net = net.cuda() torch.set_default_tensor_type('torch.cuda.FloatTensor') x = torch.zeros((1, 3, cfg.max_size, cfg.max_size)) y = net(x) for p in net.prediction_layers: print(p.last_conv_size) print() for k, a in y.items(): print(k + ': ', a.size(), torch.sum(a)) exit() net(x) # timer.disable('pass2') avg = MovingAverage() try: while True: timer.reset() with timer.env('everything else'): net(x) avg.add(timer.total_time()) print('\033[2J') # Moves console cursor to 0,0 timer.print_stats() print('Avg fps: %.2f\tAvg ms: %.2f ' % (1 / avg.get_avg(), avg.get_avg() * 1000)) except KeyboardInterrupt: pass
def postprocess(det_output, w, h, batch_idx=0, interpolation_mode='bilinear', visualize_lincomb=False, crop_masks=True, score_threshold=0): """ Postprocesses the output of Yolact on testing mode into a format that makes sense, accounting for all the possible configuration settings. Args: - det_output: The lost of dicts that Detect outputs. - w: The real with of the image. - h: The real height of the image. - batch_idx: If you have multiple images for this batch, the image's index in the batch. - interpolation_mode: Can be 'nearest' | 'area' | 'bilinear' (see torch.nn.functional.interpolate) Returns 4 torch Tensors (in the following order): - classes [num_det]: The class idx for each detection. - scores [num_det]: The confidence score for each detection. - boxes [num_det, 4]: The bounding box for each detection in absolute point form. - masks [num_det, h, w]: Full image masks for each detection. """ dets = det_output[batch_idx] net = dets['net'] dets = dets['detection'] if dets is None: return [torch.Tensor() ] * 4 # Warning, this is 4 copies of the same thing if score_threshold > 0: keep = dets['score'] > score_threshold for k in dets: if k != 'proto': dets[k] = dets[k][keep] if dets['score'].size(0) == 0: return [torch.Tensor()] * 4 # Actually extract everything from dets now classes = dets['class'] boxes = dets['box'] scores = dets['score'] masks = dets['mask'] if cfg.mask_type == mask_type.lincomb and cfg.eval_mask_branch: # At this points masks is only the coefficients proto_data = dets['proto'] # Test flag, do not upvote if cfg.mask_proto_debug:'scripts/proto.npy', proto_data.cpu().numpy()) if visualize_lincomb: display_lincomb(proto_data, masks) masks = proto_data @ masks.t() masks = cfg.mask_proto_mask_activation(masks) # Crop masks before upsampling because you know why if crop_masks: masks = crop(masks, boxes) # Permute into the correct output shape [num_dets, proto_h, proto_w] masks = masks.permute(2, 0, 1).contiguous() if cfg.use_maskiou: with timer.env('maskiou_net'): with torch.no_grad(): maskiou_p = net.maskiou_net(masks.unsqueeze(1)) maskiou_p = torch.gather( maskiou_p, dim=1, index=classes.unsqueeze(1)).squeeze(1) if cfg.rescore_mask: if cfg.rescore_bbox: scores = scores * maskiou_p else: scores = [scores, scores * maskiou_p] # Scale masks up to the full image masks = F.interpolate(masks.unsqueeze(0), (h, w), mode=interpolation_mode, align_corners=False).squeeze(0) # Binarize the masks masks.gt_(0.5) boxes[:, 0], boxes[:, 2] = sanitize_coordinates(boxes[:, 0], boxes[:, 2], w, cast=False) boxes[:, 1], boxes[:, 3] = sanitize_coordinates(boxes[:, 1], boxes[:, 3], h, cast=False) boxes = boxes.long() if cfg.mask_type == and cfg.eval_mask_branch: # Upscale masks full_masks = torch.zeros(masks.size(0), h, w) for jdx in range(masks.size(0)): x1, y1, x2, y2 = boxes[jdx, :] mask_w = x2 - x1 mask_h = y2 - y1 # Just in case if mask_w * mask_h <= 0 or mask_w < 0: continue mask = masks[jdx, :].view(1, 1, cfg.mask_size, cfg.mask_size) mask = F.interpolate(mask, (mask_h, mask_w), mode=interpolation_mode, align_corners=False) mask = full_masks[jdx, y1:y2, x1:x2] = mask masks = full_masks return classes, scores, boxes, masks