def _sample_pairs(roidb, im_scale, batch_idx): """Generate a random sample of RoIs comprising foreground and background examples. """ sampled_obj_boxes = roidb['obj_boxes'] sampled_obj_rois = sampled_obj_boxes * im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sampled_obj_boxes.shape[0], 1)) sampled_obj_rois = np.hstack((repeated_batch_idx, sampled_obj_rois)) blob_dict = {} blob_dict['obj_rois'] = sampled_obj_rois sampled_rel_rois = sampled_obj_rois blob_dict['rel_rois'] = sampled_rel_rois prd_gt_cls = np.zeros(1, dtype=np.int32) prd_gt_cls[0] = roidb['prd_gt_cls'] blob_dict['prd_gt_cls'] = prd_gt_cls obj_gt_cls = np.zeros(1, dtype=np.int32) obj_gt_cls[0] = roidb['obj_gt_cls'] blob_dict['obj_gt_cls'] = obj_gt_cls return blob_dict
def add_prn_blobs(blobs_out, blobs_in): """ Add PRN specific blobs to the input blob dictionary.""" # Prepare the mask targets by associating one gt mask to each training roi # that has a fg (non-bg) class label. num_cls = cfg.MODEL.NUM_CLASSES iou_thres = cfg.PRN.IOU_THRESHOLD fg_inds = np.where(blobs_in['labels_int32'] > 0)[0] if fg_inds.shape[0] > 0: # Class labels for the foreground rois fg_labels = blobs_in['labels_int32'][fg_inds] # if below threshold, then set labels to 1, otherwise 0 prn_labels = (blobs_in['mask_ious'] < iou_thres).astype(np.int32) # and set roi_needs_refine same as prn_labels roi_needs_refine = (blobs_in['mask_ious'] < iou_thres).astype(np.int32) # calculate refine ratio refine_ratio = np.sum(roi_needs_refine, keepdims=True).astype(np.float32) refine_ratio /= fg_inds.shape[0] # sometimes the prn_labels might be all false, but we still need # a non-all-false roi_needs_refine. So set the first one as True if np.sum(roi_needs_refine) == 0: roi_needs_refine[0] = 1 else: # If there are no fg masks (it does happen) # The network cannot handle empty blobs, so we must provide a mask # We simply take the first bg roi, given it an all -1's mask (ignore # label), and label it with class zero (bg). bg_inds = np.where(blobs_in['labels_int32'] == 0)[0] # We give it an -1's blob (ignore label) prn_labels = -blob_utils.ones((1, ), int32=True) # We label it with class = 0 (background) fg_labels = blob_utils.zeros((1, )) # and set roi_needs_refine to be 1 roi_needs_refine = blob_utils.ones((1, ), int32=True) # set refine_ratio to be 0 refine_ratio = blob_utils.zeros((1, )) if cfg.PRN.CLS_SPECIFIC_LABEL: prn_labels = _expand_to_class_specific_prn_targets( prn_labels, fg_labels) blobs_out['prn_labels_int32'] = prn_labels blobs_out['roi_needs_refine_int32'] = roi_needs_refine blobs_out['refine_ratio'] = refine_ratio
def add_refine_keypoints_blobs_gaussian(blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx, data): """Add Mask R-CNN keypoint specific blobs to the given blobs dictionary.""" # Note: gt_inds must match how they're computed in # datasets.json_dataset._merge_proposal_boxes_into_roidb gt_inds = np.where(roidb['gt_classes'] > 0)[0] gt_keypoints = roidb['gt_keypoints'] # Load the kp_fg_inds generated by keypoint_rcnn.py. So we avoid the issue # of mismatched keypoint_rois and refined_keypoint_rois, which cause a big # issue for training. kp_fg_inds = blobs['keypoint_fg_inds'] if kp_fg_inds.shape[0] > 0: sampled_fg_rois = roidb['boxes'][kp_fg_inds] box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds] # Let's expand the rois up_scale = cfg.REFINENET.UP_SCALE inp_h, inp_w = data.shape[2], data.shape[3] pad_img_h, pad_img_w = inp_h / im_scale, inp_w / im_scale pad_fg_rois = box_utils.expand_boxes(sampled_fg_rois, up_scale) pad_fg_rois = box_utils.clip_boxes_to_image(pad_fg_rois, pad_img_h, pad_img_w) num_keypoints = gt_keypoints.shape[2] sampled_keypoints = -np.ones( (len(pad_fg_rois), gt_keypoints.shape[1], num_keypoints), dtype=gt_keypoints.dtype) for ii in range(len(pad_fg_rois)): ind = box_to_gt_ind_map[ii] if ind >= 0: sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :] assert np.sum(sampled_keypoints[ii, 2, :]) > 0 heats, weights = keypoint_utils.keypoints_to_gaussian_heatmap_labels( sampled_keypoints, pad_fg_rois, M=cfg.REFINENET.KRCNN.HEATMAP_SIZE) else: # If there are no fg keypoint rois (it does happen) # The network cannot handle empty blobs, so we must provide a heatmap # We simply take the first bg roi, given it an all zero heatmap, and # set its weights to zero (ignore label). roi_inds = np.where(roidb['gt_classes'] == 0)[0] # sampled_fg_rois is actually one random roi, but that's ok because ... pad_fg_rois = roidb['boxes'][roi_inds[0]].reshape((1, -1)) # We give it an 0's blob M = cfg.REFINENET.KRCNN.HEATMAP_SIZE heats = blob_utils.zeros((1, cfg.KRCNN.NUM_KEYPOINTS, M, M)) # We set weights to 0 (ignore label) weights = blob_utils.zeros((1, cfg.KRCNN.NUM_KEYPOINTS, 1)) pad_fg_rois *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones((pad_fg_rois.shape[0], 1)) pad_fg_rois = np.hstack((repeated_batch_idx, pad_fg_rois)) blobs['refined_keypoint_rois'] = pad_fg_rois blobs['refined_keypoint_heatmaps'] = heats blobs['refined_keypoint_weights'] = weights
def add_keypoint_rcnn_blobs( blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx ): """Add Mask R-CNN keypoint specific blobs to the given blobs dictionary.""" # Note: gt_inds must match how they're computed in # datasets.json_dataset._merge_proposal_boxes_into_roidb gt_inds = np.where(roidb['gt_classes'] > 0)[0] max_overlaps = roidb['max_overlaps'] gt_keypoints = roidb['gt_keypoints'] ind_kp = gt_inds[roidb['box_to_gt_ind_map']] within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes']) vis_kp = gt_keypoints[ind_kp, 2, :] > 0 is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0 kp_fg_inds = np.where( np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, is_visible) )[0] kp_fg_rois_per_this_image = np.minimum(fg_rois_per_image, kp_fg_inds.size) if kp_fg_inds.size > kp_fg_rois_per_this_image: kp_fg_inds = np.random.choice( kp_fg_inds, size=kp_fg_rois_per_this_image, replace=False ) sampled_fg_rois = roidb['boxes'][kp_fg_inds] box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds] num_keypoints = gt_keypoints.shape[2] sampled_keypoints = -np.ones( (len(sampled_fg_rois), gt_keypoints.shape[1], num_keypoints), dtype=gt_keypoints.dtype ) for ii in range(len(sampled_fg_rois)): ind = box_to_gt_ind_map[ii] if ind >= 0: sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :] assert np.sum(sampled_keypoints[ii, 2, :]) > 0 heats, weights = keypoint_utils.keypoints_to_heatmap_labels( sampled_keypoints, sampled_fg_rois ) shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS, 1) heats = heats.reshape(shape) weights = weights.reshape(shape) sampled_fg_rois *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sampled_fg_rois.shape[0], 1) ) sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois)) blobs['keypoint_rois'] = sampled_fg_rois blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False) blobs['keypoint_weights'] = weights
def _expand_to_class_specific_prn_targets(prn_labels, class_labels): """Expand labels from shape (#rois, ) to (#rois, #classes ) to encode class specific mask targets. """ assert prn_labels.shape[0] == class_labels.shape[0] # Target values of -1 are "don't care" / ignore labels prn_targets = -blob_utils.ones( (prn_labels.shape[0], cfg.MODEL.NUM_CLASSES), int32=True) prn_targets[np.arange(prn_labels.shape[0]), class_labels] = prn_labels return prn_targets
def add_semantic_segms_blobs(blobs, roidb, im_scale, batch_idx, data): """ Add Semantic Segmentation Net specidfic blobs to the input blob dictionary. Draw all gt polygons to the label """ num_cls = cfg.MODEL.NUM_CLASSES rescale_factor = cfg.SEMANTIC_NET.RESCALE_FACTOR polys_gt_inds = np.where((roidb['gt_classes'] > 0) & (roidb['is_crowd'] == 0))[0] polys_gt = [roidb['segms'][i] for i in polys_gt_inds] # Define size variables inp_h, inp_w = data.shape[2], data.shape[3] out_h, out_w = int(inp_h * rescale_factor), int(inp_w * rescale_factor) if polys_gt_inds.shape[0] > 0: # class label for the mask gt_class_labels = roidb['gt_classes'][polys_gt_inds] semantic_segms = blob_utils.zeros((num_cls, out_h, out_w), int32=True) # narrow scale and size scale = im_scale * rescale_factor im_h, im_w = roidb['height'], roidb['width'] im_label_h, im_label_w = int(im_h * scale), int(im_w * scale) # add for i in range(polys_gt_inds.shape[0]): cls_label = gt_class_labels[i] poly_gt = polys_gt[i] # Rasterize the portion of the polygon mask within the given fg roi # to an im_label_h x im_label_w binary image mask = segm_utils.polys_to_mask_scaled(poly_gt, im_h, im_w, scale) mask = np.array(mask > 0, dtype=np.int32) # Ensure it's binary semantic_segms[cls_label, 0:im_label_h, 0:im_label_w] = np.maximum( semantic_segms[cls_label, 0:im_label_h, 0:im_label_w], mask, dtype=np.int32) semantic_segms = np.reshape(semantic_segms, (1, num_cls * out_h * out_w)) else: # The network cannot handle empty blobs, so we must provide a mask # We simply take the first bg roi, given it an all -1's mask (ignore # label), and label it with class zero (bg). # We give it an -1's blob (ignore label) semantic_segms = -blob_utils.ones( (1, num_cls * out_h * out_w), int32=True) blobs['semantic_segms_int32'] = semantic_segms blobs['img_rois'] = np.array([batch_idx, 0, 0, inp_w - 1, inp_h - 1], dtype=np.float32)[np.newaxis, :]
def add_classification_blobs(blobs, im_scales, roidb): """Add blobs needed for training classification models.""" # Sample training RoIs from each image and append them to the blob lists for im_i, entry in enumerate(roidb): blobs['rois'].append(im_i * blob_utils.ones( (entry['gt_classes'].shape[0], 1))) blobs['labels_int32'].append(entry['gt_classes'].astype(np.int32)) # Concat the training blob lists into tensors for k, v in blobs.items(): if isinstance(v, list) and len(v) > 0: blobs[k] = np.concatenate(v) valid = True return valid
def _expand_to_class_specific_mask_targets(masks, mask_class_labels): """Expand masks from shape (#masks, M ** 2) to (#masks, #classes * M ** 2) to encode class specific mask targets. """ assert masks.shape[0] == mask_class_labels.shape[0] M = cfg.MRCNN.RESOLUTION # Target values of -1 are "don't care" / ignore labels mask_targets = -blob_utils.ones( (masks.shape[0], cfg.MODEL.NUM_CLASSES * M**2), int32=True) for i in range(masks.shape[0]): cls = int(mask_class_labels[i]) start = M**2 * cls end = start + M**2 # Ignore background instance # (only happens when there is no fg samples in an image) if cls > 0: mask_targets[i, start:end] = masks[i, :] return mask_targets
def _expand_to_class_specific_mask_targets(masks, mask_class_labels): """Expand masks from shape (#masks, M ** 2) to (#masks, #classes * M ** 2) to encode class specific mask targets. """ assert masks.shape[0] == mask_class_labels.shape[0] M = cfg.MRCNN.RESOLUTION # Target values of -1 are "don't care" / ignore labels mask_targets = -blob_utils.ones( (masks.shape[0], cfg.MODEL.NUM_CLASSES * M**2), int32=True) for i in range(masks.shape[0]): cls = int(mask_class_labels[i]) start = M**2 * cls end = start + M**2 # Ignore background instance # (only happens when there is no fg samples in an image) if cls > 0: mask_targets[i, start:end] = masks[i, :] return mask_targets
def _sample_rois(roidb, im_scale, batch_idx): """Generate a random sample of RoIs comprising foreground and background examples. """ rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM) # 512 rois_per_this_image = np.minimum(rois_per_image, len(roidb['boxes'])) keep_inds = npr.choice(range(len(roidb['boxes'])), size=rois_per_this_image, replace=False) keep_inds = keep_inds.astype(np.int32) sampled_boxes = roidb['boxes'][keep_inds] # (512, 4) # sampled_boxes = roidb['boxes'][:512] sampled_rois = sampled_boxes * im_scale repeated_batch_idx = batch_idx * blob_utils.ones((sampled_rois.shape[0], 1)) sampled_rois = np.hstack((repeated_batch_idx, sampled_rois)) blob_dict = dict(rois=sampled_rois) return blob_dict
def _expand_to_class_specific_boundary_targets(boundarys, boundary_class_labels): """Expand boundarys from shape (#boundarys, M ** 2) to (#boundarys, #classes * M ** 2) to encode class specific boundary targets. """ assert boundarys.shape[0] == boundary_class_labels.shape[0] M = cfg.BOUNDARY.RESOLUTION # Target values of -1 are "don't care" / ignore labels boundary_targets = -blob_utils.ones( (boundarys.shape[0], cfg.MODEL.NUM_CLASSES * M**2), int32=True) for i in range(boundarys.shape[0]): cls = int(boundary_class_labels[i]) start = M**2 * cls end = start + M**2 # Ignore background instance # (only happens when there is no fg samples in an image) if cls > 0: boundary_targets[i, start:end] = boundarys[i, :] return boundary_targets
def _get_gt_rois(roidb, im_scale, batch_idx): """Get ground truth rois, and the corresponding labels. """ gt_inds = np.where(roidb['gt_classes'] > 0)[0] # just get all the labels sampled_labels = roidb['gt_classes'][gt_inds] sampled_boxes = roidb['boxes'][gt_inds, :] # Scale rois and format as (batch_idx, x1, y1, x2, y2) sampled_rois = sampled_boxes * im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sampled_rois.shape[0], 1)) sampled_rois = np.hstack((repeated_batch_idx, sampled_rois)) im_height = np.round(roidb['height'] * im_scale) im_width = np.round(roidb['width'] * im_scale) im_info = np.array([[im_height, im_width, im_scale]], dtype=np.float32) # Base Fast R-CNN blobs blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False), rois=sampled_rois, im_info=im_info) return blob_dict
def add_mask_rcnn_blobs(blobs, sampled_boxes, roidb, im_scale, batch_idx): """Add Mask R-CNN specific blobs to the input blob dictionary.""" # Prepare the mask targets by associating one gt mask to each training roi # that has a fg (non-bg) class label. M = cfg.MRCNN.RESOLUTION polys_gt_inds = np.where((roidb['gt_classes'] > 0) & (roidb['is_crowd'] == 0))[0] polys_gt = [roidb['segms'][i] for i in polys_gt_inds] boxes_from_polys = segm_utils.polys_to_boxes(polys_gt) # boxes_from_polys = [roidb['boxes'][i] for i in polys_gt_inds] fg_inds = np.where(blobs['labels_int32'] > 0)[0] roi_has_mask = blobs['labels_int32'].copy() roi_has_mask[roi_has_mask > 0] = 1 if fg_inds.shape[0] > 0: # Class labels for the foreground rois mask_class_labels = blobs['labels_int32'][fg_inds] masks = blob_utils.zeros((fg_inds.shape[0], M**2), int32=True) # Find overlap between all foreground rois and the bounding boxes # enclosing each segmentation rois_fg = sampled_boxes[fg_inds] overlaps_bbfg_bbpolys = box_utils.bbox_overlaps( rois_fg.astype(np.float32, copy=False), boxes_from_polys.astype(np.float32, copy=False)) # Map from each fg rois to the index of the mask with highest overlap # (measured by bbox overlap) fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1) # add fg targets for i in range(rois_fg.shape[0]): fg_polys_ind = fg_polys_inds[i] poly_gt = polys_gt[fg_polys_ind] roi_fg = rois_fg[i] # Rasterize the portion of the polygon mask within the given fg roi # to an M x M binary image mask = segm_utils.polys_to_mask_wrt_box(poly_gt, roi_fg, M) mask = np.array(mask > 0, dtype=np.int32) # Ensure it's binary masks[i, :] = np.reshape(mask, M**2) else: # If there are no fg masks (it does happen) # The network cannot handle empty blobs, so we must provide a mask # We simply take the first bg roi, given it an all -1's mask (ignore # label), and label it with class zero (bg). bg_inds = np.where(blobs['labels_int32'] == 0)[0] # rois_fg is actually one background roi, but that's ok because ... rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1)) # We give it an -1's blob (ignore label) masks = -blob_utils.ones((1, M**2), int32=True) # We label it with class = 0 (background) mask_class_labels = blob_utils.zeros((1, )) # Mark that the first roi has a mask roi_has_mask[0] = 1 if cfg.MRCNN.CLS_SPECIFIC_MASK: masks = _expand_to_class_specific_mask_targets(masks, mask_class_labels) # Scale rois_fg and format as (batch_idx, x1, y1, x2, y2) rois_fg *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1)) rois_fg = np.hstack((repeated_batch_idx, rois_fg)) # Update blobs dict with Mask R-CNN blobs blobs['mask_rois'] = rois_fg blobs['roi_has_mask_int32'] = roi_has_mask blobs['masks_int32'] = masks
def add_charmask_rcnn_blobs(blobs, sampled_boxes, gt_boxes, gt_inds, roidb, im_scale, batch_idx): """Add Mask R-CNN specific blobs to the input blob dictionary.""" # Prepare the mask targets by associating one gt mask to each training roi # that has a fg (non-bg) class label. is_e2e = cfg.MRCNN.IS_E2E M_HEIGHT = cfg.MRCNN.RESOLUTION_H M_WIDTH = cfg.MRCNN.RESOLUTION_W mask_rois_per_this_image = cfg.MRCNN.MASK_BATCH_SIZE_PER_IM polys_gt_inds = np.where( (roidb['gt_classes'] > 0) & (roidb['is_crowd'] == 0) )[0] polys_gt = [roidb['segms'][i] for i in polys_gt_inds] chars_gts = roidb['charboxes'] boxes_from_polys = segm_utils.polys_to_boxes(polys_gt) if DEBUG: img_path = roidb['image'] img = Image.open(img_path) # img = blobs['data'][0] # img = img.transpose((1,2,0)) # img += cfg.PIXEL_MEANS # img = img.astype(np.int8) # img = Image.fromarray(img) if is_e2e: fg_inds = np.where(blobs['labels_int32'] > 0)[0] if fg_inds.size > mask_rois_per_this_image: fg_inds = npr.choice( fg_inds, size=mask_rois_per_this_image, replace=False ) roi_has_mask = np.ones((fg_inds.shape[0], ), dtype=np.int32) if fg_inds.shape[0] > 0: # Class labels for the foreground rois mask_class_labels = blobs['labels_int32'][fg_inds] masks = blob_utils.zeros((fg_inds.shape[0], 2, M_HEIGHT*M_WIDTH), int32=True) mask_weights = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH), dtype=np.float32) char_boxes = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH, 4), dtype=np.float32) char_boxes_inside_weight = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH, 4), dtype=np.float32) char_boxes_outside_weight = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH, 4), dtype=np.float32) # Find overlap between all foreground rois and the bounding boxes # enclosing each segmentation rois_fg = sampled_boxes[fg_inds] overlaps_bbfg_bbpolys = box_utils.bbox_overlaps( rois_fg.astype(np.float32, copy=False), boxes_from_polys.astype(np.float32, copy=False) ) # Map from each fg rois to the index of the mask with highest overlap # (measured by bbox overlap) fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1) # add fg targets for i in range(rois_fg.shape[0]): fg_polys_ind = fg_polys_inds[i] poly_gt = polys_gt[fg_polys_ind] indexes_rec_rois_gt_chars = np.where(chars_gts[:, 9] == fg_polys_ind) chars_gt = chars_gts[indexes_rec_rois_gt_chars, :9] roi_fg = rois_fg[i] # Rasterize the portion of the polygon mask within the given fg roi # to an M_HEIGHT x M_WIDTH binary image mask, mask_weight, char_box, char_box_inside_weight = segm_utils.polys_to_mask_wrt_box_rec(chars_gt.copy(), poly_gt, roi_fg.copy(), M_HEIGHT, M_WIDTH, weight_wh=cfg.MRCNN.WEIGHT_WH) if DEBUG: draw = ImageDraw.Draw(img) draw.rectangle([(roi_fg[0],roi_fg[1]), (roi_fg[2],roi_fg[3])]) img.save('./tests/image.jpg') _visu_global_map(mask[0,:,:].copy(), './tests/proposals_visu_global.jpg') _visu_char_map(mask[1,:,:].copy(), './tests/proposals_visu_char.jpg') _visu_char_box(char_box, char_box_inside_weight, './tests/char_box.jpg', M_HEIGHT, M_WIDTH) masks[i, 0, :] = np.reshape(mask[0,:,:], M_HEIGHT*M_WIDTH) masks[i, 1, :] = np.reshape(mask[1,:,:], M_HEIGHT*M_WIDTH) mask_weights[i, :] = np.reshape(mask_weight, M_HEIGHT*M_WIDTH) char_boxes[i, :, :] = np.reshape(char_box, (M_HEIGHT*M_WIDTH, 4)) char_boxes_inside_weight[i, :, :] = np.reshape(char_box_inside_weight, (M_HEIGHT*M_WIDTH, 4)) else: # If there are no fg masks (it does happen) # The network cannot handle empty blobs, so we must provide a mask # We simply take the first bg roi, given it an all -1's mask (ignore # label), and label it with class zero (bg). bg_inds = np.where(blobs['labels_int32'] == 0)[0] # rois_fg is actually one background roi, but that's ok because ... rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1)) # We give it an -1's blob (ignore label) masks = -blob_utils.ones((1, 2, M_HEIGHT*M_WIDTH), int32=True) mask_weights = -blob_utils.ones((1, 2, M_HEIGHT*M_WIDTH), int32=True) char_boxes_inside_weight = np.zeros(1, M_HEIGHT*M_WIDTH, 4, dtype=np.float32) # We label it with class = 0 (background) mask_class_labels = blob_utils.zeros((1, )) # Mark that the first roi has a mask roi_has_mask[0] = 1 else: fg_inds = gt_inds roi_has_mask = np.ones((fg_inds.shape[0], ), dtype=np.int32) if fg_inds.shape[0] > 0: # Class labels for the foreground rois mask_class_labels = np.ones((fg_inds.shape[0], ), dtype=np.int32) masks = blob_utils.zeros((fg_inds.shape[0], 2, M_HEIGHT*M_WIDTH), int32=True) char_boxes = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH, 4), dtype=np.float32) char_boxes_inside_weight = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH, 4), dtype=np.float32) char_boxes_outside_weight = np.zeros((fg_inds.shape[0], M_HEIGHT*M_WIDTH, 4), dtype=np.float32) # mask_weights = blob_utils.zeros((fg_inds.shape[0], 2, M_HEIGHT*M_WIDTH), int32=True) rois_fg = gt_boxes # print(gt_boxes.shape[0]) # add fg targets for i in range(rois_fg.shape[0]): fg_polys_ind = fg_inds[i] poly_gt = polys_gt[fg_polys_ind] indexes_rec_rois_gt_chars = np.where(chars_gts[:, 9] == fg_polys_ind) chars_gt = chars_gts[indexes_rec_rois_gt_chars, :9] roi_fg = rois_fg[i] # Rasterize the portion of the polygon mask within the given fg roi # to an M_HEIGHT x M_WIDTH binary image mask, char_box, char_box_inside_weight = segm_utils.polys_to_mask_wrt_box_rec(chars_gt, poly_gt, roi_fg, M_HEIGHT, M_WIDTH, weight_wh=cfg.MRCNN.WEIGHT_WH) if DEBUG: _visu_char_box(char_box, char_box_inside_weight, './tests/char_box.jpg', M_HEIGHT, M_WIDTH) mask = np.array(mask, dtype=np.int32) # Ensure it's binary # mask_weight = np.array(mask_weight, dtype=np.int32) # Ensure it's binary masks[i, 0, :] = np.reshape(mask[0,:,:], M_HEIGHT*M_WIDTH) masks[i, 1, :] = np.reshape(mask[1,:,:], M_HEIGHT*M_WIDTH) char_boxes[i, :, :] = np.reshape(char_box, (M_HEIGHT*M_WIDTH, 4)) char_boxes_inside_weight[i, :, :] = np.reshape(char_box_inside_weight, (M_HEIGHT*M_WIDTH, 4)) else: # If there are no fg masks (it does happen) # The network cannot handle empty blobs, so we must provide a mask # We simply take the first bg roi, given it an all -1's mask (ignore # label), and label it with class zero (bg). bg_inds = np.where(blobs['labels_int32'] == 0)[0] # rois_fg is actually one background roi, but that's ok because ... rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1)) # We give it an -1's blob (ignore label) masks = -blob_utils.ones((1, 2, M_HEIGHT*M_WIDTH), int32=True) mask_weights = -blob_utils.ones((1, 2, M_HEIGHT*M_WIDTH), int32=True) char_boxes = -np.ones(1, M_HEIGHT*M_WIDTH, 4, dtype=np.int32) char_boxes_inside_weight = -np.zeros(1, M_HEIGHT*M_WIDTH, 4, dtype=np.float32) # We label it with class = 0 (background) mask_class_labels = blob_utils.zeros((1, )) # Mark that the first roi has a mask roi_has_mask[0] = 1 # Scale rois_fg and format as (batch_idx, x1, y1, x2, y2) rois_fg *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1)) rois_fg = np.hstack((repeated_batch_idx, rois_fg)) char_boxes_outside_weight = np.array( char_boxes_inside_weight > 0, dtype=char_boxes_inside_weight.dtype ) # Update blobs dict with Mask R-CNN blobs blobs['mask_rois'] = rois_fg blobs['roi_has_mask_int32'] = roi_has_mask blobs['masks_global_int32'] = masks[:, 0, :] blobs['masks_char_int32'] = masks[:, 1, :].reshape((-1, M_HEIGHT, M_WIDTH)) blobs['masks_char_weight'] = mask_weights blobs['char_bbox_targets'] = char_boxes.reshape((-1,4)) blobs['char_bbox_inside_weights'] = char_boxes_inside_weight.reshape((-1,4)) blobs['char_bbox_outside_weights'] = char_boxes_outside_weight.reshape((-1,4))
def _sample_pairs(roidb, im_scale, batch_idx): """Generate a random sample of RoIs comprising foreground and background examples. """ fg_pairs_per_image = cfg.TRAIN.FG_REL_SIZE_PER_IM pairs_per_image = int( cfg.TRAIN.FG_REL_SIZE_PER_IM / cfg.TRAIN.FG_REL_FRACTION) # need much more pairs since it's quadratic max_pair_overlaps = roidb['max_pair_overlaps'] gt_pair_inds = np.where(max_pair_overlaps > 1.0 - 1e-4)[0] fg_pair_inds = np.where((max_pair_overlaps >= cfg.TRAIN.FG_THRESH) & (max_pair_overlaps <= 1.0 - 1e-4))[0] fg_pairs_per_this_image = np.minimum(fg_pairs_per_image, gt_pair_inds.size + fg_pair_inds.size) # Sample foreground regions without replacement if fg_pair_inds.size > 0: fg_pair_inds = npr.choice(fg_pair_inds, size=(fg_pairs_per_this_image - gt_pair_inds.size), replace=False) fg_pair_inds = np.append(fg_pair_inds, gt_pair_inds) # Label is the class each RoI has max overlap with fg_prd_labels = roidb['max_prd_classes'][fg_pair_inds] blob_dict = dict( fg_prd_labels_int32=fg_prd_labels.astype(np.int32, copy=False)) if cfg.MODEL.USE_BG: bg_pair_inds = np.where( (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_pairs_per_this_image = pairs_per_image - fg_pairs_per_this_image bg_pairs_per_this_image = np.minimum(bg_pairs_per_this_image, bg_pair_inds.size) # Sample foreground regions without replacement if bg_pair_inds.size > 0: bg_pair_inds = npr.choice(bg_pair_inds, size=bg_pairs_per_this_image, replace=False) keep_pair_inds = np.append(fg_pair_inds, bg_pair_inds) all_prd_labels = np.zeros(keep_pair_inds.size, dtype=np.int32) all_prd_labels[:fg_pair_inds. size] = fg_prd_labels + 1 # class should start from 1 else: keep_pair_inds = fg_pair_inds all_prd_labels = fg_prd_labels blob_dict['all_prd_labels_int32'] = all_prd_labels.astype(np.int32, copy=False) blob_dict['fg_size'] = np.array( [fg_pair_inds.size], dtype=np.int32 ) # this is used to check if there is at least one fg to learn sampled_sbj_boxes = roidb['sbj_boxes'][keep_pair_inds] sampled_obj_boxes = roidb['obj_boxes'][keep_pair_inds] # Scale rois and format as (batch_idx, x1, y1, x2, y2) sampled_sbj_rois = sampled_sbj_boxes * im_scale sampled_obj_rois = sampled_obj_boxes * im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (keep_pair_inds.shape[0], 1)) sampled_sbj_rois = np.hstack((repeated_batch_idx, sampled_sbj_rois)) sampled_obj_rois = np.hstack((repeated_batch_idx, sampled_obj_rois)) blob_dict['sbj_rois'] = sampled_sbj_rois blob_dict['obj_rois'] = sampled_obj_rois sampled_rel_rois = box_utils_rel.rois_union(sampled_sbj_rois, sampled_obj_rois) blob_dict['rel_rois'] = sampled_rel_rois if cfg.MODEL.USE_SPATIAL_FEAT: sampled_spt_feat = box_utils_rel.get_spt_features( sampled_sbj_boxes, sampled_obj_boxes, roidb['width'], roidb['height']) blob_dict['spt_feat'] = sampled_spt_feat if cfg.MODEL.USE_FREQ_BIAS: sbj_labels = roidb['max_sbj_classes'][keep_pair_inds] obj_labels = roidb['max_obj_classes'][keep_pair_inds] blob_dict['all_sbj_labels_int32'] = sbj_labels.astype(np.int32, copy=False) blob_dict['all_obj_labels_int32'] = obj_labels.astype(np.int32, copy=False) if cfg.MODEL.USE_NODE_CONTRASTIVE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_SO_AWARE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_P_AWARE_LOSS: nodes_per_image = cfg.MODEL.NODE_SAMPLE_SIZE max_sbj_overlaps = roidb['max_sbj_overlaps'] max_obj_overlaps = roidb['max_obj_overlaps'] # sbj # Here a naturally existing assumption is, each positive sbj should have at least one positive obj sbj_pos_pair_pos_inds = np.where( (max_pair_overlaps >= cfg.TRAIN.FG_THRESH))[0] sbj_pos_obj_pos_pair_neg_inds = np.where( (max_sbj_overlaps >= cfg.TRAIN.FG_THRESH) & (max_obj_overlaps >= cfg.TRAIN.FG_THRESH) & (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0] sbj_pos_obj_neg_pair_neg_inds = np.where( (max_sbj_overlaps >= cfg.TRAIN.FG_THRESH) & (max_obj_overlaps < cfg.TRAIN.FG_THRESH) & (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0] if sbj_pos_pair_pos_inds.size > 0: sbj_pos_pair_pos_inds = npr.choice( sbj_pos_pair_pos_inds, size=int(min(nodes_per_image, sbj_pos_pair_pos_inds.size)), replace=False) if sbj_pos_obj_pos_pair_neg_inds.size > 0: sbj_pos_obj_pos_pair_neg_inds = npr.choice( sbj_pos_obj_pos_pair_neg_inds, size=int( min(nodes_per_image, sbj_pos_obj_pos_pair_neg_inds.size)), replace=False) sbj_pos_pair_neg_inds = sbj_pos_obj_pos_pair_neg_inds if nodes_per_image - sbj_pos_obj_pos_pair_neg_inds.size > 0 and sbj_pos_obj_neg_pair_neg_inds.size > 0: sbj_pos_obj_neg_pair_neg_inds = npr.choice( sbj_pos_obj_neg_pair_neg_inds, size=int( min(nodes_per_image - sbj_pos_obj_pos_pair_neg_inds.size, sbj_pos_obj_neg_pair_neg_inds.size)), replace=False) sbj_pos_pair_neg_inds = np.append(sbj_pos_pair_neg_inds, sbj_pos_obj_neg_pair_neg_inds) sbj_pos_inds = np.append(sbj_pos_pair_pos_inds, sbj_pos_pair_neg_inds) binary_labels_sbj_pos = np.zeros(sbj_pos_inds.size, dtype=np.int32) binary_labels_sbj_pos[:sbj_pos_pair_pos_inds.size] = 1 blob_dict[ 'binary_labels_sbj_pos_int32'] = binary_labels_sbj_pos.astype( np.int32, copy=False) prd_pos_labels_sbj_pos = roidb['max_prd_classes'][ sbj_pos_pair_pos_inds] prd_labels_sbj_pos = np.zeros(sbj_pos_inds.size, dtype=np.int32) prd_labels_sbj_pos[:sbj_pos_pair_pos_inds. size] = prd_pos_labels_sbj_pos + 1 blob_dict['prd_labels_sbj_pos_int32'] = prd_labels_sbj_pos.astype( np.int32, copy=False) sbj_labels_sbj_pos = roidb['max_sbj_classes'][sbj_pos_inds] + 1 # 1. set all obj labels > 0 obj_labels_sbj_pos = roidb['max_obj_classes'][sbj_pos_inds] + 1 # 2. find those negative obj max_obj_overlaps_sbj_pos = roidb['max_obj_overlaps'][sbj_pos_inds] obj_neg_inds_sbj_pos = np.where( max_obj_overlaps_sbj_pos < cfg.TRAIN.FG_THRESH)[0] obj_labels_sbj_pos[obj_neg_inds_sbj_pos] = 0 blob_dict['sbj_labels_sbj_pos_int32'] = sbj_labels_sbj_pos.astype( np.int32, copy=False) blob_dict['obj_labels_sbj_pos_int32'] = obj_labels_sbj_pos.astype( np.int32, copy=False) # this is for freq bias in RelDN blob_dict['sbj_labels_sbj_pos_fg_int32'] = roidb['max_sbj_classes'][ sbj_pos_inds].astype(np.int32, copy=False) blob_dict['obj_labels_sbj_pos_fg_int32'] = roidb['max_obj_classes'][ sbj_pos_inds].astype(np.int32, copy=False) sampled_sbj_boxes_sbj_pos = roidb['sbj_boxes'][sbj_pos_inds] sampled_obj_boxes_sbj_pos = roidb['obj_boxes'][sbj_pos_inds] # Scale rois and format as (batch_idx, x1, y1, x2, y2) sampled_sbj_rois_sbj_pos = sampled_sbj_boxes_sbj_pos * im_scale sampled_obj_rois_sbj_pos = sampled_obj_boxes_sbj_pos * im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sbj_pos_inds.shape[0], 1)) sampled_sbj_rois_sbj_pos = np.hstack( (repeated_batch_idx, sampled_sbj_rois_sbj_pos)) sampled_obj_rois_sbj_pos = np.hstack( (repeated_batch_idx, sampled_obj_rois_sbj_pos)) blob_dict['sbj_rois_sbj_pos'] = sampled_sbj_rois_sbj_pos blob_dict['obj_rois_sbj_pos'] = sampled_obj_rois_sbj_pos sampled_rel_rois_sbj_pos = box_utils_rel.rois_union( sampled_sbj_rois_sbj_pos, sampled_obj_rois_sbj_pos) blob_dict['rel_rois_sbj_pos'] = sampled_rel_rois_sbj_pos _, inds_unique_sbj_pos, inds_reverse_sbj_pos = np.unique( sampled_sbj_rois_sbj_pos, return_index=True, return_inverse=True, axis=0) assert inds_reverse_sbj_pos.shape[0] == sampled_sbj_rois_sbj_pos.shape[ 0] blob_dict['inds_unique_sbj_pos'] = inds_unique_sbj_pos blob_dict['inds_reverse_sbj_pos'] = inds_reverse_sbj_pos if cfg.MODEL.USE_SPATIAL_FEAT: sampled_spt_feat_sbj_pos = box_utils_rel.get_spt_features( sampled_sbj_boxes_sbj_pos, sampled_obj_boxes_sbj_pos, roidb['width'], roidb['height']) blob_dict['spt_feat_sbj_pos'] = sampled_spt_feat_sbj_pos # obj # Here a naturally existing assumption is, each positive obj should have at least one positive sbj obj_pos_pair_pos_inds = np.where( (max_pair_overlaps >= cfg.TRAIN.FG_THRESH))[0] obj_pos_sbj_pos_pair_neg_inds = np.where( (max_obj_overlaps >= cfg.TRAIN.FG_THRESH) & (max_sbj_overlaps >= cfg.TRAIN.FG_THRESH) & (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0] obj_pos_sbj_neg_pair_neg_inds = np.where( (max_obj_overlaps >= cfg.TRAIN.FG_THRESH) & (max_sbj_overlaps < cfg.TRAIN.FG_THRESH) & (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0] if obj_pos_pair_pos_inds.size > 0: obj_pos_pair_pos_inds = npr.choice( obj_pos_pair_pos_inds, size=int(min(nodes_per_image, obj_pos_pair_pos_inds.size)), replace=False) if obj_pos_sbj_pos_pair_neg_inds.size > 0: obj_pos_sbj_pos_pair_neg_inds = npr.choice( obj_pos_sbj_pos_pair_neg_inds, size=int( min(nodes_per_image, obj_pos_sbj_pos_pair_neg_inds.size)), replace=False) obj_pos_pair_neg_inds = obj_pos_sbj_pos_pair_neg_inds if nodes_per_image - obj_pos_sbj_pos_pair_neg_inds.size > 0 and obj_pos_sbj_neg_pair_neg_inds.size: obj_pos_sbj_neg_pair_neg_inds = npr.choice( obj_pos_sbj_neg_pair_neg_inds, size=int( min(nodes_per_image - obj_pos_sbj_pos_pair_neg_inds.size, obj_pos_sbj_neg_pair_neg_inds.size)), replace=False) obj_pos_pair_neg_inds = np.append(obj_pos_pair_neg_inds, obj_pos_sbj_neg_pair_neg_inds) obj_pos_inds = np.append(obj_pos_pair_pos_inds, obj_pos_pair_neg_inds) binary_labels_obj_pos = np.zeros(obj_pos_inds.size, dtype=np.int32) binary_labels_obj_pos[:obj_pos_pair_pos_inds.size] = 1 blob_dict[ 'binary_labels_obj_pos_int32'] = binary_labels_obj_pos.astype( np.int32, copy=False) prd_pos_labels_obj_pos = roidb['max_prd_classes'][ obj_pos_pair_pos_inds] prd_labels_obj_pos = np.zeros(obj_pos_inds.size, dtype=np.int32) prd_labels_obj_pos[:obj_pos_pair_pos_inds. size] = prd_pos_labels_obj_pos + 1 blob_dict['prd_labels_obj_pos_int32'] = prd_labels_obj_pos.astype( np.int32, copy=False) obj_labels_obj_pos = roidb['max_obj_classes'][obj_pos_inds] + 1 # 1. set all sbj labels > 0 sbj_labels_obj_pos = roidb['max_sbj_classes'][obj_pos_inds] + 1 # 2. find those negative sbj max_sbj_overlaps_obj_pos = roidb['max_sbj_overlaps'][obj_pos_inds] sbj_neg_inds_obj_pos = np.where( max_sbj_overlaps_obj_pos < cfg.TRAIN.FG_THRESH)[0] sbj_labels_obj_pos[sbj_neg_inds_obj_pos] = 0 blob_dict['sbj_labels_obj_pos_int32'] = sbj_labels_obj_pos.astype( np.int32, copy=False) blob_dict['obj_labels_obj_pos_int32'] = obj_labels_obj_pos.astype( np.int32, copy=False) # this is for freq bias in RelDN blob_dict['sbj_labels_obj_pos_fg_int32'] = roidb['max_sbj_classes'][ obj_pos_inds].astype(np.int32, copy=False) blob_dict['obj_labels_obj_pos_fg_int32'] = roidb['max_obj_classes'][ obj_pos_inds].astype(np.int32, copy=False) sampled_sbj_boxes_obj_pos = roidb['sbj_boxes'][obj_pos_inds] sampled_obj_boxes_obj_pos = roidb['obj_boxes'][obj_pos_inds] # Scale rois and format as (batch_idx, x1, y1, x2, y2) sampled_sbj_rois_obj_pos = sampled_sbj_boxes_obj_pos * im_scale sampled_obj_rois_obj_pos = sampled_obj_boxes_obj_pos * im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (obj_pos_inds.shape[0], 1)) sampled_sbj_rois_obj_pos = np.hstack( (repeated_batch_idx, sampled_sbj_rois_obj_pos)) sampled_obj_rois_obj_pos = np.hstack( (repeated_batch_idx, sampled_obj_rois_obj_pos)) blob_dict['sbj_rois_obj_pos'] = sampled_sbj_rois_obj_pos blob_dict['obj_rois_obj_pos'] = sampled_obj_rois_obj_pos sampled_rel_rois_obj_pos = box_utils_rel.rois_union( sampled_sbj_rois_obj_pos, sampled_obj_rois_obj_pos) blob_dict['rel_rois_obj_pos'] = sampled_rel_rois_obj_pos _, inds_unique_obj_pos, inds_reverse_obj_pos = np.unique( sampled_obj_rois_obj_pos, return_index=True, return_inverse=True, axis=0) assert inds_reverse_obj_pos.shape[0] == sampled_obj_rois_obj_pos.shape[ 0] blob_dict['inds_unique_obj_pos'] = inds_unique_obj_pos blob_dict['inds_reverse_obj_pos'] = inds_reverse_obj_pos if cfg.MODEL.USE_SPATIAL_FEAT: sampled_spt_feat_obj_pos = box_utils_rel.get_spt_features( sampled_sbj_boxes_obj_pos, sampled_obj_boxes_obj_pos, roidb['width'], roidb['height']) blob_dict['spt_feat_obj_pos'] = sampled_spt_feat_obj_pos return blob_dict
def _forward(self, data, im_info, dataset_name=None, roidb=None, use_gt_labels=False, include_feat=False, **rpn_kwargs): im_data = data if self.training: roidb = list(map(lambda x: blob_utils.deserialize(x)[0], roidb)) if dataset_name is not None: dataset_name = blob_utils.deserialize(dataset_name) else: dataset_name = cfg.TRAIN.DATASETS[ 0] if self.training else cfg.TEST.DATASETS[ 0] # assuming only one dataset per run device_id = im_data.get_device() return_dict = {} # A dict to collect return variables blob_conv = self.Conv_Body(im_data) blob_conv_prd = self.Prd_RCNN.Conv_Body(im_data) if cfg.FPN.FPN_ON: # Retain only the blobs that will be used for RoI heads. `blob_conv` may include # extra blobs that are used for RPN proposals, but not for RoI heads. blob_conv = blob_conv[-self.num_roi_levels:] blob_conv_prd = blob_conv_prd[-self.num_roi_levels:] if not cfg.TRAIN.USE_GT_BOXES: rpn_ret = self.RPN(blob_conv, im_info, roidb) if cfg.MODEL.SHARE_RES5 and self.training: box_feat, res5_feat = self.Box_Head(blob_conv, rpn_ret, use_relu=True) else: box_feat = self.Box_Head(blob_conv, rpn_ret, use_relu=True) cls_score, bbox_pred = self.Box_Outs(box_feat) # now go through the predicate branch use_relu = False if cfg.MODEL.NO_FC7_RELU else True if self.training: if cfg.TRAIN.USE_GT_BOXES: # we always feed one image per batch during training assert len(roidb) == 1 im_scale = im_info.data.numpy()[:, 2][0] im_w = im_info.data.numpy()[:, 1][0] im_h = im_info.data.numpy()[:, 0][0] sbj_boxes = roidb[0]['sbj_gt_boxes'] obj_boxes = roidb[0]['obj_gt_boxes'] sbj_all_boxes = _augment_gt_boxes_by_perturbation( sbj_boxes, im_w, im_h) obj_all_boxes = _augment_gt_boxes_by_perturbation( obj_boxes, im_w, im_h) det_all_boxes = np.vstack((sbj_all_boxes, obj_all_boxes)) det_all_boxes = np.unique(det_all_boxes, axis=0) det_all_rois = det_all_boxes * im_scale repeated_batch_idx = 0 * blob_utils.ones( (det_all_rois.shape[0], 1)) det_all_rois = np.hstack((repeated_batch_idx, det_all_rois)) rel_ret = self.RelPN(det_all_rois, None, None, im_info, dataset_name, roidb) else: fg_inds = np.where(rpn_ret['labels_int32'] > 0)[0] det_rois = rpn_ret['rois'][fg_inds] det_labels = rpn_ret['labels_int32'][fg_inds] det_scores = F.softmax(cls_score[fg_inds], dim=1) rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info, dataset_name, roidb) sbj_feat = self.Box_Head(blob_conv, rel_ret, rois_name='sbj_rois', use_relu=use_relu) obj_feat = self.Box_Head(blob_conv, rel_ret, rois_name='obj_rois', use_relu=use_relu) else: if roidb is not None: im_scale = im_info.data.numpy()[:, 2][0] im_w = im_info.data.numpy()[:, 1][0] im_h = im_info.data.numpy()[:, 0][0] sbj_boxes = roidb['sbj_gt_boxes'] obj_boxes = roidb['obj_gt_boxes'] sbj_rois = sbj_boxes * im_scale obj_rois = obj_boxes * im_scale repeated_batch_idx = 0 * blob_utils.ones( (sbj_rois.shape[0], 1)) sbj_rois = np.hstack((repeated_batch_idx, sbj_rois)) obj_rois = np.hstack((repeated_batch_idx, obj_rois)) rel_rois = box_utils.rois_union(sbj_rois, obj_rois) rel_ret = {} rel_ret['sbj_rois'] = sbj_rois rel_ret['obj_rois'] = obj_rois rel_ret['rel_rois'] = rel_rois if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS: lvl_min = cfg.FPN.ROI_MIN_LEVEL lvl_max = cfg.FPN.ROI_MAX_LEVEL rois_blob_names = ['sbj_rois', 'obj_rois', 'rel_rois'] for rois_blob_name in rois_blob_names: # Add per FPN level roi blobs named like: <rois_blob_name>_fpn<lvl> target_lvls = fpn_utils.map_rois_to_fpn_levels( rel_ret[rois_blob_name][:, 1:5], lvl_min, lvl_max) fpn_utils.add_multilevel_roi_blobs( rel_ret, rois_blob_name, rel_ret[rois_blob_name], target_lvls, lvl_min, lvl_max) if use_gt_labels: sbj_labels = roidb['sbj_gt_classes'] # start from 0 obj_labels = roidb['obj_gt_classes'] # start from 0 sbj_scores = np.ones_like(sbj_labels, dtype=np.float32) obj_scores = np.ones_like(obj_labels, dtype=np.float32) else: sbj_det_feat = self.Box_Head(blob_conv, rel_ret, rois_name='sbj_rois', use_relu=True) sbj_cls_scores, _ = self.Box_Outs(sbj_det_feat) sbj_cls_scores = sbj_cls_scores.data.cpu().numpy() obj_det_feat = self.Box_Head(blob_conv, rel_ret, rois_name='obj_rois', use_relu=True) obj_cls_scores, _ = self.Box_Outs(obj_det_feat) obj_cls_scores = obj_cls_scores.data.cpu().numpy() sbj_labels = np.argmax(sbj_cls_scores[:, 1:], axis=1) obj_labels = np.argmax(obj_cls_scores[:, 1:], axis=1) sbj_scores = np.amax(sbj_cls_scores[:, 1:], axis=1) obj_scores = np.amax(obj_cls_scores[:, 1:], axis=1) rel_ret['sbj_scores'] = sbj_scores.astype(np.float32, copy=False) rel_ret['obj_scores'] = obj_scores.astype(np.float32, copy=False) rel_ret['sbj_labels'] = sbj_labels.astype( np.int32, copy=False) + 1 # need to start from 1 rel_ret['obj_labels'] = obj_labels.astype( np.int32, copy=False) + 1 # need to start from 1 rel_ret['all_sbj_labels_int32'] = sbj_labels.astype(np.int32, copy=False) rel_ret['all_obj_labels_int32'] = obj_labels.astype(np.int32, copy=False) sbj_feat = self.Box_Head(blob_conv, rel_ret, rois_name='sbj_rois', use_relu=use_relu) obj_feat = self.Box_Head(blob_conv, rel_ret, rois_name='obj_rois', use_relu=use_relu) else: score_thresh = cfg.TEST.SCORE_THRESH while score_thresh >= -1e-06: # a negative value very close to 0.0 det_rois, det_labels, det_scores = \ self.prepare_det_rois(rpn_ret['rois'], cls_score, bbox_pred, im_info, score_thresh) rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info, dataset_name, roidb) valid_len = len(rel_ret['rel_rois']) if valid_len > 0: break logger.info( 'Got {} rel_rois when score_thresh={}, changing to {}'. format(valid_len, score_thresh, score_thresh - 0.01)) score_thresh -= 0.01 det_feat = self.Box_Head(blob_conv, rel_ret, rois_name='det_rois', use_relu=use_relu) sbj_feat = det_feat[rel_ret['sbj_inds']] obj_feat = det_feat[rel_ret['obj_inds']] rel_feat = self.Prd_RCNN.Box_Head(blob_conv_prd, rel_ret, rois_name='rel_rois', use_relu=use_relu) concat_feat = torch.cat((sbj_feat, rel_feat, obj_feat), dim=1) if cfg.MODEL.USE_FREQ_BIAS or cfg.MODEL.RUN_BASELINE or cfg.MODEL.USE_SEM_CONCAT: sbj_labels = rel_ret['all_sbj_labels_int32'] obj_labels = rel_ret['all_obj_labels_int32'] else: sbj_labels = None obj_labels = None # when MODEL.USE_SEM_CONCAT, memory runs out if the whole batch is fed once # so we need to feed the batch twice if it's big gn_size = 1000 if cfg.MODEL.USE_SEM_CONCAT and concat_feat.shape[0] > gn_size: group = int(math.ceil(concat_feat.shape[0] / gn_size)) prd_cls_scores = None sbj_cls_scores = None obj_cls_scores = None for i in range(group): end = int(min((i + 1) * gn_size, concat_feat.shape[0])) concat_feat_i = concat_feat[i * gn_size:end] sbj_labels_i = sbj_labels[ i * gn_size:end] if sbj_labels is not None else None obj_labels_i = obj_labels[ i * gn_size:end] if obj_labels is not None else None sbj_feat_i = sbj_feat[i * gn_size:end] obj_feat_i = obj_feat[i * gn_size:end] prd_cls_scores_i, sbj_cls_scores_i, obj_cls_scores_i = \ self.RelDN(concat_feat_i, sbj_labels_i, obj_labels_i, sbj_feat_i, obj_feat_i) if prd_cls_scores is None: prd_cls_scores = prd_cls_scores_i sbj_cls_scores = sbj_cls_scores_i obj_cls_scores = obj_cls_scores_i else: prd_cls_scores = torch.cat( (prd_cls_scores, prd_cls_scores_i)) sbj_cls_scores = torch.cat( (sbj_cls_scores, sbj_cls_scores_i )) if sbj_cls_scores_i is not None else sbj_cls_scores obj_cls_scores = torch.cat( (obj_cls_scores, obj_cls_scores_i )) if obj_cls_scores_i is not None else obj_cls_scores else: prd_cls_scores, sbj_cls_scores, obj_cls_scores = \ self.RelDN(concat_feat, sbj_labels, obj_labels, sbj_feat, obj_feat) if self.training: return_dict['losses'] = {} return_dict['metrics'] = {} if not cfg.TRAIN.USE_GT_BOXES: # rpn loss rpn_kwargs.update( dict((k, rpn_ret[k]) for k in rpn_ret.keys() if (k.startswith('rpn_cls_logits') or k.startswith('rpn_bbox_pred')))) loss_rpn_cls, loss_rpn_bbox = rpn_heads.generic_rpn_losses( **rpn_kwargs) if cfg.FPN.FPN_ON: for i, lvl in enumerate( range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1)): return_dict['losses']['loss_rpn_cls_fpn%d' % lvl] = loss_rpn_cls[i] return_dict['losses']['loss_rpn_bbox_fpn%d' % lvl] = loss_rpn_bbox[i] else: return_dict['losses']['loss_rpn_cls'] = loss_rpn_cls return_dict['losses']['loss_rpn_bbox'] = loss_rpn_bbox # bbox loss loss_cls, loss_bbox, accuracy_cls = fast_rcnn_heads.fast_rcnn_losses( cls_score, bbox_pred, rpn_ret['labels_int32'], rpn_ret['bbox_targets'], rpn_ret['bbox_inside_weights'], rpn_ret['bbox_outside_weights']) return_dict['losses']['loss_cls'] = loss_cls return_dict['losses']['loss_bbox'] = loss_bbox return_dict['metrics']['accuracy_cls'] = accuracy_cls loss_cls_prd, accuracy_cls_prd = reldn_heads.reldn_losses( prd_cls_scores, rel_ret['all_prd_labels_int32'], weight=self.prd_weights) return_dict['losses']['loss_cls_prd'] = loss_cls_prd return_dict['metrics']['accuracy_cls_prd'] = accuracy_cls_prd if cfg.MODEL.USE_SEPARATE_SO_SCORES: loss_cls_sbj, accuracy_cls_sbj = reldn_heads.reldn_losses( sbj_cls_scores, rel_ret['all_sbj_labels_int32'], weight=self.obj_weights) return_dict['losses']['loss_cls_sbj'] = loss_cls_sbj return_dict['metrics']['accuracy_cls_sbj'] = accuracy_cls_sbj loss_cls_obj, accuracy_cls_obj = reldn_heads.reldn_losses( obj_cls_scores, rel_ret['all_obj_labels_int32'], weight=self.obj_weights) return_dict['losses']['loss_cls_obj'] = loss_cls_obj return_dict['metrics']['accuracy_cls_obj'] = accuracy_cls_obj if cfg.TRAIN.HUBNESS: loss_hubness_prd = reldn_heads.add_hubness_loss(prd_cls_scores) loss_hubness_sbj = reldn_heads.add_hubness_loss(sbj_cls_scores) loss_hubness_obj = reldn_heads.add_hubness_loss(obj_cls_scores) return_dict['losses']['loss_hubness_prd'] = loss_hubness_prd return_dict['losses']['loss_hubness_sbj'] = loss_hubness_sbj return_dict['losses']['loss_hubness_obj'] = loss_hubness_obj # pytorch0.4 bug on gathering scalar(0-dim) tensors for k, v in return_dict['losses'].items(): return_dict['losses'][k] = v.unsqueeze(0) for k, v in return_dict['metrics'].items(): return_dict['metrics'][k] = v.unsqueeze(0) else: # Testing return_dict['sbj_rois'] = rel_ret['sbj_rois'] return_dict['obj_rois'] = rel_ret['obj_rois'] return_dict['sbj_labels'] = rel_ret['sbj_labels'] return_dict['obj_labels'] = rel_ret['obj_labels'] return_dict['sbj_scores'] = rel_ret['sbj_scores'] return_dict['sbj_scores_out'] = sbj_cls_scores return_dict['obj_scores'] = rel_ret['obj_scores'] return_dict['obj_scores_out'] = obj_cls_scores return_dict['prd_scores'] = prd_cls_scores if include_feat: return_dict['sbj_feat'] = sbj_feat return_dict['obj_feat'] = obj_feat return_dict['prd_feat'] = concat_feat return return_dict
def _forward(self, data, im_info, do_vis=False, dataset_name=None, roidb=None, use_gt_labels=False, **rpn_kwargs): im_data = data if self.training: roidb = list(map(lambda x: blob_utils.deserialize(x)[0], roidb)) if dataset_name is not None: dataset_name = blob_utils.deserialize(dataset_name) else: dataset_name = cfg.TRAIN.DATASETS[ 0] if self.training else cfg.TEST.DATASETS[ 0] # assuming only one dataset per run device_id = im_data.get_device() return_dict = {} # A dict to collect return variables blob_conv = self.Conv_Body(im_data) if not cfg.MODEL.USE_REL_PYRAMID: blob_conv_prd = self.Prd_RCNN.Conv_Body(im_data) rpn_ret = self.RPN(blob_conv, im_info, roidb) if cfg.FPN.FPN_ON: # Retain only the blobs that will be used for RoI heads. `blob_conv` may include # extra blobs that are used for RPN proposals, but not for RoI heads. blob_conv = blob_conv[-self.num_roi_levels:] if not cfg.MODEL.USE_REL_PYRAMID: blob_conv_prd = blob_conv_prd[-self.num_roi_levels:] else: blob_conv_prd = self.RelPyramid(blob_conv) if cfg.MODEL.SHARE_RES5 and self.training: box_feat, res5_feat = self.Box_Head(blob_conv, rpn_ret, use_relu=True) else: box_feat = self.Box_Head(blob_conv, rpn_ret, use_relu=True) cls_score, bbox_pred = self.Box_Outs(box_feat) # now go through the predicate branch use_relu = False if cfg.MODEL.NO_FC7_RELU else True if self.training: fg_inds = np.where(rpn_ret['labels_int32'] > 0)[0] det_rois = rpn_ret['rois'][fg_inds] det_labels = rpn_ret['labels_int32'][fg_inds] det_scores = F.softmax(cls_score[fg_inds], dim=1) rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info, dataset_name, roidb) if cfg.MODEL.ADD_SO_SCORES: sbj_feat = self.S_Head(blob_conv, rel_ret, rois_name='sbj_rois', use_relu=use_relu) obj_feat = self.O_Head(blob_conv, rel_ret, rois_name='obj_rois', use_relu=use_relu) else: sbj_feat = self.Box_Head(blob_conv, rel_ret, rois_name='sbj_rois', use_relu=use_relu) obj_feat = self.Box_Head(blob_conv, rel_ret, rois_name='obj_rois', use_relu=use_relu) if cfg.MODEL.USE_NODE_CONTRASTIVE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_SO_AWARE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_P_AWARE_LOSS: if cfg.MODEL.ADD_SO_SCORES: # sbj sbj_feat_sbj_pos = self.S_Head( blob_conv, rel_ret, rois_name='sbj_rois_sbj_pos', use_relu=use_relu) obj_feat_sbj_pos = self.O_Head( blob_conv, rel_ret, rois_name='obj_rois_sbj_pos', use_relu=use_relu) # obj sbj_feat_obj_pos = self.S_Head( blob_conv, rel_ret, rois_name='sbj_rois_obj_pos', use_relu=use_relu) obj_feat_obj_pos = self.O_Head( blob_conv, rel_ret, rois_name='obj_rois_obj_pos', use_relu=use_relu) else: # sbj sbj_feat_sbj_pos = self.Box_Head( blob_conv, rel_ret, rois_name='sbj_rois_sbj_pos', use_relu=use_relu) obj_feat_sbj_pos = self.Box_Head( blob_conv, rel_ret, rois_name='obj_rois_sbj_pos', use_relu=use_relu) # obj sbj_feat_obj_pos = self.Box_Head( blob_conv, rel_ret, rois_name='sbj_rois_obj_pos', use_relu=use_relu) obj_feat_obj_pos = self.Box_Head( blob_conv, rel_ret, rois_name='obj_rois_obj_pos', use_relu=use_relu) else: if roidb is not None: im_scale = im_info.data.numpy()[:, 2][0] im_w = im_info.data.numpy()[:, 1][0] im_h = im_info.data.numpy()[:, 0][0] sbj_boxes = roidb['sbj_gt_boxes'] obj_boxes = roidb['obj_gt_boxes'] sbj_rois = sbj_boxes * im_scale obj_rois = obj_boxes * im_scale repeated_batch_idx = 0 * blob_utils.ones( (sbj_rois.shape[0], 1)) sbj_rois = np.hstack((repeated_batch_idx, sbj_rois)) obj_rois = np.hstack((repeated_batch_idx, obj_rois)) rel_rois = box_utils_rel.rois_union(sbj_rois, obj_rois) rel_ret = {} rel_ret['sbj_rois'] = sbj_rois rel_ret['obj_rois'] = obj_rois rel_ret['rel_rois'] = rel_rois if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS: lvl_min = cfg.FPN.ROI_MIN_LEVEL lvl_max = cfg.FPN.ROI_MAX_LEVEL rois_blob_names = ['sbj_rois', 'obj_rois', 'rel_rois'] for rois_blob_name in rois_blob_names: # Add per FPN level roi blobs named like: <rois_blob_name>_fpn<lvl> target_lvls = fpn_utils.map_rois_to_fpn_levels( rel_ret[rois_blob_name][:, 1:5], lvl_min, lvl_max) fpn_utils.add_multilevel_roi_blobs( rel_ret, rois_blob_name, rel_ret[rois_blob_name], target_lvls, lvl_min, lvl_max) sbj_det_feat = self.Box_Head(blob_conv, rel_ret, rois_name='sbj_rois', use_relu=True) sbj_cls_scores, _ = self.Box_Outs(sbj_det_feat) sbj_cls_scores = sbj_cls_scores.data.cpu().numpy() obj_det_feat = self.Box_Head(blob_conv, rel_ret, rois_name='obj_rois', use_relu=True) obj_cls_scores, _ = self.Box_Outs(obj_det_feat) obj_cls_scores = obj_cls_scores.data.cpu().numpy() if use_gt_labels: sbj_labels = roidb['sbj_gt_classes'] # start from 0 obj_labels = roidb['obj_gt_classes'] # start from 0 sbj_scores = np.ones_like(sbj_labels, dtype=np.float32) obj_scores = np.ones_like(obj_labels, dtype=np.float32) else: sbj_labels = np.argmax(sbj_cls_scores[:, 1:], axis=1) obj_labels = np.argmax(obj_cls_scores[:, 1:], axis=1) sbj_scores = np.amax(sbj_cls_scores[:, 1:], axis=1) obj_scores = np.amax(obj_cls_scores[:, 1:], axis=1) rel_ret['sbj_scores'] = sbj_scores.astype(np.float32, copy=False) rel_ret['obj_scores'] = obj_scores.astype(np.float32, copy=False) rel_ret['sbj_labels'] = sbj_labels.astype( np.int32, copy=False) + 1 # need to start from 1 rel_ret['obj_labels'] = obj_labels.astype( np.int32, copy=False) + 1 # need to start from 1 rel_ret['all_sbj_labels_int32'] = sbj_labels.astype(np.int32, copy=False) rel_ret['all_obj_labels_int32'] = obj_labels.astype(np.int32, copy=False) if cfg.MODEL.USE_SPATIAL_FEAT: spt_feat = box_utils_rel.get_spt_features( sbj_boxes, obj_boxes, im_w, im_h) rel_ret['spt_feat'] = spt_feat if cfg.MODEL.ADD_SO_SCORES: sbj_feat = self.S_Head(blob_conv, rel_ret, rois_name='sbj_rois', use_relu=use_relu) obj_feat = self.O_Head(blob_conv, rel_ret, rois_name='obj_rois', use_relu=use_relu) else: sbj_feat = self.Box_Head(blob_conv, rel_ret, rois_name='sbj_rois', use_relu=use_relu) obj_feat = self.Box_Head(blob_conv, rel_ret, rois_name='obj_rois', use_relu=use_relu) else: score_thresh = cfg.TEST.SCORE_THRESH while score_thresh >= -1e-06: # a negative value very close to 0.0 det_rois, det_labels, det_scores = \ self.prepare_det_rois(rpn_ret['rois'], cls_score, bbox_pred, im_info, score_thresh) rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info, dataset_name, roidb) valid_len = len(rel_ret['rel_rois']) if valid_len > 0: break logger.info( 'Got {} rel_rois when score_thresh={}, changing to {}'. format(valid_len, score_thresh, score_thresh - 0.01)) score_thresh -= 0.01 if cfg.MODEL.ADD_SO_SCORES: det_s_feat = self.S_Head(blob_conv, rel_ret, rois_name='det_rois', use_relu=use_relu) det_o_feat = self.O_Head(blob_conv, rel_ret, rois_name='det_rois', use_relu=use_relu) sbj_feat = det_s_feat[rel_ret['sbj_inds']] obj_feat = det_o_feat[rel_ret['obj_inds']] else: det_feat = self.Box_Head(blob_conv, rel_ret, rois_name='det_rois', use_relu=use_relu) sbj_feat = det_feat[rel_ret['sbj_inds']] obj_feat = det_feat[rel_ret['obj_inds']] rel_feat = self.Prd_RCNN.Box_Head(blob_conv_prd, rel_ret, rois_name='rel_rois', use_relu=use_relu) spo_feat = torch.cat((sbj_feat, rel_feat, obj_feat), dim=1) if cfg.MODEL.USE_SPATIAL_FEAT: spt_feat = rel_ret['spt_feat'] else: spt_feat = None if cfg.MODEL.USE_FREQ_BIAS or cfg.MODEL.RUN_BASELINE: sbj_labels = rel_ret['all_sbj_labels_int32'] obj_labels = rel_ret['all_obj_labels_int32'] else: sbj_labels = None obj_labels = None # prd_scores is the visual scores. See reldn_heads.py prd_scores, prd_bias_scores, prd_spt_scores, ttl_cls_scores, sbj_cls_scores, obj_cls_scores = \ self.RelDN(spo_feat, spt_feat, sbj_labels, obj_labels, sbj_feat, obj_feat) if self.training: return_dict['losses'] = {} return_dict['metrics'] = {} # rpn loss rpn_kwargs.update( dict((k, rpn_ret[k]) for k in rpn_ret.keys() if (k.startswith('rpn_cls_logits') or k.startswith('rpn_bbox_pred')))) loss_rpn_cls, loss_rpn_bbox = rpn_heads.generic_rpn_losses( **rpn_kwargs) if cfg.FPN.FPN_ON: for i, lvl in enumerate( range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1)): return_dict['losses']['loss_rpn_cls_fpn%d' % lvl] = loss_rpn_cls[i] return_dict['losses']['loss_rpn_bbox_fpn%d' % lvl] = loss_rpn_bbox[i] else: return_dict['losses']['loss_rpn_cls'] = loss_rpn_cls return_dict['losses']['loss_rpn_bbox'] = loss_rpn_bbox # bbox loss loss_cls, loss_bbox, accuracy_cls = fast_rcnn_heads.fast_rcnn_losses( cls_score, bbox_pred, rpn_ret['labels_int32'], rpn_ret['bbox_targets'], rpn_ret['bbox_inside_weights'], rpn_ret['bbox_outside_weights']) return_dict['losses']['loss_cls'] = loss_cls return_dict['losses']['loss_bbox'] = loss_bbox return_dict['metrics']['accuracy_cls'] = accuracy_cls if cfg.MODEL.USE_FREQ_BIAS and not cfg.MODEL.ADD_SCORES_ALL: loss_cls_bias, accuracy_cls_bias = reldn_heads.reldn_losses( prd_bias_scores, rel_ret['all_prd_labels_int32']) return_dict['losses']['loss_cls_bias'] = loss_cls_bias return_dict['metrics']['accuracy_cls_bias'] = accuracy_cls_bias if cfg.MODEL.USE_SPATIAL_FEAT and not cfg.MODEL.ADD_SCORES_ALL: loss_cls_spt, accuracy_cls_spt = reldn_heads.reldn_losses( prd_spt_scores, rel_ret['all_prd_labels_int32']) return_dict['losses']['loss_cls_spt'] = loss_cls_spt return_dict['metrics']['accuracy_cls_spt'] = accuracy_cls_spt if cfg.MODEL.ADD_SCORES_ALL: loss_cls_ttl, accuracy_cls_ttl = reldn_heads.reldn_losses( ttl_cls_scores, rel_ret['all_prd_labels_int32']) return_dict['losses']['loss_cls_ttl'] = loss_cls_ttl return_dict['metrics']['accuracy_cls_ttl'] = accuracy_cls_ttl else: loss_cls_prd, accuracy_cls_prd = reldn_heads.reldn_losses( prd_scores, rel_ret['all_prd_labels_int32']) return_dict['losses']['loss_cls_prd'] = loss_cls_prd return_dict['metrics']['accuracy_cls_prd'] = accuracy_cls_prd if cfg.MODEL.USE_NODE_CONTRASTIVE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_SO_AWARE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_P_AWARE_LOSS: # sbj rel_feat_sbj_pos = self.Prd_RCNN.Box_Head( blob_conv_prd, rel_ret, rois_name='rel_rois_sbj_pos', use_relu=use_relu) spo_feat_sbj_pos = torch.cat( (sbj_feat_sbj_pos, rel_feat_sbj_pos, obj_feat_sbj_pos), dim=1) if cfg.MODEL.USE_SPATIAL_FEAT: spt_feat_sbj_pos = rel_ret['spt_feat_sbj_pos'] else: spt_feat_sbj_pos = None if cfg.MODEL.USE_FREQ_BIAS or cfg.MODEL.RUN_BASELINE: sbj_labels_sbj_pos_fg = rel_ret[ 'sbj_labels_sbj_pos_fg_int32'] obj_labels_sbj_pos_fg = rel_ret[ 'obj_labels_sbj_pos_fg_int32'] else: sbj_labels_sbj_pos_fg = None obj_labels_sbj_pos_fg = None _, prd_bias_scores_sbj_pos, _, ttl_cls_scores_sbj_pos, _, _ = \ self.RelDN(spo_feat_sbj_pos, spt_feat_sbj_pos, sbj_labels_sbj_pos_fg, obj_labels_sbj_pos_fg, sbj_feat_sbj_pos, obj_feat_sbj_pos) # obj rel_feat_obj_pos = self.Prd_RCNN.Box_Head( blob_conv_prd, rel_ret, rois_name='rel_rois_obj_pos', use_relu=use_relu) spo_feat_obj_pos = torch.cat( (sbj_feat_obj_pos, rel_feat_obj_pos, obj_feat_obj_pos), dim=1) if cfg.MODEL.USE_SPATIAL_FEAT: spt_feat_obj_pos = rel_ret['spt_feat_obj_pos'] else: spt_feat_obj_pos = None if cfg.MODEL.USE_FREQ_BIAS or cfg.MODEL.RUN_BASELINE: sbj_labels_obj_pos_fg = rel_ret[ 'sbj_labels_obj_pos_fg_int32'] obj_labels_obj_pos_fg = rel_ret[ 'obj_labels_obj_pos_fg_int32'] else: sbj_labels_obj_pos_fg = None obj_labels_obj_pos_fg = None _, prd_bias_scores_obj_pos, _, ttl_cls_scores_obj_pos, _, _ = \ self.RelDN(spo_feat_obj_pos, spt_feat_obj_pos, sbj_labels_obj_pos_fg, obj_labels_obj_pos_fg, sbj_feat_obj_pos, obj_feat_obj_pos) if cfg.MODEL.USE_NODE_CONTRASTIVE_LOSS: loss_contrastive_sbj, loss_contrastive_obj = reldn_heads.reldn_contrastive_losses( ttl_cls_scores_sbj_pos, ttl_cls_scores_obj_pos, rel_ret) return_dict['losses'][ 'loss_contrastive_sbj'] = loss_contrastive_sbj * cfg.MODEL.NODE_CONTRASTIVE_WEIGHT return_dict['losses'][ 'loss_contrastive_obj'] = loss_contrastive_obj * cfg.MODEL.NODE_CONTRASTIVE_WEIGHT if cfg.MODEL.USE_NODE_CONTRASTIVE_SO_AWARE_LOSS: loss_so_contrastive_sbj, loss_so_contrastive_obj = reldn_heads.reldn_so_contrastive_losses( ttl_cls_scores_sbj_pos, ttl_cls_scores_obj_pos, rel_ret) return_dict['losses'][ 'loss_so_contrastive_sbj'] = loss_so_contrastive_sbj * cfg.MODEL.NODE_CONTRASTIVE_SO_AWARE_WEIGHT return_dict['losses'][ 'loss_so_contrastive_obj'] = loss_so_contrastive_obj * cfg.MODEL.NODE_CONTRASTIVE_SO_AWARE_WEIGHT if cfg.MODEL.USE_NODE_CONTRASTIVE_P_AWARE_LOSS: loss_p_contrastive_sbj, loss_p_contrastive_obj = reldn_heads.reldn_p_contrastive_losses( ttl_cls_scores_sbj_pos, ttl_cls_scores_obj_pos, prd_bias_scores_sbj_pos, prd_bias_scores_obj_pos, rel_ret) return_dict['losses'][ 'loss_p_contrastive_sbj'] = loss_p_contrastive_sbj * cfg.MODEL.NODE_CONTRASTIVE_P_AWARE_WEIGHT return_dict['losses'][ 'loss_p_contrastive_obj'] = loss_p_contrastive_obj * cfg.MODEL.NODE_CONTRASTIVE_P_AWARE_WEIGHT # pytorch0.4 bug on gathering scalar(0-dim) tensors for k, v in return_dict['losses'].items(): return_dict['losses'][k] = v.unsqueeze(0) for k, v in return_dict['metrics'].items(): return_dict['metrics'][k] = v.unsqueeze(0) else: # Testing return_dict['sbj_rois'] = rel_ret['sbj_rois'] return_dict['obj_rois'] = rel_ret['obj_rois'] return_dict['sbj_labels'] = rel_ret['sbj_labels'] return_dict['obj_labels'] = rel_ret['obj_labels'] return_dict['sbj_scores'] = rel_ret['sbj_scores'] return_dict['obj_scores'] = rel_ret['obj_scores'] return_dict['prd_scores'] = prd_scores if cfg.MODEL.USE_FREQ_BIAS: return_dict['prd_scores_bias'] = prd_bias_scores if cfg.MODEL.USE_SPATIAL_FEAT: return_dict['prd_scores_spt'] = prd_spt_scores if cfg.MODEL.ADD_SCORES_ALL: return_dict['prd_ttl_scores'] = ttl_cls_scores if do_vis: return_dict['blob_conv'] = blob_conv return_dict['blob_conv_prd'] = blob_conv_prd return return_dict
def _forward(self, data, im_info, do_vis=False, dataset_name=None, roidb=None, use_gt_labels=False, **rpn_kwargs): im_data = data if self.training: # if not isinstance(roidb[0], np.array): # roidb = roidb[0] roidb = list(map(lambda x: blob_utils.deserialize(x)[0], roidb)) # only support one gpu if dataset_name is not None: dataset_name = blob_utils.deserialize(dataset_name) else: dataset_name = cfg.TRAIN.DATASETS[0] if self.training else cfg.TEST.DATASETS[0] # assuming only one dataset per run device_id = im_data.get_device() return_dict = {} # A dict to collect return variables blob_conv = self.Conv_Body(im_data) # if not cfg.MODEL.USE_REL_PYRAMID: # blob_conv_prd = self.Prd_RCNN.Conv_Body(im_data) if self.training: gt_rois = np.empty((0, 5), dtype=np.float32) gt_classes = np.empty((0), dtype=np.int64) for i, r in enumerate(roidb): rois_i = r['boxes'] * im_info[i, 2] rois_i = np.hstack((i * blob_utils.ones((rois_i.shape[0], 1)), rois_i)) gt_rois = np.append(gt_rois, rois_i, axis=0) gt_classes = np.append(gt_classes, r['gt_classes'], axis=0) if self.training or roidb is None: rpn_ret = self.RPN(blob_conv, im_info, roidb) if cfg.FPN.FPN_ON: # Retain only the blobs that will be used for RoI heads. `blob_conv` may include # extra blobs that are used for RPN proposals, but not for RoI heads. blob_conv = blob_conv[-self.num_roi_levels:] # if not cfg.MODEL.USE_REL_PYRAMID: # blob_conv_prd = blob_conv_prd[-self.num_roi_levels:] # else: # blob_conv_prd = self.RelPyramid(blob_conv) if self.training or roidb is None: if cfg.MODEL.SHARE_RES5 and self.training: box_feat, res5_feat = self.Box_Head(blob_conv, rpn_ret, use_relu=True) else: box_feat = self.Box_Head(blob_conv, rpn_ret, use_relu=True) cls_score, bbox_pred = self.Box_Outs(box_feat) # now go through the predicate branch use_relu = False if cfg.MODEL.NO_FC7_RELU else True if self.training: score_thresh = cfg.TEST.SCORE_THRESH cls_score = F.softmax(cls_score, -1) while score_thresh >= -1e-06: # a negative value very close to 0.0 det_rois, det_labels, det_scores, det_dists, det_boxes_all = \ self.prepare_det_rois(rpn_ret['rois'], cls_score, bbox_pred, im_info, score_thresh) real_area = (det_rois[:, 3] - det_rois[:, 1]) * (det_rois[:, 4] - det_rois[:, 2]) non_zero_area_inds = np.where(real_area > 0)[0] det_rois = det_rois[non_zero_area_inds] det_labels = det_labels[non_zero_area_inds] det_scores = det_scores[non_zero_area_inds] det_dists = det_dists[non_zero_area_inds] det_boxes_all = det_boxes_all[non_zero_area_inds] # rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info, dataset_name, roidb) valid_len = len(det_rois) if valid_len > 0: break logger.info('Got {} det_rois when score_thresh={}, changing to {}'.format( valid_len, score_thresh, score_thresh - 0.01)) score_thresh -= 0.01 det_labels_gt = [] ious = box_utils.bbox_overlaps(det_rois[:, 1:], gt_rois[:, 1:]) * \ (det_rois[:, 0][:,None] == gt_rois[:, 0][None, :]) det_labels_gt = gt_classes[ious.argmax(-1)] det_labels_gt[ious.max(-1) < cfg.TRAIN.FG_THRESH] = 0 else: if roidb is not None: # raise FError('not support this mode!') # assert len(roidb) == 1 im_scale = im_info.data.numpy()[:, 2][0] im_w = im_info.data.numpy()[:, 1][0] im_h = im_info.data.numpy()[:, 0][0] fpn_ret = {'gt_rois': gt_rois} if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS: lvl_min = cfg.FPN.ROI_MIN_LEVEL lvl_max = cfg.FPN.ROI_MAX_LEVEL rois_blob_names = ['gt_rois'] for rois_blob_name in rois_blob_names: # Add per FPN level roi blobs named like: <rois_blob_name>_fpn<lvl> target_lvls = fpn_utils.map_rois_to_fpn_levels( fpn_ret[rois_blob_name][:, 1:5], lvl_min, lvl_max) fpn_utils.add_multilevel_roi_blobs( fpn_ret, rois_blob_name, fpn_ret[rois_blob_name], target_lvls, lvl_min, lvl_max) det_feats = self.Box_Head(blob_conv, fpn_ret, rois_name='det_rois', use_relu=True) det_dists, _ = self.Box_Outs(det_feats) det_boxes_all = None if use_gt_labels: det_labels_gt = gt_classes det_labels = gt_classes else: score_thresh = cfg.TEST.SCORE_THRESH while score_thresh >= -1e-06: # a negative value very close to 0.0 det_rois, det_labels, det_scores, det_dists, det_boxes_all = \ self.prepare_det_rois(rpn_ret['rois'], cls_score, bbox_pred, im_info, score_thresh) real_area = (det_rois[:, 3] - det_rois[:, 1]) * (det_rois[:, 4] - det_rois[:, 2]) non_zero_area_inds = np.where(real_area > 0)[0] det_rois = det_rois[non_zero_area_inds] det_labels = det_labels[non_zero_area_inds] det_scores = det_scores[non_zero_area_inds] det_dists = det_dists[non_zero_area_inds] det_boxes_all = det_boxes_all[non_zero_area_inds] # rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info, dataset_name, roidb) valid_len = len(det_rois) if valid_len > 0: break logger.info('Got {} det_rois when score_thresh={}, changing to {}'.format( valid_len, score_thresh, score_thresh - 0.01)) score_thresh -= 0.01 return_dict['det_rois'] = det_rois num_rois = det_rois.shape[0] if not isinstance(det_dists, torch.Tensor): assert det_dists.shape[0] == num_rois det_dists = torch.from_numpy(det_dists).float().cuda(device_id) return_dict['det_dists'] = det_dists return_dict['det_scores'] = det_scores return_dict['blob_conv'] = blob_conv return_dict['det_boxes_all'] = det_boxes_all assert det_boxes_all.shape[0] == num_rois return_dict['det_labels'] = det_labels # return_dict['blob_conv_prd'] = blob_conv_prd if self.training or use_gt_labels: return_dict['det_labels_gt'] = det_labels_gt return return_dict
def _sample_rois(roidb, im_scale, batch_idx, stage=0): """Generate a random sample of RoIs comprising foreground and background examples. """ rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM) fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)) max_overlaps = roidb['max_overlaps'] # SNIP if cfg.FAST_RCNN.SNIP: gt_inds = np.where(roidb['gt_classes'] > 0)[0] for i in range(len(gt_inds)): gt_box = roidb['boxes'][gt_inds[i]] width = gt_box[2] - gt_box[0] height = gt_box[3] - gt_box[1] RES = np.sqrt(width * height) * im_scale box_to_gt_ind_map = roidb['box_to_gt_ind_map'] if not (RES > cfg.FAST_RCNN.RES_LO and RES <= cfg.FAST_RCNN.RES_HI): ids = np.where(box_to_gt_ind_map == gt_inds[i])[0] for id in ids: if max_overlaps[id] > cfg.FAST_RCNN.SNIP_NEG_THRESH: # create an exception (neither fg/bg) max_overlaps[id] = cfg.FAST_RCNN.SNIP_TARGET_THRESH for i in range(len(gt_inds)): max_overlaps[gt_inds[i]] = 1.0 # Select foreground RoIs as those with >= FG_THRESH overlap fg_inds = np.where(max_overlaps > cfg.TRAIN.FG_THRESH + stage * 0.1)[0] # print('stage: {:d} num of fg_inds: {:d}'.format(int(stage), len(fg_inds))) # Guard against the case when an image has fewer than fg_rois_per_image # foreground RoIs fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice( fg_inds, size=fg_rois_per_this_image, replace=False) # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI + stage * 0.1) & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size) # Sample foreground regions without replacement if bg_inds.size > 0: bg_inds = npr.choice( bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Label is the class each RoI has max overlap with sampled_labels = roidb['max_classes'][keep_inds] sampled_labels[fg_rois_per_this_image:] = 0 # Label bg RoIs with class 0 sampled_boxes = roidb['boxes'][keep_inds] if 'bbox_targets' not in roidb: gt_inds = np.where(roidb['gt_classes'] > 0)[0] gt_boxes = roidb['boxes'][gt_inds, :] gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]] bbox_targets = _compute_targets( sampled_boxes, gt_boxes[gt_assignments, :], sampled_labels, stage) bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets, stage) else: bbox_targets, bbox_inside_weights = _expand_bbox_targets( roidb['bbox_targets'][keep_inds, :], stage) bbox_outside_weights = np.array( bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype) # Scale rois and format as (batch_idx, x1, y1, x2, y2) sampled_rois = sampled_boxes * im_scale repeated_batch_idx = batch_idx * blob_utils.ones((sampled_rois.shape[0], 1)) sampled_rois = np.hstack((repeated_batch_idx, sampled_rois)) # Base Fast R-CNN blobs blob_dict = dict( labels_int32=sampled_labels.astype(np.int32, copy=False), rois=sampled_rois, bbox_targets=bbox_targets, bbox_inside_weights=bbox_inside_weights, bbox_outside_weights=bbox_outside_weights) # Optionally add Mask R-CNN blobs if cfg.MODEL.MASK_ON: roi_data.mask_rcnn.add_mask_rcnn_blobs(blob_dict, sampled_boxes, roidb, im_scale, batch_idx) # Optionally add Keypoint R-CNN blobs if cfg.MODEL.KEYPOINTS_ON: roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs( blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx) return blob_dict
def _sample_rois(roidb, im_scale, batch_idx): """Generate a random sample of RoIs comprising foreground and background examples. """ rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM) fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)) max_overlaps = roidb['max_overlaps'] # Select foreground RoIs as those with >= FG_THRESH overlap fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] # Guard against the case when an image has fewer than fg_rois_per_image # foreground RoIs fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size) # Sample foreground regions without replacement if bg_inds.size > 0: bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) if cfg.TEST.TAGGING or (cfg.MODEL.TAGGING): # Manually change keep_inds, so that the rois and label_int32 won't be shuffled. keep_inds = np.arange(len(roidb['boxes'])) fg_rois_per_this_image = len(roidb['boxes']) bg_rois_per_this_image = 0 if cfg.MODEL.TAGGING: assert bg_rois_per_this_image == 0 # Label is the class each RoI has max overlap with sampled_labels = roidb['max_classes'][keep_inds] sampled_labels[fg_rois_per_this_image:] = 0 # Label bg RoIs with class 0 sampled_boxes = roidb['boxes'][keep_inds] if 'bbox_targets' not in roidb: gt_inds = np.where(roidb['gt_classes'] > 0)[0] gt_boxes = roidb['boxes'][gt_inds, :] gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]] bbox_targets = _compute_targets(sampled_boxes, gt_boxes[gt_assignments, :], sampled_labels) bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets) else: bbox_targets, bbox_inside_weights = _expand_bbox_targets( roidb['bbox_targets'][keep_inds, :]) bbox_outside_weights = np.array(bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype) # Scale rois and format as (batch_idx, x1, y1, x2, y2) sampled_rois = sampled_boxes * im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sampled_rois.shape[0], 1)) sampled_rois = np.hstack((repeated_batch_idx, sampled_rois)) # Base Fast R-CNN blobs blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False), rois=sampled_rois, bbox_targets=bbox_targets, bbox_inside_weights=bbox_inside_weights, bbox_outside_weights=bbox_outside_weights) # Optionally add Mask R-CNN blobs if cfg.MODEL.MASK_ON: roi_data.mask_rcnn.add_mask_rcnn_blobs(blob_dict, sampled_boxes, roidb, im_scale, batch_idx) # Optionally add Keypoint R-CNN blobs if cfg.MODEL.KEYPOINTS_ON: roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs(blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx) return blob_dict
def add_keypoint_rcnn_blobs( blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx): # Note: gt_inds must match how they're computed in # datasets.json_dataset._merge_proposal_boxes_into_roidb gt_inds = np.where(roidb['gt_classes'] > 0)[0] max_overlaps = roidb['max_overlaps'] gt_keypoints = roidb['gt_keypoints'] ind_kp = gt_inds[roidb['box_to_gt_ind_map']] within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes']) vis_kp = gt_keypoints[ind_kp, 2, :] > 0 is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0 kp_fg_inds = np.where( np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, is_visible))[0] kp_fg_rois_per_this_image = np.minimum( fg_rois_per_image, kp_fg_inds.size) if kp_fg_inds.size > kp_fg_rois_per_this_image: kp_fg_inds = np.random.choice( kp_fg_inds, size=kp_fg_rois_per_this_image, replace=False) if kp_fg_inds.shape[0] == 0: kp_fg_inds = gt_inds sampled_fg_rois = roidb['boxes'][kp_fg_inds] box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds] num_keypoints = gt_keypoints.shape[-1] sampled_keypoints = -np.ones( (len(sampled_fg_rois), gt_keypoints.shape[1], num_keypoints), dtype=gt_keypoints.dtype) for ii in range(len(sampled_fg_rois)): ind = box_to_gt_ind_map[ii] if ind >= 0: sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :] # assert np.sum(sampled_keypoints[ii, 2, :]) > 0 all_heats = [] all_weights = [] time_dim = sampled_fg_rois.shape[-1] // 4 per_frame_nkps = num_keypoints // time_dim for t in range(time_dim): heats, weights = keypoint_utils.keypoints_to_heatmap_labels( sampled_keypoints[..., t * per_frame_nkps: (t + 1) * per_frame_nkps], sampled_fg_rois[..., t * 4: (t + 1) * 4]) all_heats.append(heats) all_weights.append(weights) heats = np.concatenate(all_heats, axis=-1) weights = np.concatenate(all_weights, axis=-1) shape = (sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS * time_dim, 1) heats = heats.reshape(shape) weights = weights.reshape(shape) sampled_fg_rois *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sampled_fg_rois.shape[0], 1)) sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois)) blobs['keypoint_rois'] = sampled_fg_rois blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False) blobs['keypoint_weights'] = weights
def add_refine_global_mask_blobs(blobs, sampled_boxes, roidb, im_scale, batch_idx, data): """Add RefineNet Mask specific blobs to the input blob dictionary.""" # Prepare the mask targets by associating one gt mask to each training roi # that has a fg (non-bg) class label. dst_scale = cfg.REFINENET.SPATIAL_SCALE polys_gt_inds = np.where((roidb['gt_classes'] > 0) & (roidb['is_crowd'] == 0))[0] polys_gt = [roidb['segms'][i] for i in polys_gt_inds] boxes_from_polys = segm_utils.polys_to_boxes(polys_gt) fg_inds = np.where(blobs['labels_int32'] > 0)[0] roi_has_mask = blobs['labels_int32'].copy() roi_has_mask[roi_has_mask > 0] = 1 # Define size variables inp_h, inp_w = data.shape[2], data.shape[3] out_h, out_w = int(inp_h * dst_scale), int(inp_w * dst_scale) if fg_inds.shape[0] > 0: # Class labels for the foreground rois mask_class_labels = blobs['labels_int32'][fg_inds] masks = blob_utils.zeros((fg_inds.shape[0], out_h, out_w), int32=True) # Find overlap between all foreground rois and the bounding boxes # enclosing each segmentation rois_fg = sampled_boxes[fg_inds] overlaps_bbfg_bbpolys = box_utils.bbox_overlaps( rois_fg.astype(np.float32, copy=False), boxes_from_polys.astype(np.float32, copy=False)) # Map from each fg rois to the index of the mask with highest overlap # (measured by bbox overlap) fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1) # narrow scale and size scale = im_scale * dst_scale im_h, im_w = roidb['height'], roidb['width'] im_label_h, im_label_w = int(im_h * scale), int(im_w * scale) # add fg targets for i in range(rois_fg.shape[0]): fg_polys_ind = fg_polys_inds[i] poly_gt = polys_gt[fg_polys_ind] roi_fg = rois_fg[i] # Rasterize the portion of the polygon mask within the given fg roi # to an im_label_h x im_label_w binary image mask = segm_utils.polys_to_mask_scaled(poly_gt, im_h, im_w, scale) mask = np.array(mask > 0, dtype=np.int32) # Ensure it's binary masks[i, 0:im_label_h, 0:im_label_w] = mask masks = np.reshape(masks, (-1, out_h * out_w)) else: # If there are no fg masks (it does happen) # The network cannot handle empty blobs, so we must provide a mask # We simply take the first bg roi, given it an all -1's mask (ignore # label), and label it with class zero (bg). bg_inds = np.where(blobs['labels_int32'] == 0)[0] # rois_fg is actually one background roi, but that's ok because ... rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1)) # We give it an -1's blob (ignore label) masks = -blob_utils.ones((1, out_h * out_w), int32=True) # We label it with class = 0 (background) mask_class_labels = blob_utils.zeros((1, )) # Mark that the first roi has a mask roi_has_mask[0] = 1 if cfg.MRCNN.CLS_SPECIFIC_MASK: masks = _expand_to_class_specific_mask_targets(masks, mask_class_labels) # Scale rois_fg and format as (batch_idx, x1, y1, x2, y2) rois_fg *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1)) rois_fg = np.hstack((repeated_batch_idx, rois_fg)) # Update blobs dict with Refine-Net blobs blobs['refined_mask_rois'] = rois_fg blobs['roi_has_refined_mask_int32'] = roi_has_mask blobs['refined_masks_int32'] = masks
def _forward(self, data, im_info, do_vis=False, dataset_name=None, roidb=None, use_gt_labels=False, **rpn_kwargs): im_data = data if self.training: roidb = list(map(lambda x: blob_utils.deserialize(x)[0], roidb)) if dataset_name is not None: dataset_name = blob_utils.deserialize(dataset_name) else: dataset_name = cfg.TRAIN.DATASETS[ 0] if self.training else cfg.TEST.DATASETS[ 0] # assuming only one dataset per run device_id = im_data.get_device() return_dict = {} # A dict to collect return variables blob_conv = self.Conv_Body(im_data) if self.training: gt_rois = roidb[0]['boxes'] * im_info[0, 2].data.cpu().numpy() gt_classes = roidb[0]['gt_classes'] sbj_gt_boxes = roidb[0]['sbj_gt_boxes'] obj_gt_boxes = roidb[0]['obj_gt_boxes'] rpn_ret = self.RPN(blob_conv, im_info, roidb) if cfg.FPN.FPN_ON: # Retain only the blobs that will be used for RoI heads. `blob_conv` may include # extra blobs that are used for RPN proposals, but not for RoI heads. blob_conv = blob_conv[-self.num_roi_levels:] if cfg.MODEL.SHARE_RES5 and self.training: box_feat, res5_feat = self.Box_Head(blob_conv, rpn_ret, use_relu=True) else: box_feat = self.Box_Head(blob_conv, rpn_ret, use_relu=True) cls_score, bbox_pred = self.Box_Outs(box_feat) # now go through the predicate branch use_relu = False if cfg.MODEL.NO_FC7_RELU else True if self.training: fg_inds = np.where(rpn_ret['labels_int32'] > 0)[0] det_rois = rpn_ret['rois'][fg_inds] det_labels = rpn_ret['labels_int32'][fg_inds] det_scores = F.softmax(cls_score[fg_inds], dim=1) rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info, dataset_name, roidb) select_inds = np.array([]) repeated_batch_idx = 0 * blob_utils.ones((gt_rois.shape[0], 1)) select_rois = np.hstack((repeated_batch_idx, gt_rois)) select_feat = self.detector_feature_map(blob_conv, select_rois, use_relu=True) select_dists, _ = self.Box_Outs(select_feat) select_dists = F.softmax(select_dists, -1) select_labels = select_dists[:, 1:].max(-1)[1].data.cpu().numpy() + 1 select_gt_labels = gt_classes sbj_feat = self.Box_Head_sg(blob_conv, rel_ret, rois_name='sbj_rois', use_relu=True) obj_feat = self.Box_Head_sg(blob_conv, rel_ret, rois_name='obj_rois', use_relu=True) else: if roidb is not None: im_scale = im_info.data.numpy()[:, 2][0] im_w = im_info.data.numpy()[:, 1][0] im_h = im_info.data.numpy()[:, 0][0] gt_rois = roidb['boxes'] * im_scale sbj_boxes = roidb['sbj_gt_boxes'] obj_boxes = roidb['obj_gt_boxes'] sbj_rois = sbj_boxes * im_scale obj_rois = obj_boxes * im_scale repeated_batch_idx = 0 * blob_utils.ones( (sbj_rois.shape[0], 1)) sbj_rois = np.hstack((repeated_batch_idx, sbj_rois)) obj_rois = np.hstack((repeated_batch_idx, obj_rois)) if gt_rois.size > 0: repeated_batch_idx = 0 * blob_utils.ones( (gt_rois.shape[0], 1)) select_rois = np.hstack((repeated_batch_idx, gt_rois)) select_feat = self.detector_feature_map(blob_conv, select_rois, use_relu=True) select_dists, _ = self.Box_Outs(select_feat) select_labels = self.get_nms_preds(select_dists, select_rois, softmax=False) select_inds = np.arange(0, select_labels.shape[0]).astype( np.int64) rel_ret = self.EdgePN(select_rois, select_labels, select_dists, im_info, dataset_name, None) det_feat_sg = self.Box_Head_sg(blob_conv, rel_ret, rois_name='det_rois', use_relu=True) det_labels = select_labels.copy() det_scores = select_dists[:, 1:].max( -1)[0].data.cpu().numpy() min_ious = np.minimum( box_utils.bbox_overlaps( select_rois[:, 1:][rel_ret['sbj_inds']], sbj_rois[:, 1:]), box_utils.bbox_overlaps( select_rois[:, 1:][rel_ret['obj_inds']], obj_rois[:, 1:])) match_indices = np.where(min_ious.max(-1) >= 0.5)[0] rel_ret['sbj_inds'], rel_ret['obj_inds'], rel_ret['sbj_rois'], rel_ret['obj_rois'],\ rel_ret['rel_rois'], rel_ret['sbj_labels'], rel_ret['obj_labels'], rel_ret['sbj_scores'], \ rel_ret['obj_scores'] = rel_ret['sbj_inds'][match_indices], \ rel_ret['obj_inds'][match_indices], rel_ret['sbj_rois'][match_indices], \ rel_ret['obj_rois'][match_indices], rel_ret['rel_rois'][match_indices], \ rel_ret['sbj_labels'][match_indices], rel_ret['obj_labels'][match_indices], \ rel_ret['sbj_scores'][match_indices], rel_ret['obj_scores'][match_indices] sbj_feat = det_feat_sg[rel_ret['sbj_inds']] obj_feat = det_feat_sg[rel_ret['obj_inds']] else: score_thresh = cfg.TEST.SCORE_THRESH while score_thresh >= -1e-06: # a negative value very close to 0.0 det_rois, det_labels, det_scores = \ self.prepare_det_rois(rpn_ret['rois'], cls_score, bbox_pred, im_info, score_thresh) rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info, dataset_name, None) valid_len = len(rel_ret['rel_rois']) if valid_len > 0: break logger.info( 'Got {} rel_rois when score_thresh={}, changing to {}' .format(valid_len, score_thresh, score_thresh - 0.01)) score_thresh -= 0.01 det_feat = None # # vaild_inds = np.unique( np.concatenate( (rel_ret['sbj_inds'], rel_ret['obj_inds']), 0)) vaild_sort_inds = vaild_inds[np.argsort( -det_scores[vaild_inds])] select_inds = vaild_sort_inds[:10] select_rois = det_rois[select_inds] det_feat = self.Box_Head(blob_conv, rel_ret, rois_name='det_rois', use_relu=True) det_dists, _ = self.Box_Outs(det_feat) select_dists = det_dists[select_inds] select_labels = det_labels[select_inds].copy() else: score_thresh = cfg.TEST.SCORE_THRESH while score_thresh >= -1e-06: # a negative value very close to 0.0 det_rois, det_labels, det_scores = \ self.prepare_det_rois(rpn_ret['rois'], cls_score, bbox_pred, im_info, score_thresh) rel_ret = self.RelPN(det_rois, det_labels, det_scores, im_info, dataset_name, roidb) valid_len = len(rel_ret['rel_rois']) if valid_len > 0: break logger.info( 'Got {} rel_rois when score_thresh={}, changing to {}'. format(valid_len, score_thresh, score_thresh - 0.01)) score_thresh -= 0.01 det_feat = None vaild_inds = np.unique( np.concatenate((rel_ret['sbj_inds'], rel_ret['obj_inds']), 0)) vaild_sort_inds = vaild_inds[np.argsort( -det_scores[vaild_inds])] select_inds = vaild_sort_inds select_rois = det_rois[select_inds] det_feat_sg = self.Box_Head_sg(blob_conv, rel_ret, rois_name='det_rois', use_relu=True) sbj_feat = det_feat_sg[rel_ret['sbj_inds']] obj_feat = det_feat_sg[rel_ret['obj_inds']] if det_feat is None: det_feat = self.Box_Head(blob_conv, rel_ret, rois_name='det_rois', use_relu=True) det_dists, _ = self.Box_Outs(det_feat) select_dists = det_dists[select_inds] select_labels = det_labels[select_inds].copy() if select_inds.size > 2 or self.training: # if False: entity_fmap = self.obj_feature_map(blob_conv.detach(), select_rois, use_relu=True) entity_feat0 = self.merge_obj_feats(entity_fmap, select_rois, select_dists.detach(), im_info) edge_ret = self.EdgePN(select_rois, select_labels, select_dists, im_info, dataset_name, None) edge_feat = self.get_phr_feats( self.visual_rep(blob_conv, edge_ret, device_id, use_relu=use_relu)) edge_inds = np.stack((edge_ret['sbj_rois'][:, 0].astype(edge_ret['sbj_inds'].dtype), \ edge_ret['sbj_inds'], edge_ret['obj_inds']), -1) im_inds = select_rois[:, 0].astype(edge_inds.dtype) entity_feat = self.obj_mps1(entity_feat0, edge_feat, im_inds, edge_inds) entity_feat = self.obj_mps2(entity_feat, edge_feat, im_inds, edge_inds) entity_cls_score = self.ObjClassifier(entity_feat) if not self.training: select_labels_pred = self.get_nms_preds( entity_cls_score, select_rois) det_labels[select_inds] = select_labels_pred if use_gt_labels: det_labels[select_inds] = roidb['gt_classes'] select_twod_inds = np.arange(0, select_labels_pred.shape[ 0]) * cfg.MODEL.NUM_CLASSES + select_labels_pred select_scores = F.softmax( entity_cls_score, -1).view(-1)[select_twod_inds].data.cpu().numpy() det_scores[select_inds] = select_scores if use_gt_labels: det_scores[select_inds] = np.ones_like(select_scores) rel_feat = self.visual_rep(blob_conv, rel_ret, device_id, use_relu=use_relu) if not self.training: sbj_labels = det_labels[rel_ret['sbj_inds']] obj_labels = det_labels[rel_ret['obj_inds']] rel_ret['sbj_labels'] = det_labels[rel_ret['sbj_inds']] rel_ret['obj_labels'] = det_labels[rel_ret['obj_inds']] rel_ret['sbj_scores'] = det_scores[rel_ret['sbj_inds']] rel_ret['obj_scores'] = det_scores[rel_ret['obj_inds']] else: sbj_labels = rel_ret['all_sbj_labels_int32'] + 1 obj_labels = rel_ret['all_obj_labels_int32'] + 1 sbj_embed = self.ori_embed[sbj_labels].clone().cuda(device_id) obj_embed = self.ori_embed[obj_labels].clone().cuda(device_id) sbj_pos = torch.from_numpy( self.get_obj_pos(rel_ret['sbj_rois'], im_info)).float().cuda(device_id) obj_pos = torch.from_numpy( self.get_obj_pos(rel_ret['obj_rois'], im_info)).float().cuda(device_id) prod = self.sbj_map(torch.cat( (sbj_feat, sbj_embed, sbj_pos), -1)) * self.obj_map( torch.cat((obj_feat, obj_embed, obj_pos), -1)) prd_scores = self.rel_compress(rel_feat * prod) if cfg.MODEL.USE_FREQ_BIAS: sbj_labels = torch.from_numpy(sbj_labels).long().cuda(device_id) obj_labels = torch.from_numpy(obj_labels).long().cuda(device_id) prd_bias_scores = self.freq_bias.rel_index_with_labels( torch.stack((sbj_labels - 1, obj_labels - 1), 1)) prd_scores += prd_bias_scores if not self.training: prd_scores = F.softmax(prd_scores, -1) if self.training: return_dict['losses'] = {} return_dict['metrics'] = {} imp_gamma = get_importance_factor(select_rois, sbj_gt_boxes, obj_gt_boxes, im_info) # rpn loss rpn_kwargs.update( dict((k, rpn_ret[k]) for k in rpn_ret.keys() if (k.startswith('rpn_cls_logits') or k.startswith('rpn_bbox_pred')))) loss_rpn_cls, loss_rpn_bbox = rpn_heads.generic_rpn_losses( **rpn_kwargs) if cfg.FPN.FPN_ON: for i, lvl in enumerate( range(cfg.FPN.RPN_MIN_LEVEL, cfg.FPN.RPN_MAX_LEVEL + 1)): return_dict['losses']['loss_rpn_cls_fpn%d' % lvl] = loss_rpn_cls[i] return_dict['losses']['loss_rpn_bbox_fpn%d' % lvl] = loss_rpn_bbox[i] else: return_dict['losses']['loss_rpn_cls'] = loss_rpn_cls return_dict['losses']['loss_rpn_bbox'] = loss_rpn_bbox # bbox loss loss_cls, loss_bbox, accuracy_cls = fast_rcnn_heads.fast_rcnn_losses( cls_score, bbox_pred, rpn_ret['labels_int32'], rpn_ret['bbox_targets'], rpn_ret['bbox_inside_weights'], rpn_ret['bbox_outside_weights']) return_dict['losses']['loss_cls'] = loss_cls return_dict['losses']['loss_bbox'] = loss_bbox return_dict['metrics']['accuracy_cls'] = accuracy_cls loss_cls_prd, accuracy_cls_prd = reldn_heads.reldn_losses( prd_scores, rel_ret['all_prd_labels_int32']) return_dict['losses']['loss_cls_prd'] = loss_cls_prd return_dict['metrics']['accuracy_cls_prd'] = accuracy_cls_prd loss_cls_entity, accuracy_cls_entity = refine_obj_feats.entity_losses_imp( entity_cls_score, select_gt_labels, imp_gamma) return_dict['losses']['loss_cls_entity'] = loss_cls_entity return_dict['metrics']['accuracy_cls_entity'] = accuracy_cls_entity # pytorch0.4 bug on gathering scalar(0-dim) tensors for k, v in return_dict['losses'].items(): return_dict['losses'][k] = v.unsqueeze(0) for k, v in return_dict['metrics'].items(): return_dict['metrics'][k] = v.unsqueeze(0) else: # Testing return_dict['sbj_rois'] = rel_ret['sbj_rois'] return_dict['obj_rois'] = rel_ret['obj_rois'] return_dict['sbj_labels'] = rel_ret['sbj_labels'] return_dict['obj_labels'] = rel_ret['obj_labels'] return_dict['sbj_scores'] = rel_ret['sbj_scores'] return_dict['obj_scores'] = rel_ret['obj_scores'] return_dict['prd_scores'] = prd_scores if do_vis: return_dict['blob_conv'] = blob_conv return return_dict
def _sample_rois(roidb, im_scale, batch_idx): """Generate a random sample of RoIs comprising foreground and background examples. """ max_overlaps = roidb['max_overlaps'] # Select foreground RoIs as those with >= FG_THRESH overlap fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] if cfg.TRAIN.JOINT_SELECTIVE_FG: # EDIT: 'Selective foreground sampling from dataset-0' dataset_idx = roidb['dataset_id'][0] if dataset_idx == 0: print('Selective foreground sampling') # Only fg rois in minibatch for "dataset-0": fg_rois_per_image = int( np.round(cfg.TRAIN.FG_FRACTION * cfg.TRAIN.BATCH_SIZE_PER_IM)) fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) # If rois_per_image = fg_rois_per_this_image, then # bg_rois_per_this_image = 0 (ensures no bg in batch) rois_per_image = fg_rois_per_this_image else: # for "dataset-1", ensure correct ratio of fg:bg fg_rois_per_image = int( np.round(cfg.TRAIN.FG_FRACTION * cfg.TRAIN.BATCH_SIZE_PER_IM)) bg_rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM - fg_rois_per_image) # Increase the batchsize (roi per img) to accomodate twice the number of bg rois rois_per_image = int( cfg.TRAIN.BATCH_SIZE_PER_IM) + bg_rois_per_image fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) else: # Default fg:bg rois sampling rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM) fg_rois_per_image = int( np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)) # Guard against the case when an image has fewer than fg_rois_per_image # foreground RoIs fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size) # Sample foreground regions without replacement if bg_inds.size > 0: bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Label is the class each RoI has max overlap with sampled_labels = roidb['max_classes'][keep_inds] sampled_labels[fg_rois_per_this_image:] = 0 # Label bg RoIs with class 0 sampled_boxes = roidb['boxes'][keep_inds] if cfg.TRAIN.GT_SCORES: # EDIT: soft labels sampled_scores = roidb['max_scores'][keep_inds] sampled_gt_source = roidb['max_gt_source'][keep_inds] sampled_scores[fg_rois_per_this_image:] = 0 sampled_gt_source[fg_rois_per_this_image:] = 0 if roidb['dataset_id'][0] == 0: # sanity-check for the unlabeled dataset case (assumed "dataset-0") assert all( (sampled_scores >= 0) == (sampled_labels >= 0)) # TODO: Check >= instead of > assert (len(sampled_gt_source) == len(sampled_scores)) if 'bbox_targets' not in roidb: gt_inds = np.where(roidb['gt_classes'] > 0)[0] gt_boxes = roidb['boxes'][gt_inds, :] gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]] bbox_targets = _compute_targets(sampled_boxes, gt_boxes[gt_assignments, :], sampled_labels) bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets) else: bbox_targets, bbox_inside_weights = _expand_bbox_targets( roidb['bbox_targets'][keep_inds, :]) bbox_outside_weights = np.array(bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype) # Scale rois and format as (batch_idx, x1, y1, x2, y2) sampled_rois = sampled_boxes * im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sampled_rois.shape[0], 1)) sampled_rois = np.hstack((repeated_batch_idx, sampled_rois)) # Base Fast R-CNN blobs blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False), rois=sampled_rois, bbox_targets=bbox_targets, bbox_inside_weights=bbox_inside_weights, bbox_outside_weights=bbox_outside_weights) # EDIT: joint training if cfg.TRAIN.JOINT_TRAINING: blob_dict['dataset_id'] = np.full_like(sampled_labels, roidb['dataset_id'][0], dtype=np.int32) # EDIT: soft labels if cfg.TRAIN.GT_SCORES: blob_dict['gt_scores'] = sampled_scores.astype(np.float32, copy=False) blob_dict['gt_source'] = sampled_gt_source.astype(np.int32, copy=False) blob_dict['dataset_id'] = np.full_like(sampled_scores, roidb['dataset_id'][0], dtype=np.int32) # Optionally add Mask R-CNN blobs if cfg.MODEL.MASK_ON: roi_data.mask_rcnn.add_mask_rcnn_blobs(blob_dict, sampled_boxes, roidb, im_scale, batch_idx) # Optionally add Keypoint R-CNN blobs if cfg.MODEL.KEYPOINTS_ON: roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs(blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx) return blob_dict
def sample_rois(roidb, im_scale, batch_idx, pos_iou): """Generate a random sample of RoIs comprising foreground and background examples. """ rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM) fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)) # 0.25 x 512 by default max_overlaps = roidb['max_overlaps'] # Select foreground RoIs as those with >= FG_THRESH overlap fg_inds = np.where(max_overlaps >= pos_iou)[0] # Guard against the case when an image has fewer than fg_rois_per_image # foreground RoIs fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((max_overlaps < pos_iou) & # [0.0, 0.5) by default (max_overlaps >= 0))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size) # Sample foreground regions without replacement if bg_inds.size > 0: bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Label is the class each RoI has max overlap with sampled_labels = roidb['max_classes'][keep_inds] sampled_labels[fg_rois_per_this_image:] = 0 # Label bg RoIs with class 0 sampled_boxes = roidb['boxes'][keep_inds] # pdb.set_trace() if 'bbox_targets' not in roidb: # pdb.set_trace() gt_inds = np.where(roidb['gt_classes'] > 0)[0] gt_boxes = roidb['boxes'][gt_inds, :] # print(gt_inds) # print(roidb['box_to_gt_ind_map']) # ForkedPdb().set_trace() if len(gt_inds) > 0: # LJY # print(gt_inds) # print(roidb['box_to_gt_ind_map']) # print(roidb['box_to_gt_ind_map'][keep_inds]) gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]] bbox_targets = compute_targets(sampled_boxes, gt_boxes[gt_assignments, :], sampled_labels) bbox_targets, bbox_inside_weights = expand_bbox_targets( bbox_targets) else: # all-negative image # generate dummy gt boxes gt_boxes = sampled_boxes.copy() # pdb.set_trace() bbox_targets = compute_targets(sampled_boxes, gt_boxes, sampled_labels) bbox_targets, bbox_inside_weights = expand_bbox_targets( bbox_targets) # pdb.set_trace() else: # LJ 不会进入 pdb.set_trace() bbox_targets, bbox_inside_weights = expand_bbox_targets( roidb['bbox_targets'][keep_inds, :]) bbox_outside_weights = np.array(bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype) # Scale rois and format as (batch_idx, x1, y1, x2, y2) sampled_rois = sampled_boxes * im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sampled_rois.shape[0], 1)) sampled_rois = np.hstack((repeated_batch_idx, sampled_rois)) # Base Fast R-CNN blobs blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False), rois=sampled_rois, bbox_targets=bbox_targets, bbox_inside_weights=bbox_inside_weights, bbox_outside_weights=bbox_outside_weights) return blob_dict
def _sample_rois(roidb, im_scale, batch_idx): """Generate a random sample of RoIs comprising foreground and background examples. """ rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM) fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)) max_overlaps = roidb['max_overlaps'] # Select foreground RoIs as those with >= FG_THRESH overlap fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] # Guard against the case when an image has fewer than fg_rois_per_image # foreground RoIs fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice( fg_inds, size=fg_rois_per_this_image, replace=False ) # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where( (max_overlaps < cfg.TRAIN.BG_THRESH_HI) & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO) )[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size) # Sample foreground regions without replacement if bg_inds.size > 0: bg_inds = npr.choice( bg_inds, size=bg_rois_per_this_image, replace=False ) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Label is the class each RoI has max overlap with sampled_labels = roidb['max_classes'][keep_inds] sampled_labels[fg_rois_per_this_image:] = 0 # Label bg RoIs with class 0 sampled_boxes = roidb['boxes'][keep_inds] if 'bbox_targets' not in roidb: gt_inds = np.where(roidb['gt_classes'] > 0)[0] gt_boxes = roidb['boxes'][gt_inds, :] gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]] bbox_targets = _compute_targets( sampled_boxes, gt_boxes[gt_assignments, :], sampled_labels ) bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets) else: bbox_targets, bbox_inside_weights = _expand_bbox_targets( roidb['bbox_targets'][keep_inds, :] ) bbox_outside_weights = np.array( bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype ) # Scale rois and format as (batch_idx, x1, y1, x2, y2) sampled_rois = sampled_boxes * im_scale repeated_batch_idx = batch_idx * blob_utils.ones((sampled_rois.shape[0], 1)) sampled_rois = np.hstack((repeated_batch_idx, sampled_rois)) # Base Fast R-CNN blobs blob_dict = dict( labels_int32=sampled_labels.astype(np.int32, copy=False), rois=sampled_rois, bbox_targets=bbox_targets, bbox_inside_weights=bbox_inside_weights, bbox_outside_weights=bbox_outside_weights ) # Optionally add Mask R-CNN blobs if cfg.MODEL.MASK_ON: roi_data.mask_rcnn.add_mask_rcnn_blobs( blob_dict, sampled_boxes, roidb, im_scale, batch_idx ) # Optionally add Keypoint R-CNN blobs if cfg.MODEL.KEYPOINTS_ON: roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs( blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx ) return blob_dict
def add_refine_local_mask_blobs(blobs, sampled_boxes, roidb, im_scale, batch_idx, data): """Add RefineNet Mask specific blobs to the input blob dictionary.""" # Prepare the mask targets by associating one gt mask to each training roi # that has a fg (non-bg) class label. M = cfg.REFINENET.RESOLUTION up_scale = cfg.REFINENET.UP_SCALE polys_gt_inds = np.where((roidb['gt_classes'] > 0) & (roidb['is_crowd'] == 0))[0] gt_classes = roidb['gt_classes'][polys_gt_inds] polys_gt = [roidb['segms'][i] for i in polys_gt_inds] boxes_from_polys = segm_utils.polys_to_boxes(polys_gt) fg_inds = np.where(blobs['labels_int32'] > 0)[0] roi_has_mask = blobs['labels_int32'].copy() roi_has_mask[roi_has_mask > 0] = 1 # Define size variables inp_h, inp_w = data.shape[2], data.shape[3] pad_img_h, pad_img_w = inp_h / im_scale, inp_w / im_scale if fg_inds.shape[0] > 0: # Class labels for the foreground rois mask_class_labels = blobs['labels_int32'][fg_inds] masks = blob_utils.zeros((fg_inds.shape[0], M**2), int32=True) # Find overlap between all foreground rois and the bounding boxes # enclosing each segmentation rois_fg = sampled_boxes[fg_inds] overlaps_bbfg_bbpolys = box_utils.bbox_overlaps( rois_fg.astype(np.float32, copy=False), boxes_from_polys.astype(np.float32, copy=False)) # Map from each fg rois to the index of the mask with highest overlap # (measured by bbox overlap) fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1) # Expand the foreground rois by a factor of up_scale and # clip by the padded image boundary pad_rois_fg = box_utils.expand_boxes(rois_fg, up_scale) pad_rois_fg = box_utils.clip_boxes_to_image(pad_rois_fg, pad_img_h, pad_img_w) if cfg.REFINENET.ONLY_USE_CROWDED_SAMPLES: # Only use crowded samples to train the RefineNet THRES = cfg.REFINENET.OVERLAP_THRESHOLD for i in range(rois_fg.shape[0]): overlap = overlaps_bbfg_bbpolys[i] if np.sum(overlap > THRES) > 1: # if has multiple instances overlapped, use it for training fg_polys_ind = fg_polys_inds[i] poly_gt = polys_gt[fg_polys_ind] pad_roi_fg = pad_rois_fg[i] # Rasterize the portion of the polygon mask within the given fg roi # to an M x M binary image mask = segm_utils.polys_to_mask_wrt_box( poly_gt, pad_roi_fg, M) mask = np.array(mask > 0, dtype=np.int32) # Ensure it's binary masks[i, :] = np.reshape(mask, M**2) else: # Only one instance, then set label to be -1 (ignored) masks[i, :] = -1 mask_class_labels[i] = 0 elif cfg.REFINENET.ASSIGN_LARGER_WEIGHT_FOR_CROWDED_SAMPLES: loss_weights = blob_utils.ones((rois_fg.shape[0], )) for i in range(rois_fg.shape[0]): fg_polys_ind = fg_polys_inds[i] poly_gt = polys_gt[fg_polys_ind] pad_roi_fg = pad_rois_fg[i] class_label = mask_class_labels[i] # Rasterize the portion of the polygon mask within the given # fg roi to an M x M binary image mask = segm_utils.polys_to_mask_wrt_box(poly_gt, pad_roi_fg, M) mask = np.array(mask > 0, dtype=np.int32) # Ensure it's binary masks[i, :] = np.reshape(mask, M**2) # And now determine the weight for each roi. If any instance # that is of the same class as the RoI, then we expect it to # be a hard sample and assigns a larger weight for this RoI for j in range(len(polys_gt)): if j == fg_polys_ind: continue if gt_classes[ j] == class_label: # only same class is valid mask = segm_utils.polys_to_mask_wrt_box( polys_gt[j], pad_roi_fg, M) # and check if has anypart fall inside the bbox is_inside_bbox = (np.sum(mask) > 0) if is_inside_bbox: loss_weights[i] = cfg.REFINENET.WEIGHT_LOSS_CROWDED break # early stop else: # add fg targets for i in range(rois_fg.shape[0]): fg_polys_ind = fg_polys_inds[i] poly_gt = polys_gt[fg_polys_ind] pad_roi_fg = pad_rois_fg[i] # Rasterize the portion of the polygon mask within the given fg roi # to an M x M binary image mask = segm_utils.polys_to_mask_wrt_box(poly_gt, pad_roi_fg, M) mask = np.array(mask > 0, dtype=np.int32) # Ensure it's binary masks[i, :] = np.reshape(mask, M**2) else: # If there are no fg masks (it does happen) # The network cannot handle empty blobs, so we must provide a mask # We simply take the first bg roi, given it an all -1's mask (ignore # label), and label it with class zero (bg). bg_inds = np.where(blobs['labels_int32'] == 0)[0] # pad_rois_fg is actually one background roi, but that's ok because ... pad_rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1)) # We give it an -1's blob (ignore label) masks = -blob_utils.ones((1, M**2), int32=True) # We label it with class = 0 (background) mask_class_labels = blob_utils.zeros((1, )) # Mark that the first roi has a mask roi_has_mask[0] = 1 if cfg.MRCNN.CLS_SPECIFIC_MASK: masks = _expand_to_class_specific_mask_targets(masks, mask_class_labels) # Scale rois_fg and format as (batch_idx, x1, y1, x2, y2) pad_rois_fg = (pad_rois_fg.astype(np.float32)) * im_scale repeated_batch_idx = batch_idx * blob_utils.ones((pad_rois_fg.shape[0], 1)) pad_rois_fg = np.hstack((repeated_batch_idx, pad_rois_fg)).astype(np.int32) # Update blobs dict with Refine-Net blobs blobs['refined_mask_rois'] = pad_rois_fg blobs['roi_has_refined_mask_int32'] = roi_has_mask blobs['refined_masks_int32'] = masks if cfg.REFINENET.ASSIGN_LARGER_WEIGHT_FOR_CROWDED_SAMPLES: blobs['loss_weights'] = loss_weights
def _sample_rois_gan(roidb, im_scale, batch_idx, flags): """Generate a random sample of RoIs comprising foreground and background examples. """ assert isinstance(flags, ModeFlags) is True # gt_boxes and sample such that they fulfill threshold criterion gt_inds = np.where(roidb['gt_classes'] > 0)[0] gt_boxes = roidb['boxes'][gt_inds, :] if cfg.DEBUG: logger.info("sample from {} gt boxes".format(len(gt_boxes))) areas_gt, _ = box_utils.boxes_area(gt_boxes) areas_gt = np.sqrt(areas_gt) print("gt-boxes: area_thres: {} vs areas: {}".format( cfg.GAN.AREA_THRESHOLD, areas_gt)) gt_keep_inds = [] if cfg.GAN.AREA_THRESHOLD > 0: area_thres = 1.0 * cfg.GAN.AREA_THRESHOLD * cfg.GAN.AREA_THRESHOLD # no scaling, as rois are scaled latter if flags.fake_mode: # for fake samples: keep only samples with area < area-threshold gt_keep_inds = gt_inds[box_utils.filter_large_boxes_area( gt_boxes, max_area=area_thres)] elif flags.real_mode: # for real samples: keep only samples with area >= area-threshold gt_keep_inds = gt_inds[box_utils.filter_small_boxes_area( gt_boxes, min_area=area_thres)] elif flags.real_fake_mode: gt_keep_inds = gt_inds if flags.train_generator: rois_per_image = int(cfg.GAN.TRAIN.BATCH_SIZE_PER_IM_G) fg_rois_per_image = int( np.round(cfg.GAN.TRAIN.FG_FRACTION_G * rois_per_image)) elif flags.train_discriminator: # discriminator rois_per_image = int(cfg.GAN.TRAIN.BATCH_SIZE_PER_IM_D) fg_rois_per_image = int( np.round(cfg.GAN.TRAIN.FG_FRACTION_D * rois_per_image)) elif flags.train_pre: rois_per_image = int(cfg.GAN.TRAIN.BATCH_SIZE_PER_IM_PRE) fg_rois_per_image = int( np.round(cfg.GAN.TRAIN.FG_FRACTION_PRE * rois_per_image)) max_overlaps = roidb['max_overlaps'] # Select foreground RoIs as those with >= FG_THRESH overlap fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] # with area-threshold, only select indices of boxes, whose corresponding ground-truth-box fulfills criterion # i.e. whose corresponding index to gt-box is in gt_keep_inds if cfg.GAN.AREA_THRESHOLD > 0: #if cfg.DEBUG: # fg_boxes = gt_boxes[gt_inds[roidb['box_to_gt_ind_map'][fg_inds]], :] # areas_fg, _ = box_utils.boxes_area(fg_boxes) # areas_fg = np.sqrt(areas_fg) # print("fg-before: area_thres: {} vs areas: {}".format(cfg.GAN.AREA_THRESHOLD, areas_fg)) fg_inds = np.asarray([ x for x in fg_inds if gt_inds[roidb['box_to_gt_ind_map'][x]] in gt_keep_inds ]).astype(int) if cfg.DEBUG: fg_boxes = gt_boxes[ gt_inds[roidb['box_to_gt_ind_map'][fg_inds]], :] areas_fg, _ = box_utils.boxes_area(fg_boxes) areas_fg = np.sqrt(areas_fg) print("fg-after: area_thres: {} vs areas: {}".format( cfg.GAN.AREA_THRESHOLD, areas_fg)) # Guard against the case when an image has fewer than fg_rois_per_image # foreground RoIs fg_rois_per_this_image = np.minimum(fg_rois_per_image, int(fg_inds.size)) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size) # Sample foreground regions without replacement if bg_inds.size > 0: bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Label is the class each RoI has max overlap with sampled_labels = roidb['max_classes'][keep_inds] sampled_labels[fg_rois_per_this_image:] = 0 # Label bg RoIs with class 0 sampled_boxes = roidb['boxes'][keep_inds] if 'bbox_targets' not in roidb: gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]] bbox_targets = _compute_targets(sampled_boxes, gt_boxes[gt_assignments, :], sampled_labels) bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets) else: bbox_targets, bbox_inside_weights = _expand_bbox_targets( roidb['bbox_targets'][keep_inds, :]) bbox_outside_weights = np.array(bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype) # Scale rois and format as (batch_idx, x1, y1, x2, y2) sampled_rois = sampled_boxes * im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sampled_rois.shape[0], 1)) sampled_rois = np.hstack((repeated_batch_idx, sampled_rois)) if not cfg.RPN.RPN_ON: # FAST-RCNN training # need to unsqueeze things for functionality in loader / minibatch.py ... sampled_rois = np.expand_dims(sampled_rois, axis=0) sampled_labels = np.expand_dims(sampled_labels, axis=0) bbox_targets = np.expand_dims(bbox_targets, axis=0) bbox_outside_weights = np.expand_dims(bbox_outside_weights, axis=0) bbox_inside_weights = np.expand_dims(bbox_inside_weights, axis=0) # Base Fast R-CNN blobs blob_dict = dict(labels_int32=sampled_labels.astype(np.int32, copy=False), rois=sampled_rois, bbox_targets=bbox_targets, bbox_inside_weights=bbox_inside_weights, bbox_outside_weights=bbox_outside_weights) return blob_dict
def _sample_rois_balance_sample(roidb, im_scale, batch_idx): """Generate a random sample of RoIs comprising foreground and background examples. """ rois_per_image = int(cfg.TRAIN.BATCH_SIZE_PER_IM) fg_rois_per_image = int(np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)) max_overlaps = roidb['max_overlaps'] # Select foreground RoIs as those with >= FG_THRESH overlap fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] # Guard against the case when an image has fewer than fg_rois_per_image # foreground RoIs fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) # Balance Sample Start gt_label = roidb['gt_classes'] gt_assignment = roidb['box_to_gt_ind_map'] gt_set = np.unique(gt_label) # pdb.set_trace() gt_set = gt_set[np.where(gt_set>0)] sample_count = {} roi_max_num = {} gt_pair = np.zeros((2, len(gt_set)), dtype=np.uint8) for temp_index, temp_label in enumerate(gt_set): sample_count[temp_label] = 0 temp_num1 = len(np.where(gt_label==temp_label)[0]) gt_pair[0, temp_index] = temp_label gt_pair[1, temp_index] = temp_num1 average_label_num = math.ceil(fg_rois_per_this_image / float(len(gt_set))) gt_num_sort = np.argsort(gt_pair[1, :]) fg_remain = fg_rois_per_this_image # print(len(gt_num_sort)) for ii in range(len(gt_num_sort)): dispatch = gt_pair[1, ii] if gt_pair[1, ii] <= average_label_num else average_label_num roi_max_num[gt_pair[0, ii]] = dispatch fg_remain -= dispatch if len(gt_num_sort)-ii-1 == 0: continue average_label_num = math.ceil(fg_remain / float(len(gt_num_sort)-ii-1)) new_fg_inds = [] # Sample foreground regions without replacement if fg_inds.size > 0: # print(fg_inds) # pdb.set_trace() np.random.shuffle(fg_inds) # print(fg_inds) # input() for ii in range(len(fg_inds)): label_temp = gt_label[gt_assignment[fg_inds[ii]]] if sample_count[label_temp] < roi_max_num[label_temp]: new_fg_inds.append(fg_inds[ii]) sample_count[label_temp] += 1 new_fg_inds = np.array(new_fg_inds) fg_inds = new_fg_inds # Balance Sample End # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size) # Sample foreground regions without replacement if bg_inds.size > 0: bg_inds = npr.choice( bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Label is the class each RoI has max overlap with sampled_labels = roidb['max_classes'][keep_inds] sampled_labels[fg_rois_per_this_image:] = 0 # Label bg RoIs with class 0 sampled_boxes = roidb['boxes'][keep_inds] if 'bbox_targets' not in roidb: gt_inds = np.where(roidb['gt_classes'] > 0)[0] gt_boxes = roidb['boxes'][gt_inds, :] gt_assignments = gt_inds[roidb['box_to_gt_ind_map'][keep_inds]] bbox_targets = _compute_targets( sampled_boxes, gt_boxes[gt_assignments, :], sampled_labels) bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_targets) else: bbox_targets, bbox_inside_weights = _expand_bbox_targets( roidb['bbox_targets'][keep_inds, :]) bbox_outside_weights = np.array( bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype) # Scale rois and format as (batch_idx, x1, y1, x2, y2) sampled_rois = sampled_boxes * im_scale repeated_batch_idx = batch_idx * blob_utils.ones((sampled_rois.shape[0], 1)) sampled_rois = np.hstack((repeated_batch_idx, sampled_rois)) # Base Fast R-CNN blobs blob_dict = dict( labels_int32=sampled_labels.astype(np.int32, copy=False), rois=sampled_rois, bbox_targets=bbox_targets, bbox_inside_weights=bbox_inside_weights, bbox_outside_weights=bbox_outside_weights) # Optionally add Mask R-CNN blobs if cfg.MODEL.MASK_ON: roi_data.mask_rcnn.add_mask_rcnn_blobs(blob_dict, sampled_boxes, roidb, im_scale, batch_idx) # Optionally add Keypoint R-CNN blobs if cfg.MODEL.KEYPOINTS_ON: roi_data.keypoint_rcnn.add_keypoint_rcnn_blobs( blob_dict, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx) return blob_dict
def add_mask_rcnn_blobs(blobs, sampled_boxes, roidb, im_scale, batch_idx): """Add Mask R-CNN specific blobs to the input blob dictionary.""" # Prepare the mask targets by associating one gt mask to each training roi # that has a fg (non-bg) class label. M = cfg.MRCNN.RESOLUTION polys_gt_inds = np.where( (roidb['gt_classes'] > 0) & (roidb['is_crowd'] == 0) )[0] polys_gt = [roidb['segms'][i] for i in polys_gt_inds] boxes_from_polys = segm_utils.polys_to_boxes(polys_gt) # Keep only a subset of classes (set A in the paper) for mask training if cfg.TRAIN.MRCNN_FILTER_LABELS: keep_label_set = set(cfg.TRAIN.MRCNN_LABELS_TO_KEEP) labels_int32 = blobs['labels_int32'] labels_int32_keep = np.array( [(l if l in keep_label_set else 0) for l in labels_int32], dtype=labels_int32.dtype) else: labels_int32_keep = blobs['labels_int32'] fg_inds = np.where(labels_int32_keep > 0)[0] roi_has_mask = labels_int32_keep.copy() roi_has_mask[roi_has_mask > 0] = 1 if fg_inds.shape[0] > 0: # Class labels for the foreground rois mask_class_labels = blobs['labels_int32'][fg_inds] masks = blob_utils.zeros((fg_inds.shape[0], M**2), int32=True) # Find overlap between all foreground rois and the bounding boxes # enclosing each segmentation rois_fg = sampled_boxes[fg_inds] overlaps_bbfg_bbpolys = box_utils.bbox_overlaps( rois_fg.astype(np.float32, copy=False), boxes_from_polys.astype(np.float32, copy=False) ) # Map from each fg rois to the index of the mask with highest overlap # (measured by bbox overlap) fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1) # add fg targets for i in range(rois_fg.shape[0]): fg_polys_ind = fg_polys_inds[i] poly_gt = polys_gt[fg_polys_ind] roi_fg = rois_fg[i] # Rasterize the portion of the polygon mask within the given fg roi # to an M x M binary image mask = segm_utils.polys_to_mask_wrt_box(poly_gt, roi_fg, M) mask = np.array(mask > 0, dtype=np.int32) # Ensure it's binary masks[i, :] = np.reshape(mask, M**2) else: # If there are no fg masks (it does happen) # The network cannot handle empty blobs, so we must provide a mask # We simply take the first bg roi, given it an all -1's mask (ignore # label), and label it with class zero (bg). bg_inds = np.where(blobs['labels_int32'] == 0)[0] # rois_fg is actually one background roi, but that's ok because ... rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1)) # We give it an -1's blob (ignore label) masks = -blob_utils.ones((1, M**2), int32=True) # We label it with class = 0 (background) mask_class_labels = blob_utils.zeros((1, )) # Mark that the first roi has a mask roi_has_mask[0] = 1 if cfg.MRCNN.CLS_SPECIFIC_MASK: masks = _expand_to_class_specific_mask_targets(masks, mask_class_labels) # Scale rois_fg and format as (batch_idx, x1, y1, x2, y2) rois_fg *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1)) rois_fg = np.hstack((repeated_batch_idx, rois_fg)) # Update blobs dict with Mask R-CNN blobs blobs['mask_rois'] = rois_fg blobs['roi_has_mask_int32'] = roi_has_mask blobs['masks_int32'] = masks
def _sample_pairs(roidb, im_scale, batch_idx): """Generate a random sample of RoIs comprising foreground and background examples. """ fg_pairs_per_image = cfg.TRAIN.FG_REL_SIZE_PER_IM pairs_per_image = int( cfg.TRAIN.FG_REL_SIZE_PER_IM / cfg.TRAIN.FG_REL_FRACTION) # need much more pairs since it's quadratic max_pair_overlaps = roidb['max_pair_overlaps'] gt_pair_inds = np.where(max_pair_overlaps > 1.0 - 1e-4)[0] fg_pair_inds = np.where((max_pair_overlaps >= cfg.TRAIN.FG_THRESH) & (max_pair_overlaps <= 1.0 - 1e-4))[0] fg_pairs_per_this_image = np.minimum(fg_pairs_per_image, gt_pair_inds.size + fg_pair_inds.size) # Sample foreground regions without replacement if fg_pair_inds.size > 0: fg_pair_inds = npr.choice(fg_pair_inds, size=(fg_pairs_per_this_image - gt_pair_inds.size), replace=False) fg_pair_inds = np.append(fg_pair_inds, gt_pair_inds) # Label is the class each RoI has max overlap with fg_prd_labels = roidb['max_prd_classes'][fg_pair_inds] blob_dict = dict( fg_prd_labels_int32=fg_prd_labels.astype(np.int32, copy=False)) bg_pair_inds = np.where((max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_pairs_per_this_image = pairs_per_image - fg_pairs_per_this_image bg_pairs_per_this_image = np.minimum(bg_pairs_per_this_image, bg_pair_inds.size) # Sample foreground regions without replacement if bg_pair_inds.size > 0: bg_pair_inds = npr.choice(bg_pair_inds, size=bg_pairs_per_this_image, replace=False) keep_pair_inds = np.append(fg_pair_inds, bg_pair_inds) all_prd_labels = np.zeros(keep_pair_inds.size, dtype=np.int32) all_prd_labels[:fg_pair_inds. size] = fg_prd_labels + 1 # class should start from 1 blob_dict['all_prd_labels_int32'] = all_prd_labels.astype(np.int32, copy=False) blob_dict['fg_size'] = np.array( [fg_pair_inds.size], dtype=np.int32 ) # this is used to check if there is at least one fg to learn sampled_sbj_boxes = roidb['sbj_boxes'][keep_pair_inds] sampled_obj_boxes = roidb['obj_boxes'][keep_pair_inds] # Scale rois and format as (batch_idx, x1, y1, x2, y2) sampled_sbj_rois = sampled_sbj_boxes * im_scale sampled_obj_rois = sampled_obj_boxes * im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (keep_pair_inds.shape[0], 1)) sampled_sbj_rois = np.hstack((repeated_batch_idx, sampled_sbj_rois)) sampled_obj_rois = np.hstack((repeated_batch_idx, sampled_obj_rois)) blob_dict['sbj_rois'] = sampled_sbj_rois blob_dict['obj_rois'] = sampled_obj_rois sampled_rel_rois = box_utils.rois_union(sampled_sbj_rois, sampled_obj_rois) blob_dict['rel_rois'] = sampled_rel_rois if cfg.MODEL.USE_FREQ_BIAS or cfg.MODEL.USE_SEPARATE_SO_SCORES: sbj_labels = roidb['max_sbj_classes'][keep_pair_inds] obj_labels = roidb['max_obj_classes'][keep_pair_inds] blob_dict['all_sbj_labels_int32'] = sbj_labels.astype(np.int32, copy=False) blob_dict['all_obj_labels_int32'] = obj_labels.astype(np.int32, copy=False) return blob_dict
def add_keypoint_rcnn_blobs_sigmoid( blobs, roidb, fg_rois_per_image, fg_inds, im_scale, batch_idx ): """Add Mask R-CNN keypoint specific blobs to the given blobs dictionary.""" # Note: gt_inds must match how they're computed in # datasets.json_dataset._merge_proposal_boxes_into_roidb gt_inds = np.where(roidb['gt_classes'] > 0)[0] max_overlaps = roidb['max_overlaps'] gt_keypoints = roidb['gt_keypoints'] M = cfg.KRCNN.HEATMAP_SIZE ind_kp = gt_inds[roidb['box_to_gt_ind_map']] within_box = _within_box(gt_keypoints[ind_kp, :, :], roidb['boxes']) vis_kp = gt_keypoints[ind_kp, 2, :] > 0 is_visible = np.sum(np.logical_and(vis_kp, within_box), axis=1) > 0 kp_fg_inds = np.where( np.logical_and(max_overlaps >= cfg.TRAIN.FG_THRESH, is_visible) )[0] kp_fg_rois_per_this_image = np.minimum(fg_rois_per_image, kp_fg_inds.size) if kp_fg_inds.size > kp_fg_rois_per_this_image: kp_fg_inds = np.random.choice( kp_fg_inds, size=kp_fg_rois_per_this_image, replace=False ) if kp_fg_inds.shape[0] > 0: sampled_fg_rois = roidb['boxes'][kp_fg_inds] box_to_gt_ind_map = roidb['box_to_gt_ind_map'][kp_fg_inds] num_keypoints = gt_keypoints.shape[2] sampled_keypoints = -np.ones( (len(sampled_fg_rois), gt_keypoints.shape[1], num_keypoints), dtype=gt_keypoints.dtype ) for ii in range(len(sampled_fg_rois)): ind = box_to_gt_ind_map[ii] if ind >= 0: sampled_keypoints[ii, :, :] = gt_keypoints[gt_inds[ind], :, :] assert np.sum(sampled_keypoints[ii, 2, :]) > 0 heats, weights = keypoint_utils.keypoints_to_sigmoid_heatmap_labels( sampled_keypoints, sampled_fg_rois, M=cfg.KRCNN.HEATMAP_SIZE ) shape = sampled_fg_rois.shape[0] * cfg.KRCNN.NUM_KEYPOINTS heats = heats.reshape((shape, M**2)) weights = weights.reshape((shape, 1)) else:# If there are no fg keypoint rois (it does happen) # The network cannot handle empty blobs, so we must provide a heatmap # We simply take the first bg roi, given it an all zero heatmap, and # set its weights to zero (ignore label). roi_inds = np.where(roidb['gt_classes'] == 0)[0] # sampled_fg_rois is actually one random roi, but that's ok because ... sampled_fg_rois = roidb['boxes'][roi_inds[0]].reshape((1, -1)) # We give it an 0's blob heats = (-1) * blob_utils.ones((1 * cfg.KRCNN.NUM_KEYPOINTS, M**2)) # We set weights to 0 (ignore label) weights = blob_utils.zeros((1 * cfg.KRCNN.NUM_KEYPOINTS, 1)) sampled_fg_rois *= im_scale repeated_batch_idx = batch_idx * blob_utils.ones( (sampled_fg_rois.shape[0], 1) ) sampled_fg_rois = np.hstack((repeated_batch_idx, sampled_fg_rois)) blobs['keypoint_rois'] = sampled_fg_rois blobs['keypoint_locations_int32'] = heats.astype(np.int32, copy=False) blobs['keypoint_weights'] = weights # Since in this function we may random sample a subset of bbox as the roi, # we need to make sure it's the same subset for the refined_keypoint_rois, # so we pass out the inds for the subset too. blobs['keypoint_fg_inds'] = kp_fg_inds.astype(np.int32, copy=False)
def forward(self, frame_feat=None, obj_feat=None, human_mask=None, human_box=None, roidb=None, roi=None, batch=1, no_dropout=False, full_batch=False, binary_label=None): device_id = obj_feat.get_device() B = frame_feat.shape[0] obj_label = torch.IntTensor([db['obj_gt_cls'] for db in roidb]).cuda(device_id) prd_label = torch.IntTensor([db['prd_gt_cls'] for db in roidb]).cuda(device_id) # ------------------------------------------------------------------------------------------------------------------- # obj visual and human visual # ------------------------------------------------------------------------------------------------------------------- obj_inter = self.obj_feats(obj_feat) obj_hidden = self.obj_feats_2(obj_inter) obj_hidden_norm = F.normalize(obj_hidden, p=2, dim=1) densepose_mask = human_mask densepose_mask = densepose_mask.view(-1, 1, densepose_mask.shape[1], densepose_mask.shape[2]) densepose_mask_conv = self.human_mask_conv(densepose_mask) roi_feature = RoIAlignFunction( 7, 7, 1. / 8, 0.0)(densepose_mask_conv, Variable(torch.from_numpy(roi)).cuda(device_id)) roi_feature = roi_feature.view(-1, 64 * 49) densepose_mask_hidden = self.human_mask_feats(roi_feature) densepose_mask_hidden_norm = F.normalize(densepose_mask_hidden, p=2, dim=1) ## ---------------------------------------------------------------------------------------------------------------------- ## extract the features of densepose bounding boxes ## ---------------------------------------------------------------------------------------------------------------------- human_boxs = [] for batch_idx, tem_human_box in enumerate(human_box): repeated_batch_idx = batch_idx * blob_utils.ones( (tem_human_box.shape[0], 1)) tem_human_box = np.hstack((repeated_batch_idx, tem_human_box[:, 1:])) for tem in tem_human_box: human_boxs.append(tem) densepose_roi = np.array(human_boxs) densepose_roi_feature = RoIAlignFunction(7, 7, 1. / 8, 0.0)( densepose_mask_conv, Variable(torch.from_numpy(densepose_roi)).cuda(device_id)) densepose_roi_feature = densepose_roi_feature.view(-1, 64 * 49) densepose_box_feature_hidden = self.human_mask_feats( densepose_roi_feature) densepose_box_feature_hidden_norm = F.normalize( densepose_box_feature_hidden, p=2, dim=1) # ------------------------------------------------------------------------------------------------------------------- # obj text and prd text # ------------------------------------------------------------------------------------------------------------------- ## obj text obj_text_vecs = self.obj_vecs[obj_label] obj_text_vecs = Variable( torch.from_numpy(obj_text_vecs.astype('float32'))).cuda(device_id) if obj_text_vecs.dim() == 1: obj_text_vecs = obj_text_vecs.view(1, -1) obj_text_hidden = self.obj_text_feats(obj_text_vecs) obj_text_hidden_norm = F.normalize(obj_text_hidden, p=2, dim=1) # (#prd, 1024) ## prd text prd_text_vecs = self.prd_vecs[prd_label] prd_text_vecs = Variable( torch.from_numpy(prd_text_vecs.astype('float32'))).cuda(device_id) if prd_text_vecs.dim() == 1: prd_text_vecs = prd_text_vecs.view(1, -1) prd_text_hidden = self.prd_text_feats(prd_text_vecs) prd_text_hidden_norm = F.normalize(prd_text_hidden, p=2, dim=1) # ------------------------------------------------------------------------------------------------------------------- # video binary loss, text match video # ------------------------------------------------------------------------------------------------------------------- if cfg.BINARY_LOSS: frame_feat_binary = self.frame_fc_binary(frame_feat) frame_feat_binary = torch.cat( [frame_feat_binary, obj_text_hidden, prd_text_hidden], dim=-1) frame_feat_binary = self.frame_fc_cat_binary(frame_feat_binary) frame_feat_binary = frame_feat_binary.mean(0) frame_feat_binary_pred = self.binary_classifier(frame_feat_binary) frame_feat_binary_pred = frame_feat_binary_pred.view(1, -1) video_binary_loss = F.cross_entropy(frame_feat_binary_pred, binary_label) video_binary_loss = 10 * video_binary_loss else: video_binary_loss = torch.tensor([0]).cuda(device_id) # ------------------------------------------------------------------------------------------------------------------- # concate visual obj + obj text + prd text --> weight # ------------------------------------------------------------------------------------------------------------------- if roi is None: obj_text_hidden_norm_expn = obj_text_hidden_norm.expand( obj_hidden_norm.shape[0], obj_text_hidden_norm.shape[1]) prd_text_hidden_norm_expn = prd_text_hidden_norm.expand( obj_hidden_norm.shape[0], prd_text_hidden_norm.shape[1]) densepose_mask_hidden_norm_expn = densepose_mask_hidden_norm.expand( obj_hidden_norm.shape[0], densepose_mask_hidden_norm.shape[1]) densepose_box_feature_hidden_norm_expn = densepose_box_feature_hidden_norm.expand( densepose_box_feature_hidden_norm.shape[0], densepose_box_feature_hidden_norm.shape[1]) else: gather_obj_index = torch.Tensor(roi[:, 0:1]).long().repeat( 1, obj_text_hidden_norm.shape[1]).cuda(device_id) gather_prd_index = torch.Tensor(roi[:, 0:1]).long().repeat( 1, prd_text_hidden_norm.shape[1]).cuda(device_id) obj_text_hidden_norm_expn = torch.gather(obj_text_hidden_norm, 0, gather_obj_index) prd_text_hidden_norm_expn = torch.gather(prd_text_hidden_norm, 0, gather_prd_index) gather_obj_index = torch.Tensor( densepose_roi[:, 0:1]).long().repeat( 1, obj_text_hidden_norm.shape[1]).cuda(device_id) gather_prd_index = torch.Tensor( densepose_roi[:, 0:1]).long().repeat( 1, prd_text_hidden_norm.shape[1]).cuda(device_id) densepose_obj_text_hidden_norm_expn = torch.gather( obj_text_hidden_norm, 0, gather_obj_index) densepose_prd_text_hidden_norm_expn = torch.gather( prd_text_hidden_norm, 0, gather_prd_index) if cfg.HUMAN_OBJ_SPATIAL: obj_hidden_norm = torch.max(obj_hidden_norm, densepose_mask_hidden_norm) obj_hidden = torch.max(obj_hidden, densepose_mask_hidden) # ------------------------------------------------------------------------------------------------------------------- # COM_WEIGHT # ------------------------------------------------------------------------------------------------------------------- if cfg.COM_WEIGHT == 'cat_video_soft_attention': frame_feat = self.frame_fc(frame_feat) query = self.soft_attention_fc2( F.relu(self.soft_attention_fc1(frame_feat))) key = self.soft_attention_fc4( F.relu(self.soft_attention_fc3(frame_feat))) sim = query[:, None, :] * key[None, :, :] sim = F.softmax(sim.sum(dim=-1), dim=-1) value = self.soft_attention_fc6( F.relu(self.soft_attention_fc5(frame_feat))) frame_feat = (sim[:, :, None] * value[:, None, :].repeat(1, sim.size(1), 1)).sum( dim=1) frame_feat_norm = F.normalize(frame_feat, p=2, dim=1) gather_index = torch.Tensor(roi[:, 0:1]).long().repeat( 1, frame_feat_norm.shape[1]).cuda(device_id) frame_feat_norm_obj = torch.gather(frame_feat_norm, 0, gather_index) gather_index = torch.Tensor(densepose_roi[:, 0:1]).long().repeat( 1, frame_feat_norm.shape[1]).cuda(device_id) frame_feat_norm_human = torch.gather(frame_feat_norm, 0, gather_index) concated_vecs = torch.cat( (obj_hidden_norm, obj_text_hidden_norm_expn, prd_text_hidden_norm_expn, frame_feat_norm_obj), dim=1) densepose_concated_vecs = torch.cat( (densepose_box_feature_hidden_norm, densepose_obj_text_hidden_norm_expn, densepose_prd_text_hidden_norm_expn, frame_feat_norm_human), dim=1) roi_weights = self.roi_weights_net_obj(concated_vecs) roi_weights_human = self.roi_weights_net_human( densepose_concated_vecs) # ------------------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------------------- if roi is None: roi_weights = F.softmax(roi_weights, dim=0) roi_weights = roi_weights.view(1, -1) obj_hidden_weighted = torch.mm(roi_weights, obj_hidden) roi_weights_human = F.softmax(roi_weights_human, dim=0) roi_weights_human = roi_weights_human.view(1, -1) densepose_box_feature_hidden_weighted = torch.mm( roi_weights_human, densepose_box_feature_hidden) else: roi_weights_unpacked = roi_weights.view( -1, cfg.TRAIN.BATCH_SIZE_PER_IM, 1) roi_weights_human_unpacked = roi_weights_human.view( -1, cfg.MAX_NUM_HUMAN, 1) roi_weights_ori = roi_weights.view(-1, cfg.TRAIN.BATCH_SIZE_PER_IM) roi_weights_human_ori = roi_weights_human.view( -1, cfg.MAX_NUM_HUMAN) if not no_dropout: roi_weights_unpacked = self.dropout(roi_weights_unpacked) roi_weights_human_unpacked = self.dropout( roi_weights_human_unpacked) roi_weights_unpacked = F.softmax(roi_weights_unpacked, dim=1) roi_weights_human_unpacked = F.softmax(roi_weights_human_unpacked, dim=1) ## --------------------------------------------------------------------------------------------------------- ## feature ## --------------------------------------------------------------------------------------------------------- obj_hidden_unpacked = obj_hidden.view(-1, cfg.TRAIN.BATCH_SIZE_PER_IM, obj_hidden.size(1)) densepose_box_feature_hidden_unpacked = densepose_box_feature_hidden.view( -1, cfg.MAX_NUM_HUMAN, densepose_box_feature_hidden.size(1)) obj_hidden_weighted = torch.sum(roi_weights_unpacked * obj_hidden_unpacked, dim=1) obj_hidden_human_weighted = torch.sum( roi_weights_human_unpacked * densepose_box_feature_hidden_unpacked, dim=1) if cfg.VIDEO_LOSS == 'contrastive_max_plus': obj_feat = obj_hidden obj_hidden_video_unpacked = obj_feat.view( -1, cfg.VIDEO_FRAME, cfg.TRAIN.BATCH_SIZE_PER_IM, obj_feat.shape[1]) # obj_hidden_video_unpacked = F.normalize(obj_hidden_video_unpacked, p=2, dim=3) * 4 roi_weights_unpacked_batch = roi_weights_unpacked.view( -1, cfg.VIDEO_FRAME, cfg.TRAIN.BATCH_SIZE_PER_IM, 1) # idx = torch.max(roi_weights_unpacked_batch, dim=2)[1][:, :, None, :] sort_idx = torch.sort(roi_weights_unpacked_batch, dim=2)[1] idx_select = sort_idx[:, :, -1:].repeat(1, 1, 1, obj_feat.shape[1]) anchor_embed = torch.gather(obj_hidden_video_unpacked, 2, idx_select) # Randomly sample a positive pair of frames for positive samples permute = torch.randperm(cfg.VIDEO_FRAME).cuda(device_id) pos_embed = anchor_embed[:, permute] permute = torch.randperm( cfg.TRAIN.BATCH_SIZE_PER_IM).cuda(device_id) obj_hidden_video_permute = obj_hidden_video_unpacked[:, :, permute] neg_sample = 15 neg_embed = obj_hidden_video_unpacked[:, :, :neg_sample] pos_dot = (pos_embed * anchor_embed).sum(dim=3) neg_dot = (neg_embed * anchor_embed).sum(dim=3) neg_dot = -torch.cat([pos_dot, neg_dot], dim=-1) pos_dot = -pos_dot video_loss = pos_dot.view( -1, cfg.VIDEO_FRAME) + torch.logsumexp(-neg_dot, dim=-1) select_frames = max(int(cfg.VIDEO_FRAME * 0.7), 1) video_loss, _ = torch.sort(video_loss, dim=1) video_loss = cfg.VIDEO_WEIGHT * video_loss[:, : select_frames].mean( ) else: video_loss = torch.zeros(1)[0].cuda(device_id) if cfg.OBJ_LOSS == 'contrastive_objloss': hidden_weighted_obj = obj_hidden_weighted hidden_weighted_prd = obj_hidden_human_weighted nsample = 15 word_embed_contrast_obj = self.word_embed_contrast_obj( obj_text_vecs) word_embed_contrast_prd = self.word_embed_contrast_prd( prd_text_vecs) ## neg obj n_obj = self.obj_vecs.shape[0] neg_sample = np.random.choice(np.arange(n_obj, dtype=np.int32), size=(obj_text_vecs.shape[0] * nsample, )) neg_embed_obj = self.obj_vecs[neg_sample] neg_embed_obj = neg_embed_obj.reshape( (int(obj_text_vecs.shape[0]), nsample, 300)) neg_embed_obj = Variable( torch.from_numpy( neg_embed_obj.astype('float32'))).cuda(device_id) ## neg prd n_prd = self.prd_vecs.shape[0] neg_sample = np.random.choice(np.arange(n_prd, dtype=np.int32), size=(prd_text_vecs.shape[0] * nsample, )) neg_embed_prd = self.prd_vecs[neg_sample] neg_embed_prd = neg_embed_prd.reshape( (int(prd_text_vecs.shape[0]), nsample, 300)) neg_embed_prd = Variable( torch.from_numpy( neg_embed_prd.astype('float32'))).cuda(device_id) ## embed neg obj and prd neg_embed_contrast_obj = self.word_embed_contrast_obj( neg_embed_obj) neg_embed_contrast_prd = self.word_embed_contrast_prd( neg_embed_prd) pos_dot = (hidden_weighted_obj[:, None, :] * word_embed_contrast_obj[:, None, :]).sum(dim=2) neg_dot = (hidden_weighted_obj[:, None, :] * neg_embed_contrast_obj).sum(dim=2) neg_dot = -torch.cat([pos_dot, neg_dot], dim=-1) pos_dot = -pos_dot obj_loss = pos_dot.view(-1, cfg.VIDEO_FRAME) + torch.logsumexp( -neg_dot, dim=-1).view(-1, cfg.VIDEO_FRAME) pos_dot = (hidden_weighted_prd[:, None, :] * word_embed_contrast_prd[:, None, :]).sum(dim=2) neg_dot = (hidden_weighted_prd[:, None, :] * neg_embed_contrast_prd).sum(dim=2) neg_dot = -torch.cat([pos_dot, neg_dot], dim=-1) pos_dot = -pos_dot prd_loss = pos_dot.view(-1, cfg.VIDEO_FRAME) + torch.logsumexp( -neg_dot, dim=-1).view(-1, cfg.VIDEO_FRAME) # select_frames = max(int(cfg.VIDEO_FRAME * 0.5), 1) select_frames = cfg.VIDEO_FRAME obj_loss, _ = torch.sort(obj_loss, dim=-1) obj_loss = obj_loss[:, :select_frames] # obj_loss = torch.clamp(obj_loss, 0, 1e5) obj_loss = obj_loss.mean() obj_scores = None prd_loss, _ = torch.sort(prd_loss, dim=-1) prd_loss = prd_loss[:, :select_frames] prd_loss = prd_loss.mean() if cfg.WEIGHT_REG == 'L2': weight_loss = torch.norm(roi_weights_unpacked, 2.0, 1) weight_loss = -cfg.L2_WEIGHT * torch.log(weight_loss.mean()) weight_human_loss = torch.norm(roi_weights_human_unpacked, 2.0, 1) weight_human_loss = -cfg.L2_WEIGHT * torch.log( weight_human_loss.mean()) cls_prediction = {} if not self.training and cfg.BINARY_LOSS: cls_prediction['binary_pred'] = frame_feat_binary_pred if cfg.BINARY_LOSS: loss_scale = F.softmax(frame_feat_binary_pred)[0][1] obj_loss = obj_loss * loss_scale prd_loss = prd_loss * loss_scale video_loss = video_loss weight_loss = weight_loss weight_human_loss = weight_human_loss return obj_loss, prd_loss, weight_loss, weight_human_loss, video_loss, video_binary_loss, roi_weights_unpacked, roi_weights_human_unpacked, densepose_roi, roi_weights_ori, roi_weights_human_ori, cls_prediction