def crop_and_resize(pool_size, feature_map, boxes, box_ind): if boxes.shape[1]==5: x1, y1, x2, y2, _= boxes.chunk(5, dim=1) else: x1, y1, x2, y2= boxes.chunk(4, dim=1) im_h, im_w=feature_map.shape[2:4] x1=x1/(float(im_w-1)) x2=x2/(float(im_w-1)) y1=y1/(float(im_h-1)) y2=y2/(float(im_h-1)) boxes = torch.cat((y1, x1, y2, x2), 1) return CropAndResizeFunction(pool_size[0],pool_size[1],0)(feature_map, boxes, box_ind)
def pyramid_roi_align_image(inputs, pool_size, image_shape, istrain=False): """Implements ROI Pooling on multiple levels of the feature pyramid. Params: - pool_size: [height, width] of the output pooled regions. Usually [7, 7] - image_shape: [height, width, channels]. Shape of input image in pixels Inputs: - boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized coordinates. - Feature maps: List of feature maps from different levels of the pyramid. Each is [batch, channels, height, width] Output: Pooled regions in the shape: [num_boxes, height, width, channels]. The width and height are those specific in the pool_shape in the layer constructor. """ # Currently only supports batchsize 1 if istrain: start = 1 else: start = 0 for i in range(start, len(inputs)): inputs[i] = inputs[i].squeeze(0) # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords boxes = inputs[0] # Feature Maps. List of feature maps from different level of the # feature pyramid. Each is [batch, height, width, channels] feature_maps = inputs[1:] # Loop through levels and apply ROI pooling to each. P2 to P5. pooled = [] ind = Variable(torch.zeros(boxes.size()[0]), requires_grad=False).int() if boxes.is_cuda: ind = ind.cuda() feature_maps[0] = feature_maps[0].unsqueeze( 0) #CropAndResizeFunction needs batch dimension pooled_features = CropAndResizeFunction(pool_size, pool_size, 0)(feature_maps[0], boxes, ind) return pooled_features
def GridFeatures(ori_p, radius, pool_size, img_size, feature_map): # pool_size = 7 bbox1 = ori_p - radius bbox2 = ori_p + radius bbox = torch.stack([bbox1, bbox2], dim=0) bbox = bbox.reshape(-1, 4) height = img_size[0] width = img_size[1] window = np.array([0, 0, height, width]).astype(np.float32) boxes = torch.stack( \ [bbox[:, 0].clamp(float(window[0]), float(window[2])), bbox[:, 1].clamp(float(window[1]), float(window[3])), bbox[:, 2].clamp(float(window[0]), float(window[2])), bbox[:, 3].clamp(float(window[1]), float(window[3]))], 1).float() # long tensor # Normalize dimensions to range of 0 to 1. norm = Variable(torch.from_numpy(np.array([height, width, height, width])).float(), requires_grad=False).cuda() # long tensor normalized_boxes = boxes / norm # Add back batch dimension # normalized_boxes = normalized_boxes.unsqueeze(0) ind = Variable(torch.zeros(boxes.size()[0]), requires_grad=False).int().cuda() # import pdb; pdb.set_trace() with torch.no_grad(): pooled_features = CropAndResizeFunction( pool_size, pool_size, 0)(feature_map, normalized_boxes, ind) # final_features ---> image_inputs # avg_pool = nn.AdaptiveAvgPool2d(1) # avg_features = avg_pool(pooled_features) # avg_features = torch.squeeze(avg_features, 2) # avg_features = torch.squeeze(avg_features, 2) # import pdb; pdb.set_trace() return pooled_features
def pyramid_roi_align(inputs, pool_size, image_shape): """Implements ROI Pooling on multiple levels of the feature pyramid. Params: - pool_size: [height, width] of the output pooled regions. Usually [7, 7] - image_shape: [height, width, channels]. Shape of input image in pixels Inputs: - boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized coordinates. - Feature maps: List of feature maps from different levels of the pyramid. Each is [batch, channels, height, width] Output: Pooled regions in the shape: [num_boxes, height, width, channels]. The width and height are those specific in the pool_shape in the layer constructor. """ # Currently only supports batchsize 1 for i in range(len(inputs)): inputs[i] = inputs[i].squeeze(0) # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords boxes = inputs[0] # Feature Maps. List of feature maps from different level of the # feature pyramid. Each is [batch, height, width, channels] feature_maps = inputs[1:] # Assign each ROI to a level in the pyramid based on the ROI area. boxes = boxes.view(-1, 4) y1, x1, y2, x2 = boxes.chunk(4, dim=1) h = y2 - y1 w = x2 - x1 # Equation 1 in the Feature Pyramid Networks paper. Account for # the fact that our coordinates are normalized here. # e.g. a 224x224 ROI (in pixels) maps to P4 image_area = Variable(torch.FloatTensor( [float(image_shape[0] * image_shape[1])]), requires_grad=False) if boxes.is_cuda: image_area = image_area.cuda() roi_level = 4 + log2(torch.sqrt(h * w) / (224.0 / torch.sqrt(image_area))) roi_level = roi_level.round().int() roi_level = roi_level.clamp(2, 5) # Loop through levels and apply ROI pooling to each. P2 to P5. pooled = [] box_to_level = [] for i, level in enumerate(range(2, 6)): ix = roi_level == level if not ix.any(): continue ix = torch.nonzero(ix)[:, 0] level_boxes = boxes[ix.data, :] # Keep track of which box is mapped to which level box_to_level.append(ix.data) # Stop gradient propogation to ROI proposals level_boxes = level_boxes.detach() # Crop and Resize # From Mask R-CNN paper: "We sample four regular locations, so # that we can evaluate either max or average pooling. In fact, # interpolating only a single value at each bin center (without # pooling) is nearly as effective." # # Here we use the simplified approach of a single value per bin, # which is how it's done in tf.crop_and_resize() # Result: [batch * num_boxes, pool_height, pool_width, channels] ind = Variable(torch.zeros(level_boxes.size()[0]), requires_grad=False).int() if level_boxes.is_cuda: ind = ind.cuda() feature_maps[i] = feature_maps[i].unsqueeze( 0) #CropAndResizeFunction needs batch dimension pooled_features = CropAndResizeFunction(pool_size, pool_size, 0)(feature_maps[i], level_boxes, ind) pooled.append(pooled_features) # Pack pooled features into one tensor pooled = torch.cat(pooled, dim=0) # Pack box_to_level mapping into one array and add another # column representing the order of pooled boxes box_to_level = torch.cat(box_to_level, dim=0) # Rearrange pooled features to match the order of the original boxes _, box_to_level = torch.sort(box_to_level) pooled = pooled[box_to_level, :, :] return pooled
def detection_target_layer(proposals, gt_class_ids, gt_boxes, gt_masks, config): """Subsamples proposals and generates target box refinment, class_ids, and masks for each. Inputs: proposals: [batch, N, (y1, x1, y2, x2)] in normalized coordinates. Might be zero padded if there are not enough proposals. gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs. gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates. gt_masks: [batch, height, width, MAX_GT_INSTANCES] of boolean type Returns: Target ROIs and corresponding class IDs, bounding box shifts, and masks. rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]. Integer class IDs. target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (dy, dx, log(dh), log(dw), class_id)] Class-specific bbox refinments. target_mask: [batch, TRAIN_ROIS_PER_IMAGE, height, width) Masks cropped to bbox boundaries and resized to neural network output size. """ # Currently only supports batchsize 1 proposals = proposals.squeeze(0) gt_class_ids = gt_class_ids.squeeze(0) gt_boxes = gt_boxes.squeeze(0) gt_masks = gt_masks.squeeze(0) # Handle COCO crowds # A crowd box in COCO is a bounding box around several instances. Exclude # them from training. A crowd box is given a negative class ID. if torch.nonzero(gt_class_ids < 0).size(): # test_data = gt_class_ids # print(test_data.size()) crowd_ix = torch.nonzero(gt_class_ids < 0)[:, 0] non_crowd_ix = torch.nonzero(gt_class_ids > 0)[:, 0] crowd_boxes = gt_boxes[crowd_ix.data, :] crowd_masks = gt_masks[crowd_ix.data, :, :] gt_class_ids = gt_class_ids[non_crowd_ix.data] gt_boxes = gt_boxes[non_crowd_ix.data, :] gt_masks = gt_masks[non_crowd_ix.data, :] # Compute overlaps with crowd boxes [anchors, crowds] crowd_overlaps = bbox_overlaps(proposals, crowd_boxes) crowd_iou_max = torch.max(crowd_overlaps, dim=1)[0] no_crowd_bool = crowd_iou_max < 0.001 else: no_crowd_bool = Variable(torch.ByteTensor(proposals.size()[0] * [True]), requires_grad=False) if config.GPU_COUNT: no_crowd_bool = no_crowd_bool.cuda() # Compute overlaps matrix [proposals, gt_boxes] overlaps = bbox_overlaps(proposals, gt_boxes) # Determine postive and negative ROIs roi_iou_max = torch.max(overlaps, dim=1)[0] # 1. Positive ROIs are those with >= 0.5 IoU with a GT box positive_roi_bool = roi_iou_max >= 0.5 # Subsample ROIs. Aim for 33% positive # Positive ROIs if torch.nonzero(positive_roi_bool).size(): positive_indices = torch.nonzero(positive_roi_bool)[:, 0] positive_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO) rand_idx = torch.randperm(positive_indices.size()[0]) rand_idx = rand_idx[:positive_count] if config.GPU_COUNT: rand_idx = rand_idx.cuda() positive_indices = positive_indices[rand_idx] positive_count = positive_indices.size()[0] positive_rois = proposals[positive_indices.data, :] # Assign positive ROIs to GT boxes. positive_overlaps = overlaps[positive_indices.data, :] roi_gt_box_assignment = torch.max(positive_overlaps, dim=1)[1] roi_gt_boxes = gt_boxes[roi_gt_box_assignment.data, :] roi_gt_class_ids = gt_class_ids[roi_gt_box_assignment.data] # Compute bbox refinement for positive ROIs deltas = Variable(utils.box_refinement(positive_rois.data, roi_gt_boxes.data), requires_grad=False) std_dev = Variable(torch.from_numpy(config.BBOX_STD_DEV).float(), requires_grad=False) if config.GPU_COUNT: std_dev = std_dev.cuda() deltas /= std_dev # Assign positive ROIs to GT masks roi_masks = gt_masks[roi_gt_box_assignment.data, :, :] # Compute mask targets boxes = positive_rois if config.USE_MINI_MASK: # Transform ROI corrdinates from normalized image space # to normalized mini-mask space. y1, x1, y2, x2 = positive_rois.chunk(4, dim=1) gt_y1, gt_x1, gt_y2, gt_x2 = roi_gt_boxes.chunk(4, dim=1) gt_h = gt_y2 - gt_y1 gt_w = gt_x2 - gt_x1 y1 = (y1 - gt_y1) / gt_h x1 = (x1 - gt_x1) / gt_w y2 = (y2 - gt_y1) / gt_h x2 = (x2 - gt_x1) / gt_w boxes = torch.cat([y1, x1, y2, x2], dim=1) box_ids = Variable(torch.arange(roi_masks.size()[0]), requires_grad=False).int() if config.GPU_COUNT: box_ids = box_ids.cuda() masks = Variable(CropAndResizeFunction(config.MASK_SHAPE[0], config.MASK_SHAPE[1], 0)(roi_masks.unsqueeze(1), boxes, box_ids).data, requires_grad=False) masks = masks.squeeze(1) # Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with # binary cross entropy loss. masks = torch.round(masks) else: positive_count = 0 # 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds. negative_roi_bool = roi_iou_max < 0.5 negative_roi_bool = negative_roi_bool & no_crowd_bool # Negative ROIs. Add enough to maintain positive:negative ratio. if torch.nonzero(negative_roi_bool).size() and positive_count > 0: negative_indices = torch.nonzero(negative_roi_bool)[:, 0] r = 1.0 / config.ROI_POSITIVE_RATIO negative_count = int(r * positive_count - positive_count) rand_idx = torch.randperm(negative_indices.size()[0]) rand_idx = rand_idx[:negative_count] if config.GPU_COUNT: rand_idx = rand_idx.cuda() negative_indices = negative_indices[rand_idx] negative_count = negative_indices.size()[0] negative_rois = proposals[negative_indices.data, :] else: negative_count = 0 # Append negative ROIs and pad bbox deltas and masks that # are not used for negative ROIs with zeros. if positive_count > 0 and negative_count > 0: rois = torch.cat((positive_rois, negative_rois), dim=0) zeros = Variable(torch.zeros(negative_count), requires_grad=False).int() if config.GPU_COUNT: zeros = zeros.cuda() roi_gt_class_ids = torch.cat([roi_gt_class_ids, zeros], dim=0) zeros = Variable(torch.zeros(negative_count, 4), requires_grad=False) if config.GPU_COUNT: zeros = zeros.cuda() deltas = torch.cat([deltas, zeros], dim=0) zeros = Variable(torch.zeros(negative_count, config.MASK_SHAPE[0], config.MASK_SHAPE[1]), requires_grad=False) if config.GPU_COUNT: zeros = zeros.cuda() masks = torch.cat([masks, zeros], dim=0) elif positive_count > 0: rois = positive_rois elif negative_count > 0: rois = negative_rois zeros = Variable(torch.zeros(negative_count), requires_grad=False) if config.GPU_COUNT: zeros = zeros.cuda() roi_gt_class_ids = zeros zeros = Variable(torch.zeros(negative_count, 4), requires_grad=False).int() if config.GPU_COUNT: zeros = zeros.cuda() deltas = zeros zeros = Variable(torch.zeros(negative_count, config.MASK_SHAPE[0], config.MASK_SHAPE[1]), requires_grad=False) if config.GPU_COUNT: zeros = zeros.cuda() masks = zeros else: rois = Variable(torch.FloatTensor(), requires_grad=False) roi_gt_class_ids = Variable(torch.IntTensor(), requires_grad=False) deltas = Variable(torch.FloatTensor(), requires_grad=False) masks = Variable(torch.FloatTensor(), requires_grad=False) if config.GPU_COUNT: rois = rois.cuda() roi_gt_class_ids = roi_gt_class_ids.cuda() deltas = deltas.cuda() masks = masks.cuda() return rois, roi_gt_class_ids, deltas, masks
def compare_with_tf(crop_height, crop_width, is_cuda=True): # generate data image_data, boxes_data, box_index_data = generate_data( batch_size=2, depth=128, im_height=200, im_width=200, n_boxes=10, xyxy=False, box_normalize=True) # boxes_tf_data = np.stack((boxes_data[:, 1], boxes_data[:, 0], boxes_data[:, 3], boxes_data[:, 2]), axis=1) # boxes_tf_data[:, 0::2] /= (image_data.shape[2] - 1.) # boxes_tf_data[:, 1::2] /= (image_data.shape[3] - 1.) # rand conv layer conv_torch = nn.Conv2d(image_data.shape[1], 64, 3, padding=1, bias=False) if is_cuda: conv_torch = conv_torch.cuda() # pytorch forward image_torch = to_varabile(image_data, requires_grad=True, is_cuda=is_cuda) boxes = to_varabile(boxes_data, requires_grad=False, is_cuda=is_cuda) box_index = to_varabile(box_index_data, requires_grad=False, is_cuda=is_cuda) print('pytorch forward and backward start') crops_torch = CropAndResizeFunction(crop_height, crop_width, 0)(image_torch, boxes, box_index) crops_torch = conv_torch(crops_torch) crops_torch_data = crops_torch.data.cpu().numpy() # pytorch backward loss_torch = crops_torch.sum() loss_torch.backward() grad_torch_data = image_torch.grad.data.cpu().numpy() print('pytorch forward and backward end') # tf forward & backward image_tf = tf.placeholder(tf.float32, (None, None, None, None), name='image') boxes = tf.placeholder(tf.float32, (None, 4), name='boxes') box_index = tf.placeholder(tf.int32, (None,), name='box_index') image_t = tf.transpose(image_tf, (0, 2, 3, 1)) crops_tf = tf.image.crop_and_resize(image_t, boxes, box_index, (crop_height, crop_width)) conv_tf = tf.nn.conv2d(crops_tf, np.transpose(conv_torch.weight.data.cpu().numpy(), (2, 3, 1, 0)), [1, 1, 1, 1], padding='SAME') trans_tf = tf.transpose(conv_tf, (0, 3, 1, 2)) loss_tf = tf.reduce_sum(trans_tf) grad_tf = tf.gradients(loss_tf, image_tf)[0] with tf.Session() as sess: crops_tf_data, grad_tf_data = sess.run( (trans_tf, grad_tf), feed_dict={image_tf: image_data, boxes: boxes_data, box_index: box_index_data} ) crops_diff = np.abs(crops_tf_data - crops_torch_data) print('forward (maxval, min_err, max_err, mean_err):', crops_tf_data.max(), crops_diff.min(), crops_diff.max(), crops_diff.mean()) grad_diff = np.abs(grad_tf_data - grad_torch_data) print('backward (maxval, min_err, max_err, mean_err):', grad_tf_data.max(), grad_diff.min(), grad_diff.max(), grad_diff.mean())