def call(self, inputs): roi_bboxes = inputs[0] gt_boxes = inputs[1] gt_labels = inputs[2] gt_box_indices = inputs[3] total_labels = self.hyper_params["total_labels"] total_pos_bboxes = self.hyper_params["total_pos_bboxes"] total_neg_bboxes = self.hyper_params["total_neg_bboxes"] total_bboxes = total_pos_bboxes + total_neg_bboxes batch_size = tf.shape(roi_bboxes)[0] # gt_boxes_map = helpers.get_gt_boxes_map(gt_boxes, gt_box_indices, batch_size, total_neg_bboxes) # pos_gt_labels_map = tf.gather(gt_labels, gt_box_indices, batch_dims=1) neg_gt_labels_map = tf.fill((batch_size, total_neg_bboxes), total_labels-1) gt_labels_map = tf.concat([pos_gt_labels_map, neg_gt_labels_map], axis=1) # roi_bbox_deltas = helpers.get_deltas_from_bboxes(roi_bboxes, gt_boxes_map) # flatted_batch_indices = helpers.get_tiled_indices(batch_size, total_bboxes) flatted_bbox_indices = tf.reshape(tf.tile(tf.range(total_bboxes), (batch_size, )), (-1, 1)) flatted_gt_labels_indices = tf.reshape(gt_labels_map, (-1, 1)) scatter_indices = helpers.get_scatter_indices_for_bboxes([flatted_batch_indices, flatted_bbox_indices, flatted_gt_labels_indices], batch_size, total_bboxes) roi_bbox_deltas = tf.scatter_nd(scatter_indices, roi_bbox_deltas, (batch_size, total_bboxes, total_labels, 4)) roi_bbox_deltas = tf.reshape(roi_bbox_deltas, (batch_size, total_bboxes, total_labels * 4)) roi_bbox_labels = tf.scatter_nd(scatter_indices, tf.ones((batch_size, total_bboxes), tf.int32), (batch_size, total_bboxes, total_labels)) # return tf.stop_gradient(roi_bbox_deltas), tf.stop_gradient(roi_bbox_labels)
def call(self, inputs): roi_bboxes = inputs[0] gt_boxes = inputs[1] gt_labels = inputs[2] total_labels = self.hyper_params["total_labels"] total_pos_bboxes = self.hyper_params["total_pos_bboxes"] total_neg_bboxes = self.hyper_params["total_neg_bboxes"] variances = self.hyper_params["variances"] batch_size, total_bboxes = tf.shape(roi_bboxes)[0], tf.shape( roi_bboxes)[1] # Calculate iou values between each bboxes and ground truth boxes iou_map = helpers.generate_iou_map(roi_bboxes, gt_boxes) # Get max index value for each row max_indices_each_gt_box = tf.argmax(iou_map, axis=2, output_type=tf.int32) # IoU map has iou values for every gt boxes and we merge these values column wise merged_iou_map = tf.reduce_max(iou_map, axis=2) # pos_mask = tf.greater(merged_iou_map, 0.5) pos_mask = helpers.randomly_select_xyz_mask( pos_mask, tf.tile(tf.constant([total_pos_bboxes], dtype=tf.int32), (batch_size, ))) # neg_mask = tf.logical_and(tf.less(merged_iou_map, 0.5), tf.greater(merged_iou_map, 0.1)) neg_mask = helpers.randomly_select_xyz_mask( neg_mask, tf.tile(tf.constant([total_neg_bboxes], dtype=tf.int32), (batch_size, ))) # gt_boxes_map = tf.gather(gt_boxes, max_indices_each_gt_box, batch_dims=1) expanded_gt_boxes = tf.where(tf.expand_dims(pos_mask, axis=-1), gt_boxes_map, tf.zeros_like(gt_boxes_map)) # gt_labels_map = tf.gather(gt_labels, max_indices_each_gt_box, batch_dims=1) pos_gt_labels = tf.where(pos_mask, gt_labels_map, tf.constant(-1, dtype=tf.int32)) neg_gt_labels = tf.cast(neg_mask, dtype=tf.int32) expanded_gt_labels = pos_gt_labels + neg_gt_labels # roi_bbox_deltas = helpers.get_deltas_from_bboxes( roi_bboxes, expanded_gt_boxes) / variances # roi_bbox_labels = tf.one_hot(expanded_gt_labels, total_labels) scatter_indices = tf.tile(tf.expand_dims(roi_bbox_labels, -1), (1, 1, 1, 4)) roi_bbox_deltas = scatter_indices * tf.expand_dims(roi_bbox_deltas, -2) roi_bbox_deltas = tf.reshape( roi_bbox_deltas, (batch_size, total_bboxes * total_labels, 4)) # return tf.stop_gradient(roi_bbox_deltas), tf.stop_gradient( roi_bbox_labels)
def get_step_data(image_data, hyper_params, input_processor, mode="training"): """Generating one step data for training or inference. Batch operations supported. inputs: image_data = img (batch_size, height, width, channels) gt_boxes (batch_size, gt_box_size, [y1, x1, y2, x2]) these values in normalized format between [0, 1] gt_labels (batch_size, gt_box_size) hyper_params = dictionary input_processor = function for preparing image for input. It's getting from backbone. mode = "training" or "inference" outputs: input_img = (batch_size, height, width, channels) preprocessed image using input_processor bbox_deltas = (batch_size, output_height, output_width, anchor_count * [y1, x1, y2, x2]) actual outputs for rpn, generating only training mode bbox_labels = (batch_size, output_height, output_width, anchor_count) actual outputs for rpn, generating only training mode anchors = (batch_size, output_height * output_width * anchor_count, [y1, x1, y2, x2]) """ img, gt_boxes, gt_labels = image_data batch_size = tf.shape(img)[0] input_img = input_processor(img) stride = hyper_params["stride"] anchor_count = hyper_params["anchor_count"] total_pos_bboxes = hyper_params["total_pos_bboxes"] total_neg_bboxes = hyper_params["total_neg_bboxes"] total_bboxes = total_pos_bboxes + total_neg_bboxes img_params = helpers.get_image_params(img, stride) height, width, output_height, output_width = img_params total_anchors = output_height * output_width * anchor_count anchors = generate_anchors(img_params, hyper_params) # We use same anchors for each batch so we multiplied anchors to the batch size anchors = tf.reshape(tf.tile(anchors, (batch_size, 1)), (batch_size, total_anchors, 4)) if mode != "training": return input_img, anchors ################################################################################################################ pos_bbox_indices, neg_bbox_indices, gt_box_indices = helpers.get_selected_indices( anchors, gt_boxes, total_pos_bboxes, total_neg_bboxes) # gt_boxes_map = helpers.get_gt_boxes_map(gt_boxes, gt_box_indices, batch_size, total_neg_bboxes) # pos_labels_map = tf.ones((batch_size, total_pos_bboxes), tf.int32) neg_labels_map = tf.zeros((batch_size, total_neg_bboxes), tf.int32) gt_labels_map = tf.concat([pos_labels_map, neg_labels_map], axis=1) # bbox_indices = tf.concat([pos_bbox_indices, neg_bbox_indices], axis=1) # flatted_batch_indices = helpers.get_tiled_indices(batch_size, total_bboxes) flatted_bbox_indices = tf.reshape(bbox_indices, (-1, 1)) scatter_indices = helpers.get_scatter_indices_for_bboxes( [flatted_batch_indices, flatted_bbox_indices], batch_size, total_bboxes) expanded_gt_boxes = tf.scatter_nd(scatter_indices, gt_boxes_map, tf.shape(anchors)) # bbox_deltas = helpers.get_deltas_from_bboxes(anchors, expanded_gt_boxes) # bbox_labels = tf.negative(tf.ones((batch_size, total_anchors), tf.int32)) bbox_labels = tf.tensor_scatter_nd_update(bbox_labels, scatter_indices, gt_labels_map) # bbox_deltas = tf.reshape( bbox_deltas, (batch_size, output_height, output_width, anchor_count * 4)) bbox_labels = tf.reshape( bbox_labels, (batch_size, output_height, output_width, anchor_count)) # return input_img, bbox_deltas, bbox_labels, anchors
def get_step_data(image_data, anchors, hyper_params, input_processor): """Generating one step data for training or inference. Batch operations supported. inputs: image_data = img (batch_size, height, width, channels) gt_boxes (batch_size, gt_box_size, [y1, x1, y2, x2]) these values in normalized format between [0, 1] gt_labels (batch_size, gt_box_size) anchors = (total_anchors, [y1, x1, y2, x2]) these values in normalized format between [0, 1] hyper_params = dictionary input_processor = function for preparing image for input. It's getting from backbone. outputs: input_img = (batch_size, height, width, channels) preprocessed image using input_processor bbox_deltas = (batch_size, output_height, output_width, anchor_count * [delta_y, delta_x, delta_h, delta_w]) bbox_labels = (batch_size, output_height, output_width, anchor_count) """ img, gt_boxes, gt_labels = image_data batch_size, image_height, image_width = tf.shape(img)[0], tf.shape( img)[1], tf.shape(img)[2] input_img = input_processor(img) input_img = tf.image.convert_image_dtype(input_img, tf.float32) stride = hyper_params["stride"] anchor_count = hyper_params["anchor_count"] total_pos_bboxes = hyper_params["total_pos_bboxes"] total_neg_bboxes = hyper_params["total_neg_bboxes"] variances = hyper_params["variances"] output_height, output_width = image_height // stride, image_width // stride total_anchors = anchors.shape[0] # Calculate iou values between each bboxes and ground truth boxes iou_map = helpers.generate_iou_map(anchors, gt_boxes) # Get max index value for each row max_indices_each_row = tf.argmax(iou_map, axis=2, output_type=tf.int32) # Get max index value for each column max_indices_each_column = tf.argmax(iou_map, axis=1, output_type=tf.int32) # IoU map has iou values for every gt boxes and we merge these values column wise merged_iou_map = tf.reduce_max(iou_map, axis=2) # pos_mask = tf.greater(merged_iou_map, 0.7) # valid_indices_cond = tf.not_equal(gt_labels, -1) valid_indices = tf.cast(tf.where(valid_indices_cond), tf.int32) valid_max_indices = max_indices_each_column[valid_indices_cond] # scatter_bbox_indices = tf.stack([valid_indices[..., 0], valid_max_indices], 1) max_pos_mask = tf.scatter_nd(scatter_bbox_indices, tf.fill((tf.shape(valid_indices)[0], ), True), tf.shape(pos_mask)) pos_mask = tf.logical_or(pos_mask, max_pos_mask) pos_mask = helpers.randomly_select_xyz_mask( pos_mask, tf.tile(tf.constant([total_pos_bboxes], dtype=tf.int32), (batch_size, ))) # pos_count = tf.reduce_sum(tf.cast(pos_mask, tf.int32), axis=-1) neg_count = (total_pos_bboxes + total_neg_bboxes) - pos_count # neg_mask = tf.logical_and(tf.less(merged_iou_map, 0.3), tf.logical_not(pos_mask)) neg_mask = helpers.randomly_select_xyz_mask(neg_mask, neg_count) # pos_labels = tf.where(pos_mask, tf.ones_like(pos_mask, dtype=tf.float32), tf.constant(-1.0, dtype=tf.float32)) neg_labels = tf.cast(neg_mask, dtype=tf.float32) bbox_labels = tf.add(pos_labels, neg_labels) # gt_boxes_map = tf.gather(gt_boxes, max_indices_each_row, batch_dims=1) # Replace negative bboxes with zeros expanded_gt_boxes = tf.where(tf.expand_dims(pos_mask, -1), gt_boxes_map, tf.zeros_like(gt_boxes_map)) # Calculate delta values between anchors and ground truth bboxes bbox_deltas = helpers.get_deltas_from_bboxes(anchors, expanded_gt_boxes) / variances # # bbox_deltas = tf.reshape(bbox_deltas, (batch_size, output_height, output_width, anchor_count * 4)) bbox_labels = tf.reshape( bbox_labels, (batch_size, output_height, output_width, anchor_count)) # return input_img, bbox_deltas, bbox_labels