def build_model_graph(features, labels, is_training, params): """Builds the forward model graph.""" model_outputs = {} if params['transpose_input'] and is_training: features['images'] = tf.transpose(features['images'], [3, 0, 1, 2]) batch_size, image_height, image_width, _ = ( features['images'].get_shape().as_list()) if 'source_ids' not in features: features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32) all_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], (image_height, image_width)) with tf.variable_scope('resnet%s' % params['resnet_depth']): resnet_fn = resnet.resnet_v1( params['resnet_depth'], num_batch_norm_group=params['num_batch_norm_group']) backbone_feats = resnet_fn(features['images'], (params['is_training_bn'] and is_training)) fpn_feats = fpn.fpn(backbone_feats, params['min_level'], params['max_level']) rpn_score_outputs, rpn_box_outputs = heads.rpn_head( fpn_feats, params['min_level'], params['max_level'], len(params['aspect_ratios'] * params['num_scales'])) if is_training: rpn_pre_nms_topn = params['rpn_pre_nms_topn'] rpn_post_nms_topn = params['rpn_post_nms_topn'] else: rpn_pre_nms_topn = params['test_rpn_pre_nms_topn'] rpn_post_nms_topn = params['test_rpn_post_nms_topn'] rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois( rpn_score_outputs, rpn_box_outputs, all_anchors, features['image_info'], rpn_pre_nms_topn, rpn_post_nms_topn, params['rpn_nms_threshold'], params['rpn_min_size'], bbox_reg_weights=None, use_tpu=params['use_tpu']) rpn_box_rois = tf.to_float(rpn_box_rois) if is_training: rpn_box_rois = tf.stop_gradient(rpn_box_rois) rpn_box_scores = tf.stop_gradient(rpn_box_scores) if is_training: # Sampling box_targets, class_targets, rpn_box_rois, proposal_to_label_map = ( training_ops.proposal_label_op( rpn_box_rois, labels['gt_boxes'], labels['gt_classes'], features['image_info'], batch_size_per_im=params['batch_size_per_im'], fg_fraction=params['fg_fraction'], fg_thresh=params['fg_thresh'], bg_thresh_hi=params['bg_thresh_hi'], bg_thresh_lo=params['bg_thresh_lo'])) # Performs multi-level RoIAlign. box_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_feats, rpn_box_rois, output_size=7) class_outputs, box_outputs, _ = heads.box_head( box_roi_features, num_classes=params['num_classes'], mlp_head_dim=params['fast_rcnn_mlp_head_dim']) if not is_training: if params['use_tpu']: detections = postprocess_ops.generate_detections_tpu( class_outputs, box_outputs, rpn_box_rois, features['source_ids'], features['image_info'], params['test_rpn_post_nms_topn'], params['test_detections_per_image'], params['test_nms'], params['bbox_reg_weights']) else: detections = postprocess_ops.generate_detections_gpu( class_outputs, box_outputs, rpn_box_rois, features['source_ids'], features['image_info'], params['test_rpn_post_nms_topn'], params['test_detections_per_image'], params['test_nms'], params['bbox_reg_weights']) model_outputs.update({ 'detections': tf.identity(detections, 'Detections'), }) if params['output_box_features']: final_box_rois = detections[:, :, 1:5] final_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_feats, final_box_rois, output_size=7) _, _, final_box_features = heads.box_head( final_roi_features, num_classes=params['num_classes'], mlp_head_dim=params['fast_rcnn_mlp_head_dim']) model_outputs.update({ 'box_features': tf.identity(final_box_features, 'BoxFeatures'), }) else: encoded_box_targets = training_ops.encode_box_targets( rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights']) model_outputs.update({ 'rpn_score_outputs': rpn_score_outputs, 'rpn_box_outputs': rpn_box_outputs, 'class_outputs': class_outputs, 'box_outputs': box_outputs, 'class_targets': class_targets, 'box_targets': encoded_box_targets, 'box_rois': rpn_box_rois, }) # Faster-RCNN mode. if not params['include_mask']: return model_outputs # Mask sampling if not is_training: selected_box_rois = detections[:, :, 1:5] class_indices = tf.to_int32(detections[:, :, 6]) else: (selected_class_targets, selected_box_targets, selected_box_rois, proposal_to_label_map) = (training_ops.select_fg_for_masks( class_targets, box_targets, rpn_box_rois, proposal_to_label_map, max_num_fg=int(params['batch_size_per_im'] * params['fg_fraction']))) class_indices = tf.to_int32(selected_class_targets) mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_feats, selected_box_rois, output_size=14) mask_outputs = heads.mask_head(mask_roi_features, class_indices, num_classes=params['num_classes'], mrcnn_resolution=params['mrcnn_resolution']) model_outputs.update({ 'mask_outputs': mask_outputs, }) if is_training: mask_targets = training_ops.get_mask_targets( selected_box_rois, proposal_to_label_map, selected_box_targets, labels['cropped_gt_masks'], params['mrcnn_resolution']) model_outputs.update({ 'mask_targets': mask_targets, 'selected_class_targets': selected_class_targets, }) else: model_outputs['mask_outputs'] = tf.identity( tf.nn.sigmoid(model_outputs['mask_outputs']), 'Masks') return model_outputs
def build_model_graph(features, labels, is_training, params): """Builds the forward model graph.""" use_batched_nms = (not params['use_tpu'] and params['use_batched_nms']) is_gpu_inference = (not is_training and use_batched_nms) model_outputs = {} if is_training: if params['transpose_input']: features['images'] = tf.transpose(features['images'], [2, 0, 1, 3]) batch_size, image_height, image_width, _ = ( features['images'].get_shape().as_list()) # Handles space-to-depth transform. conv0_space_to_depth_block_size = 0 if is_training: conv0_space_to_depth_block_size = params[ 'conv0_space_to_depth_block_size'] image_height *= conv0_space_to_depth_block_size image_width *= conv0_space_to_depth_block_size if 'source_ids' not in features: features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32) all_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], (image_height, image_width)) if 'resnet' in params['backbone']: with tf.variable_scope(params['backbone']): resnet_fn = resnet.resnet_v1( params['backbone'], conv0_kernel_size=params['conv0_kernel_size'], conv0_space_to_depth_block_size=conv0_space_to_depth_block_size, num_batch_norm_group=params['num_batch_norm_group']) backbone_feats = resnet_fn( features['images'], (params['is_training_bn'] and is_training)) elif 'mnasnet' in params['backbone']: with tf.variable_scope(params['backbone']): _, endpoints = mnasnet_models.build_mnasnet_base( features['images'], params['backbone'], training=(params['is_training_bn'] and is_training), override_params={'use_keras': False}) backbone_feats = { 2: endpoints['reduction_2'], 3: endpoints['reduction_3'], 4: endpoints['reduction_4'], 5: endpoints['reduction_5'], } else: raise ValueError('Not a valid backbone option: %s' % params['backbone']) fpn_feats = fpn.fpn(backbone_feats, params['min_level'], params['max_level']) model_outputs.update({ 'fpn_features': fpn_feats, }) rpn_score_outputs, rpn_box_outputs = heads.rpn_head( fpn_feats, params['min_level'], params['max_level'], len(params['aspect_ratios'] * params['num_scales'])) if is_training: rpn_pre_nms_topn = params['rpn_pre_nms_topn'] rpn_post_nms_topn = params['rpn_post_nms_topn'] else: rpn_pre_nms_topn = params['test_rpn_pre_nms_topn'] rpn_post_nms_topn = params['test_rpn_post_nms_topn'] rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois( rpn_score_outputs, rpn_box_outputs, all_anchors, features['image_info'], rpn_pre_nms_topn, rpn_post_nms_topn, params['rpn_nms_threshold'], params['rpn_min_size'], bbox_reg_weights=None, use_batched_nms=use_batched_nms) rpn_box_rois = tf.to_float(rpn_box_rois) if is_training: rpn_box_rois = tf.stop_gradient(rpn_box_rois) rpn_box_scores = tf.stop_gradient(rpn_box_scores) if is_training: # Sampling box_targets, class_targets, rpn_box_rois, proposal_to_label_map = ( training_ops.proposal_label_op( rpn_box_rois, labels['gt_boxes'], labels['gt_classes'], features['image_info'], batch_size_per_im=params['batch_size_per_im'], fg_fraction=params['fg_fraction'], fg_thresh=params['fg_thresh'], bg_thresh_hi=params['bg_thresh_hi'], bg_thresh_lo=params['bg_thresh_lo'])) # Performs multi-level RoIAlign. box_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_feats, rpn_box_rois, output_size=7, is_gpu_inference=is_gpu_inference) class_outputs, box_outputs, _ = heads.box_head( box_roi_features, num_classes=params['num_classes'], mlp_head_dim=params['fast_rcnn_mlp_head_dim']) if not is_training: if is_gpu_inference: generate_detections_fn = postprocess_ops.generate_detections_gpu else: generate_detections_fn = postprocess_ops.generate_detections_tpu detections = generate_detections_fn( class_outputs, box_outputs, rpn_box_rois, features['image_info'], params['test_rpn_post_nms_topn'], params['test_detections_per_image'], params['test_nms'], params['bbox_reg_weights']) model_outputs.update({ 'num_detections': detections[0], 'detection_boxes': detections[1], 'detection_classes': detections[2], 'detection_scores': detections[3], }) else: encoded_box_targets = training_ops.encode_box_targets( rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights']) model_outputs.update({ 'rpn_score_outputs': rpn_score_outputs, 'rpn_box_outputs': rpn_box_outputs, 'class_outputs': class_outputs, 'box_outputs': box_outputs, 'class_targets': class_targets, 'box_targets': encoded_box_targets, 'box_rois': rpn_box_rois, }) # Faster-RCNN mode. if not params['include_mask']: return model_outputs # Mask sampling if not is_training: selected_box_rois = model_outputs['detection_boxes'] class_indices = model_outputs['detection_classes'] # If using GPU for inference, delay the cast until when Gather ops show up # since GPU inference supports float point better. # TODO(laigd): revisit this when newer versions of GPU libraries is # released. if not is_gpu_inference: class_indices = tf.to_int32(class_indices) else: (selected_class_targets, selected_box_targets, selected_box_rois, proposal_to_label_map) = (training_ops.select_fg_for_masks( class_targets, box_targets, rpn_box_rois, proposal_to_label_map, max_num_fg=int(params['batch_size_per_im'] * params['fg_fraction']))) class_indices = tf.to_int32(selected_class_targets) mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_feats, selected_box_rois, output_size=14, is_gpu_inference=is_gpu_inference) mask_outputs = heads.mask_head(mask_roi_features, class_indices, num_classes=params['num_classes'], mrcnn_resolution=params['mrcnn_resolution'], is_gpu_inference=is_gpu_inference) if is_training: mask_targets = training_ops.get_mask_targets( selected_box_rois, proposal_to_label_map, selected_box_targets, labels['cropped_gt_masks'], params['mrcnn_resolution']) model_outputs.update({ 'mask_outputs': mask_outputs, 'mask_targets': mask_targets, 'selected_class_targets': selected_class_targets, }) else: model_outputs.update({ 'detection_masks': tf.nn.sigmoid(mask_outputs), }) return model_outputs
def _model_outputs(): """Generates outputs from the model.""" model_outputs = {} with tf.variable_scope('resnet%s' % params['resnet_depth']): resnet_fn = resnet.resnet_v1( params['resnet_depth'], num_batch_norm_group=params['num_batch_norm_group']) backbone_feats = resnet_fn(features['images'], params['is_training_bn']) fpn_feats = fpn.fpn( backbone_feats, params['min_level'], params['max_level']) rpn_score_outputs, rpn_box_outputs = heads.rpn_head( fpn_feats, params['min_level'], params['max_level'], len(params['aspect_ratios'] * params['num_scales'])) if mode == tf.estimator.ModeKeys.TRAIN: rpn_pre_nms_topn = params['rpn_pre_nms_topn'] rpn_post_nms_topn = params['rpn_post_nms_topn'] else: rpn_pre_nms_topn = params['test_rpn_pre_nms_topn'] rpn_post_nms_topn = params['test_rpn_post_nms_topn'] rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois( rpn_score_outputs, rpn_box_outputs, all_anchors, features['image_info'], rpn_pre_nms_topn, rpn_post_nms_topn, params['rpn_nms_threshold'], params['rpn_min_size'], bbox_reg_weights=None, use_tpu=params['use_tpu']) rpn_box_rois = tf.to_float(rpn_box_rois) if mode == tf.estimator.ModeKeys.TRAIN: rpn_box_rois = tf.stop_gradient(rpn_box_rois) rpn_box_scores = tf.stop_gradient(rpn_box_scores) if mode == tf.estimator.ModeKeys.TRAIN: # Sampling box_targets, class_targets, rpn_box_rois, proposal_to_label_map = ( training_ops.proposal_label_op( rpn_box_rois, labels['gt_boxes'], labels['gt_classes'], features['image_info'], batch_size_per_im=params['batch_size_per_im'], fg_fraction=params['fg_fraction'], fg_thresh=params['fg_thresh'], bg_thresh_hi=params['bg_thresh_hi'], bg_thresh_lo=params['bg_thresh_lo'])) # Performs multi-level RoIAlign. box_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_feats, rpn_box_rois, output_size=7) class_outputs, box_outputs, _ = heads.box_head( box_roi_features, num_classes=params['num_classes'], mlp_head_dim=params['fast_rcnn_mlp_head_dim']) if mode != tf.estimator.ModeKeys.TRAIN: if params['use_tpu']: detections = postprocess_ops.generate_detections_tpu( class_outputs, box_outputs, rpn_box_rois, features['source_ids'], features['image_info'], params['test_rpn_post_nms_topn'], params['test_detections_per_image'], params['test_nms'], params['bbox_reg_weights']) else: detections = postprocess_ops.generate_detections_gpu( class_outputs, box_outputs, rpn_box_rois, features['source_ids'], features['image_info'], params['test_rpn_post_nms_topn'], params['test_detections_per_image'], params['test_nms'], params['bbox_reg_weights']) detections = tf.identity(detections, 'Detections') model_outputs.update({ 'detections': detections, }) if params['output_box_features']: final_box_rois = detections[:, :, 1:5] final_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_feats, final_box_rois, output_size=7) _, _, final_box_features = heads.box_head( final_roi_features, num_classes=params['num_classes'], mlp_head_dim=params['fast_rcnn_mlp_head_dim']) final_box_features = tf.identity(final_box_features, 'BoxFeatures') model_outputs.update({ 'box_features': final_box_features, }) else: encoded_box_targets = training_ops.encode_box_targets( rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights']) model_outputs.update({ 'rpn_score_outputs': rpn_score_outputs, 'rpn_box_outputs': rpn_box_outputs, 'class_outputs': class_outputs, 'box_outputs': box_outputs, 'class_targets': class_targets, 'box_targets': encoded_box_targets, 'box_rois': rpn_box_rois, }) # Faster-RCNN mode. if not params['include_mask']: return model_outputs # Mask sampling if mode != tf.estimator.ModeKeys.TRAIN: selected_box_rois = detections[:, :, 1:5] class_indices = tf.to_int32(detections[:, :, 6]) else: (selected_class_targets, selected_box_targets, selected_box_rois, proposal_to_label_map) = ( training_ops.select_fg_for_masks( class_targets, box_targets, rpn_box_rois, proposal_to_label_map, max_num_fg=int( params['batch_size_per_im'] * params['fg_fraction']))) class_indices = tf.to_int32(selected_class_targets) mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize( fpn_feats, selected_box_rois, output_size=14) mask_outputs = heads.mask_head( mask_roi_features, class_indices, num_classes=params['num_classes'], mrcnn_resolution=params['mrcnn_resolution']) mask_outputs = tf.identity(mask_outputs, 'Masks') model_outputs.update({ 'mask_outputs': mask_outputs, }) if mode == tf.estimator.ModeKeys.TRAIN: mask_targets = training_ops.get_mask_targets( selected_box_rois, proposal_to_label_map, selected_box_targets, labels['cropped_gt_masks'], params['mrcnn_resolution']) model_outputs.update({ 'mask_targets': mask_targets, 'selected_class_targets': selected_class_targets, }) return model_outputs