def build_model_graph(features, labels, is_training, params): """Builds the forward model graph.""" model_outputs = {} is_gpu_inference = not is_training and params['use_batched_nms'] batch_size, image_height, image_width, _ = features['images'].get_shape( ).as_list() if 'source_ids' not in features: features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32) all_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], (image_height, image_width)) MODELS["backbone"] = resnet.Resnet_Model("resnet50", data_format='channels_last', trainable=is_training, finetune_bn=params['finetune_bn']) backbone_feats = MODELS["backbone"]( features['images'], training=is_training, ) MODELS["FPN"] = fpn.FPNNetwork(params['min_level'], params['max_level'], trainable=is_training) fpn_feats = MODELS["FPN"](backbone_feats, training=is_training) model_outputs.update({'fpn_features': fpn_feats}) def rpn_head_fn(features, min_level=2, max_level=6, num_anchors=3): """Region Proposal Network (RPN) for Mask-RCNN.""" scores_outputs = dict() box_outputs = dict() MODELS["RPN_Heads"] = heads.RPN_Head_Model(name="rpn_head", num_anchors=num_anchors, trainable=is_training) for level in range(min_level, max_level + 1): scores_outputs[level], box_outputs[level] = MODELS["RPN_Heads"]( features[level], training=is_training) return scores_outputs, box_outputs rpn_score_outputs, rpn_box_outputs = rpn_head_fn( features=fpn_feats, min_level=params['min_level'], max_level=params['max_level'], num_anchors=len(params['aspect_ratios'] * params['num_scales'])) if is_training: rpn_pre_nms_topn = params['train_rpn_pre_nms_topn'] rpn_post_nms_topn = params['train_rpn_post_nms_topn'] rpn_nms_threshold = params['train_rpn_nms_threshold'] else: rpn_pre_nms_topn = params['test_rpn_pre_nms_topn'] rpn_post_nms_topn = params['test_rpn_post_nms_topn'] rpn_nms_threshold = params['test_rpn_nms_thresh'] if params['use_custom_box_proposals_op']: rpn_box_scores, rpn_box_rois = roi_ops.custom_multilevel_propose_rois( scores_outputs=rpn_score_outputs, box_outputs=rpn_box_outputs, all_anchors=all_anchors, image_info=features['image_info'], rpn_pre_nms_topn=rpn_pre_nms_topn, rpn_post_nms_topn=rpn_post_nms_topn, rpn_nms_threshold=rpn_nms_threshold, rpn_min_size=params['rpn_min_size']) else: rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois( scores_outputs=rpn_score_outputs, box_outputs=rpn_box_outputs, all_anchors=all_anchors, image_info=features['image_info'], rpn_pre_nms_topn=rpn_pre_nms_topn, rpn_post_nms_topn=rpn_post_nms_topn, rpn_nms_threshold=rpn_nms_threshold, rpn_min_size=params['rpn_min_size'], bbox_reg_weights=None, use_batched_nms=params['use_batched_nms']) rpn_box_rois = tf.cast(rpn_box_rois, dtype=tf.float32) if is_training: rpn_box_rois = tf.stop_gradient(rpn_box_rois) rpn_box_scores = tf.stop_gradient( rpn_box_scores) # TODO Jonathan: Unused => Shall keep ? # Sampling box_targets, class_targets, rpn_box_rois, proposal_to_label_map = training_ops.proposal_label_op( rpn_box_rois, labels['gt_boxes'], labels['gt_classes'], batch_size_per_im=params['batch_size_per_im'], fg_fraction=params['fg_fraction'], fg_thresh=params['fg_thresh'], bg_thresh_hi=params['bg_thresh_hi'], bg_thresh_lo=params['bg_thresh_lo']) # Performs multi-level RoIAlign. box_roi_features = spatial_transform_ops.multilevel_crop_and_resize( features=fpn_feats, boxes=rpn_box_rois, output_size=7, is_gpu_inference=is_gpu_inference) MODELS["Box_Head"] = heads.Box_Head_Model( num_classes=params['num_classes'], mlp_head_dim=params['fast_rcnn_mlp_head_dim'], trainable=is_training) class_outputs, box_outputs, _ = MODELS["Box_Head"](inputs=box_roi_features) if not is_training: if params['use_batched_nms']: generate_detections_fn = postprocess_ops.generate_detections_gpu else: generate_detections_fn = postprocess_ops.generate_detections_tpu detections = generate_detections_fn( class_outputs=class_outputs, box_outputs=box_outputs, anchor_boxes=rpn_box_rois, image_info=features['image_info'], pre_nms_num_detections=params['test_rpn_post_nms_topn'], post_nms_num_detections=params['test_detections_per_image'], nms_threshold=params['test_nms'], bbox_reg_weights=params['bbox_reg_weights']) model_outputs.update({ 'num_detections': detections[0], 'detection_boxes': detections[1], 'detection_classes': detections[2], 'detection_scores': detections[3], }) else: # is training encoded_box_targets = training_ops.encode_box_targets( boxes=rpn_box_rois, gt_boxes=box_targets, gt_labels=class_targets, bbox_reg_weights=params['bbox_reg_weights']) model_outputs.update({ 'rpn_score_outputs': rpn_score_outputs, 'rpn_box_outputs': rpn_box_outputs, 'class_outputs': class_outputs, 'box_outputs': box_outputs, 'class_targets': class_targets, 'box_targets': encoded_box_targets, 'box_rois': rpn_box_rois, }) # Faster-RCNN mode. if not params['include_mask']: return model_outputs # Mask sampling if not is_training: selected_box_rois = model_outputs['detection_boxes'] class_indices = model_outputs['detection_classes'] # If using GPU for inference, delay the cast until when Gather ops show up # since GPU inference supports float point better. # TODO(laigd): revisit this when newer versions of GPU libraries is # released. if not params['use_batched_nms']: class_indices = tf.cast(class_indices, dtype=tf.int32) else: selected_class_targets, selected_box_targets, \ selected_box_rois, proposal_to_label_map = training_ops.select_fg_for_masks( class_targets=class_targets, box_targets=box_targets, boxes=rpn_box_rois, proposal_to_label_map=proposal_to_label_map, max_num_fg=int(params['batch_size_per_im'] * params['fg_fraction']) ) class_indices = tf.cast(selected_class_targets, dtype=tf.int32) mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize( features=fpn_feats, boxes=selected_box_rois, output_size=14, is_gpu_inference=is_gpu_inference) MODELS["Mask_Head"] = heads.Mask_Head_Model( class_indices, num_classes=params['num_classes'], mrcnn_resolution=params['mrcnn_resolution'], is_gpu_inference=is_gpu_inference, trainable=is_training, name="mask_head") mask_outputs = MODELS["Mask_Head"](inputs=mask_roi_features) if MPI_local_rank() == 0: # Print #FLOPs in model. compute_model_statistics(batch_size, is_training=is_training) if is_training: mask_targets = training_ops.get_mask_targets( fg_boxes=selected_box_rois, fg_proposal_to_label_map=proposal_to_label_map, fg_box_targets=selected_box_targets, mask_gt_labels=labels['cropped_gt_masks'], output_size=params['mrcnn_resolution']) model_outputs.update({ 'mask_outputs': mask_outputs, 'mask_targets': mask_targets, 'selected_class_targets': selected_class_targets, }) else: model_outputs.update({ 'detection_masks': tf.nn.sigmoid(mask_outputs), }) return model_outputs
def _get_session_config(mode, use_xla, use_amp, use_tf_distributed=False, allow_xla_at_inference=False): assert mode in ('train', 'eval') rewrite_options = rewriter_config_pb2.RewriterConfig( # arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF, # arithmetic_optimization=rewriter_config_pb2.RewriterConfig.ON, # constant_folding=rewriter_config_pb2.RewriterConfig.OFF, # constant_folding=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # debug_stripper=rewriter_config_pb2.RewriterConfig.OFF, # debug_stripper=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF, # dependency_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # disable_model_pruning=False, # INCOMPATIBLE with AMP # function_optimization=True, # implementation_selector=True, # loop_optimization=rewriter_config_pb2.RewriterConfig.OFF, # loop_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # The default setting (SCHEDULING and SWAPPING HEURISTICS only) # memory_optimization=rewriter_config_pb2.RewriterConfig.DEFAULT_MEM_OPT, # Disabled in the meta-optimizer. # memory_optimization=rewriter_config_pb2.RewriterConfig.NO_MEM_OPT, # Driven by manual op-level annotations. # memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL, # Swapping heuristic will move a tensor from the GPU to the CPU and move it # back when needed to reduce peak memory usage.. # memory_optimization=rewriter_config_pb2.RewriterConfig.SWAPPING_HEURISTICS, # Recomputation heuristics will recompute ops (such as Relu activation) # during backprop instead of storing them, reducing peak memory usage. # memory_optimization=rewriter_config_pb2.RewriterConfig.RECOMPUTATION_HEURISTICS, # Scheduling will split big ops such as AddN and try to enforce a schedule of # the new computations that decreases peak memory usage. # memory_optimization=rewriter_config_pb2.RewriterConfig.SCHEDULING_HEURISTICS, # Use any combination of swapping and recomputation heuristics. # memory_optimization=rewriter_config_pb2.RewriterConfig.HEURISTICS, meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.TWO, # meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE, # meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.DEFAULT_NUM_ITERS, # pin_to_host_optimization=rewriter_config_pb2.RewriterConfig.OFF, # pin_to_host_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # # remapping=rewriter_config_pb2.RewriterConfig.OFF, # remapping=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # scoped_allocator_optimization=rewriter_config_pb2.RewriterConfig.OFF, # scoped_allocator_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST # shape_optimization=rewriter_config_pb2.RewriterConfig.OFF, # shape_optimization=rewriter_config_pb2.RewriterConfig.ON, # TO TEST ) if use_amp: logging.info("[%s] AMP is activated - Experiment Feature" % mode) rewrite_options.auto_mixed_precision = True config = tf.compat.v1.ConfigProto( allow_soft_placement=True, log_device_placement=False, graph_options=tf.compat.v1.GraphOptions( rewrite_options=rewrite_options, # infer_shapes=True # Heavily drops throughput by 30% ) ) if use_tf_distributed: config.gpu_options.force_gpu_compatible = False else: config.gpu_options.force_gpu_compatible = True # Force pinned memory if MPI_is_distributed(): config.gpu_options.visible_device_list = str(MPI_local_rank()) if use_xla and (mode == "train" or allow_xla_at_inference): logging.info("[%s] XLA is activated - Experiment Feature" % mode) config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 # config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_2 if mode == 'train': config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads if MPI_is_distributed(): config.inter_op_parallelism_threads = max(2, multiprocessing.cpu_count() // hvd.local_size()) elif not use_tf_distributed: config.inter_op_parallelism_threads = 4 return config