def draw_side_by_side_evaluation_image(eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2): """Creates a side-by-side image with detections and groundtruth. Bounding boxes (and instance masks, if available) are visualized on both subimages. Args: eval_dict: The evaluation dictionary returned by eval_util.result_dict_for_single_example(). category_index: A category index (dictionary) produced from a labelmap. max_boxes_to_draw: The maximum number of boxes to draw for detections. min_score_thresh: The minimum score threshold for showing detections. Returns: A [1, H, 2 * W, C] uint8 tensor. The subimage on the left corresponds to detections, while the subimage on the right corresponds to groundtruth. """ detection_fields = fields.DetectionResultFields() input_data_fields = fields.InputDataFields() instance_masks = None if detection_fields.detection_masks in eval_dict: instance_masks = tf.cast( tf.expand_dims(eval_dict[detection_fields.detection_masks], axis=0), tf.uint8) keypoints = None if detection_fields.detection_keypoints in eval_dict: keypoints = tf.expand_dims( eval_dict[detection_fields.detection_keypoints], axis=0) groundtruth_instance_masks = None if input_data_fields.groundtruth_instance_masks in eval_dict: groundtruth_instance_masks = tf.cast( tf.expand_dims( eval_dict[input_data_fields.groundtruth_instance_masks], axis=0), tf.uint8) images_with_detections = draw_bounding_boxes_on_image_tensors( eval_dict[input_data_fields.original_image], tf.expand_dims(eval_dict[detection_fields.detection_boxes], axis=0), tf.expand_dims(eval_dict[detection_fields.detection_classes], axis=0), tf.expand_dims(eval_dict[detection_fields.detection_scores], axis=0), category_index, instance_masks=instance_masks, keypoints=keypoints, max_boxes_to_draw=max_boxes_to_draw, min_score_thresh=min_score_thresh) images_with_groundtruth = draw_bounding_boxes_on_image_tensors( eval_dict[input_data_fields.original_image], tf.expand_dims(eval_dict[input_data_fields.groundtruth_boxes], axis=0), tf.expand_dims(eval_dict[input_data_fields.groundtruth_classes], axis=0), tf.expand_dims( tf.ones_like( eval_dict[input_data_fields.groundtruth_classes], dtype=tf.float32), axis=0), category_index, instance_masks=groundtruth_instance_masks, keypoints=None, max_boxes_to_draw=None, min_score_thresh=0.0) return tf.concat([images_with_detections, images_with_groundtruth], axis=2)
def draw_side_by_side_evaluation_image(eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2, use_normalized_coordinates=True): """Creates a side-by-side image with detections and groundtruth. Bounding boxes (and instance masks, if available) are visualized on both subimages. Args: eval_dict: The evaluation dictionary returned by eval_util.result_dict_for_batched_example() or eval_util.result_dict_for_single_example(). category_index: A category index (dictionary) produced from a labelmap. max_boxes_to_draw: The maximum number of boxes to draw for detections. min_score_thresh: The minimum score threshold for showing detections. use_normalized_coordinates: Whether to assume boxes and kepoints are in normalized coordinates (as opposed to absolute coordiantes). Default is True. Returns: A list of [1, H, 2 * W, C] uint8 tensor. The subimage on the left corresponds to detections, while the subimage on the right corresponds to groundtruth. """ detection_fields = fields.DetectionResultFields() input_data_fields = fields.InputDataFields() images_with_detections_list = [] # Add the batch dimension if the eval_dict is for single example. if len(eval_dict[detection_fields.detection_classes].shape) == 1: for key in eval_dict: if key != input_data_fields.original_image: eval_dict[key] = tf.expand_dims(eval_dict[key], 0) for indx in range(eval_dict[input_data_fields.original_image].shape[0]): instance_masks = None if detection_fields.detection_masks in eval_dict: instance_masks = tf.cast( tf.expand_dims( eval_dict[detection_fields.detection_masks][indx], axis=0), tf.uint8) keypoints = None if detection_fields.detection_keypoints in eval_dict: keypoints = tf.expand_dims( eval_dict[detection_fields.detection_keypoints][indx], axis=0) groundtruth_instance_masks = None if input_data_fields.groundtruth_instance_masks in eval_dict: groundtruth_instance_masks = tf.cast( tf.expand_dims(eval_dict[ input_data_fields.groundtruth_instance_masks][indx], axis=0), tf.uint8) images_with_detections = draw_bounding_boxes_on_image_tensors( tf.expand_dims(eval_dict[input_data_fields.original_image][indx], axis=0), tf.expand_dims(eval_dict[detection_fields.detection_boxes][indx], axis=0), tf.expand_dims(eval_dict[detection_fields.detection_classes][indx], axis=0), tf.expand_dims(eval_dict[detection_fields.detection_scores][indx], axis=0), category_index, original_image_spatial_shape=tf.expand_dims(eval_dict[ input_data_fields.original_image_spatial_shape][indx], axis=0), true_image_shape=tf.expand_dims( eval_dict[input_data_fields.true_image_shape][indx], axis=0), instance_masks=instance_masks, keypoints=keypoints, max_boxes_to_draw=max_boxes_to_draw, min_score_thresh=min_score_thresh, use_normalized_coordinates=use_normalized_coordinates) images_with_groundtruth = draw_bounding_boxes_on_image_tensors( tf.expand_dims(eval_dict[input_data_fields.original_image][indx], axis=0), tf.expand_dims( eval_dict[input_data_fields.groundtruth_boxes][indx], axis=0), tf.expand_dims( eval_dict[input_data_fields.groundtruth_classes][indx], axis=0), tf.expand_dims(tf.ones_like( eval_dict[input_data_fields.groundtruth_classes][indx], dtype=tf.float32), axis=0), category_index, original_image_spatial_shape=tf.expand_dims(eval_dict[ input_data_fields.original_image_spatial_shape][indx], axis=0), true_image_shape=tf.expand_dims( eval_dict[input_data_fields.true_image_shape][indx], axis=0), instance_masks=groundtruth_instance_masks, keypoints=None, max_boxes_to_draw=None, min_score_thresh=0.0, use_normalized_coordinates=use_normalized_coordinates) images_with_detections_list.append( tf.concat([images_with_detections, images_with_groundtruth], axis=2)) return images_with_detections_list
def model_fn(features, labels, mode, params=None): """Constructs the object detection model. Args: features: Dictionary of feature tensors, returned from `input_fn`. labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL, otherwise None. mode: Mode key from tf.estimator.ModeKeys. params: Parameter dictionary passed from the estimator. Returns: An `EstimatorSpec` that encapsulates the model and its serving configurations. """ params = params or {} total_loss, train_op, detections, export_outputs = None, None, None, None is_training = mode == tf.estimator.ModeKeys.TRAIN detection_model = detection_model_fn(is_training=is_training, add_summaries=(not use_tpu)) scaffold_fn = None if mode == tf.estimator.ModeKeys.TRAIN: labels = unstack_batch( labels, unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors) elif mode == tf.estimator.ModeKeys.EVAL: labels = unstack_batch(labels, unpad_groundtruth_tensors=False) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_classes_list = labels[fields.InputDataFields.groundtruth_classes] gt_masks_list = None if fields.InputDataFields.groundtruth_instance_masks in labels: gt_masks_list = labels[ fields.InputDataFields.groundtruth_instance_masks] gt_keypoints_list = None if fields.InputDataFields.groundtruth_keypoints in labels: gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints] detection_model.provide_groundtruth( groundtruth_boxes_list=gt_boxes_list, groundtruth_classes_list=gt_classes_list, groundtruth_masks_list=gt_masks_list, groundtruth_keypoints_list=gt_keypoints_list) preprocessed_images = features[fields.InputDataFields.image] prediction_dict = detection_model.predict( preprocessed_images, features[fields.InputDataFields.true_image_shape]) detections = detection_model.postprocess( prediction_dict, features[fields.InputDataFields.true_image_shape]) if mode == tf.estimator.ModeKeys.TRAIN: if train_config.fine_tune_checkpoint and hparams.load_pretrained: asg_map = detection_model.restore_map( from_detection_checkpoint=train_config.from_detection_checkpoint, load_all_detection_checkpoint_vars=( train_config.load_all_detection_checkpoint_vars)) available_var_map = ( variables_helper.get_variables_available_in_checkpoint( asg_map, train_config.fine_tune_checkpoint, include_global_step=False)) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint, available_var_map) if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL): losses_dict = detection_model.loss( prediction_dict, features[fields.InputDataFields.true_image_shape]) losses = [loss_tensor for loss_tensor in losses_dict.itervalues()] total_loss = tf.add_n(losses, name='total_loss') if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.train.get_or_create_global_step() training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer) if use_tpu: training_optimizer = tpu_optimizer.CrossShardOptimizer( training_optimizer) # Optionally freeze some layers by setting their gradients to be zero. trainable_variables = None if train_config.freeze_variables: trainable_variables = tf.contrib.framework.filter_variables( tf.trainable_variables(), exclude_patterns=train_config.freeze_variables) clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm if not use_tpu: for var in optimizer_summary_vars: tf.summary.scalar(var.op.name, var) summaries = [] if use_tpu else None train_op = tf.contrib.layers.optimize_loss( loss=total_loss, global_step=global_step, learning_rate=None, clip_gradients=clip_gradients_value, optimizer=training_optimizer, variables=trainable_variables, summaries=summaries, name='') # Preventing scope prefix on all variables. if mode == tf.estimator.ModeKeys.PREDICT: export_outputs = { tf.saved_model.signature_constants.PREDICT_METHOD_NAME: tf.estimator.export.PredictOutput(detections) } eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: # Detection summaries during eval. class_agnostic = (fields.DetectionResultFields.detection_classes not in detections) groundtruth = _get_groundtruth_data(detection_model, class_agnostic) eval_dict = eval_util.result_dict_for_single_example( tf.expand_dims(features[fields.InputDataFields.original_image][0], 0), features[inputs.HASH_KEY][0], detections, groundtruth, class_agnostic=class_agnostic, scale_to_absolute=False) if class_agnostic: category_index = label_map_util.create_class_agnostic_category_index() else: category_index = label_map_util.create_category_index_from_labelmap( eval_input_config.label_map_path) detection_and_groundtruth = vis_utils.draw_side_by_side_evaluation_image( eval_dict, category_index, max_boxes_to_draw=20, min_score_thresh=0.2) if not use_tpu: tf.summary.image('Detections_Left_Groundtruth_Right', detection_and_groundtruth) # Eval metrics on a single image. detection_fields = fields.DetectionResultFields() input_data_fields = fields.InputDataFields() coco_evaluator = coco_evaluation.CocoDetectionEvaluator( category_index.values()) eval_metric_ops = coco_evaluator.get_estimator_eval_metric_ops( image_id=eval_dict[input_data_fields.key], groundtruth_boxes=eval_dict[input_data_fields.groundtruth_boxes], groundtruth_classes=eval_dict[input_data_fields.groundtruth_classes], detection_boxes=eval_dict[detection_fields.detection_boxes], detection_scores=eval_dict[detection_fields.detection_scores], detection_classes=eval_dict[detection_fields.detection_classes]) if use_tpu: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, scaffold_fn=scaffold_fn, predictions=detections, loss=total_loss, train_op=train_op, eval_metrics=eval_metric_ops, export_outputs=export_outputs) else: return tf.estimator.EstimatorSpec( mode=mode, predictions=detections, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, export_outputs=export_outputs)