def variable_summaries(var, scope=""): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" with tf.name_scope(scope): with tf.name_scope("summaries"): mean = tf.reduce_mean(var) tf.summary.scalar("mean", mean) with tf.name_scope("stddev"): stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) tf.summary.scalar("stddev", stddev) tf.summary.scalar("max", tf.reduce_max(var)) tf.summary.scalar("min", tf.reduce_min(var)) tf.summary.histogram("histogram", var)
def focal_loss(logits, targets, alpha, gamma, normalizer): """Compute the focal loss between `logits` and the golden `target` values. Focal loss = -(1-alpha)^gamma * log(pt) where pt is the probability of being classified to the true class. Args: logits: A float32 tensor of size [batch, height_in, width_in, num_predictions]. targets: A float32 tensor of size [batch, height_in, width_in, num_predictions]. alpha: A float32 scalar multiplying alpha to the loss from positive examples and (1-alpha) to the loss from negative examples. gamma: A float32 scalar modulating loss from hard and easy examples. normalizer: A float32 scalar normalizes the total loss from all examples. Returns: loss: A float32 scalar representing normalized total loss. """ with tf.name_scope('focal_loss'): positive_label_mask = tf.equal(targets, 1.0) cross_entropy = ( tf.nn.sigmoid_cross_entropy_with_logits(labels=targets, logits=logits)) probs = tf.sigmoid(logits) probs_gt = tf.where(positive_label_mask, probs, 1.0 - probs) # With small gamma, the implementation could produce NaN during back prop. modulator = tf.pow(1.0 - probs_gt, gamma) loss = modulator * cross_entropy weighted_loss = tf.where(positive_label_mask, alpha * loss, (1.0 - alpha) * loss) total_loss = tf.reduce_sum(weighted_loss) total_loss /= normalizer return total_loss
def upsampling_tpu_compatible(data, scale): """Nearest neighbor upsampling TPU-compatible implementation. This implementation is TPU compatible as opposed to tf.image.resize_nearest_neighbor(). Args: data: A 4D float32 tensor of shape [batch, height, width, channels]. scale: An integer multiple to scale resolution of input data. Returns: A 4D float32 tensor of shape [batch, height*scale, width*scale, channels]. """ with tf.name_scope('upsampling_tpu_compatible'): if data.get_shape().is_fully_defined(): bs, height, width, _ = [s.value for s in data.get_shape()] else: shape = tf.shape(data) bs, height, width = shape[0], shape[1], shape[2] channels = data.get_shape().as_list()[3] # Use reshape to quickly upsample the input. The nearest pixel is selected # implicitly via broadcasting. data = tf.reshape(data, [bs, height, 1, width, 1, channels]) * tf.ones( [1, 1, scale, 1, scale, 1], dtype=data.dtype) return tf.reshape(data, [bs, height * scale, width * scale, channels])
def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets.""" with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) # the image normalization is identical to Cloud TPU ResNet-50 image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = _normalize_image(image) if params['input_rand_hflip']: image, boxes = preprocessor.random_horizontal_flip( image, boxes=boxes) image_original_shape = tf.shape(image) image, _ = preprocessor.resize_to_range( image, min_dimension=params['image_size'], max_dimension=params['image_size']) image_scale = tf.to_float( image_original_shape[0]) / tf.to_float(tf.shape(image)[0]) image, boxes = preprocessor.scale_boxes_to_pixel_coordinates( image, boxes, keypoints=None) image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'], params['image_size']) (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.string_to_number(source_id, out_type=tf.float32) row = (image, cls_targets, box_targets, num_positives, source_id, image_scale) return row
def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: A list of the following elements in order: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] label: label tensor of the same spatial dimension as the image. """ with tf.name_scope('parser'): data = example_decoder.decode(value) image = data['image'] label = data['labels_class'] label = tf.to_int32(label) input_processor = SegmentationInputProcessor( image, params['image_size'], label) # The image normalization is identical to Cloud TPU ResNet. input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) image = input_processor.resize_and_crop_image() # Set padding to background (class=0) during training. if self._is_training: label = input_processor.resize_and_crop_label(0) else: label = input_processor.resize_and_crop_label( params['ignore_label']) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return image, label
def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: features: A dictionary that contains the image and auxiliary information. The following describes {key: value} pairs in the dictionary. image: An image tensor that is preprocessed to have normalized value and fixed dimension [image_size, image_size, 3] image_info: Image information that includes the original height and width, the scale of the processed image to the original image, and the scaled height and width. source_ids: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. labels: (only for training) A dictionary that contains groundtruth labels. The following describes {key: value} pairs in the dictionary. score_targets_dict: An ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of objectiveness score at l-th level. box_targets_dict: An ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. cropped_gt_masks: Groundtruth masks cropped by the bounding box and resized to a fixed size determined by params['gt_mask_size'] """ with tf.name_scope('parser'): data = example_decoder.decode(value) image = data['image'] source_id = data['source_id'] source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) if self._mode == tf.estimator.ModeKeys.PREDICT: input_processor = InstanceSegmentationInputProcessor( image, image_size, params['short_side_image_size'], params['long_side_max_image_size']) input_processor.normalize_image() input_processor.set_scale_factors_to_mlperf_reference_size( ) image = input_processor.resize_and_crop_image() if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) image_info = input_processor.get_image_info() return { 'images': image, 'image_info': image_info, 'source_ids': source_id } # The following part is for training. instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) instance_masks = tf.gather_nd(instance_masks, indices) input_processor = InstanceSegmentationInputProcessor( image, image_size, params['short_side_image_size'], params['long_side_max_image_size'], boxes, classes, instance_masks) input_processor.normalize_image() if params['input_rand_hflip']: input_processor.random_horizontal_flip() input_processor.set_scale_factors_to_mlperf_reference_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() cropped_gt_masks = input_processor.crop_gt_masks( params['gt_mask_size']) image_info = input_processor.get_image_info() # Assign anchors. is_height_short_side = tf.less(image_info[3], image_info[4]) score_targets, box_targets = tf.cond( is_height_short_side, lambda: anchor_labeler.label_anchors(boxes, classes), lambda: height_long_side_anchor_labeler.label_anchors(boxes, classes)) # pylint: disable=line-too-long # Pad groundtruth data. boxes *= image_info[2] boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. cropped_gt_masks = tf.reshape( cropped_gt_masks, [-1, (params['gt_mask_size'] + 4)**2]) cropped_gt_masks = pad_to_fixed_size( cropped_gt_masks, -1, [self._max_num_instances, (params['gt_mask_size'] + 4)**2]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) features = {} features['images'] = image features['image_info'] = image_info features['source_ids'] = source_id labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['gt_boxes'] = boxes labels['gt_classes'] = classes labels['cropped_gt_masks'] = cropped_gt_masks return features, labels
def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the proccessed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tennsor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if params['skip_crowd_during_training'] and self._is_training: indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) # NOTE: The autoaugment method works best when used alongside the # standard horizontal flipping of images along with size jittering # and normalization. if params.get('autoaugment_policy', None) and self._is_training: image, boxes = autoaugment.distort_image_with_autoaugment( image, boxes, params['autoaugment_policy']) input_processor = DetectionInputProcessor( image, params['image_size'], boxes, classes) input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() # Assign anchors. (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) # Pad groundtruth data for evaluation. image_scale = input_processor.image_scale_to_original boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, cls_targets, box_targets, num_positives, source_id, image_scale, boxes, is_crowds, areas, classes)
def estimator_metric_fn(self, detections, image_scale): """Constructs the metric function for tf.TPUEstimator. For each metric, we return the evaluation op and an update op; the update op is shared across all metrics and simply appends the set of detections to the `self.detections` list. The metric op is invoked after all examples have been seen and computes the aggregate COCO metrics. Please find details API in: https://www.tensorflow.org/api_docs/python/tf/contrib/learn/MetricSpec Args: detections: detection results in a tensor with each row representing [image_id, x, y, width, height, score, class] image_scale: a float tensor representing the scale between original image and input image for the detector. It is used to rescale detections for evaluating with the original groundtruth annotations. Returns: metrics_dict: A dictionary mapping from evaluation name to a tuple of operations (`metric_op`, `update_op`). `update_op` appends the detections for the metric to the `self.detections` list. """ def _evaluate(): """Evaluates with detections from all images with COCO API. Returns: coco_metric: float numpy array with shape [12] representing the coco-style evaluation metrics. """ detections = np.array(self.detections) image_ids = list(set(detections[:, 0])) coco_dt = self.coco_gt.loadRes(detections) coco_eval = COCOeval(self.coco_gt, coco_dt) coco_eval.params.imgIds = image_ids coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() coco_metrics = coco_eval.stats # clean self.detections after evaluation is done. # this makes sure the next evaluation will start with an empty list of # self.detections. self.detections = [] return np.array(coco_metrics, dtype=np.float32) def _update_op(detections, image_scale): """Extends self.detections with the detection results in one image. Args: detections: detection results in a tensor with each row representing [image_id, x, y, width, height, score, class] image_scale: a float tensor representing the scale between original image and input image for the detector. It is used to rescale detections for evaluating with the original groundtruth annotations. """ detections[:, 1:5] *= image_scale self.detections.extend(detections) with tf.name_scope('coco_metric'): update_op = tf.py_func(_update_op, [detections, image_scale], []) metrics = tf.py_func(_evaluate, [], tf.float32) metrics_dict = {} for i, name in enumerate(self.metric_names): metrics_dict[name] = (metrics[i], update_op) return metrics_dict