def __init__(self, iou_loss_type, min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size, **kwargs): super().__init__(**kwargs) self.iou_loss_type = iou_loss_type self.input_anchors = anchors.Anchors(min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size)
def __call__(self, params, input_context=None, batch_size=None): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder( include_mask='segmentation' in params['heads'], regenerate_source_id=params['regenerate_source_id']) batch_size = batch_size or params['batch_size'] seed = params['tf_random_seed'] if self._debug else None dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=self._is_training, seed=seed) if input_context: dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) # Prefetch data from files. def _prefetch_dataset(filename): if params.get('dataset_type', None) == 'sstable': pass else: dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.interleave(_prefetch_dataset, num_parallel_calls=tf.data.AUTOTUNE) dataset = dataset.with_options(self.dataset_options) if self._is_training: dataset = dataset.shuffle(64, seed=seed) # Parse the fetched records to input tensors for model function. # pylint: disable=g-long-lambda if params.get('dataset_type', None) == 'sstable': map_fn = lambda key, value: self.dataset_parser( value, example_decoder, anchor_labeler, params) else: map_fn = lambda value: self.dataset_parser(value, example_decoder, anchor_labeler, params) # pylint: enable=g-long-lambda dataset = dataset.map(map_fn, num_parallel_calls=tf.data.AUTOTUNE) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=params['drop_remainder']) dataset = dataset.map( lambda *args: self.process_example(params, batch_size, *args)) dataset = dataset.prefetch(tf.data.AUTOTUNE) if self._is_training: dataset = dataset.repeat() if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() return dataset
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder( include_mask='segmentation' in params['heads'], regenerate_source_id=params['regenerate_source_id']) batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=self._is_training) if self._is_training: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.interleave( _prefetch_dataset, num_parallel_calls=tf.data.experimental.AUTOTUNE) options = tf.data.Options() options.experimental_deterministic = not self._is_training dataset = dataset.with_options(options) if self._is_training: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map( lambda value: self.dataset_parser( # pylint: disable=g-long-lambda value, example_decoder, anchor_labeler, params), num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.map( lambda *args: self.process_example(params, batch_size, *args)) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() return dataset
def tflite_pre_nms(params, cls_outputs, box_outputs): """Pre-NMS that is compatible with TFLite's custom NMS op. For details, see tensorflow/lite/kernels/detection_postprocess.cc Args: params: a dict of parameters. cls_outputs: a list of tensors for classes, each tensor denotes a level of logits with shape [1, H, W, num_class * num_anchors]. box_outputs: a list of tensors for boxes, each tensor ddenotes a level of boxes with shape [1, H, W, 4 * num_anchors]. Each box format is [y_min, x_min, y_max, x_man]. Returns: boxes: boxes encoded as {y_center, x_center, height, width} scores: scores converted from `cls_outputs` logits using sigmoid anchors: normalized anchors encoded as {y_center, x_center, height, width} """ cls_outputs = to_list(cls_outputs) box_outputs = to_list(box_outputs) cls_outputs, box_outputs = merge_class_box_level_outputs( params, cls_outputs, box_outputs) eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) # TODO(b/175166514): Consider computing Top-K boxes & anchors here. We don't # do this currently since the resultant graph does not support TFLite # delegates well. `topk_class_boxes` won't work as-is, since the outputs # will need to be modified appropriately for TFLite op's consumption. # TFLite's object detection APIs require normalized anchors. height, width = utils.parse_image_size(params['image_size']) normalize_factor = tf.constant([height, width, height, width], dtype=tf.float32) normalized_anchors = eval_anchors.boxes / normalize_factor decoded_anchors = anchors.decode_anchors_to_centersize( box_outputs, normalized_anchors) # convert logits to scores. scores = tf.math.sigmoid(cls_outputs) return box_outputs, scores, decoded_anchors
def test_parser(self): tf.random.set_seed(111111) params = hparams_config.get_detection_config( 'efficientdet-d0').as_dict() input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder( regenerate_source_id=params['regenerate_source_id']) tfrecord_path = self._make_fake_tfrecord() dataset = tf.data.TFRecordDataset([tfrecord_path]) value = next(iter(dataset)) reader = dataloader.InputReader(tfrecord_path, True) result = reader.dataset_parser(value, example_decoder, anchor_labeler, params) self.assertEqual(len(result), 10)
def pre_nms(params, cls_outputs, box_outputs, topk=True): """Detection post processing before nms. It takes the multi-level class and box predictions from network, merge them into unified tensors, and compute boxes, scores, and classes. Args: params: a dict of parameters. cls_outputs: a list of tensors for classes, each tensor denotes a level of logits with shape [N, H, W, num_class * num_anchors]. box_outputs: a list of tensors for boxes, each tensor ddenotes a level of boxes with shape [N, H, W, 4 * num_anchors]. topk: if True, select topk before nms (mainly to speed up nms). Returns: A tuple of (boxes, scores, classes). """ # get boxes by apply bounding box regression to anchors. eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) cls_outputs, box_outputs = merge_class_box_level_outputs( params, cls_outputs, box_outputs) if topk: # select topK purely based on scores before NMS, in order to speed up nms. cls_outputs, box_outputs, classes, indices = topk_class_boxes( params, cls_outputs, box_outputs) anchor_boxes = tf.gather(eval_anchors.boxes, indices) else: anchor_boxes = eval_anchors.boxes classes = None boxes = anchors.decode_box_outputs(box_outputs, anchor_boxes) # convert logits to scores. scores = tf.math.sigmoid(cls_outputs) return boxes, scores, classes
def detection_loss(cls_outputs, box_outputs, labels, params): """Computes total detection loss. Computes total detection loss including box and class loss from all levels. Args: cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. labels: the dictionary that returned from dataloader that includes groundtruth targets. params: the dictionary including training parameters specified in default_haprams function in this file. Returns: total_loss: an integer tensor representing total loss reducing from class and box losses from all levels. cls_loss: an integer tensor representing total class loss. box_loss: an integer tensor representing total box regression loss. box_iou_loss: an integer tensor representing total box iou loss. """ # Sum all positives in a batch for normalization and avoid zero # num_positives_sum, which would lead to inf loss during training num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0 positives_momentum = params.get('positives_momentum', None) or 0 if positives_momentum > 0: # normalize the num_positive_examples for training stability. moving_normalizer_var = tf.Variable( 0.0, name='moving_normalizer', dtype=tf.float32, synchronization=tf.VariableSynchronization.ON_READ, trainable=False, aggregation=tf.VariableAggregation.MEAN) num_positives_sum = tf.keras.backend.moving_average_update( moving_normalizer_var, num_positives_sum, momentum=params['positives_momentum']) elif positives_momentum < 0: num_positives_sum = utils.cross_replica_mean(num_positives_sum) levels = cls_outputs.keys() cls_losses = [] box_losses = [] for level in levels: # Onehot encoding for classification labels. cls_targets_at_level = tf.one_hot(labels['cls_targets_%d' % level], params['num_classes']) if params['data_format'] == 'channels_first': bs, _, width, height, _ = cls_targets_at_level.get_shape().as_list( ) cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, -1, width, height]) else: bs, width, height, _, _ = cls_targets_at_level.get_shape().as_list( ) cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, width, height, -1]) box_targets_at_level = labels['box_targets_%d' % level] cls_loss = focal_loss(cls_outputs[level], cls_targets_at_level, params['alpha'], params['gamma'], normalizer=num_positives_sum, label_smoothing=params['label_smoothing']) if params['data_format'] == 'channels_first': cls_loss = tf.reshape( cls_loss, [bs, -1, width, height, params['num_classes']]) else: cls_loss = tf.reshape( cls_loss, [bs, width, height, -1, params['num_classes']]) cls_loss *= tf.cast( tf.expand_dims(tf.not_equal(labels['cls_targets_%d' % level], -2), -1), tf.float32) cls_losses.append(tf.clip_by_value(tf.reduce_sum(cls_loss), 0.0, 2.0)) if params['box_loss_weight']: box_losses.append( _box_loss(box_outputs[level], box_targets_at_level, num_positives_sum, delta=params['delta'])) if params['iou_loss_type']: input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) box_output_list = [tf.reshape(box_outputs[i], [-1, 4]) for i in levels] box_outputs = tf.concat(box_output_list, axis=0) box_target_list = [ tf.reshape(labels['box_targets_%d' % level], [-1, 4]) for level in levels ] box_targets = tf.concat(box_target_list, axis=0) anchor_boxes = tf.tile(input_anchors.boxes, [params['batch_size'], 1]) box_outputs = anchors.decode_box_outputs(box_outputs, anchor_boxes) box_targets = anchors.decode_box_outputs(box_targets, anchor_boxes) box_iou_loss = _box_iou_loss(box_outputs, box_targets, num_positives_sum, params['iou_loss_type']) else: box_iou_loss = 0 # Sum per level losses to total loss. cls_loss = tf.add_n(cls_losses) box_loss = tf.add_n(box_losses) if box_losses else 0 total_loss = (cls_loss + params['box_loss_weight'] * box_loss + params['iou_loss_weight'] * box_iou_loss) return total_loss, cls_loss, box_loss, box_iou_loss
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder( regenerate_source_id=params['regenerate_source_id']) @tf.autograph.experimental.do_not_convert def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preprocessed to have normalized value and fixed dimension [image_height, image_width, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the processed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tensor is padded with -1 to the fixed dimension [self._max_instances_per_image, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tensor is padded with 0 to the fixed dimension [self._max_instances_per_image]. areas: Groundtruth areas annotations. The tensor is padded with -1 to the fixed dimension [self._max_instances_per_image]. classes: Groundtruth classes annotations. The tensor is padded with -1 to the fixed dimension [self._max_instances_per_image]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if params['skip_crowd_during_training'] and self._is_training: indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) # NOTE: The autoaugment method works best when used alongside the # standard horizontal flipping of images along with size jittering # and normalization. if params.get('autoaugment_policy', None) and self._is_training: from aug import autoaugment # pylint: disable=g-import-not-at-top image, boxes = autoaugment.distort_image_with_autoaugment( image, boxes, params['autoaugment_policy'], params['use_augmix'], *params['augmix_params']) input_processor = DetectionInputProcessor( image, params['image_size'], boxes, classes) input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max'], params.get('target_size', None)) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() # Assign anchors. (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.strings.to_number(source_id) # Pad groundtruth data for evaluation. image_scale = input_processor.image_scale_to_original boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_instances_per_image, 4]) is_crowds = pad_to_fixed_size( is_crowds, 0, [self._max_instances_per_image, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_instances_per_image, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_instances_per_image, 1]) return (image, cls_targets, box_targets, num_positives, source_id, image_scale, boxes, is_crowds, areas, classes) batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=self._is_training) if self._is_training: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.data.experimental.parallel_interleave(_prefetch_dataset, cycle_length=32, sloppy=self._is_training)) if self._is_training: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) @tf.autograph.experimental.do_not_convert def _process_example(images, cls_targets, box_targets, num_positives, source_ids, image_scales, boxes, is_crowds, areas, classes): """Processes one batch of data.""" labels = {} # Count num_positives in a batch. num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) if params['data_format'] == 'channels_first': images = tf.transpose(images, [0, 3, 1, 2]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] if params['data_format'] == 'channels_first': labels['cls_targets_%d' % level] = tf.transpose( labels['cls_targets_%d' % level], [0, 3, 1, 2]) labels['box_targets_%d' % level] = tf.transpose( labels['box_targets_%d' % level], [0, 3, 1, 2]) # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_scales'] = image_scales return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() return dataset
def detection_loss(cls_outputs, box_outputs, labels, params): """Computes total detection loss. Computes total detection loss including box and class loss from all levels. Args: cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. labels: the dictionary that returned from dataloader that includes groundtruth targets. params: the dictionary including training parameters specified in default_haprams function in this file. Returns: total_loss: an integer tensor representing total loss reducing from class and box losses from all levels. cls_loss: an integer tensor representing total class loss. box_loss: an integer tensor representing total box regression loss. box_iou_loss: an integer tensor representing total box iou loss. """ # Sum all positives in a batch for normalization and avoid zero # num_positives_sum, which would lead to inf loss during training num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0 levels = cls_outputs.keys() cls_losses = [] box_losses = [] for level in levels: # Onehot encoding for classification labels. cls_targets_at_level = tf.one_hot(labels['cls_targets_%d' % level], params['num_classes']) if params['data_format'] == 'channels_first': bs, _, width, height, _ = cls_targets_at_level.get_shape().as_list( ) cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, -1, width, height]) else: bs, width, height, _, _ = cls_targets_at_level.get_shape().as_list( ) cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, width, height, -1]) box_targets_at_level = labels['box_targets_%d' % level] cls_loss = focal_loss(cls_outputs[level], cls_targets_at_level, params['alpha'], params['gamma'], normalizer=num_positives_sum, label_smoothing=params['label_smoothing']) if params['data_format'] == 'channels_first': cls_loss = tf.reshape( cls_loss, [bs, -1, width, height, params['num_classes']]) else: cls_loss = tf.reshape( cls_loss, [bs, width, height, -1, params['num_classes']]) cls_loss *= tf.cast( tf.expand_dims(tf.not_equal(labels['cls_targets_%d' % level], -2), -1), tf.float32) cls_losses.append(tf.reduce_sum(cls_loss)) if params['box_loss_weight']: box_losses.append( _box_loss(box_outputs[level], box_targets_at_level, num_positives_sum, delta=params['delta'])) if params['iou_loss_type']: input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) box_coder = FasterRcnnBoxCoder() input_anchors = BoxList( tf.tile(input_anchors.boxes, [params['batch_size'], 1])) box_outputs = tf.concat( [tf.reshape(v, [-1, 4]) for v in list(box_outputs.values())], axis=0) box_targets = tf.concat([ tf.reshape(labels['box_targets_%d' % level], [-1, 4]) for level in levels ], axis=0) box_outputs = box_coder.decode(box_outputs, input_anchors) box_targets = box_coder.decode(box_targets, input_anchors) box_iou_loss = _box_iou_loss(box_outputs.data['boxes'], box_targets.data['boxes'], num_positives_sum, params['iou_loss_type']) else: box_iou_loss = 0 # Sum per level losses to total loss. cls_loss = tf.add_n(cls_losses) box_loss = tf.add_n(box_losses) if box_losses else 0 total_loss = (cls_loss + params['box_loss_weight'] * box_loss + params['iou_loss_weight'] * box_iou_loss) return total_loss, cls_loss, box_loss, box_iou_loss
def detection_loss(cls_outputs, box_outputs, labels, params): """Computes total detection loss. Computes total detection loss including box and class loss from all levels. Args: cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. labels: the dictionary that returned from dataloader that includes groundtruth targets. params: the dictionary including training parameters specified in default_haprams function in this file. Returns: total_loss: an integer tensor representing total loss reducing from class and box losses from all levels. cls_loss: an integer tensor representing total class loss. box_loss: an integer tensor representing total box regression loss. box_iou_loss: an integer tensor representing total box iou loss. """ # Sum all positives in a batch for normalization and avoid zero # num_positives_sum, which would lead to inf loss during training num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0 levels = cls_outputs.keys() cls_losses = [] box_losses = [] sumrule = {} if params.get('sumrule'): sumrule = params['sumrule'] # because of cls_targets -= 1 (so that bg class becomes -1, actual class then starts from 0) # we need to subtract 1 from sumrule as well. _sumrule = {} for k, v in sumrule.items(): _sumrule[k - 1] = [vv - 1 for vv in v] sumrule = _sumrule def table_lookup(values, old_onehot, cls_targets_at_level): for val in values: if sumrule.get(val): new_val = sumrule[val] #prob = 1.0/len(new_val) prob = 0.5 # try sigmoid cross entropy first so set this to 0.5, if we use softmax we should set this to 1.0/len(new_val) if len(new_val) == 1: # leaf node, prob = 1.0 prob = 1.0 _matching_onehot = old_onehot[np.where( cls_targets_at_level == val)] _matching_onehot[:, new_val] = prob _matching_onehot[:, val] = 0 old_onehot[np.where( cls_targets_at_level == val)] = _matching_onehot return old_onehot for level in levels: # Onehot encoding for classification labels. _cls_targets_at_level = tf.one_hot(labels['cls_targets_%d' % level], params['num_classes']) if params.get('sumrule'): unique_labels, _ = tf.unique( tf.reshape(labels['cls_targets_%d' % level], [-1])) # refine one-hot labels so that we map each label to it's finest leaves cls_targets_at_level = tf.numpy_function( table_lookup, [ unique_labels, _cls_targets_at_level, labels['cls_targets_%d' % level] ], _cls_targets_at_level.dtype) cls_targets_at_level = tf.reshape(cls_targets_at_level, _cls_targets_at_level.shape) else: cls_targets_at_level = _cls_targets_at_level if params['data_format'] == 'channels_first': bs, _, width, height, _ = cls_targets_at_level.get_shape().as_list( ) cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, -1, width, height]) else: bs, width, height, _, _ = cls_targets_at_level.get_shape().as_list( ) cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, width, height, -1]) box_targets_at_level = labels['box_targets_%d' % level] cls_loss = focal_loss(cls_outputs[level], cls_targets_at_level, params['alpha'], params['gamma'], normalizer=num_positives_sum, label_smoothing=params['label_smoothing']) if params['data_format'] == 'channels_first': cls_loss = tf.reshape( cls_loss, [bs, -1, width, height, params['num_classes']]) else: cls_loss = tf.reshape( cls_loss, [bs, width, height, -1, params['num_classes']]) cls_loss *= tf.cast( tf.expand_dims(tf.not_equal(labels['cls_targets_%d' % level], -2), -1), tf.float32) cls_losses.append(tf.reduce_sum(cls_loss)) if params['box_loss_weight']: box_losses.append( _box_loss(box_outputs[level], box_targets_at_level, num_positives_sum, delta=params['delta'])) if params['iou_loss_type']: input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) box_output_list = [tf.reshape(box_outputs[i], [-1, 4]) for i in levels] box_outputs = tf.concat(box_output_list, axis=0) box_target_list = [ tf.reshape(labels['box_targets_%d' % level], [-1, 4]) for level in levels ] box_targets = tf.concat(box_target_list, axis=0) anchor_boxes = tf.tile(input_anchors.boxes, [params['batch_size'], 1]) box_outputs = anchors.decode_box_outputs(box_outputs, anchor_boxes) box_targets = anchors.decode_box_outputs(box_targets, anchor_boxes) box_iou_loss = _box_iou_loss(box_outputs, box_targets, num_positives_sum, params['iou_loss_type']) else: box_iou_loss = 0 # Sum per level losses to total loss. cls_loss = tf.add_n(cls_losses) box_loss = tf.add_n(box_losses) if box_losses else 0 total_loss = (cls_loss + params['box_loss_weight'] * box_loss + params['iou_loss_weight'] * box_iou_loss) return total_loss, cls_loss, box_loss, box_iou_loss