コード例 #1
0
def encode_labels(gt_boxes, gt_labels):
    """Labels anchors with ground truth inputs.

    Args:
      gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
      gt_labels: A integer tensor with shape [N, 1] representing groundtruth
        classes.
    Returns:
      encoded_classes: a tensor with shape [num_anchors, 1].
      encoded_boxes: a tensor with shape [num_anchors, 4].
      num_positives: scalar tensor storing number of positives in an image.
    """
    similarity_calc = region_similarity_calculator.IouSimilarity()
    matcher = argmax_matcher.ArgMaxMatcher(
        matched_threshold=constants.MATCH_THRESHOLD,
        unmatched_threshold=constants.MATCH_THRESHOLD,
        negatives_lower_than_unmatched=True,
        force_match_for_each_row=True)

    box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
        scale_factors=constants.BOX_CODER_SCALES)

    default_boxes = box_list.BoxList(
        tf.convert_to_tensor(DefaultBoxes()('ltrb')))
    target_boxes = box_list.BoxList(gt_boxes)

    assigner = target_assigner.TargetAssigner(
        similarity_calc, matcher, box_coder)

    encoded_classes, _, encoded_boxes, _, matches = assigner.assign(
        default_boxes, target_boxes, gt_labels)
    num_matched_boxes = tf.reduce_sum(
        tf.cast(tf.not_equal(matches.match_results, -1), tf.float32))
    return encoded_classes, encoded_boxes, num_matched_boxes
コード例 #2
0
    def _decode(self, rel_codes, anchors):
        """Decode relative codes to boxes.

        Args:
          rel_codes: a tensor representing N anchor-encoded boxes.
          anchors: BoxList of anchors.

        Returns:
          boxes: BoxList holding N bounding boxes.
        """
        ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes()

        ty, tx, th, tw = tf.unstack(tf.transpose(rel_codes))
        if self._scale_factors:
            ty /= self._scale_factors[0]
            tx /= self._scale_factors[1]
            th /= self._scale_factors[2]
            tw /= self._scale_factors[3]
        w = tf.exp(tw) * wa
        h = tf.exp(th) * ha
        ycenter = ty * ha + ycenter_a
        xcenter = tx * wa + xcenter_a
        ymin = ycenter - h / 2.
        xmin = xcenter - w / 2.
        ymax = ycenter + h / 2.
        xmax = xcenter + w / 2.
        return box_list.BoxList(tf.transpose(tf.stack([ymin, xmin, ymax, xmax])))
コード例 #3
0
def scale_boxes_to_pixel_coordinates(image, boxes, keypoints=None):
    """Scales boxes from normalized to pixel coordinates.

    Args:
      image: A 3D float32 tensor of shape [height, width, channels].
      boxes: A 2D float32 tensor of shape [num_boxes, 4] containing the bounding
        boxes in normalized coordinates. Each row is of the form
        [ymin, xmin, ymax, xmax].
      keypoints: (optional) rank 3 float32 tensor with shape
        [num_instances, num_keypoints, 2]. The keypoints are in y-x normalized
        coordinates.

    Returns:
      image: unchanged input image.
      scaled_boxes: a 2D float32 tensor of shape [num_boxes, 4] containing the
        bounding boxes in pixel coordinates.
      scaled_keypoints: a 3D float32 tensor with shape
        [num_instances, num_keypoints, 2] containing the keypoints in pixel
        coordinates.
    """
    boxlist = box_list.BoxList(boxes)
    image_height = tf.shape(image)[0]
    image_width = tf.shape(image)[1]
    scaled_boxes = box_list_scale(boxlist, image_height, image_width).get()
    result = [image, scaled_boxes]
    if keypoints is not None:
        scaled_keypoints = keypoint_scale(keypoints, image_height, image_width)
        result.append(scaled_keypoints)
    return tuple(result)
コード例 #4
0
def box_list_scale(boxlist, y_scale, x_scale, scope=None):
    """scale box coordinates in x and y dimensions.

    Args:
      boxlist: BoxList holding N boxes
      y_scale: (float) scalar tensor
      x_scale: (float) scalar tensor
      scope: name scope.

    Returns:
      boxlist: BoxList holding N boxes
    """
    with tf.name_scope(scope, 'Scale'):
        y_scale = tf.cast(y_scale, tf.float32)
        x_scale = tf.cast(x_scale, tf.float32)
        y_min, x_min, y_max, x_max = tf.split(value=boxlist.get(),
                                              num_or_size_splits=4,
                                              axis=1)
        y_min = y_scale * y_min
        y_max = y_scale * y_max
        x_min = x_scale * x_min
        x_max = x_scale * x_max
        scaled_boxlist = box_list.BoxList(
            tf.concat([y_min, x_min, y_max, x_max], 1))
        return _copy_extra_fields(scaled_boxlist, boxlist)
コード例 #5
0
    def _create_regression_targets(self, anchors, groundtruth_boxes, match):
        """Returns a regression target for each anchor.

        Args:
          anchors: a BoxList representing N anchors
          groundtruth_boxes: a BoxList representing M groundtruth_boxes
          match: a matcher.Match object

        Returns:
          reg_targets: a float32 tensor with shape [N, box_code_dimension]
        """
        matched_gt_boxes = match.gather_based_on_match(
            groundtruth_boxes.get(),
            unmatched_value=tf.zeros(4),
            ignored_value=tf.zeros(4))
        matched_gt_boxlist = box_list.BoxList(matched_gt_boxes)
        if groundtruth_boxes.has_field(KEYPOINTS_FIELD_NAME):
            groundtruth_keypoints = groundtruth_boxes.get_field(
                KEYPOINTS_FIELD_NAME)
            matched_keypoints = match.gather_based_on_match(
                groundtruth_keypoints,
                unmatched_value=tf.zeros(
                    groundtruth_keypoints.get_shape()[1:]),
                ignored_value=tf.zeros(groundtruth_keypoints.get_shape()[1:]))
            matched_gt_boxlist.add_field(KEYPOINTS_FIELD_NAME,
                                         matched_keypoints)
        matched_reg_targets = self._box_coder.encode(matched_gt_boxlist,
                                                     anchors)
        match_results_shape = shape_utils.combined_static_and_dynamic_shape(
            match.match_results)

        # Zero out the unmatched and ignored regression targets.
        unmatched_ignored_reg_targets = tf.tile(
            self._default_regression_target(), [match_results_shape[0], 1])
        matched_anchors_mask = match.matched_column_indicator()
        reg_targets = tf.where(matched_anchors_mask, matched_reg_targets,
                               unmatched_ignored_reg_targets)
        return reg_targets
コード例 #6
0
        def _parse_example(data):
            with tf.name_scope('augmentation'):
                source_id = data['source_id']
                image = data['image']  # dtype uint8
                raw_shape = tf.shape(image)
                boxes = data['groundtruth_boxes']
                classes = tf.reshape(data['groundtruth_classes'], [-1, 1])

                # Only 80 of the 90 COCO classes are used.
                class_map = tf.convert_to_tensor(constants.CLASS_MAP)
                classes = tf.gather(class_map, classes)
                classes = tf.cast(classes, dtype=tf.float32)

                if self._is_training:
                    image, boxes, classes = ssd_crop(image, boxes, classes)
                    # ssd_crop resizes and returns image of dtype float32 and does not
                    # change its range (i.e., value in between 0--255). Divide by 255.
                    # converts it to [0, 1] range. Not doing this before cropping to
                    # avoid dtype cast (which incurs additional memory copy).
                    image /= 255.0

                    # random_horizontal_flip() is hard coded to flip with 50% chance.
                    image, boxes = preprocessor.random_horizontal_flip(
                        image=image, boxes=boxes)

                    # TODO(shibow): Investigate the parameters for color jitter.
                    image = color_jitter(
                        image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05)

                    if params['dtype'] == 'bf16':
                        image = tf.cast(image, dtype=tf.bfloat16)

                    encoded_classes, encoded_boxes, num_matched_boxes = encode_labels(
                        boxes, classes)

                    # We transpose in dataloader instead of in the topology to save time
                    encoded_classes, encoded_boxes = transpose_labels(
                        encoded_classes, encoded_boxes)

                    encoded_classes = tf.cast(encoded_classes, tf.int32)

                    labels = {
                        constants.NUM_MATCHED_BOXES: num_matched_boxes,
                        constants.BOXES: encoded_boxes,
                        constants.CLASSES: tf.squeeze(encoded_classes, axis=1),
                    }
                    # This is for dataloader visualization; actual model doesn't use this.
                    if params['visualize_dataloader']:
                        box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
                            scale_factors=constants.BOX_CODER_SCALES)
                        decoded_boxes = tf.expand_dims(box_coder.decode(
                            rel_codes=tf.squeeze(encoded_boxes),
                            anchors=box_list.BoxList(
                                tf.convert_to_tensor(DefaultBoxes()('ltrb')))
                        ).get(), axis=0)
                        labels['decoded_boxes'] = tf.squeeze(decoded_boxes)

                    return image, labels

                else:
                    image = tf.image.resize_images(
                        image, size=(constants.IMAGE_SIZE, constants.IMAGE_SIZE))
                    # resize_image returns image of dtype float32 and does not change its
                    # range. Divide by 255 to convert image to [0, 1] range.
                    image /= 255.

                    if params['dtype'] == 'bf16':
                        image = tf.cast(image, dtype=tf.bfloat16)

                    def trim_and_pad(inp_tensor, dim_1):
                        """Limit the number of boxes, and pad if necessary."""
                        inp_tensor = inp_tensor[:constants.MAX_NUM_EVAL_BOXES]
                        num_pad = constants.MAX_NUM_EVAL_BOXES - \
                            tf.shape(inp_tensor)[0]
                        inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]])
                        return tf.reshape(
                            inp_tensor, [constants.MAX_NUM_EVAL_BOXES, dim_1])

                    boxes, classes = trim_and_pad(
                        boxes, 4), trim_and_pad(classes, 1)

                    sample = {
                        constants.IMAGE: image,
                        constants.BOXES: boxes,
                        constants.CLASSES: classes,
                        constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32),
                        constants.RAW_SHAPE: raw_shape,
                    }

                    if not self._is_training and self._count > params['eval_samples']:
                        sample[constants.IS_PADDED] = data[constants.IS_PADDED]
                    return sample
コード例 #7
0
ファイル: model.py プロジェクト: HabanaAI/Model-References
def _model_fn(features, labels, mode, params, model):
    """Model defination for the SSD model based on ResNet-50.

    Args:
      features: the input image tensor with shape [batch_size, height, width, 3].
        The height and width are fixed and equal.
      labels: the input labels in a dictionary. The labels include class targets
        and box targets which are dense label maps. The labels are generated from
        get_input_fn function in data/dataloader.py
      mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
      params: the dictionary defines hyperparameters of model. The default
        settings are in default_hparams function in this file.
      model: the SSD model outputs class logits and box regression outputs.

    Returns:
      spec: the EstimatorSpec or TPUEstimatorSpec to run training, evaluation,
        or prediction.
    """
    if mode == tf.estimator.ModeKeys.PREDICT:
        labels = features
        features = labels.pop('image')

    features -= tf.constant(constants.NORMALIZATION_MEAN,
                            shape=[1, 1, 3],
                            dtype=features.dtype)
    COEF_STD = 1.0 / tf.constant(
        constants.NORMALIZATION_STD, shape=[1, 1, 3], dtype=features.dtype)
    features *= COEF_STD

    def _model_outputs():
        return model(features,
                     params,
                     is_training_bn=(mode == tf.estimator.ModeKeys.TRAIN))

    if params['dtype'] == 'bf16':
        with tf.compat.v1.tpu.bfloat16_scope():
            cls_outputs, box_outputs = _model_outputs()
            levels = cls_outputs.keys()
            for level in levels:
                cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
                box_outputs[level] = tf.cast(box_outputs[level], tf.float32)
    else:
        cls_outputs, box_outputs = _model_outputs()
        levels = cls_outputs.keys()

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        flattened_cls, flattened_box = concat_outputs(cls_outputs, box_outputs,
                                                      True)
        ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
            scale_factors=constants.BOX_CODER_SCALES)

        anchors = box_list.BoxList(
            tf.convert_to_tensor(dataloader.DefaultBoxes()('ltrb')))

        decoded_boxes = box_coder.batch_decode(encoded_boxes=flattened_box,
                                               box_coder=ssd_box_coder,
                                               anchors=anchors)

        pred_scores = tf.nn.softmax(flattened_cls, axis=2)

        pred_scores, indices = select_top_k_scores(
            pred_scores, constants.MAX_NUM_EVAL_BOXES)
        predictions = dict(
            labels,
            indices=indices,
            pred_scores=pred_scores,
            pred_box=decoded_boxes,
        )

        if params['visualize_dataloader']:
            # this is for inference visualization.
            predictions['image'] = features

        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Load pretrained model from checkpoint.
    if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN:

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            tf.train.init_from_checkpoint(
                params['resnet_checkpoint'], {
                    '/': 'resnet%s/' % constants.RESNET_DEPTH,
                })
            return tf.train.Scaffold()
    else:
        scaffold_fn = None

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)
    # cls_loss and box_loss are for logging. only total_loss is optimized.
    loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs, labels)

    total_loss = loss + params['weight_decay'] * tf.add_n(
        [tf.nn.l2_loss(v) for v in tf.trainable_variables()])

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.MomentumOptimizer(learning_rate,
                                               momentum=constants.MOMENTUM)

        if params['distributed_optimizer']:
            optimizer = params['distributed_optimizer'](optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        train_op = tf.group(optimizer.minimize(total_loss, global_step),
                            update_ops)

        save_summary_steps = params['save_summary_steps']
        if save_summary_steps is not None:
            tf.summary.scalar("learning_rate", learning_rate)
            tf.summary.scalar("class_loss", cls_loss)
            tf.summary.scalar("box_loss", box_loss)

        return model_fn_lib.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          training_hooks=None,
                                          scaffold=scaffold_fn())

    if mode == tf.estimator.ModeKeys.EVAL:
        raise NotImplementedError