Ejemplo n.º 1
0
 def backbone(images, is_training):
     if params['backbone'] == 'mobilenet':
         return mobilenet_v1(images,
                             is_training,
                             depth_multiplier=params['depth_multiplier'])
     elif params['backbone'] == 'shufflenet':
         return shufflenet_v2(images,
                              is_training,
                              depth_multiplier=str(
                                  params['depth_multiplier']))
Ejemplo n.º 2
0
 def backbone(images, is_training):
     if params['backbone'] == 'mobilenet':
         return mobilenet_v1(images,
                             is_training,
                             depth_multiplier=params['depth_multiplier'])
     elif params['backbone'] == 'shufflenet':
         return shufflenet_v2(images,
                              is_training,
                              depth_multiplier=str(
                                  params['depth_multiplier']))
     elif params['backbone'] == 'resnet':
         return resnet(images,
                       is_training,
                       block_sizes=params['block_sizes'],
                       enableBN=params['enableBN'])
     # elif params['backbone'] == 'hrnet':
     #     return hrnet(
     #         images, is_training,
     #         width=params['width'],
     #     )
     else:
         raise NotImplementedError
Ejemplo n.º 3
0
 def backbone(images, is_training):
     return mobilenet_v1(images,
                         is_training,
                         depth_multiplier=params['depth_multiplier'])
Ejemplo n.º 4
0
def model_fn(features, labels, mode, params):

    assert mode != tf.estimator.ModeKeys.PREDICT
    is_training = mode == tf.estimator.ModeKeys.TRAIN

    images = features['images']
    # it has shape [b, height, width, 3]

    backbone_features = mobilenet_v1(images, is_training,
                                     params['depth_multiplier'])
    subnet = KeypointSubnet(backbone_features, is_training, params)

    # add l2 regularization
    if params['weight_decay'] > 0.0:
        add_weight_decay(params['weight_decay'])
        regularization_loss = tf.losses.get_regularization_loss()
        tf.summary.scalar('regularization_loss', regularization_loss)

    losses = {}

    heatmaps = labels['heatmaps']
    # it has shape [b, h, w, 17],
    # where (h, w) = (height / 4, width / 4),
    # and `b` is batch size

    batch_size = tf.shape(heatmaps)[0]
    normalizer = tf.to_float(batch_size)

    segmentation_masks = tf.expand_dims(labels['segmentation_masks'], 3)
    loss_masks = tf.expand_dims(labels['loss_masks'], 3)
    # they have shape [b, h, w, 1]

    predicted_heatmaps = subnet.heatmaps[:, :, :, :17]
    predicted_segmentation_masks = tf.expand_dims(subnet.heatmaps[:, :, :, 17],
                                                  3)

    focal_loss_value = focal_loss(heatmaps,
                                  labels['num_boxes'],
                                  predicted_heatmaps,
                                  alpha=2.0,
                                  beta=4.0)  # shape [b, h, w]
    focal_loss_value = tf.squeeze(loss_masks, 3) * focal_loss_value
    focal_loss_value = tf.reduce_sum(focal_loss_value, axis=[0, 1, 2])
    losses['focal_loss'] = focal_loss_value / normalizer

    regression_loss = tf.nn.l2_loss(
        loss_masks * (predicted_segmentation_masks - segmentation_masks))
    losses['regression_loss'] = 1e-3 * regression_loss / normalizer

    # additional supervision
    # with person segmentation
    for level in range(2, 6):

        x = subnet.enriched_features[f'p{level}']
        x = tf.expand_dims(x[:, :, :, 0], 3)
        # it has shape [b, height / stride, width / stride, 1],
        # where stride is equal to level ** 2

        x = tf.nn.l2_loss(loss_masks * (x - segmentation_masks))
        losses[f'segmentation_loss_at_level_{level}'] = 1e-5 * x / normalizer

        shape = tf.shape(segmentation_masks)
        height, width = shape[1], shape[2]
        new_size = [height // 2, width // 2]

        segmentation_masks = tf.image.resize_bilinear(segmentation_masks,
                                                      new_size)
        loss_masks = tf.image.resize_bilinear(loss_masks, new_size)

    for n, v in losses.items():
        tf.losses.add_loss(v)
        tf.summary.scalar(n, v)
    total_loss = tf.losses.get_total_loss(add_regularization_losses=True)

    with tf.name_scope('eval_metrics'):

        shape = tf.shape(heatmaps)
        height, width = shape[1], shape[2]
        area = tf.to_float(height * width)

        loss_masks = tf.expand_dims(labels['loss_masks'], 3)
        predicted_heatmaps = tf.sigmoid(predicted_heatmaps)
        per_pixel_reg_loss = tf.nn.l2_loss(
            loss_masks * (predicted_heatmaps - heatmaps)) / (normalizer * area)
        tf.summary.scalar('per_pixel_reg_loss', per_pixel_reg_loss)

    if mode == tf.estimator.ModeKeys.EVAL:

        eval_metric_ops = {
            'eval_regression_loss':
            tf.metrics.mean(losses['regression_loss']),
            'eval_focal_loss':
            tf.metrics.mean(losses['focal_loss']),
            'eval_per_pixel_reg_loss':
            tf.metrics.mean(per_pixel_reg_loss),
            'eval_segmentation_loss_at_level_2':
            tf.metrics.mean(losses['segmentation_loss_at_level_2']),
            'eval_segmentation_loss_at_level_5':
            tf.metrics.mean(losses['segmentation_loss_at_level_5'])
        }

        return tf.estimator.EstimatorSpec(mode,
                                          loss=total_loss,
                                          eval_metric_ops=eval_metric_ops)

    with tf.variable_scope('learning_rate'):
        global_step = tf.train.get_global_step()
        learning_rate = tf.train.cosine_decay(params['initial_learning_rate'],
                                              global_step,
                                              decay_steps=params['num_steps'],
                                              alpha=1e-4)
        tf.summary.scalar('learning_rate', learning_rate)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        optimizer = tf.train.AdamOptimizer(learning_rate)
        grads_and_vars = optimizer.compute_gradients(total_loss)
        grads_and_vars = [(tf.clip_by_value(g, -200, 200), v)
                          for g, v in grads_and_vars]
        train_op = optimizer.apply_gradients(grads_and_vars, global_step)

    for g, v in grads_and_vars:
        tf.summary.histogram(v.name[:-2] + '_hist', v)
        tf.summary.histogram(v.name[:-2] + '_grad_hist', g)

    return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
Ejemplo n.º 5
0
def model_fn(features, labels, mode, params):

    assert mode != tf.estimator.ModeKeys.PREDICT
    is_training = mode == tf.estimator.ModeKeys.TRAIN

    images = features['images']
    backbone_features = mobilenet_v1(
        images, is_training=False, depth_multiplier=params['depth_multiplier'])
    retinanet = RetinaNet(backbone_features, tf.shape(images), is_training,
                          params)

    # add nms to the graph
    if not is_training:
        predictions = retinanet.get_predictions(
            score_threshold=params['score_threshold'],
            iou_threshold=params['iou_threshold'],
            max_detections=params['max_boxes'])

    # add l2 regularization
    add_weight_decay(params['weight_decay'])
    regularization_loss = tf.losses.get_regularization_loss()
    tf.summary.scalar('regularization_loss', regularization_loss)

    # create localization and classification losses
    losses = retinanet.loss(labels, params)
    tf.losses.add_loss(params['localization_loss_weight'] *
                       losses['localization_loss'])
    tf.losses.add_loss(params['classification_loss_weight'] *
                       losses['classification_loss'])
    tf.summary.scalar('localization_loss', losses['localization_loss'])
    tf.summary.scalar('classification_loss', losses['classification_loss'])
    total_loss = tf.losses.get_total_loss(add_regularization_losses=True)

    if mode == tf.estimator.ModeKeys.EVAL:

        shape = features['images'].shape
        batch_size = shape[0].value
        assert batch_size == 1

        evaluator = Evaluator()
        eval_metric_ops = evaluator.get_metric_ops(labels, predictions)

        return tf.estimator.EstimatorSpec(mode,
                                          loss=total_loss,
                                          eval_metric_ops=eval_metric_ops)

    with tf.variable_scope('learning_rate'):
        global_step = tf.train.get_global_step()
        learning_rate = tf.train.cosine_decay(params['initial_learning_rate'],
                                              global_step,
                                              decay_steps=params['num_steps'],
                                              alpha=1e-4)
        tf.summary.scalar('learning_rate', learning_rate)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # backbone network is frozen
        var_list = [
            v for v in tf.trainable_variables() if 'MobilenetV1' not in v.name
        ]

        grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step)

    for g, v in grads_and_vars:
        tf.summary.histogram(v.name[:-2] + '_hist', v)
        tf.summary.histogram(v.name[:-2] + '_grad_hist', g)

    return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
Ejemplo n.º 6
0
def create_full_graph(images, params):
    """
    Batch size must be a static value.
    Image size must be divisible by 128.

    Arguments:
        images: a float tensor with shape [b, h, w, 3].
        params: a dict.
    Returns:
        boxes: a float tensor with shape [b, max_boxes, 4],
            where max_boxes = max(num_boxes).
        scores: a float tensor with shape [b, max_boxes].
        num_boxes: an int tensor with shape [b].
        keypoint_heatmaps: a float tensor with shape [b, h / 4, w / 4, 17].
        segmentation_masks: a float tensor with shape [b, h / 4, w / 4].
        keypoint_scores: a float tensor with shape [total_num_boxes],
            where total_num_boxes = sum(num_boxes).
        keypoint_positions: a float tensor with shape [total_num_boxes, 17, 2].
    """

    is_training = False
    backbone_features = mobilenet_v1(images, is_training,
                                     params['depth_multiplier'])

    with tf.variable_scope('keypoint_subnet'):
        subnet = KeypointSubnet(backbone_features, is_training, params)

    with tf.variable_scope('retinanet'):
        retinanet = RetinaNet(backbone_features, tf.shape(images), is_training,
                              params)

    predictions = {
        'keypoint_heatmaps': tf.sigmoid(subnet.heatmaps[:, :, :, :17]),
        'segmentation_masks': subnet.heatmaps[:, :, :, 17]
    }
    predictions.update(
        retinanet.get_predictions(score_threshold=params['score_threshold'],
                                  iou_threshold=params['iou_threshold'],
                                  max_detections=params['max_boxes']))

    batch_size = images.shape[0].value
    assert batch_size is not None

    heatmaps = predictions['keypoint_heatmaps']  # shape [b, h / 4, w / 4, 17]
    predicted_boxes = predictions['boxes']  # shape [b, max_boxes, 4]
    num_boxes = predictions['num_boxes']  # shape [b]

    M = tf.reduce_max(heatmaps, [1, 2], keepdims=True)
    mask = tf.to_float(M > 0.2)
    m = tf.reduce_min(heatmaps, [1, 2], keepdims=True)
    heatmaps = (heatmaps - m) / (M - m)
    heatmaps *= mask

    boxes, box_ind = [], []
    for i in range(batch_size):
        n = num_boxes[i]
        boxes.append(predicted_boxes[i][:n])
        box_ind.append(i * tf.ones([n], dtype=tf.int32))

    boxes = tf.concat(boxes, axis=0)  # shape [num_boxes, 4]
    box_ind = tf.concat(box_ind, axis=0)  # shape [num_boxes]
    # where num_boxes is equal to sum(num_boxes)

    crops = tf.image.crop_and_resize(
        heatmaps, boxes, box_ind,
        crop_size=CROP_SIZE)  # shape [num_boxes, 56, 36, 17]

    num_boxes = tf.shape(crops)[0]
    logits = prn(crops, is_training)
    # it has shape [num_boxes, 56, 36, 17]

    H, W = CROP_SIZE
    logits = tf.reshape(logits, [num_boxes, H * W, 17])
    probabilities = tf.nn.softmax(logits, axis=1)
    probabilities = tf.reshape(probabilities, [num_boxes, H, W, 17])

    def argmax_2d(x):
        """
        Arguments:
            x: a tensor with shape [b, h, w, c].
        Returns:
            an int tensor with shape [b, c, 2].
        """
        shape = tf.unstack(tf.shape(x))
        b, h, w, c = shape

        flat_x = tf.reshape(x, [b, h * w, c])
        argmax = tf.argmax(flat_x, axis=1, output_type=tf.int32)

        argmax_y = argmax // w
        argmax_x = argmax % w

        return tf.stack([argmax_y, argmax_x], axis=2)

    keypoint_scores = tf.reduce_max(probabilities,
                                    axis=[1, 2])  # shape [num_boxes, 17]
    keypoint_positions = tf.to_float(
        argmax_2d(probabilities))  # shape [num_boxes, 17, 2]

    scaler = tf.stack(CROP_SIZE, axis=0)
    keypoint_positions /= tf.to_float(scaler)

    predictions.update({
        'keypoint_scores': keypoint_scores,
        'keypoint_positions': keypoint_positions
    })

    predictions = {
        n: tf.identity(predictions[n], name=n)
        for n in OUTPUT_NAMES
    }
    return predictions