Esempio n. 1
0
def train_input(config, params):
  """
  Returns a tf.data.Dataset for training from Mapillary Vistas tfrecords.

  Arguments:
    config: unused for now
    params: object with with the following attributes:
      required by _train_parse_and_prebatch_processing:
        tfrecords_path: path of tfrecords using KEYS2FEATURES_v5
        training_lids2cids: ...
      required by this function
        Ntrain: number of tfexamples in tfrecords
        Nb: number of examples per batch

  Returns:
    A tf.data.Dataset dataset containing (features, labels) tuples where:
      features = {'proimages', 'rawimagespaths', 'rawlabelspaths'}
      labels = {'prolabels'}
  """

  def _grouping(pim, pla, imp, lap):
    # group dataset elements as required by estimator
    features = {
        # 'rawimages': tf.zeros_like(pim),
        'proimages': pim,
        'rawimagespaths': imp,
        'rawlabelspaths': lap,
        }
    labels = {
        # 'rawlabels': tf.zeros_like(pla),
        'prolabels': pla,
        }

    # next line for distributed debugging
    # tf.string tensors is not supported for DMA read/write to GPUs (TF bug)
    if params.distribute:
      del features['rawimagespaths']
      del features['rawlabelspaths']

    return (features, labels)

  with tf.variable_scope('input_pipeline'):
    dataset = prebatch_dataset(config, params)
    # Vistas raw images and labels don't have the same spatial size and cannot be batched
    dataset = dataset.map(lambda rim, rla, pim, pla, rip, rlp: (pim, pla, rip, rlp))
    dataset = dataset.batch(get_temp_Nb(config, params.Nb))
    dataset = postbatch_dataset(dataset, config, params)
    dataset = dataset.map(_grouping, num_parallel_calls=NUM_PARALLEL_CALLS)
    dataset = dataset.prefetch(None)

  return dataset
def train_input(config, params):
    """
  Returns a tf.data.Dataset for training from Cityscapes tfrecords.

  Arguments:
    config: unused for now
    params: object with the following attributes:
      required by this function
        tfrecords_path: path of tfrecords
        Ntrain: number of tfexamples in tfrecords
        Nb: number of examples per batch
      required by _train_prebatch_processing:
        {height, width}_feature_extractor: the spatial size of feature extractor
        training_lids2cids: the ...

  Returns:
    A tf.data.Dataset dataset containing (features, labels) tuples where:
      features = {'rawimages', 'proimages', 'rawimagespaths', 'rawlabelspaths'}
      labels = {'rawlabels', 'prolabels'}
  """
    def _grouping(rim, rla, pim, pla, imp, lap):
        # group dataset elements as required by estimator
        features = {
            'rawimages': rim,
            'proimages': pim,
            'rawimagespaths': imp,
            'rawlabelspaths': lap,
        }
        labels = {
            'rawlabels': rla,
            'prolabels': pla,
        }

        # next line for distributed debugging
        # tf.string tensors is not supported for DMA read/write to GPUs (TF bug)
        if params.distribute:
            del features['rawimagespaths']
            del features['rawlabelspaths']

        return (features, labels)

    with tf.variable_scope('input_pipeline'):
        dataset = prebatch_dataset(config, params)
        dataset = dataset.batch(get_temp_Nb(config, params.Nb))
        dataset = postbatch_dataset(dataset, config, params)
        dataset = dataset.map(_grouping, num_parallel_calls=NUM_PARALLEL_CALLS)
        dataset = dataset.prefetch(None)

    return dataset
def train_input(config, params):
    """
  Returns a tf.data.Dataset for training from Open Images data
  """
    def _grouping(pim, pla, iid):
        # group dataset elements as required by estimator
        features = {
            # 'rawimages': rim,
            'proimages': pim,
            'imageids': iid,
            # 'rawimagespaths': imp,
            # 'rawlabelspaths': lap,
        }
        labels = {
            # 'rawlabels': rla,
            'prolabels': pla,
        }

        # next line for distributed debugging
        # tf.string tensors is not supported for DMA read/write to GPUs (TF bug)
        if params.distribute:
            # del features['rawimagespaths']
            # del features['rawlabelspaths']
            del features['imageids']

        return (features, labels)

    with tf.name_scope('input_pipeline'):
        dataset = prebatch_dataset(config, params)
        dataset = dataset.batch(get_temp_Nb(config, params.Nb))
        dataset = postbatch_dataset(dataset, config, params)
        dataset = dataset.map(_grouping, num_parallel_calls=NUM_PARALLEL_CALLS)
        options = tf.data.Options()
        options.experimental_autotune = True
        # seems than on average gives faster results
        dataset = dataset.prefetch(None).with_options(options)

    return dataset
def _define_summaries(mode, config, params, summaries_data):
    # this function is only to be used for training mode

    assert mode == tf.estimator.ModeKeys.TRAIN, print(
        'internal error: summaries only for training.')

    with tf.name_scope('summaries'), tf.device('/cpu:0'):
        # unpack necessary objects and tensors
        # WARNING: assumes all necessary items exist (maybe add assertions)
        # rawlabels = summaries_data['labels']['rawlabels']
        proimages = summaries_data['features']['proimages']
        prolabels_per_pixel = summaries_data['labels']['prolabels_per_pixel']
        prolabels_per_bbox = summaries_data['labels']['prolabels_per_bbox']
        l1_probs, l1_decs, l2_vehicle_probs, l2_vehicle_decs, l2_human_probs, l2_human_decs, decs = itemgetter(
            'l1_probabilities', 'l1_decisions', 'l2_vehicle_probabilities',
            'l2_vehicle_decisions', 'l2_human_probabilities',
            'l2_human_decisions', 'decisions')(summaries_data['predictions'])
        # create a new dict with the supported keys only
        # predictions = _map_predictions_to_new_cids(
        # {'probabilities': probs, 'decisions': decs}, params.training_cids2inference_cids)
        # probs, decs = itemgetter('probabilities', 'decisions')(predictions)
        tot_loss, reg_loss, l1_seg_loss, l1_seg_loss_hot, l2_vehicle_seg_loss, l2_human_seg_loss = itemgetter(
            'total', 'regularization', 'l1_segmentation',
            'l1_segmentation_hot', 'l2_vehicle_segmentation',
            'l2_human_segmentation')(summaries_data['losses'])

        # drawing
        with tf.name_scope('drawing'):
            with tf.name_scope('palette'):
                palette = tf.constant(
                    params.training_problem_def['cids2colors'], dtype=tf.uint8)

            # WARNING: assuming upsampling, that is all color_* images have the
            # same spatial dimensions
            if params.per_pixel_dataset_name == 'vistas':
                # human: 19->19, vehicle: 49->52
                l1_cids2common_cids = tf.cast([
                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
                    17, 18, 19, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
                    35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
                    50, 51, 52, 63, 64, 65
                ], tf.int32)
                per_bbox_cids2common_cids = tf.cast([
                    52, 54, 55, 57, 58, 61, 19, 19, 19, 19, 19, 48, 50, 50, 65
                ], tf.int32)
            elif params.per_pixel_dataset_name == 'cityscapes':
                l1_cids2common_cids = tf.cast(
                    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 19], tf.int32)
                per_bbox_cids2common_cids = tf.cast(
                    [18, 15, 13, 17, 16, 14, 11, 11, 11, 11, 11, 6, 7, 7, 19],
                    tf.int32)
            color_l1_decisions = _cids2col(
                tf.gather(l1_cids2common_cids, l1_decs), palette)
            color_l2_vehicle_decisions = _cids2col(l2_vehicle_decs, palette)
            color_decisions = _cids2col(decs, palette)

            # generate confidence image, preventing TF from normalizing max prob
            # to 1, by casting to tf.uint8
            color_l1_confidences = tf.stack(
                [tf.cast(tf.reduce_max(l1_probs, axis=3) * 255, tf.uint8)] * 3,
                axis=3)
            # raise to ^50 for more contrast in high probabilities
            color_l2_vehicle_confidences = tf.stack([
                tf.cast(
                    tf.reduce_max(tf.pow(l2_vehicle_probs, 50), axis=3) * 255,
                    tf.uint8)
            ] * 3,
                                                    axis=3)

            color_prolabels = _cids2col(
                tf.concat([
                    prolabels_per_pixel,
                    tf.gather(
                        per_bbox_cids2common_cids,
                        tf.argmax(
                            prolabels_per_bbox, axis=-1, output_type=tf.int32))
                ], 0), palette)

            # TODO(panos): as noted in MirrorStrategy, in a multi-gpu setting the effective
            #   batch size is num_gpus * Nb, however in the early implementation
            #   (master branch of April 1st 2018), summaries are computed per GPU
            #   and since Nb >= Nb/num_gpus (of the future implementation)
            #   no change is needed here
            # TODO(panos): here it is assumed that the input pipeline outputs proimages in [-1, 1)
            tf.summary.image(
                'proimages',
                (proimages + 1) / 2,
                # tf.image.convert_image_dtype(proimages, tf.uint8, saturate=True),
                max_outputs=100,
                family='preprocessed_data')
            tf.summary.image('prolabels',
                             color_prolabels,
                             max_outputs=100,
                             family='preprocessed_data')
            tf.summary.image('decisions',
                             color_decisions,
                             max_outputs=100,
                             family='results')
            tf.summary.image('l1_decisions',
                             color_l1_decisions,
                             max_outputs=100,
                             family='results')
            tf.summary.image('l2_vehicle_decisions',
                             color_l2_vehicle_decisions,
                             max_outputs=100,
                             family='results')
            tf.summary.image('l1_confidences',
                             color_l1_confidences,
                             max_outputs=100,
                             family='results')
            tf.summary.image('l2_vehicle_confidences_stretched',
                             color_l2_vehicle_confidences,
                             max_outputs=100,
                             family='results')

            # compute batch metrics
            m_iou_per_pixel = mean_iou(
                prolabels_per_pixel,
                decs[:get_temp_Nb(config, params.Nb_per_pixel), ...],
                num_classes=params.output_Nclasses,
                params=params)

        # TODO: in order to disable loss summary created internally by estimator this line should
        # evaluate to False:
        # not any([x.op.name == 'loss' for x in ops.get_collection(ops.GraphKeys.SUMMARIES)])
        tf.summary.scalar('total', tot_loss, family='losses')
        tf.summary.scalar('regularization', reg_loss, family='losses')
        tf.summary.scalar('l1_segmentation', l1_seg_loss, family='losses')
        # tf.summary.scalar('l1_segmentation_hot', l1_seg_loss_hot, family='losses')
        tf.summary.scalar('l2_vehicle_segmentation',
                          l2_vehicle_seg_loss,
                          family='losses')
        tf.summary.scalar('l2_human_segmentation',
                          l2_human_seg_loss,
                          family='losses')
        tf.summary.scalar('mIoU', m_iou_per_pixel, family='metrics')

        tf.summary.scalar('learning_rate',
                          summaries_data['learning_rate'],
                          family='optimizer')
def define_losses(mode, predictions, labels, config, params): # pylint: disable=unused-argument
  """
  """

  l1_logits = predictions['l1_logits']
  l1_decisions = predictions['l1_decisions']
  l2_vehicle_logits = predictions['l2_vehicle_logits']
  l2_vehicle_probabilities = predictions['l2_vehicle_probabilities']
  l2_human_logits = predictions['l2_human_logits']
  l2_human_probabilities = predictions['l2_human_probabilities']

  ## generate losses
  if mode == tf.estimator.ModeKeys.EVAL:
    tf.logging.info('Losses for evaluation are not yet implemented (set to 0 for now).')
    return {'total': tf.constant(0.),
            'segmentation': tf.constant(0.),
            'regularization': tf.constant(0.)}

  elif mode == tf.estimator.ModeKeys.TRAIN:

    Nb_per_pixel = get_temp_Nb(config, params.Nb_per_pixel)
    Nb_per_bbox = get_temp_Nb(config, params.Nb_per_bbox)
    Nb_per_image = get_temp_Nb(config, params.Nb_per_image)
    per_pixel_dataset = params.per_pixel_dataset_name
    if per_pixel_dataset == 'vistas':
      cid_l1_vehicle = 49
      cid_l1_human = 19
      per_pixel_cids2l1_cids = tf.cast([
          0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
          10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
          19, 19, 19, 20, 21, 22, 23, 24, 25, 26,
          27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
          37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
          47, 48, 49, 49, 49, 49, 49, 49, 49, 49,
          49, 49, 49, 50, 51, 52], tf.int32)
      per_bbox_cids2l1_cids = tf.cast([
          49, 49, 49, 49, 49, 49, 19, 19, 19, 19,
          19, 52, 52, 52, 52], tf.int32)
      # 0: bicycle, 1: boat, 2: bus, 3: car, 4: caravan, 5: motorcycle, 6: on rails,
      # 7: other vehicle, 8: trailer, 9: truck, 10: wheeled slow, 11: void
      per_pixel_cids2vehicle_cids = tf.cast([
          11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
          11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
          11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
          11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
          11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
          11, 11,  0,  1,  2,  3,  4,  5,  6,  7,
           8,  9, 10, 11, 11, 11], tf.int32)
      per_bbox_cids2vehicle_cids = tf.cast(
          [0, 2, 3, 5, 6, 9, 11, 11, 11, 11, 11, 11, 11, 11, 11], tf.int32)
      # 0: person, 1: bicyclist, 2: motorcyclist, 3: other rider, 4: void
      per_pixel_cids2human_cids = tf.cast([
          4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
          4, 4, 4, 4, 4, 4, 4, 4, 4, 0,
          1, 2, 3, 4, 4, 4, 4, 4, 4, 4,
          4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
          4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
          4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
          4, 4, 4, 4, 4, 4], tf.int32)
      per_bbox_cids2human_cids = tf.cast(
          [4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 4, 4, 4, 4], tf.int32)
    elif per_pixel_dataset == 'cityscapes':
      cid_l1_vehicle = 12
      cid_l1_human = 11
      per_pixel_cids2l1_cids = tf.cast([
           0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
          10,11,11,12,12,12,12,12,12,13], tf.int32)
      per_bbox_cids2l1_cids = tf.cast([
          12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
          11, 13, 13, 13, 13], tf.int32)
      per_pixel_cids2vehicle_cids = tf.cast([
          6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
          6, 6, 6, 0, 1, 2, 3, 4, 5, 6], tf.int32)
      per_bbox_cids2vehicle_cids = tf.cast(
          [5, 2, 0, 4, 3, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6], tf.int32)
      per_pixel_cids2human_cids = tf.cast([
          2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
          2, 0, 1, 2, 2, 2, 2, 2, 2, 2], tf.int32)
      per_bbox_cids2human_cids = tf.cast(
          [2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 2], tf.int32)

    ## labels for l1 classifier
    # vehicle_labels contain per_pixel (dense) labels and open images (sparse) labels
    per_pixel_labels = labels['prolabels_per_pixel']
    per_bbox_labels = labels['prolabels_per_bbox']
    per_image_labels = labels['prolabels_per_image']
    l1_per_pixel_labels = tf.gather(per_pixel_cids2l1_cids, per_pixel_labels)
    # dummy labels, will be masked during loss computation by weights
    l1_per_bbox_labels = tf.constant(
        -1000, dtype=tf.int32, shape=(Nb_per_bbox, *l1_per_pixel_labels.shape[1:]))
    l1_per_image_labels = tf.constant(
        -1000, dtype=tf.int32, shape=(Nb_per_image, *l1_per_pixel_labels.shape[1:]))
    l1_labels = tf.concat([l1_per_pixel_labels, l1_per_bbox_labels, l1_per_image_labels], 0)
    l1_labels = tf.stop_gradient(l1_labels) # tf.int32, (H, W), with indices

    ## labels for the vehicle l2 classifier
    l2_vehicle_per_pixel_labels = tf.gather(per_pixel_cids2vehicle_cids, per_pixel_labels)
    l2_vehicle_per_pixel_labels = tf.one_hot(l2_vehicle_per_pixel_labels, tf.reduce_max(per_pixel_cids2vehicle_cids)+1)
    # _segment_sum strategy: e.g. if one pixel belongs to bbox of human and vehicle
    # for the vehicle classifier 1/2 will remain for supervision and 1/2 will go to void
    l2_vehicle_per_bbox_labels = _segment_sum(per_bbox_labels, per_bbox_cids2vehicle_cids, tf.reduce_max(per_bbox_cids2vehicle_cids)+1)
    l2_vehicle_per_image_labels = _segment_sum(per_image_labels, per_bbox_cids2vehicle_cids, tf.reduce_max(per_bbox_cids2vehicle_cids)+1)

    l2_vehicle_labels = tf.concat([l2_vehicle_per_pixel_labels, l2_vehicle_per_bbox_labels, l2_vehicle_per_image_labels], 0)
    l2_vehicle_labels = tf.stop_gradient(l2_vehicle_labels)

    ## labels for the human l2 classifier
    l2_human_per_pixel_labels = tf.gather(per_pixel_cids2human_cids, per_pixel_labels)
    l2_human_per_pixel_labels = tf.one_hot(l2_human_per_pixel_labels, tf.reduce_max(per_pixel_cids2human_cids)+1)
    l2_human_per_bbox_labels = _segment_sum(per_bbox_labels, per_bbox_cids2human_cids, tf.reduce_max(per_bbox_cids2human_cids)+1)
    l2_human_per_image_labels = _segment_sum(per_image_labels, per_bbox_cids2human_cids, tf.reduce_max(per_bbox_cids2human_cids)+1)
    l2_human_labels = tf.concat([l2_human_per_pixel_labels, l2_human_per_bbox_labels, l2_human_per_image_labels], 0)
    l2_human_labels = tf.stop_gradient(l2_human_labels)

    ## l1 loss: for per_pixel, disabled: [and high-confidence loss for open-images]
    with tf.name_scope("l1_cross_entropy_loss",
                       (l1_logits, l1_per_pixel_labels, l1_per_bbox_labels)) as l1_loss_scope:
      l1_raw_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=l1_per_pixel_labels,
          logits=l1_logits[:Nb_per_pixel, ...],
          name="l1")
      l1_per_pixel_weights = tf.cast(l1_per_pixel_labels <= tf.reduce_max(per_pixel_cids2l1_cids)-1, tf.float32)
      l1_per_bbox_weights = tf.zeros(tf.shape(per_bbox_labels)[:-1])
      l1_weights = tf.concat([l1_per_pixel_weights, l1_per_bbox_weights], 0)

      # debug summary
      tf.summary.image('l1_weights', l1_weights[..., tf.newaxis], max_outputs=100, family='debug')

    ## l2 losses per classifier: for per_pixel and weak labels (open images)
    # all examples from per_pixel must accumulate loss, but only the examples from open images
    # that are found to be vehicle by the parent (l1) classifier: this is implemented with weights
    with tf.name_scope("l2_cross_entropy_losses",
                       (l2_vehicle_logits, l2_vehicle_labels)) as l2_loss_scope:

      # vehicle l2 classifier
      l2_vehicle_raw_loss = tf.nn.softmax_cross_entropy_with_logits(
          labels=l2_vehicle_labels,
          logits=l2_vehicle_logits,
          name="vehicle")

      l2_vehicle_per_pixel_weights = 1.0 - l2_vehicle_labels[:Nb_per_pixel, ..., -1]
      # not_void: pixels that are non_void
      # l1 correct: pixels that l1 classifier found as vehicle and the l2 gt
      #   agrees that may belong to vehicle
      # l2_vehicle_labels[Nb_per_pixel:, ..., -1] due to _segment_sum may have all type of values in [0, 1]
      not_void_weights = tf.greater(1.0 - l2_vehicle_labels[Nb_per_pixel:, ..., -1], 0.01)
      with tf.control_dependencies([l1_decisions]):
        l1_correct_weights = tf.logical_and(
            tf.equal(l1_decisions[Nb_per_pixel:, ...], cid_l1_vehicle),
            tf.greater_equal(tf.reduce_max(l2_vehicle_labels[Nb_per_pixel:, ..., :-1], axis=-1), 0.01))
      l2_vehicle_weak_weights = tf.cast(tf.logical_and(not_void_weights, l1_correct_weights), tf.float32)
      l2_vehicle_weights = tf.concat([l2_vehicle_per_pixel_weights, l2_vehicle_weak_weights], 0)

      tf.summary.image('l2_vehicle_weights', l2_vehicle_weights[..., tf.newaxis], max_outputs=100, family='debug')

      # human l2 classifier
      l2_human_raw_loss = tf.nn.softmax_cross_entropy_with_logits(
          labels=l2_human_labels,
          logits=l2_human_logits,
          name="human")

      l2_human_per_pixel_weights = 1.0 - l2_human_labels[:Nb_per_pixel, ..., -1]
      # not_void: pixels that are non_void
      # l1 correct: pixels that l1 classifier found as human and the l2 gt
      #   agrees that may belong to human
      not_void_weights = tf.greater(1.0 - l2_human_labels[Nb_per_pixel:, ..., -1], 0.01)
      with tf.control_dependencies([l1_decisions]):
        l1_correct_weights = tf.logical_and(
            tf.equal(l1_decisions[Nb_per_pixel:, ...], cid_l1_human),
            tf.greater_equal(tf.reduce_max(l2_human_labels[Nb_per_pixel:, ..., :-1], axis=-1), 0.01))
      l2_human_weak_weights = tf.cast(tf.logical_and(not_void_weights, l1_correct_weights), tf.float32)
      l2_human_weights = tf.concat([l2_human_per_pixel_weights, l2_human_weak_weights], 0)

      tf.summary.image('l2_human_weights', l2_human_weights[..., tf.newaxis], max_outputs=100, family='debug')

    ## compute losses
    # l1 accumulates from per_pixel and selectively from Open Images
    l1_seg_loss = tf.losses.compute_weighted_loss(
        l1_raw_loss, weights=l1_per_pixel_weights, scope=l1_loss_scope, loss_collection=None)
    # l1_seg_loss_hot = tf.losses.compute_weighted_loss(
    #     l1_raw_loss_hot, weights=l1_weights_hot, scope=l1_loss_scope, loss_collection=None)
    # l2 accumulates from per_pixel and Open Images
    l2_vehicle_seg_loss = tf.losses.compute_weighted_loss(
        l2_vehicle_raw_loss, weights=l2_vehicle_weights, scope=l2_loss_scope, loss_collection=None)
    l2_human_seg_loss = tf.losses.compute_weighted_loss(
        l2_human_raw_loss, weights=l2_human_weights, scope=l2_loss_scope, loss_collection=None)
    l2_seg_loss = l2_vehicle_seg_loss + l2_human_seg_loss

    print('\n\nweak labels loss coeff. changed to 0.1.\n\n')
    seg_loss = l1_seg_loss + 0.1 * l2_seg_loss
    tf.losses.add_loss(seg_loss)
    reg_loss = tf.add_n(tf.losses.get_regularization_losses())
    tot_loss = tf.losses.get_total_loss(add_regularization_losses=True)
    losses = {'total': tot_loss,
              'l1_segmentation': l1_seg_loss,
              'l1_segmentation_hot': tf.zeros_like(l1_seg_loss), #l1_seg_loss_hot,
              'l2_vehicle_segmentation': l2_vehicle_seg_loss,
              'l2_human_segmentation': l2_human_seg_loss,
              'regularization': reg_loss}

  else:
    assert NotImplementedError(f"mode {mode} is invalid or not yet implemented.")

  return losses
Esempio n. 6
0
def model(mode, features, labels, config, params):
    """
  Arguments:
    features: tf.float32, Nb x hf x wf x Nc, (?, ?, ?, ?)

  Effective Receptive Field: ~200x200 pixels
  Return:
    predictions: a dict containing predictions:
      logits: tf.float32, Nb x hf x wf x Nc, (?, hf, wf, Nc)
      probabilities: tf.float32, Nb x hf x wf x Nc, (?, hf, wf, Nc)
      decisions: tf.int32, Nb x hf x wf, (?, hf, wf, Nc)
  """

    _validate_params(params)

    # ResNet-50 extended model require predefined channel (C) dimensions
    # set channel shape to 3 (RGB colors)
    features.set_shape((None, None, None, 3))
    # if group norm then batch need to be defined
    if params.norm_layer == 'group':
        features.set_shape((get_temp_Nb(config, params.Nb), None, None, None))

    # define arguments scope
    network_scope_args = {
        'norm_train_variables': params.norm_train_variables,
        'batch_norm_accumulate_statistics':
        params.batch_norm_accumulate_statistics,
        'norm_type': params.norm_layer
    }
    if mode == tf.estimator.ModeKeys.TRAIN:
        network_scope_args.update(weight_decay=params.regularization_weight,
                                  batch_norm_decay=params.batch_norm_decay,
                                  cross_replica_norm=params.cross_replica_norm)
    args_context = functools.partial(module_arg_scope, **network_scope_args)

    # build the feature extractor
    with tf.variable_scope('feature_extractor'), slim.arg_scope(
            args_context()):
        features, end_points = feature_extractor(mode, features, labels,
                                                 config, params)
        # add optionally a PSP module
        if params.psp_module:
            with tf.variable_scope('pyramid_module'):
                features = _create_psp_module(features, params)

    def _bottleneck(features, scope):
        return resnet_v1.bottleneck(features,
                                    features.shape[-1].value,
                                    features.shape[-1].value,
                                    1,
                                    scope=scope)

    with tf.variable_scope('adaptation_module'), slim.arg_scope(
            args_context()):
        # Vistas: l1 features for classifying into 50(Vistas) + 1(vehicle) + 1(human) + 1(void) classes
        #         l2 features for classifying vehicle into 11(types of vehicle) + 1(void) classes
        #         l2 features for classifying human into 4(types of human) + 1(void) classes
        l1_features = _bottleneck(features, 'l1_features')
        l2_vehicle_features = _bottleneck(features, 'l2_vehicle_features')
        l2_human_features = _bottleneck(features, 'l2_human_features')

    ## create head: logits, probabilities and top-1 decisions
    ##   First the logits are created and then upsampled for memory efficiency.
    # if group normalization then groups must be less than channels ->
    #   layer norm (1) works better than instance norm (output_Nclasses) for same hyperparameters
    with tf.variable_scope('softmax_classifier'), slim.arg_scope(
            args_context(groups=1)):

        def _conv2d(features, n_out, sc):
            return slim.conv2d(features,
                               num_outputs=n_out,
                               kernel_size=1,
                               activation_fn=None,
                               scope=sc)

        l1_logits = _conv2d(
            l1_features,
            53 if params.per_pixel_dataset_name == 'vistas' else 14,
            'l1_logits')
        l2_vehicle_logits = _conv2d(
            l2_vehicle_features,
            12 if params.per_pixel_dataset_name == 'vistas' else 7,
            'l2_vehicle_logits')
        l2_human_logits = _conv2d(
            l2_human_features,
            5 if params.per_pixel_dataset_name == 'vistas' else 3,
            'l2_human_logits')
        l1_logits = _create_upsampler(l1_logits, params)
        l2_vehicle_logits = _create_upsampler(l2_vehicle_logits, params)
        l2_human_logits = _create_upsampler(l2_human_logits, params)

        l1_probs = tf.nn.softmax(l1_logits, name='l1_probabilities')
        l1_decs = tf.cast(tf.argmax(l1_probs, 3),
                          tf.int32,
                          name='l1_decisions')
        l2_vehicle_probs = tf.nn.softmax(l2_vehicle_logits,
                                         name='l2_vehicle_probabilities')
        l2_vehicle_decs = tf.cast(tf.argmax(l2_vehicle_probs, 3),
                                  tf.int32,
                                  name='l2_vehicle_decisions')
        l2_human_probs = tf.nn.softmax(l2_human_logits,
                                       name='l2_human_probabilities')
        l2_human_decs = tf.cast(tf.argmax(l2_human_probs, 3),
                                tf.int32,
                                name='l2_human_decisions')
        # generate final decisions
        if params.per_pixel_dataset_name == 'vistas':
            # human: 19->19, vehicle: 49->52
            l1_cids2common_cids = tf.cast([
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
                18, 19, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
                37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
                63, 64, 65
            ], tf.int32)
            l2_vehicle_cids2common_cids = tf.cast(
                [52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 65], tf.int32)
            l2_human_cids2common_cids = tf.cast([19, 20, 21, 22, 65], tf.int32)
        elif params.per_pixel_dataset_name == 'cityscapes':
            l1_cids2common_cids = tf.cast(
                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 19], tf.int32)
            l2_vehicle_cids2common_cids = tf.cast([13, 14, 15, 16, 17, 18, 19],
                                                  tf.int32)
            l2_human_cids2common_cids = tf.cast([11, 12, 19], tf.int32)

        decs = tf.where(
            tf.equal(l1_decs,
                     49 if params.per_pixel_dataset_name == 'vistas' else 12),
            tf.gather(l2_vehicle_cids2common_cids, l2_vehicle_decs),
            tf.where(
                tf.equal(
                    l1_decs,
                    19 if params.per_pixel_dataset_name == 'vistas' else 11),
                tf.gather(l2_human_cids2common_cids, l2_human_decs),
                tf.gather(l1_cids2common_cids, l1_decs)))

    ## model outputs groupped as predictions of the Estimator
    # WARNING: 'decisions' key is used internally so it must exist for now..
    predictions = {
        'l1_logits': l1_logits,
        'l1_probabilities': l1_probs,
        'l1_decisions': l1_decs,
        'l2_vehicle_logits': l2_vehicle_logits,
        'l2_vehicle_probabilities': l2_vehicle_probs,
        'l2_vehicle_decisions': l2_vehicle_decs,
        'l2_human_logits': l2_human_logits,
        'l2_human_probabilities': l2_human_probs,
        'l2_human_decisions': l2_human_decs,
        'decisions': decs
    }

    # distribute is not yet supported in evaluate and predict
    if hasattr(params, 'distribute') and params.distribute:
        tower_context = tf.contrib.distribute.get_tower_context()
        assert tower_context
        twr_str = f"Tower {tower_context.tower_id}"
    else:
        twr_str = ''
    tf.logging.info(twr_str + " predictions:\n" +
                    pprint.pformat(predictions, width=10))

    return features, end_points, predictions