Exemple #1
0
    def preprocess(self, inputs, audio_flag=False):
        """Feature-extractor specific preprocessing.

    See base class.

    Args:
      inputs: a [batch, height_in, width_in, channels] float tensor representing
        a batch of images with values between 0 and 255.0.

    Returns:
      preprocessed_inputs: a [batch, height_out, width_out, channels] float
        tensor representing a batch of images.
    Raises:
      ValueError: if inputs tensor does not have type tf.float32
    """
        if inputs.dtype is not tf.float32:
            raise ValueError('`preprocess` expects a tf.float32 tensor')
        with tf.name_scope('Preprocessor'):
            # TODO: revisit whether to always use batch size as the number of parallel
            # iterations vs allow for dynamic batching.

            if (audio_flag == False):
                resized_inputs = tf.map_fn(self._image_resizer_fn,
                                           elems=inputs,
                                           dtype=tf.float32)
                return self._feature_extractor.preprocess(resized_inputs,
                                                          normalized=False)
            else:
                print("audio shape in preprocess", inputs.get_shape())
                # no resize
                # TO DO: hard coding for just coverting
                # dynamic shape of tensor to the static shape
                # need to be modified later
                resized_inputs = preprocessor.resize_image(inputs,
                                                           new_height=20,
                                                           new_width=9)

                return self._feature_extractor.preprocess(resized_inputs,
                                                          normalized=False)
def build(input_reader_config,
          model_config,
          lstm_config,
          unroll_length,
          data_augmentation_options=None,
          batch_size=1):
  """Builds a tensor dictionary based on the InputReader config.

  Args:
    input_reader_config: An input_reader_builder.InputReader object.
    model_config: A model.proto object containing the config for the desired
      DetectionModel.
    lstm_config: LSTM specific configs.
    unroll_length: Unrolled length for LSTM training.
    data_augmentation_options: A list of tuples, where each tuple contains a
      data augmentation function and a dictionary containing arguments and their
      values (see preprocessor.py).
    batch_size: Batch size for queue outputs.

  Returns:
    A dictionary of tensors based on items in the input_reader_config.

  Raises:
    ValueError: On invalid input reader proto.
    ValueError: If no input paths are specified.
  """
  if not isinstance(input_reader_config, input_reader_pb2.InputReader):
    raise ValueError('input_reader_config not of type '
                     'input_reader_pb2.InputReader.')

  external_reader_config = input_reader_config.external_input_reader
  google_input_reader_config = external_reader_config.Extensions[
      input_reader_google_pb2.GoogleInputReader.google_input_reader]
  input_reader_type = google_input_reader_config.WhichOneof('input_reader')

  if input_reader_type == 'tf_record_video_input_reader':
    config = google_input_reader_config.tf_record_video_input_reader
    reader_type_class = tf.TFRecordReader
  else:
    raise ValueError(
        'Unsupported reader in input_reader_config: %s' % input_reader_type)

  if not config.input_path:
    raise ValueError('At least one input path must be specified in '
                     '`input_reader_config`.')
  key, value = parallel_reader.parallel_read(
      config.input_path[:],  # Convert `RepeatedScalarContainer` to list.
      reader_class=reader_type_class,
      num_epochs=(input_reader_config.num_epochs
                  if input_reader_config.num_epochs else None),
      num_readers=input_reader_config.num_readers,
      shuffle=input_reader_config.shuffle,
      dtypes=[tf.string, tf.string],
      capacity=input_reader_config.queue_capacity,
      min_after_dequeue=input_reader_config.min_after_dequeue)

  # TODO(yinxiao): Add loading instance mask option.
  decoder = tf_sequence_example_decoder.TFSequenceExampleDecoder()

  keys_to_decode = [
      fields.InputDataFields.image, fields.InputDataFields.groundtruth_boxes,
      fields.InputDataFields.groundtruth_classes
  ]
  tensor_dict = decoder.decode(value, items=keys_to_decode)

  tensor_dict['image'].set_shape([None, None, None, 3])
  tensor_dict['groundtruth_boxes'].set_shape([None, None, 4])

  height = model_config.ssd.image_resizer.fixed_shape_resizer.height
  width = model_config.ssd.image_resizer.fixed_shape_resizer.width

  # If data augmentation is specified in the config file, the preprocessor
  # will be called here to augment the data as specified. Most common
  # augmentations include horizontal flip and cropping.
  if data_augmentation_options:
    images_pre = tf.split(tensor_dict['image'], config.video_length, axis=0)
    bboxes_pre = tf.split(
        tensor_dict['groundtruth_boxes'], config.video_length, axis=0)
    labels_pre = tf.split(
        tensor_dict['groundtruth_classes'], config.video_length, axis=0)
    images_proc, bboxes_proc, labels_proc = [], [], []
    cache = preprocessor_cache.PreprocessorCache()

    for i, _ in enumerate(images_pre):
      image_dict = {
          fields.InputDataFields.image:
              images_pre[i],
          fields.InputDataFields.groundtruth_boxes:
              tf.squeeze(bboxes_pre[i], axis=0),
          fields.InputDataFields.groundtruth_classes:
              tf.squeeze(labels_pre[i], axis=0),
      }
      image_dict = preprocessor.preprocess(
          image_dict,
          data_augmentation_options,
          func_arg_map=preprocessor.get_default_func_arg_map(),
          preprocess_vars_cache=cache)
      # Pads detection count to _PADDING_SIZE.
      image_dict[fields.InputDataFields.groundtruth_boxes] = tf.pad(
          image_dict[fields.InputDataFields.groundtruth_boxes],
          [[0, _PADDING_SIZE], [0, 0]])
      image_dict[fields.InputDataFields.groundtruth_boxes] = tf.slice(
          image_dict[fields.InputDataFields.groundtruth_boxes], [0, 0],
          [_PADDING_SIZE, -1])
      image_dict[fields.InputDataFields.groundtruth_classes] = tf.pad(
          image_dict[fields.InputDataFields.groundtruth_classes],
          [[0, _PADDING_SIZE]])
      image_dict[fields.InputDataFields.groundtruth_classes] = tf.slice(
          image_dict[fields.InputDataFields.groundtruth_classes], [0],
          [_PADDING_SIZE])
      images_proc.append(image_dict[fields.InputDataFields.image])
      bboxes_proc.append(image_dict[fields.InputDataFields.groundtruth_boxes])
      labels_proc.append(image_dict[fields.InputDataFields.groundtruth_classes])
    tensor_dict['image'] = tf.concat(images_proc, axis=0)
    tensor_dict['groundtruth_boxes'] = tf.stack(bboxes_proc, axis=0)
    tensor_dict['groundtruth_classes'] = tf.stack(labels_proc, axis=0)
  else:
    # Pads detection count to _PADDING_SIZE per frame.
    tensor_dict['groundtruth_boxes'] = tf.pad(
        tensor_dict['groundtruth_boxes'], [[0, 0], [0, _PADDING_SIZE], [0, 0]])
    tensor_dict['groundtruth_boxes'] = tf.slice(
        tensor_dict['groundtruth_boxes'], [0, 0, 0], [-1, _PADDING_SIZE, -1])
    tensor_dict['groundtruth_classes'] = tf.pad(
        tensor_dict['groundtruth_classes'], [[0, 0], [0, _PADDING_SIZE]])
    tensor_dict['groundtruth_classes'] = tf.slice(
        tensor_dict['groundtruth_classes'], [0, 0], [-1, _PADDING_SIZE])

  tensor_dict['image'], _ = preprocessor.resize_image(
      tensor_dict['image'], new_height=height, new_width=width)

  num_steps = config.video_length / unroll_length

  init_states = {
      'lstm_state_c':
          tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]),
      'lstm_state_h':
          tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]),
      'lstm_state_step':
          tf.constant(num_steps, shape=[]),
  }

  batch = sqss.batch_sequences_with_states(
      input_key=key,
      input_sequences=tensor_dict,
      input_context={},
      input_length=None,
      initial_states=init_states,
      num_unroll=unroll_length,
      batch_size=batch_size,
      num_threads=batch_size,
      make_keys_unique=True,
      capacity=batch_size * batch_size)

  return _build_training_batch_dict(batch, unroll_length, batch_size)
Exemple #3
0
def build(input_reader_config,
          model_config,
          lstm_config,
          unroll_length,
          data_augmentation_options=None,
          batch_size=1):
    """Builds a tensor dictionary based on the InputReader config.

    Args:
      input_reader_config: An input_reader_builder.InputReader object.
      model_config: A model.proto object containing the config for the desired
        DetectionModel.
      lstm_config: LSTM specific configs.
      unroll_length: Unrolled length for LSTM training.
      data_augmentation_options: A list of tuples, where each tuple contains a
        data augmentation function and a dictionary containing arguments and their
        values (see preprocessor.py).
      batch_size: Batch size for queue outputs.

    Returns:
      A dictionary of tensors based on items in the input_reader_config.

    Raises:
      ValueError: On invalid input reader proto.
      ValueError: If no input paths are specified.
    """
    if not isinstance(input_reader_config, input_reader_pb2.InputReader):
        raise ValueError('input_reader_config not of type '
                         'input_reader_pb2.InputReader.')

    external_reader_config = input_reader_config.external_input_reader
    external_input_reader_config = external_reader_config.Extensions[
        input_reader_google_pb2.GoogleInputReader.google_input_reader]
    input_reader_type = external_input_reader_config.WhichOneof('input_reader')

    if input_reader_type == 'tf_record_video_input_reader':
        config = external_input_reader_config.tf_record_video_input_reader
        reader_type_class = tf.TFRecordReader
    else:
        raise ValueError(
            'Unsupported reader in input_reader_config: %s' % input_reader_type)

    if not config.input_path:
        raise ValueError('At least one input path must be specified in '
                         '`input_reader_config`.')
    key, value = parallel_reader.parallel_read(
        config.input_path[:],  # Convert `RepeatedScalarContainer` to list.
        reader_class=reader_type_class,
        num_epochs=(input_reader_config.num_epochs
                    if input_reader_config.num_epochs else None),
        num_readers=input_reader_config.num_readers,
        shuffle=input_reader_config.shuffle,
        dtypes=[tf.string, tf.string],
        capacity=input_reader_config.queue_capacity,
        min_after_dequeue=input_reader_config.min_after_dequeue)

    # TODO(yinxiao): Add loading instance mask option.
    decoder = tf_sequence_example_decoder.TFSequenceExampleDecoder()

    keys_to_decode = [
        fields.InputDataFields.image, fields.InputDataFields.groundtruth_boxes,
        fields.InputDataFields.groundtruth_classes
    ]
    tensor_dict = decoder.decode(value, items=keys_to_decode)

    tensor_dict['image'].set_shape([None, None, None, 3])
    tensor_dict['groundtruth_boxes'].set_shape([None, None, 4])

    height = model_config.ssd.image_resizer.fixed_shape_resizer.height
    width = model_config.ssd.image_resizer.fixed_shape_resizer.width

    # If data augmentation is specified in the config file, the preprocessor
    # will be called here to augment the data as specified. Most common
    # augmentations include horizontal flip and cropping.
    if data_augmentation_options:
        images_pre = tf.split(
            tensor_dict['image'], config.video_length, axis=0)
        bboxes_pre = tf.split(
            tensor_dict['groundtruth_boxes'], config.video_length, axis=0)
        labels_pre = tf.split(
            tensor_dict['groundtruth_classes'], config.video_length, axis=0)
        images_proc, bboxes_proc, labels_proc = [], [], []
        cache = preprocessor_cache.PreprocessorCache()

        for i, _ in enumerate(images_pre):
            image_dict = {
                fields.InputDataFields.image:
                    images_pre[i],
                fields.InputDataFields.groundtruth_boxes:
                    tf.squeeze(bboxes_pre[i], axis=0),
                fields.InputDataFields.groundtruth_classes:
                    tf.squeeze(labels_pre[i], axis=0),
            }
            image_dict = preprocessor.preprocess(
                image_dict,
                data_augmentation_options,
                func_arg_map=preprocessor.get_default_func_arg_map(),
                preprocess_vars_cache=cache)
            # Pads detection count to _PADDING_SIZE.
            image_dict[fields.InputDataFields.groundtruth_boxes] = tf.pad(
                image_dict[fields.InputDataFields.groundtruth_boxes],
                [[0, _PADDING_SIZE], [0, 0]])
            image_dict[fields.InputDataFields.groundtruth_boxes] = tf.slice(
                image_dict[fields.InputDataFields.groundtruth_boxes], [0, 0],
                [_PADDING_SIZE, -1])
            image_dict[fields.InputDataFields.groundtruth_classes] = tf.pad(
                image_dict[fields.InputDataFields.groundtruth_classes],
                [[0, _PADDING_SIZE]])
            image_dict[fields.InputDataFields.groundtruth_classes] = tf.slice(
                image_dict[fields.InputDataFields.groundtruth_classes], [0],
                [_PADDING_SIZE])
            images_proc.append(image_dict[fields.InputDataFields.image])
            bboxes_proc.append(
                image_dict[fields.InputDataFields.groundtruth_boxes])
            labels_proc.append(
                image_dict[fields.InputDataFields.groundtruth_classes])
        tensor_dict['image'] = tf.concat(images_proc, axis=0)
        tensor_dict['groundtruth_boxes'] = tf.stack(bboxes_proc, axis=0)
        tensor_dict['groundtruth_classes'] = tf.stack(labels_proc, axis=0)
    else:
        # Pads detection count to _PADDING_SIZE per frame.
        tensor_dict['groundtruth_boxes'] = tf.pad(
            tensor_dict['groundtruth_boxes'], [[0, 0], [0, _PADDING_SIZE], [0, 0]])
        tensor_dict['groundtruth_boxes'] = tf.slice(
            tensor_dict['groundtruth_boxes'], [0, 0, 0], [-1, _PADDING_SIZE, -1])
        tensor_dict['groundtruth_classes'] = tf.pad(
            tensor_dict['groundtruth_classes'], [[0, 0], [0, _PADDING_SIZE]])
        tensor_dict['groundtruth_classes'] = tf.slice(
            tensor_dict['groundtruth_classes'], [0, 0], [-1, _PADDING_SIZE])

    tensor_dict['image'], _ = preprocessor.resize_image(
        tensor_dict['image'], new_height=height, new_width=width)

    num_steps = config.video_length / unroll_length

    init_states = {
        'lstm_state_c':
            tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]),
        'lstm_state_h':
            tf.zeros([height / 32, width / 32, lstm_config.lstm_state_depth]),
        'lstm_state_step':
            tf.constant(num_steps, shape=[]),
    }

    batch = sqss.batch_sequences_with_states(
        input_key=key,
        input_sequences=tensor_dict,
        input_context={},
        input_length=None,
        initial_states=init_states,
        num_unroll=unroll_length,
        batch_size=batch_size,
        num_threads=batch_size,
        make_keys_unique=True,
        capacity=batch_size * batch_size)

    return _build_training_batch_dict(batch, unroll_length, batch_size)
Exemple #4
0
def resize_detection_fixed_size(tensors, input_size, for_testing=False):
    tensors_out = tensors.copy()
    #ignore_regions are currently not supported in this resize mode
    if Constants.IGNORE_REGIONS in tensors_out:
        del tensors_out[Constants.IGNORE_REGIONS]
    img = tensors[Constants.UNNORMALIZED_IMG]
    original_img = img
    bboxes = tensors[Constants.BBOXES]

    # remove the padding
    n_real_detections = tf.reduce_sum(
        tf.cast(tensors[Constants.IDS] > 0, tf.int32))
    bboxes = bboxes[:n_real_detections]
    classes = tensors[Constants.CLASSES][:n_real_detections]

    # permute y1, y2, x1, x2 -> y1, x1, y2, x1
    bboxes = tf.stack(
        [bboxes[..., 0], bboxes[..., 2], bboxes[..., 1], bboxes[..., 3]],
        axis=-1)

    # normalize bboxes to [0..1]
    height = tf.shape(img)[0]
    width = tf.shape(img)[1]
    bboxes = tf.cast(bboxes, tf.float32) / tf.cast(
        tf.stack([height, width, height, width], axis=0), tf.float32)

    import object_detection.core.preprocessor as preproc
    if not for_testing:
        #crop (ssd style)
        img, bboxes, classes = preproc.ssd_random_crop(img, bboxes, classes)
        #alternative
        #img, bboxes, classes = preproc.random_crop_image(img, real_boxes, real_boxes)

        # include random horizontal flip augmentation here
        img, bboxes = preproc.random_horizontal_flip(img, bboxes)

    #resize image, note: boxes don't need resizing as they are in relative coordinates
    img = preproc.resize_image(img,
                               new_height=input_size[0],
                               new_width=input_size[1])

    if for_testing:
        _, bboxes = preproc.scale_boxes_to_pixel_coordinates(
            original_img, bboxes)
    else:
        _, bboxes = preproc.scale_boxes_to_pixel_coordinates(img, bboxes)

    #permute back y1, x1, y2, x1 -> y1, y2, x1, x2
    bboxes = tf.stack(
        [bboxes[..., 0], bboxes[..., 2], bboxes[..., 1], bboxes[..., 3]],
        axis=-1)

    #pad the stuff needs to be padded back to the maximum size
    padded_size = smart_shape(tensors[Constants.CLASSES])[0]
    n_real_detections_after_crop = smart_shape(bboxes)[0]
    pad_size = padded_size - n_real_detections_after_crop
    paddings_bboxes = [[0, pad_size], [0, 0]]
    bboxes = tf.pad(bboxes, paddings=paddings_bboxes)
    paddings_classes_ids = [[0, pad_size]]
    classes = tf.pad(classes, paddings=paddings_classes_ids)
    ids = tf.pad(tf.range(n_real_detections_after_crop) + 1,
                 paddings=paddings_classes_ids)
    if isinstance(padded_size, int):
        bboxes.set_shape((padded_size, 4))
        classes.set_shape((padded_size, ))
        ids.set_shape((padded_size, ))
    else:
        bboxes.set_shape((None, 4))
    #note that we do not retain the original ids, but it does not matter since this resize_mode is only meant for
    #isolated frames
    tensors_out[Constants.UNNORMALIZED_IMG] = img
    tensors_out[Constants.BBOXES] = bboxes
    tensors_out[Constants.CLASSES] = classes
    tensors_out[Constants.IDS] = ids
    tensors_out[Constants.RESIZED_SIZES] = tf.shape(img)[:2]
    if for_testing:
        tensors_out[Constants.ORIGINAL_SIZES] = tf.shape(original_img)[:2]
    return tensors_out