def _evaluate(self, session, step):
    var_name_to_value = session.run(self._var_name_to_train_var)
    logging.info('Building placeholders.')
    placeholder_to_value = {
        self._var_name_to_placeholder[v_name]: var_name_to_value[v_name]
        for v_name in var_name_to_value
    }

    def feed_variables(scaffold, session):
      del scaffold
      session.run(self._var_feed_op, feed_dict=placeholder_to_value)

    logging.info('Building scaffold.')
    scaffold = training.Scaffold(
        init_fn=feed_variables, copy_from_scaffold=self._scaffold)

    with self._graph.as_default():
      mlperf_log.resnet_print(key=mlperf_log.EVAL_START)
      eval_results = self._estimator._evaluate_run(
          checkpoint_path=None,
          scaffold=scaffold,
          update_op=self._update_op,
          eval_dict=self._eval_dict,
          all_hooks=self._all_hooks,
          output_dir=self._eval_dir)
      logging.info('Eval done.')

    self._timer.update_last_triggered_step(step)
    return eval_results
Exemple #2
0
def process_record_dataset(dataset,
                           is_training,
                           batch_size,
                           shuffle_buffer,
                           parse_record_fn,
                           num_epochs=1,
                           num_gpus=None,
                           examples_per_epoch=None,
                           dtype=tf.float32):
    """Given a Dataset with raw records, return an iterator over the records.

  Args:
    dataset: A Dataset representing raw records
    is_training: A boolean denoting whether the input is for training.
    batch_size: The number of samples per batch.
    shuffle_buffer: The buffer size to use when shuffling records. A larger
      value results in better randomness, but smaller values reduce startup
      time and use less memory.
    parse_record_fn: A function that takes a raw record and returns the
      corresponding (image, label) pair.
    num_epochs: The number of epochs to repeat the dataset.
    num_gpus: The number of gpus used for training.
    examples_per_epoch: The number of examples in an epoch.
    dtype: Data type to use for images/features.

  Returns:
    Dataset of (image, label) pairs ready for iteration.
  """

    # We prefetch a batch at a time, This can help smooth out the time taken to
    # load input files as we go through shuffling and processing.
    dataset = dataset.prefetch(buffer_size=batch_size)
    if is_training:
        # Shuffle the records. Note that we shuffle before repeating to ensure
        # that the shuffling respects epoch boundaries.
        mlperf_log.resnet_print(key=mlperf_log.INPUT_ORDER)
        dataset = dataset.shuffle(buffer_size=shuffle_buffer)

    # If we are training over multiple epochs before evaluating, repeat the
    # dataset for the appropriate number of epochs.
    dataset = dataset.repeat(num_epochs)

    # Parse the raw records into images and labels. Testing has shown that setting
    # num_parallel_batches > 1 produces no improvement in throughput, since
    # batch_size is almost always much greater than the number of CPU cores.
    dataset = dataset.apply(
        tf.contrib.data.map_and_batch(
            lambda value: parse_record_fn(value, is_training, dtype),
            batch_size=batch_size,
            num_parallel_batches=1))

    # Operations between the final prefetch and the get_next call to the iterator
    # will happen synchronously during run time. We prefetch here again to
    # background all of the above processing work and keep it out of the
    # critical training path. Setting buffer_size to tf.contrib.data.AUTOTUNE
    # allows DistributionStrategies to adjust how many batches to fetch based
    # on how many devices are present.
    dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)

    return dataset
def block_group(inputs, filters, block_fn, blocks, strides, is_training, name,
                data_format='channels_first'):
  """Creates one group of blocks for the ResNet model.

  Args:
    inputs: `Tensor` of size `[batch, channels, height, width]`.
    filters: `int` number of filters for the first convolution of the layer.
    block_fn: `function` for the block to use within the model
    blocks: `int` number of blocks contained in the layer.
    strides: `int` stride to use for the first convolution of the layer. If
        greater than 1, this layer will downsample the input.
    is_training: `bool` for whether the model is training.
    name: `str`name for the Tensor output of the block layer.
    data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.

  Returns:
    The output `Tensor` of the block layer.
  """
  # Drop batch size from shape logging.
  if is_training and FLAGS.mlperf_logging:
    mlperf_log.resnet_print(
        key=mlperf_log.MODEL_HP_INITIAL_SHAPE, value=inputs.shape.as_list()[1:])

  # Only the first block per block_group uses projection shortcut and strides.
  inputs = block_fn(inputs, filters, is_training, strides,
                    use_projection=True, data_format=data_format)

  for _ in range(1, blocks):
    inputs = block_fn(inputs, filters, is_training, 1,
                      data_format=data_format)

  return tf.identity(inputs, name)
def learning_rate_schedule(current_epoch):
    """Handles linear scaling rule, gradual warmup, and LR decay.

  The learning rate starts at 0, then it increases linearly per step.
  After 5 epochs we reach the base learning rate (scaled to account
    for batch size).
  After 30, 60 and 80 epochs the learning rate is divided by 10.
  After 90 epochs training stops and the LR is set to 0. This ensures
    that we train for exactly 90 epochs for reproducibility.

  Args:
    current_epoch: `Tensor` for current epoch.

  Returns:
    A scaled `Tensor` for current learning rate.
  """
    mlperf_log.resnet_print(key=mlperf_log.OPT_LR, deferred=True)
    scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0)

    decay_rate = (scaled_lr * LR_SCHEDULE[0][0] * current_epoch /
                  LR_SCHEDULE[0][1])
    for mult, start_epoch in LR_SCHEDULE:
        decay_rate = tf.where(current_epoch < start_epoch, decay_rate,
                              scaled_lr * mult)
    return decay_rate
def log_batch_norm(input_tensor, output_tensor, momentum, epsilon, center,
                   scale, training):
  assert _get_shape(input_tensor) == _get_shape(output_tensor)
  mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_BATCH_NORM, value={
    "shape": _get_shape(input_tensor), "momentum": momentum, "epsilon": epsilon,
    "center": center, "scale": scale, "training": training},
                          stack_offset=_STACK_OFFSET)
Exemple #6
0
def _mean_image_subtraction(image, means, num_channels):
    """Subtracts the given means from each image channel.

  For example:
    means = [123.68, 116.779, 103.939]
    image = _mean_image_subtraction(image, means)

  Note that the rank of `image` must be known.

  Args:
    image: a tensor of size [height, width, C].
    means: a C-vector of values to subtract from each channel.
    num_channels: number of color channels in the image that will be distorted.

  Returns:
    the centered image.

  Raises:
    ValueError: If the rank of `image` is unknown, if `image` has a rank other
      than three or if the number of channels in `image` doesn't match the
      number of values in `means`.
  """
    if image.get_shape().ndims != 3:
        raise ValueError('Input must be of size [height, width, C>0]')

    if len(means) != num_channels:
        raise ValueError('len(means) must match the number of channels')

    mlperf_log.resnet_print(key=mlperf_log.INPUT_MEAN_SUBTRACTION, value=means)

    # We have a 1-D tensor of means; convert to 3-D.
    means = tf.expand_dims(tf.expand_dims(means, 0), 0)

    return image - means
def log_begin_block(input_tensor, block_type):
    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_BEGIN_BLOCK,
                            value={"block_type": block_type},
                            stack_offset=_STACK_OFFSET)
    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RESNET_TOPOLOGY,
                            value=" Block Input: {}".format(
                                _get_shape(input_tensor)),
                            stack_offset=_STACK_OFFSET)
  def model(inputs, is_training):
    """Creation of the model graph."""
    inputs = conv2d_fixed_padding(
        inputs=inputs,
        filters=64,
        kernel_size=7,
        strides=2,
        is_training=is_training,
        data_format=data_format)
    inputs = tf.identity(inputs, 'initial_conv')
    inputs = batch_norm_relu(inputs, is_training, data_format=data_format)

    pooled_inputs = tf.layers.max_pooling2d(
        inputs=inputs, pool_size=3, strides=2, padding='SAME',
        data_format=data_format)
    if is_training and FLAGS.mlperf_logging:
      resnet_log_helper.log_max_pool(input_tensor=inputs,
                                     output_tensor=pooled_inputs)
    inputs = tf.identity(pooled_inputs, 'initial_max_pool')

    inputs = block_group(
        inputs=inputs, filters=64, block_fn=block_fn, blocks=layers[0],
        strides=1, is_training=is_training, name='block_group1',
        data_format=data_format)
    inputs = block_group(
        inputs=inputs, filters=128, block_fn=block_fn, blocks=layers[1],
        strides=2, is_training=is_training, name='block_group2',
        data_format=data_format)
    inputs = block_group(
        inputs=inputs, filters=256, block_fn=block_fn, blocks=layers[2],
        strides=2, is_training=is_training, name='block_group3',
        data_format=data_format)
    inputs = block_group(
        inputs=inputs, filters=512, block_fn=block_fn, blocks=layers[3],
        strides=2, is_training=is_training, name='block_group4',
        data_format=data_format)

    # The activation is 7x7 so this is a global average pool.
    # TODO(huangyp): reduce_mean will be faster.
    pool_size = (inputs.shape[1], inputs.shape[2])
    inputs = tf.layers.average_pooling2d(
        inputs=inputs, pool_size=pool_size, strides=1, padding='VALID',
        data_format=data_format)
    inputs = tf.identity(inputs, 'final_avg_pool')
    inputs = tf.reshape(
        inputs, [-1, 2048 if block_fn is bottleneck_block else 512])
    if is_training and FLAGS.mlperf_logging:
      mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_DENSE, value=num_classes)
    inputs = tf.layers.dense(
        inputs=inputs,
        units=num_classes,
        kernel_initializer=tf.random_normal_initializer(stddev=.01))
    inputs = tf.identity(inputs, 'final_dense')
    if is_training and FLAGS.mlperf_logging:
      mlperf_log.resnet_print(
          key=mlperf_log.MODEL_HP_FINAL_SHAPE, value=inputs.shape.as_list()[1:])
    return inputs
def log_conv2d(input_tensor, output_tensor, stride, filters, initializer,
               use_bias):
  mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_CONV2D_FIXED_PADDING,
                          value=_in_out_shape(input_tensor, output_tensor),
                          stack_offset=_STACK_OFFSET)
  mlperf_log.resnet_print(
      key=mlperf_log.MODEL_HP_CONV2D_FIXED_PADDING,
      value={"stride": stride, "filters": filters, "initializer": initializer,
             "use_bias": use_bias},
      stack_offset=_STACK_OFFSET)
def batch_norm_relu(inputs,
                    is_training,
                    relu=True,
                    init_zero=False,
                    data_format='channels_first'):
    """Performs a batch normalization followed by a ReLU.

  Args:
    inputs: `Tensor` of shape `[batch, channels, ...]`.
    is_training: `bool` for whether the model is training.
    relu: `bool` if False, omits the ReLU operation.
    init_zero: `bool` if True, initializes scale parameter of batch
        normalization with 0 instead of 1 (default).
    data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.

  Returns:
    A normalized `Tensor` with the same `data_format`.
  """
    if init_zero:
        gamma_initializer = tf.zeros_initializer()
    else:
        gamma_initializer = tf.ones_initializer()

    if data_format == 'channels_first':
        axis = 1
    else:
        axis = 3

    outputs = tf.layers.batch_normalization(
        inputs=inputs,
        axis=axis,
        momentum=BATCH_NORM_DECAY,
        epsilon=BATCH_NORM_EPSILON,
        center=True,
        scale=True,
        training=is_training,
        fused=True,
        gamma_initializer=gamma_initializer)

    if is_training:
        resnet_log_helper.log_batch_norm(input_tensor=inputs,
                                         output_tensor=outputs,
                                         momentum=BATCH_NORM_DECAY,
                                         epsilon=BATCH_NORM_EPSILON,
                                         center=True,
                                         scale=True,
                                         training=is_training)

    if relu:
        if is_training:
            mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
        outputs = tf.nn.relu(outputs)

    return outputs
Exemple #11
0
 def log_train_epochs(self, num_epochs):
     """Logs all the TRAIN_EPOCHs log lines."""
     num_epochs_int = int(num_epochs)
     for i in range(num_epochs_int):
         # MLPerf allows us to print all the train epochs at once instead of
         # printing them as we do them.
         mlperf_log.resnet_print(key=mlperf_log.TRAIN_EPOCH, value=i)
     if num_epochs_int != num_epochs:
         value = (
             str(num_epochs_int) +
             ', but this epoch only has {}% of the examples of a normal epoch'
             .format(100 * (num_epochs - num_epochs_int)))
         mlperf_log.resnet_print(key=mlperf_log.TRAIN_EPOCH, value=value)
  def after_run(self, run_context, run_values):  # pylint: disable=unused-argument
    """Runs evaluator."""
    step = np.asscalar(run_context.session.run(self._global_step_tensor))

    if self._timer.should_trigger_for_step(step):
      logging.info('Starting eval.')
      eval_results = self._evaluate(run_context.session, step)
      mlperf_log.resnet_print(key=mlperf_log.EVAL_STOP)
      mlperf_log.resnet_print(
          key=mlperf_log.EVAL_ACCURACY,
          value={
              'epoch': max(step // self._steps_per_epoch - 1, 0),
              'value': float(eval_results[_EVAL_METRIC])
          })

      # The ImageNet eval size is hard coded.
      mlperf_log.resnet_print(key=mlperf_log.EVAL_SIZE, value=50000)
      if eval_results[_EVAL_METRIC] >= self._stop_threshold:
        self._run_success = True
        mlperf_log.resnet_print(
            key=mlperf_log.RUN_STOP, value={'success': 'true'})
        run_context.request_stop()

    if step // self._steps_per_epoch == self._eval_every_epoch_from:
      self._timer = training.SecondOrStepTimer(
          every_steps=self._steps_per_epoch)
      self._timer.reset()
Exemple #13
0
def input_fn(is_training,
             data_dir,
             batch_size,
             num_epochs=1,
             num_gpus=None,
             dtype=tf.float32,
             mix_up=False,
             oss_load=False):
    """Input function which provides batches for train or eval.

  Args:
    is_training: A boolean denoting whether the input is for training.
    data_dir: The directory containing the input data.
    batch_size: The number of samples per batch.
    num_epochs: The number of epochs to repeat the dataset.
    num_gpus: The number of gpus used for training.
    dtype: Data type to use for images/features

  Returns:
    A dataset that can be used for iteration.
  """
    mlperf_log.resnet_print(key=mlperf_log.INPUT_ORDER)
    if not oss_load:
        filenames = get_filenames(is_training, data_dir)
    else:
        filenames = get_filenames_oss(is_training)
    #print(filenames)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)

    if is_training:
        # Shuffle the input files
        dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)

    # Convert to individual records
    dataset = dataset.flat_map(tf.data.TFRecordDataset)

    return resnet_run_loop.process_record_dataset(
        dataset=dataset,
        is_training=is_training,
        batch_size=batch_size,
        shuffle_buffer=_SHUFFLE_BUFFER,
        parse_record_fn=parse_record,
        num_epochs=num_epochs,
        num_gpus=num_gpus,
        examples_per_epoch=_NUM_IMAGES['train'] if is_training else None,
        dtype=dtype,
        mix_up=mix_up)
Exemple #14
0
def block_m3(inputs, filters, training, strides, data_format):

    resnet_log_helper.log_begin_block(input_tensor=inputs,
                                      block_type=mlperf_log.BOTTLENECK_BLOCK)

    avg_pool = tf.layers.average_pooling2d(inputs=inputs,
                                           pool_size=strides,
                                           strides=strides,
                                           padding='SAME',
                                           data_format=data_format)
    shortcut = conv2d_fixed_padding(inputs=avg_pool,
                                    filters=1664,
                                    kernel_size=1,
                                    strides=1,
                                    data_format=data_format)
    resnet_log_helper.log_projection(input_tensor=inputs,
                                     output_tensor=shortcut)
    shortcut = batch_norm(inputs=shortcut,
                          training=training,
                          data_format=data_format)

    inputs = conv2d_fixed_padding(inputs=inputs,
                                  filters=256,
                                  kernel_size=1,
                                  strides=1,
                                  data_format=data_format)
    inputs = batch_norm(inputs, training, data_format)

    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
    inputs = tf.nn.relu(inputs)

    inputs = conv2d_fixed_padding(inputs=inputs,
                                  filters=256,
                                  kernel_size=3,
                                  strides=strides,
                                  data_format=data_format)
    inputs = batch_norm(inputs, training, data_format)

    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
    inputs = tf.nn.relu(inputs)

    inputs = conv2d_fixed_padding(inputs=inputs,
                                  filters=1664,
                                  kernel_size=1,
                                  strides=1,
                                  data_format=data_format)
    inputs = batch_norm(inputs, training, data_format)

    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_SHORTCUT_ADD)
    inputs += shortcut

    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
    inputs = tf.nn.relu(inputs)

    resnet_log_helper.log_end_block(output_tensor=inputs)
    return inputs
Exemple #15
0
    def make_source_dataset(self, index, num_hosts):
        """See base class."""
        if not self.data_dir:
            tf.logging.info('Undefined data_dir implies null input')
            return tf.data.Dataset.range(1).repeat().map(self._get_null_input)

        if FLAGS.mlperf_logging:
            mlperf_log.resnet_print(key=mlperf_log.INPUT_ORDER)
        # Shuffle the filenames to ensure better randomization.
        file_pattern = os.path.join(
            self.data_dir, 'train-*' if self.is_training else 'validation-*')

        # For multi-host training, we want each hosts to always process the same
        # subset of files.  Each host only sees a subset of the entire dataset,
        # allowing us to cache larger datasets in memory.
        dataset = tf.data.Dataset.list_files(file_pattern, shuffle=False)
        dataset = dataset.shard(num_hosts, index)

        if self.is_training and not self.cache:
            dataset = dataset.repeat()

        def fetch_dataset(filename):
            buffer_size = 8 * 1024 * 1024  # 8 MiB per file
            dataset = tf.data.TFRecordDataset(filename,
                                              buffer_size=buffer_size)
            return dataset

        # Read the data from disk in parallel
        dataset = dataset.apply(
            tf.contrib.data.parallel_interleave(
                fetch_dataset,
                cycle_length=self.num_parallel_calls,
                sloppy=True))

        if FLAGS.mlperf_logging:
            mlperf_log.resnet_print(key=mlperf_log.INPUT_ORDER)
        if self.cache:
            dataset = dataset.cache()
        if self.is_training:
            # We shuffle only during training, and during training, we must produce an
            # infinite dataset, so apply the fused shuffle_and_repeat optimized
            # dataset transformation.
            dataset = dataset.apply(
                tf.contrib.data.shuffle_and_repeat(1024 * 16))
        return dataset
Exemple #16
0
def preprocess_image(image_buffer,
                     bbox,
                     output_height,
                     output_width,
                     num_channels,
                     is_training=False):
    """Preprocesses the given image.

  Preprocessing includes decoding, cropping, and resizing for both training
  and eval images. Training preprocessing, however, introduces some random
  distortion of the image to improve accuracy.

  Args:
    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
      where each coordinate is [0, 1) and the coordinates are arranged as
      [ymin, xmin, ymax, xmax].
    output_height: The height of the image after preprocessing.
    output_width: The width of the image after preprocessing.
    num_channels: Integer depth of the image buffer for decoding.
    is_training: `True` if we're preprocessing the image for training and
      `False` otherwise.

  Returns:
    A preprocessed image.
  """
    if is_training:
        # For training, we want to randomize some of the distortions.
        image = _decode_crop_and_flip(image_buffer, bbox, num_channels)

        mlperf_log.resnet_print(key=mlperf_log.INPUT_RESIZE,
                                value=[output_height, output_width])
        image = _resize_image(image, output_height, output_width)
    else:
        # For validation, we want to decode, resize, then just crop the middle.
        image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
        image = _aspect_preserving_resize(image, _RESIZE_MIN)

        mlperf_log.resnet_print(key=mlperf_log.INPUT_RESIZE,
                                value=[output_height, output_width])
        image = _central_crop(image, output_height, output_width)

    image.set_shape([output_height, output_width, num_channels])

    return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels)
def mx_resnet_print(key,
                    val=None,
                    sync=False,
                    uniq=True,
                    stack_offset=1,
                    deferred=False):
    rank = mpiwrapper.rank()
    if sync:
        mpiwrapper.barrier()

    if uniq and (rank != 0):
        return

    mlperf_log.resnet_print(key=key,
                            value=val,
                            stack_offset=stack_offset,
                            deferred=deferred)
    return
Exemple #18
0
def _aspect_preserving_resize(image, resize_min):
    """Resize images preserving the original aspect ratio.

  Args:
    image: A 3-D image `Tensor`.
    resize_min: A python integer or scalar `Tensor` indicating the size of
      the smallest side after resize.

  Returns:
    resized_image: A 3-D tensor containing the resized image.
  """
    mlperf_log.resnet_print(key=mlperf_log.INPUT_RESIZE_ASPECT_PRESERVING,
                            value={"min": resize_min})

    shape = tf.shape(image)
    height, width = shape[0], shape[1]

    new_height, new_width = _smallest_size_at_least(height, width, resize_min)

    return _resize_image(image, new_height, new_width)
def main(argv):
  parser = resnet_run_loop.ResnetArgParser(
      resnet_size_choices=[18, 34, 50, 101, 152, 200])

  parser.set_defaults(
       train_epochs=90,
       version=1
  )

  flags = parser.parse_args(args=argv[2:])

  seed = int(argv[1])
  print('Setting random seed = ', seed)
  print('special seeding')
  mlperf_log.resnet_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=seed)
  random.seed(seed)
  tf.compat.v1.set_random_seed(seed)
  numpy.random.seed(seed)

  mlperf_log.resnet_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES,
                          value=_NUM_IMAGES['train'])
  mlperf_log.resnet_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES,
                          value=_NUM_IMAGES['validation'])
  input_function = flags.use_synthetic_data and get_synth_input_fn() or input_fn

  resnet_run_loop.resnet_main(seed,
      flags, imagenet_model_fn, input_function,
      shape=[_DEFAULT_IMAGE_SIZE, _DEFAULT_IMAGE_SIZE, _NUM_CHANNELS])
Exemple #20
0
def main(argv):
    parser = resnet_run_loop.ResnetArgParser(
        resnet_size_choices=[18, 26, 34, 50, 101, 152, 200])

    parser.set_defaults(train_epochs=90, version=1)

    flags = parser.parse_args(args=argv[2:])

    if flags.oss_load:
        auth = oss2.Auth(_ACCESS_ID, _ACCESS_KEY)
        bucket = oss2.Bucket(auth, _HOST, _BUCKET)

    seed = int(argv[1])
    print('Setting random seed = ', seed)
    print('special seeding')
    mlperf_log.resnet_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=seed)
    random.seed(seed)
    tf.set_random_seed(seed)
    np.random.seed(seed)

    mlperf_log.resnet_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES,
                            value=_NUM_IMAGES['train'])
    mlperf_log.resnet_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES,
                            value=_NUM_IMAGES['validation'])
    input_function = input_fn

    resnet_run_loop.resnet_main(
        seed,
        flags,
        imagenet_model_fn,
        input_function,
        shape=[_DEFAULT_IMAGE_SIZE, _DEFAULT_IMAGE_SIZE, _NUM_CHANNELS])
def preprocess_image(image_bytes,
                     is_training=False,
                     use_bfloat16=False,
                     image_size=IMAGE_SIZE):
    """Preprocesses the given image.

  Args:
    image_bytes: `Tensor` representing an image binary of arbitrary size.
    is_training: `bool` for whether the preprocessing is for training.
    use_bfloat16: `bool` for whether to use bfloat16.
    image_size: image size.

  Returns:
    A preprocessed image `Tensor` with value range of [0, 255].
  """
    if is_training:
        mlperf_log.resnet_print(key=mlperf_log.INPUT_RESIZE,
                                value=[IMAGE_SIZE, IMAGE_SIZE])
        return preprocess_for_train(image_bytes, use_bfloat16, image_size)
    else:
        mlperf_log.resnet_print(key=mlperf_log.INPUT_RESIZE,
                                value=[IMAGE_SIZE, IMAGE_SIZE])
        return preprocess_for_eval(image_bytes, use_bfloat16, image_size)
Exemple #22
0
def _central_crop(image, crop_height, crop_width):
    """Performs central crops of the given image list.

  Args:
    image: a 3-D image tensor
    crop_height: the height of the image following the crop.
    crop_width: the width of the image following the crop.

  Returns:
    3-D tensor with cropped image.
  """
    shape = tf.shape(image)
    height, width = shape[0], shape[1]

    mlperf_log.resnet_print(key=mlperf_log.INPUT_CENTRAL_CROP,
                            value=[crop_height, crop_width])

    amount_to_be_cropped_h = (height - crop_height)
    crop_top = amount_to_be_cropped_h // 2
    amount_to_be_cropped_w = (width - crop_width)
    crop_left = amount_to_be_cropped_w // 2
    return tf.slice(image, [crop_top, crop_left, 0],
                    [crop_height, crop_width, -1])
Exemple #23
0
def block_m4(inputs, filters, training, strides, data_format):

    resnet_log_helper.log_begin_block(input_tensor=inputs,
                                      block_type=mlperf_log.BOTTLENECK_BLOCK)

    shortcut = inputs

    inputs = conv2d_fixed_padding(inputs=inputs,
                                  filters=384,
                                  kernel_size=1,
                                  strides=1,
                                  data_format=data_format)
    inputs = batch_norm(inputs, training, data_format)

    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
    inputs = tf.nn.relu(inputs)

    inputs = conv2d_fixed_padding(
        inputs=inputs,
        filters=384,
        kernel_size=3,
        strides=strides,  # 384
        data_format=data_format)
    inputs = batch_norm(inputs, training, data_format)

    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
    inputs = tf.nn.relu(inputs)

    inputs = conv2d_fixed_padding(inputs=inputs,
                                  filters=1664,
                                  kernel_size=1,
                                  strides=1,
                                  data_format=data_format)
    inputs = batch_norm(inputs, training, data_format)

    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_SHORTCUT_ADD)
    inputs += shortcut

    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
    inputs = tf.nn.relu(inputs)

    resnet_log_helper.log_end_block(output_tensor=inputs)
    return inputs
 def end(self, session):  # pylint: disable=unused-argument
   """Runs evaluator for final model."""
   # Only runs eval at the end if highest accuracy so far
   # is less than self._stop_threshold.
   if not self._run_success:
     step = np.asscalar(session.run(self._global_step_tensor))
     logging.info('Starting eval.')
     eval_results = self._evaluate(session, step)
     mlperf_log.resnet_print(key=mlperf_log.EVAL_STOP)
     mlperf_log.resnet_print(
         key=mlperf_log.EVAL_ACCURACY,
         value={
             'epoch': max(step // self._steps_per_epoch - 1, 0),
             'value': float(eval_results[_EVAL_METRIC])
         })
     if eval_results[_EVAL_METRIC] >= self._stop_threshold:
       mlperf_log.resnet_print(
           key=mlperf_log.RUN_STOP, value={'success': 'true'})
     else:
       mlperf_log.resnet_print(
           key=mlperf_log.RUN_STOP, value={'success': 'false'})
def get_train_op(loss, params):
    """Generate training operation that updates variables based on loss."""
    with tf.variable_scope("get_train_op"):
        mlperf_log.transformer_print(key=mlperf_log.OPT_LR_WARMUP_STEPS,
                                     value=params.learning_rate_warmup_steps)
        learning_rate = get_learning_rate(params.learning_rate,
                                          params.hidden_size,
                                          params.learning_rate_warmup_steps)
        log_id = mlperf_log.resnet_print(key=mlperf_log.OPT_LR, deferred=True)
        learning_rate = tf_mlperf_log.log_deferred(op=learning_rate,
                                                   log_id=log_id,
                                                   every_n=100)

        # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster
        # than the TF core Adam optimizer.
        mlperf_log.transformer_print(key=mlperf_log.OPT_NAME,
                                     value=mlperf_log.LAZY_ADAM)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1,
                                     value=params.optimizer_adam_beta1)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2,
                                     value=params.optimizer_adam_beta2)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON,
                                     value=params.optimizer_adam_epsilon)
        optimizer = tf.contrib.opt.LazyAdamOptimizer(
            learning_rate,
            beta1=params.optimizer_adam_beta1,
            beta2=params.optimizer_adam_beta2,
            epsilon=params.optimizer_adam_epsilon)

        # Calculate and apply gradients using LazyAdamOptimizer.
        global_step = tf.train.get_global_step()
        tvars = tf.trainable_variables()
        gradients = optimizer.compute_gradients(
            loss, tvars, colocate_gradients_with_ops=True)
        train_op = optimizer.apply_gradients(gradients,
                                             global_step=global_step,
                                             name="train")

        # Save gradient norm to Tensorboard
        tf.summary.scalar("global_norm/gradient_norm",
                          tf.global_norm(list(zip(*gradients))[0]))

        return train_op
  def __call__(self, inputs, training):
    """Add operations to classify a batch of input images.

    Args:
      inputs: A Tensor representing a batch of input images.
      training: A boolean. Set to True to add operations required only when
        training the classifier.

    Returns:
      A logits Tensor with shape [<batch_size>, self.num_classes].
    """

    # Drop batch size from shape logging.
    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_INITIAL_SHAPE,
                            value=inputs.shape.as_list()[1:])

    with self._model_variable_scope():
      if self.data_format == 'channels_first':
        # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
        # This provides a large performance boost on GPU. See
        # https://www.tensorflow.org/performance/performance_guide#data_formats
        inputs = tf.transpose(inputs, [0, 3, 1, 2])

      if (self.resnet_version == 1) or (self.resnet_version == 2):
        inputs = conv2d_fixed_padding(
            inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size,
            strides=self.conv_stride, data_format=self.data_format)

      elif self.resnet_version == 14: # v1d architecture
        inputs = conv2d_fixed_padding(
            inputs=inputs, filters=self.num_filters // 2, kernel_size=3,
            strides=self.conv_stride, data_format=self.data_format)
        inputs = batch_norm(inputs, training, self.data_format)
        inputs = conv2d_fixed_padding(
            inputs=inputs, filters=self.num_filters // 2, kernel_size=3,
            strides=1, data_format=self.data_format)
        inputs = batch_norm(inputs, training, self.data_format)
        inputs = conv2d_fixed_padding(
            inputs=inputs, filters=self.num_filters, kernel_size=3,
            strides=1, data_format=self.data_format)

      elif self.resnet_version == 24: # v1-simple architecture
        inputs = conv2d_fixed_padding(
            inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size,
            strides=self.conv_stride, data_format=self.data_format)

      elif self.resnet_version == 34: # v1cs architecture
        inputs = conv2d_fixed_padding(
            inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size,
            strides=self.conv_stride, data_format=self.data_format)

      inputs = tf.identity(inputs, 'initial_conv')

      # We do not include batch normalization or activation functions in V2
      # for the initial conv1 because the first ResNet unit will perform these
      # for both the shortcut and non-shortcut paths as part of the first
      # block's projection. Cf. Appendix of [2].
      if (self.resnet_version == 1) or (self.resnet_version == 14) \
         or (self.resnet_version == 24) or (self.resnet_version == 34):
        inputs = batch_norm(inputs, training, self.data_format)

        mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
        inputs = tf.nn.relu(inputs)

      if self.first_pool_size:
        pooled_inputs = tf.layers.max_pooling2d(
            inputs=inputs, pool_size=self.first_pool_size,
            strides=self.first_pool_stride, padding='SAME',
            data_format=self.data_format)
        resnet_log_helper.log_max_pool(input_tensor=inputs, output_tensor=pooled_inputs)
        inputs = tf.identity(pooled_inputs, 'initial_max_pool')

      feat_s = list()
      if self.resnet_version == 24: # mlperf open
        import nets.manual_blocks_t as manual_blocks
        # group1
        for i, num_blocks in enumerate(self.block_sizes[0:1]):
          num_filters = self.num_filters * (2**i)
          inputs = block_layer(
              inputs=inputs, filters=num_filters, bottleneck=self.bottleneck,
              block_fn=self.block_fn, blocks=num_blocks,
              strides=self.block_strides[i], training=training,
              name='block_layer{}'.format(i + 1), data_format=self.data_format,
              version=self.resnet_version)
        # group2
        inputs = manual_blocks.block_m(inputs=inputs, filters=128,
                          training=training, strides=2,
                          data_format=self.data_format)
        inputs = manual_blocks.block_m0(inputs=inputs, filters=128,
                          training=training, strides=1,
                          data_format=self.data_format)
        # group3
        inputs = manual_blocks.block_m1(inputs=inputs, filters=256,
                          training=training, strides=2,
                          data_format=self.data_format)
        inputs = manual_blocks.block_m2(inputs=inputs, filters=256,
                          training=training, strides=1,
                          data_format=self.data_format)
        if self.enable_at:
          feat_s.append(inputs)
        # group4
        inputs = manual_blocks.block_m3(inputs=inputs, filters=512,
                          training=training, strides=2,
                          data_format=self.data_format)
        inputs = manual_blocks.block_m4(inputs=inputs, filters=512,
                          training=training, strides=1,
                          data_format=self.data_format)
        if self.enable_at:
          feat_s.append(inputs)

      elif self.resnet_version == 34: # dawnbench
        import nets.manual_blocks as manual_blocks
        # group1~2
        for i, num_blocks in enumerate(self.block_sizes[0:2]):
          num_filters = self.num_filters * (2**i)
          inputs = block_layer(
              inputs=inputs, filters=num_filters, bottleneck=self.bottleneck,
              block_fn=self.block_fn, blocks=num_blocks,
              strides=self.block_strides[i], training=training,
              name='block_layer{}'.format(i + 1), data_format=self.data_format,
              version=self.resnet_version)
        # group3
        inputs = manual_blocks.block_m1(inputs=inputs, filters=256,
                          training=training, strides=2,
                          data_format=self.data_format)
        inputs = manual_blocks.block_m2(inputs=inputs, filters=256,
                          training=training, strides=1,
                          data_format=self.data_format)
        if self.enable_at:
          feat_s.append(inputs)
        # group4
        inputs = manual_blocks.block_m3(inputs=inputs, filters=512,
                          training=training, strides=2,
                          data_format=self.data_format)
        inputs = manual_blocks.block_m4(inputs=inputs, filters=512,
                          training=training, strides=1,
                          data_format=self.data_format)
        if self.enable_at:
          feat_s.append(inputs)

      else: # standard v1, v1d, v2
        for i, num_blocks in enumerate(self.block_sizes):
          num_filters = self.num_filters * (2**i)
          inputs = block_layer(
              inputs=inputs, filters=num_filters, bottleneck=self.bottleneck,
              block_fn=self.block_fn, blocks=num_blocks,
              strides=self.block_strides[i], training=training,
              name='block_layer{}'.format(i + 1), data_format=self.data_format,
              version=self.resnet_version)
          if (i > 1) and self.enable_at:
            feat_s.append(inputs)

      # Only apply the BN and ReLU for model that does pre_activation in each
      # building/bottleneck block, eg resnet V2.
      if self.pre_activation:
        inputs = batch_norm(inputs, training, self.data_format)

        mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
        inputs = tf.nn.relu(inputs)

      # The current top layer has shape
      # `batch_size x pool_size x pool_size x final_size`.
      # ResNet does an Average Pooling layer over pool_size,
      # but that is the same as doing a reduce_mean. We do a reduce_mean
      # here because it performs better than AveragePooling2D.
      axes = [2, 3] if self.data_format == 'channels_first' else [1, 2]
      inputs = tf.reduce_mean(inputs, axes, keepdims=True)
      inputs = tf.identity(inputs, 'final_reduce_mean')

      inputs = tf.reshape(inputs, [-1, inputs.get_shape().as_list()[-1]])
      mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_DENSE,
                              value=self.num_classes)
      inputs = tf.layers.dense(
        inputs=inputs,
        units=self.num_classes,
        kernel_initializer=tf.random_normal_initializer(stddev=.01))
      inputs = tf.identity(inputs, 'final_dense')

      # Drop batch size from shape logging.
      mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_FINAL_SHAPE,
                              value=inputs.shape.as_list()[1:])
      return inputs, feat_s
def _bottleneck_block_v1(inputs, filters, training, projection_shortcut,
                         strides, data_format):
  """A single block for ResNet v1, with a bottleneck.

  Similar to _building_block_v1(), except using the "bottleneck" blocks
  described in:
    Convolution then batch normalization then ReLU as described by:
      Deep Residual Learning for Image Recognition
      https://arxiv.org/pdf/1512.03385.pdf
      by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.

  Args:
    inputs: A tensor of size [batch, channels, height_in, width_in] or
      [batch, height_in, width_in, channels] depending on data_format.
    filters: The number of filters for the convolutions.
    training: A Boolean for whether the model is in training or inference
      mode. Needed for batch normalization.
    projection_shortcut: The function to use for projection shortcuts
      (typically a 1x1 convolution when downsampling the input).
    strides: The block's stride. If greater than 1, this block will ultimately
      downsample the input.
    data_format: The input format ('channels_last' or 'channels_first').

  Returns:
    The output tensor of the block; shape should match inputs.
  """
  resnet_log_helper.log_begin_block(
      input_tensor=inputs, block_type=mlperf_log.BOTTLENECK_BLOCK)

  shortcut = inputs

  if projection_shortcut is not None:
    shortcut = projection_shortcut(inputs)
    resnet_log_helper.log_projection(input_tensor=inputs,
                                     output_tensor=shortcut)
    shortcut = batch_norm(inputs=shortcut, training=training,
                          data_format=data_format)

  inputs = conv2d_fixed_padding(
      inputs=inputs, filters=filters, kernel_size=1, strides=1,
      data_format=data_format)
  inputs = batch_norm(inputs, training, data_format)

  mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
  inputs = tf.nn.relu(inputs)

  inputs = conv2d_fixed_padding(
      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
      data_format=data_format)
  inputs = batch_norm(inputs, training, data_format)

  mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
  inputs = tf.nn.relu(inputs)

  inputs = conv2d_fixed_padding(
      inputs=inputs, filters=4 * filters, kernel_size=1, strides=1,
      data_format=data_format)
  inputs = batch_norm(inputs, training, data_format)

  mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_SHORTCUT_ADD)
  inputs += shortcut

  mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
  inputs = tf.nn.relu(inputs)

  resnet_log_helper.log_end_block(output_tensor=inputs)
  return inputs
Exemple #28
0
def get_train_op(loss, params):
    """Generate training operation that updates variables based on loss."""
    with tf.compat.v1.variable_scope("get_train_op"):
        mlperf_log.transformer_print(key=mlperf_log.OPT_LR_WARMUP_STEPS,
                                     value=params.learning_rate_warmup_steps)
        learning_rate = get_learning_rate(params.learning_rate,
                                          params.hidden_size,
                                          params.learning_rate_warmup_steps)
        log_id = mlperf_log.resnet_print(key=mlperf_log.OPT_LR, deferred=True)
        learning_rate = tf_mlperf_log.log_deferred(op=learning_rate,
                                                   log_id=log_id,
                                                   every_n=100)

        # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster
        # than the TF core Adam optimizer.
        mlperf_log.transformer_print(key=mlperf_log.OPT_NAME,
                                     value=mlperf_log.LAZY_ADAM)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA1,
                                     value=params.optimizer_adam_beta1)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_BETA2,
                                     value=params.optimizer_adam_beta2)
        mlperf_log.transformer_print(key=mlperf_log.OPT_HP_ADAM_EPSILON,
                                     value=params.optimizer_adam_epsilon)
        # Using optimizer v1(from tensorflow.python.trainings*)
        # The optimizer v2 version of code is in the below.
        # Optimzer v1 does not
        # have lazyAdam optimizer (was in contrib, now deprecated)
        optimizer = adam.AdamOptimizer(learning_rate,
                                       beta1=params.optimizer_adam_beta1,
                                       beta2=params.optimizer_adam_beta2,
                                       epsilon=params.optimizer_adam_epsilon)

        # Calculate and apply gradients using LazyAdamOptimizer.
        global_step = tf.compat.v1.train.get_global_step()
        tvars = tf.compat.v1.trainable_variables()
        grads_and_vars = optimizer.compute_gradients(loss, tvars)
        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step,
                                             name="train")
        # Save gradient norm to Tensorboard
        tf.compat.v1.summary.scalar(
            "global_norm/gradient_norm",
            tf.linalg.global_norm(list(zip(*grads_and_vars))[0]))
        # Using tfa (tensorflow_addons) optimizer, which in turn
        # uses optimizer_v2 (from tf.python.keras.optimizer_v2)
        # which has waringin issues about global step not updated since
        # global_step is not accepted in apply_gradients() function of
        # optimizer_v2 version.
        # Thus the global step is updated and grouped with training op
        # To activate LazyAdams from tensroflow-addons activate the
        # following code and take out the above optimer v1 related code
        # Currently both optimizer v1 and v2 take about same time
        '''                  
    optimizer = tfa.optimizers.LazyAdam(
        learning_rate,
        beta_1=params.optimizer_adam_beta1,
        beta_2=params.optimizer_adam_beta2,
        epsilon=params.optimizer_adam_epsilon)

    # Calculate and apply gradients using LazyAdamOptimizer.
    global_step = tf.compat.v1.train.get_global_step()
    tvars = tf.compat.v1.trainable_variables()
    tvars = tvars[0:len(tvars)-1]
    gradients = optimizer.get_gradients(
        loss, tvars)
    grads_and_vars = zip(gradients, tvars)
    train_op = optimizer.apply_gradients(
        grads_and_vars)
    # Save gradient norm to Tensorboard
    tf.compat.v1.summary.scalar("global_norm/gradient_norm",
                      tf.compat.v1.linalg.global_norm(list(gradients)))
    update_global_step = tf.compat.v1.assign(global_step, global_step + 1, name = "update_global_step")
    train_op = tf.compat.v1.group(train_op, [(update_global_step)])
    '''
        return train_op
Exemple #29
0
def _decode_crop_and_flip(image_buffer, bbox, num_channels):
    """Crops the given image to a random part of the image, and randomly flips.

  We use the fused decode_and_crop op, which performs better than the two ops
  used separately in series, but note that this requires that the image be
  passed in as an un-decoded string Tensor.

  Args:
    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
      where each coordinate is [0, 1) and the coordinates are arranged as
      [ymin, xmin, ymax, xmax].
    num_channels: Integer depth of the image buffer for decoding.

  Returns:
    3-D tensor with cropped image.

  """
    # A large fraction of image datasets contain a human-annotated bounding box
    # delineating the region of the image containing the object of interest.  We
    # choose to create a new bounding box for the object which is a randomly
    # distorted version of the human-annotated bounding box that obeys an
    # allowed range of aspect ratios, sizes and overlap with the human-annotated
    # bounding box. If no box is supplied, then we assume the bounding box is
    # the entire image.

    min_object_covered = 0.1
    aspect_ratio_range = [0.75, 1.33]
    area_range = [0.05, 1.0]
    max_attempts = 100

    mlperf_log.resnet_print(key=mlperf_log.INPUT_DISTORTED_CROP_MIN_OBJ_COV,
                            value=min_object_covered)
    mlperf_log.resnet_print(key=mlperf_log.INPUT_DISTORTED_CROP_RATIO_RANGE,
                            value=aspect_ratio_range)
    mlperf_log.resnet_print(key=mlperf_log.INPUT_DISTORTED_CROP_AREA_RANGE,
                            value=area_range)
    mlperf_log.resnet_print(key=mlperf_log.INPUT_DISTORTED_CROP_MAX_ATTEMPTS,
                            value=max_attempts)

    sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
        tf.image.extract_jpeg_shape(image_buffer),
        bounding_boxes=bbox,
        min_object_covered=min_object_covered,
        aspect_ratio_range=aspect_ratio_range,
        area_range=area_range,
        max_attempts=max_attempts,
        use_image_if_no_bounding_boxes=True)
    bbox_begin, bbox_size, _ = sample_distorted_bounding_box

    # Reassemble the bounding box in the format the crop op requires.
    offset_y, offset_x, _ = tf.unstack(bbox_begin)
    target_height, target_width, _ = tf.unstack(bbox_size)
    crop_window = tf.stack([offset_y, offset_x, target_height, target_width])

    # Use the fused decode and crop op here, which is faster than each in series.
    cropped = tf.image.decode_and_crop_jpeg(image_buffer,
                                            crop_window,
                                            channels=num_channels)

    # Flip to add a little more random distortion in.
    mlperf_log.resnet_print(key=mlperf_log.INPUT_RANDOM_FLIP)
    cropped = tf.image.random_flip_left_right(cropped)
    return cropped
def bottleneck_block(inputs, filters, is_training, strides,
                     use_projection=False, data_format='channels_first'):
  """Bottleneck block variant for residual networks with BN after convolutions.

  Args:
    inputs: `Tensor` of size `[batch, channels, height, width]`.
    filters: `int` number of filters for the first two convolutions. Note that
        the third and final convolution will use 4 times as many filters.
    is_training: `bool` for whether the model is in training.
    strides: `int` block stride. If greater than 1, this block will ultimately
        downsample the input.
    use_projection: `bool` for whether this block should use a projection
        shortcut (versus the default identity shortcut). This is usually `True`
        for the first block of a block group, which may change the number of
        filters and the resolution.
    data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.

  Returns:
    The output `Tensor` of the block.
  """
  if is_training and FLAGS.mlperf_logging:
      mlperf_log.resnet_print(
          key=mlperf_log.MODEL_HP_BLOCK_TYPE, value=mlperf_log.BOTTLENECK_BLOCK)
      resnet_log_helper.log_begin_block(
          input_tensor=inputs, block_type=mlperf_log.BOTTLENECK_BLOCK)
  shortcut = inputs
  if use_projection:
    # Projection shortcut only in first block within a group. Bottleneck blocks
    # end with 4 times the number of filters.
    filters_out = 4 * filters
    shortcut = conv2d_fixed_padding(
        inputs=inputs,
        filters=filters_out,
        kernel_size=1,
        strides=strides,
        is_training=is_training,
        data_format=data_format)
    shortcut = batch_norm_relu(shortcut, is_training, relu=False,
                               data_format=data_format)
    if is_training and FLAGS.mlperf_logging:
      resnet_log_helper.log_projection(
          input_tensor=inputs, output_tensor=shortcut)

  inputs = conv2d_fixed_padding(
      inputs=inputs,
      filters=filters,
      kernel_size=1,
      strides=1,
      is_training=is_training,
      data_format=data_format)
  inputs = batch_norm_relu(inputs, is_training, data_format=data_format)

  inputs = conv2d_fixed_padding(
      inputs=inputs,
      filters=filters,
      kernel_size=3,
      strides=strides,
      is_training=is_training,
      data_format=data_format)
  inputs = batch_norm_relu(inputs, is_training, data_format=data_format)

  inputs = conv2d_fixed_padding(
      inputs=inputs,
      filters=4 * filters,
      kernel_size=1,
      strides=1,
      is_training=is_training,
      data_format=data_format)
  inputs = batch_norm_relu(inputs, is_training, relu=False, init_zero=True,
                           data_format=data_format)

  output = tf.nn.relu(inputs + shortcut)
  if is_training and FLAGS.mlperf_logging:
    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_SHORTCUT_ADD)
    mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
    resnet_log_helper.log_end_block(output_tensor=output)

  return output