Esempio n. 1
0
def lite_conv_stem(inputs,
                   filters,
                   temporal_dilation=1,
                   is_training=False,
                   data_format='channels_last'):
    """Layers for a RGB or optical flow stem, using 2D + 1D conv layers.

  Args:
    inputs: A list of `Tensors` of size `[batch*time, channels, height, width]`.
    filters: `int` number of filters in the convolution.
    temporal_dilation: `int` temporal dilatioin size for the 1D conv.
    is_training: `bool` specifying whether in training mode or not.
    data_format: `str`. Only supports "channels_last" as the data format.
  Returns:
    The output `Tensor`.
  """
    assert data_format == 'channels_last'

    if temporal_dilation < 1:
        temporal_dilation = 1

    inputs = asn.conv3d_same_padding(inputs=inputs,
                                     filters=filters,
                                     kernel_size=3,
                                     strides=2,
                                     do_2d_conv=True,
                                     data_format=data_format)
    inputs = tf.identity(inputs, 'initial_conv')
    inputs = rf.batch_norm_relu(inputs,
                                is_training,
                                bn_decay=FLAGS.bn_decay,
                                bn_epsilon=FLAGS.bn_epsilon,
                                data_format=data_format)

    inputs = conv1d_bn(inputs=inputs,
                       is_training=is_training,
                       filters=filters,
                       kernel_size=3,
                       temporal_dilation=temporal_dilation,
                       data_format=data_format)

    return inputs
Esempio n. 2
0
def lite_one_stream_head(inputs,
                         num_classes,
                         is_training=False,
                         data_format='channels_last'):
    """Layers for one classification head.

  Args:
    inputs: A 4D `Tensor` following the data_format.
    num_classes: `int` number of possible classes for video classification.
    is_training: `bool` specifying whether in training mode or not.
    data_format: `str` either "channels_first" for `[batch*time, channels,
      height, width]` or "channels_last for `[batch*time, height, width,
      channels]`. Only works for 'channels_last' currently.
  Returns:
    The output `Tensor`.
  """
    assert data_format == 'channels_last'

    batch_size = inputs.shape[0]
    num_frames = inputs.shape[1]

    inputs = asn.conv3d_same_padding(inputs=inputs,
                                     filters=432,
                                     kernel_size=1,
                                     strides=1,
                                     data_format=data_format)
    inputs = tf.identity(inputs, 'last_conv')
    inputs = rf.batch_norm_relu(inputs,
                                is_training,
                                bn_decay=FLAGS.bn_decay,
                                bn_epsilon=FLAGS.bn_epsilon,
                                data_format=data_format)

    inputs = asn.conv3d_same_padding(inputs=inputs,
                                     filters=2048,
                                     kernel_size=1,
                                     strides=1,
                                     data_format=data_format)
    inputs = tf.identity(inputs, 'last_conv2')
    inputs = rf.batch_norm_relu(inputs,
                                is_training,
                                bn_decay=FLAGS.bn_decay,
                                bn_epsilon=FLAGS.bn_epsilon,
                                data_format=data_format)

    if not FLAGS.max_pool_preditions:
        pool_size = (inputs.shape[1], inputs.shape[2], inputs.shape[3])
        inputs = tf.layers.average_pooling3d(inputs=inputs,
                                             pool_size=pool_size,
                                             strides=1,
                                             padding='VALID',
                                             data_format=data_format)
        inputs = tf.identity(inputs, 'final_avg_pool')

        inputs = tf.reshape(inputs, [batch_size, -1])
    else:
        pool_size = (1, inputs.shape[2], inputs.shape[3])
        inputs = tf.layers.average_pooling3d(inputs=inputs,
                                             pool_size=pool_size,
                                             strides=1,
                                             padding='VALID',
                                             data_format=data_format)
        inputs = tf.identity(inputs, 'final_avg_pool')

        inputs = tf.reshape(inputs, [batch_size, num_frames, -1])

    if FLAGS.dropout_keep_prob:
        inputs = tf.keras.layers.Dropout(FLAGS.dropout_keep_prob)(inputs, training=is_training)  # pylint: disable=line-too-long

    outputs = tf.layers.dense(
        inputs=inputs,
        units=num_classes,
        kernel_initializer=tf.random_normal_initializer(stddev=.01))
    outputs = tf.identity(outputs, 'final_dense')

    if FLAGS.max_pool_preditions:
        pre_logits = outputs / np.sqrt(num_frames)
        acts = tf.nn.softmax(pre_logits, axis=1)
        outputs = tf.math.multiply(outputs, acts)

        outputs = tf.reduce_sum(outputs, 1)

    return outputs
Esempio n. 3
0
def inverted_bottleneck_3dblock(inputs,
                                filters,
                                is_training,
                                strides,
                                use_projection=False,
                                temporal_dilation=1,
                                data_format='channels_last'):
    """Inverted bottleneck residual block with a 3D conv layer.

  Inverted bottleneck block variant for 3D residual networks with BN after
  convolutions. It uses (2+1)D conv instead when striding is needed.

  Args:
    inputs: 5D `Tensor` following the data_format.
    filters: `List` of `int` number of filters.
    is_training: `bool` for whether the model is in training.
    strides: `int` block stride. If greater than 1, this block will ultimately
      downsample the input spatially.
    use_projection: `bool` for whether this block should use a projection
      shortcut (versus the default identity shortcut). This is usually `True`
      for the first block of a block group, which may change the number of
      filters and the resolution.
    temporal_dilation: `int` temporal dilatioin size for the 1D conv.
    data_format: `str` either "channels_first" for `[batch, time, channels,
      height, width]` or "channels_last for `[batch, time, height, width,
      channels]`.

  Returns:
    The output `Tensor` of the block.
  """
    shortcut = inputs
    if use_projection:
        # Projection shortcut only in first block within a group. Bottleneck blocks
        # end with 4 times the number of filters.

        shortcut = asn.conv3d_same_padding(inputs=inputs,
                                           filters=filters[-1],
                                           kernel_size=1,
                                           strides=strides,
                                           data_format=data_format)
        shortcut = rf.batch_norm_relu(shortcut,
                                      is_training,
                                      relu=False,
                                      bn_decay=FLAGS.bn_decay,
                                      bn_epsilon=FLAGS.bn_epsilon,
                                      data_format=data_format)

    inputs = asn.conv3d_same_padding(inputs=inputs,
                                     filters=filters[0],
                                     kernel_size=1,
                                     strides=1,
                                     data_format=data_format)
    inputs = rf.batch_norm_relu(inputs,
                                is_training,
                                bn_decay=FLAGS.bn_decay,
                                bn_epsilon=FLAGS.bn_epsilon,
                                data_format=data_format)

    if strides > 1:
        inputs = asn.conv3d_same_padding(inputs=inputs,
                                         filters=filters[1],
                                         kernel_size=3,
                                         strides=strides,
                                         do_2d_conv=True,
                                         data_format=data_format)
        inputs = rf.batch_norm_relu(
            inputs,
            is_training,
            bn_decay=FLAGS.bn_decay,
            bn_epsilon=FLAGS.bn_epsilon,  # pylint: disable=line-too-long
            data_format=data_format)
        inputs = conv1d_bn(inputs,
                           is_training,
                           filters=filters[1],
                           kernel_size=3,
                           temporal_dilation=temporal_dilation,
                           data_format=data_format)
    else:
        inputs = asn.conv3d_same_padding(inputs=inputs,
                                         filters=filters[1],
                                         kernel_size=[3, 3, 3],
                                         strides=1,
                                         temporal_dilation=temporal_dilation,
                                         data_format=data_format)
        inputs = rf.batch_norm_relu(
            inputs,
            is_training,
            bn_decay=FLAGS.bn_decay,
            bn_epsilon=FLAGS.bn_epsilon,  # pylint: disable=line-too-long
            data_format=data_format)

    inputs = asn.conv3d_same_padding(inputs=inputs,
                                     filters=filters[-1],
                                     kernel_size=1,
                                     strides=1,
                                     data_format=data_format)
    inputs = rf.batch_norm_relu(inputs,
                                is_training,
                                relu=False,
                                init_zero=True,
                                bn_decay=FLAGS.bn_decay,
                                bn_epsilon=FLAGS.bn_epsilon,
                                data_format=data_format)

    return tf.nn.relu(inputs + shortcut)