def lite_conv_stem(inputs, filters, temporal_dilation=1, is_training=False, data_format='channels_last'): """Layers for a RGB or optical flow stem, using 2D + 1D conv layers. Args: inputs: A list of `Tensors` of size `[batch*time, channels, height, width]`. filters: `int` number of filters in the convolution. temporal_dilation: `int` temporal dilatioin size for the 1D conv. is_training: `bool` specifying whether in training mode or not. data_format: `str`. Only supports "channels_last" as the data format. Returns: The output `Tensor`. """ assert data_format == 'channels_last' if temporal_dilation < 1: temporal_dilation = 1 inputs = asn.conv3d_same_padding(inputs=inputs, filters=filters, kernel_size=3, strides=2, do_2d_conv=True, data_format=data_format) inputs = tf.identity(inputs, 'initial_conv') inputs = rf.batch_norm_relu(inputs, is_training, bn_decay=FLAGS.bn_decay, bn_epsilon=FLAGS.bn_epsilon, data_format=data_format) inputs = conv1d_bn(inputs=inputs, is_training=is_training, filters=filters, kernel_size=3, temporal_dilation=temporal_dilation, data_format=data_format) return inputs
def lite_one_stream_head(inputs, num_classes, is_training=False, data_format='channels_last'): """Layers for one classification head. Args: inputs: A 4D `Tensor` following the data_format. num_classes: `int` number of possible classes for video classification. is_training: `bool` specifying whether in training mode or not. data_format: `str` either "channels_first" for `[batch*time, channels, height, width]` or "channels_last for `[batch*time, height, width, channels]`. Only works for 'channels_last' currently. Returns: The output `Tensor`. """ assert data_format == 'channels_last' batch_size = inputs.shape[0] num_frames = inputs.shape[1] inputs = asn.conv3d_same_padding(inputs=inputs, filters=432, kernel_size=1, strides=1, data_format=data_format) inputs = tf.identity(inputs, 'last_conv') inputs = rf.batch_norm_relu(inputs, is_training, bn_decay=FLAGS.bn_decay, bn_epsilon=FLAGS.bn_epsilon, data_format=data_format) inputs = asn.conv3d_same_padding(inputs=inputs, filters=2048, kernel_size=1, strides=1, data_format=data_format) inputs = tf.identity(inputs, 'last_conv2') inputs = rf.batch_norm_relu(inputs, is_training, bn_decay=FLAGS.bn_decay, bn_epsilon=FLAGS.bn_epsilon, data_format=data_format) if not FLAGS.max_pool_preditions: pool_size = (inputs.shape[1], inputs.shape[2], inputs.shape[3]) inputs = tf.layers.average_pooling3d(inputs=inputs, pool_size=pool_size, strides=1, padding='VALID', data_format=data_format) inputs = tf.identity(inputs, 'final_avg_pool') inputs = tf.reshape(inputs, [batch_size, -1]) else: pool_size = (1, inputs.shape[2], inputs.shape[3]) inputs = tf.layers.average_pooling3d(inputs=inputs, pool_size=pool_size, strides=1, padding='VALID', data_format=data_format) inputs = tf.identity(inputs, 'final_avg_pool') inputs = tf.reshape(inputs, [batch_size, num_frames, -1]) if FLAGS.dropout_keep_prob: inputs = tf.keras.layers.Dropout(FLAGS.dropout_keep_prob)(inputs, training=is_training) # pylint: disable=line-too-long outputs = tf.layers.dense( inputs=inputs, units=num_classes, kernel_initializer=tf.random_normal_initializer(stddev=.01)) outputs = tf.identity(outputs, 'final_dense') if FLAGS.max_pool_preditions: pre_logits = outputs / np.sqrt(num_frames) acts = tf.nn.softmax(pre_logits, axis=1) outputs = tf.math.multiply(outputs, acts) outputs = tf.reduce_sum(outputs, 1) return outputs
def inverted_bottleneck_3dblock(inputs, filters, is_training, strides, use_projection=False, temporal_dilation=1, data_format='channels_last'): """Inverted bottleneck residual block with a 3D conv layer. Inverted bottleneck block variant for 3D residual networks with BN after convolutions. It uses (2+1)D conv instead when striding is needed. Args: inputs: 5D `Tensor` following the data_format. filters: `List` of `int` number of filters. is_training: `bool` for whether the model is in training. strides: `int` block stride. If greater than 1, this block will ultimately downsample the input spatially. use_projection: `bool` for whether this block should use a projection shortcut (versus the default identity shortcut). This is usually `True` for the first block of a block group, which may change the number of filters and the resolution. temporal_dilation: `int` temporal dilatioin size for the 1D conv. data_format: `str` either "channels_first" for `[batch, time, channels, height, width]` or "channels_last for `[batch, time, height, width, channels]`. Returns: The output `Tensor` of the block. """ shortcut = inputs if use_projection: # Projection shortcut only in first block within a group. Bottleneck blocks # end with 4 times the number of filters. shortcut = asn.conv3d_same_padding(inputs=inputs, filters=filters[-1], kernel_size=1, strides=strides, data_format=data_format) shortcut = rf.batch_norm_relu(shortcut, is_training, relu=False, bn_decay=FLAGS.bn_decay, bn_epsilon=FLAGS.bn_epsilon, data_format=data_format) inputs = asn.conv3d_same_padding(inputs=inputs, filters=filters[0], kernel_size=1, strides=1, data_format=data_format) inputs = rf.batch_norm_relu(inputs, is_training, bn_decay=FLAGS.bn_decay, bn_epsilon=FLAGS.bn_epsilon, data_format=data_format) if strides > 1: inputs = asn.conv3d_same_padding(inputs=inputs, filters=filters[1], kernel_size=3, strides=strides, do_2d_conv=True, data_format=data_format) inputs = rf.batch_norm_relu( inputs, is_training, bn_decay=FLAGS.bn_decay, bn_epsilon=FLAGS.bn_epsilon, # pylint: disable=line-too-long data_format=data_format) inputs = conv1d_bn(inputs, is_training, filters=filters[1], kernel_size=3, temporal_dilation=temporal_dilation, data_format=data_format) else: inputs = asn.conv3d_same_padding(inputs=inputs, filters=filters[1], kernel_size=[3, 3, 3], strides=1, temporal_dilation=temporal_dilation, data_format=data_format) inputs = rf.batch_norm_relu( inputs, is_training, bn_decay=FLAGS.bn_decay, bn_epsilon=FLAGS.bn_epsilon, # pylint: disable=line-too-long data_format=data_format) inputs = asn.conv3d_same_padding(inputs=inputs, filters=filters[-1], kernel_size=1, strides=1, data_format=data_format) inputs = rf.batch_norm_relu(inputs, is_training, relu=False, init_zero=True, bn_decay=FLAGS.bn_decay, bn_epsilon=FLAGS.bn_epsilon, data_format=data_format) return tf.nn.relu(inputs + shortcut)