def create_acvideo_discriminator(clips, actions, ndf=64, norm_layer='instance', use_noise=False, noise_sigma=None): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [0, 0], [1, 1], [1, 1], [0, 0]] clips = clips * 2 - 1 clip_pairs = tf.concat([clips[:-1], clips[1:]], axis=-1) clip_pairs = tile_concat([clip_pairs, actions[..., None, None, :]], axis=-1) clip_pairs = tf_utils.transpose_batch_time(clip_pairs) with tf.variable_scope("acvideo_layer_1"): h1 = noise(clip_pairs, use_noise, noise_sigma) h1 = conv3d(tf.pad(h1, paddings), ndf, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) h1 = lrelu(h1, 0.2) layers.append(h1) with tf.variable_scope("acvideo_layer_2"): h2 = noise(h1, use_noise, noise_sigma) h2 = conv3d(tf.pad(h2, paddings), ndf * 2, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) h2 = norm_layer(h2) h2 = lrelu(h2, 0.2) layers.append(h2) with tf.variable_scope("acvideo_layer_3"): h3 = noise(h2, use_noise, noise_sigma) h3 = conv3d(tf.pad(h3, paddings), ndf * 4, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) h3 = norm_layer(h3) h3 = lrelu(h3, 0.2) layers.append(h3) with tf.variable_scope("acvideo_layer_4"): logits = conv3d(tf.pad(h3, paddings), 1, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) layers.append(logits) return nest.map_structure(tf_utils.transpose_batch_time, layers)
def create_video_discriminator(clips, ndf=64, norm_layer='instance'): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [0, 0], [1, 1], [1, 1], [0, 0]] clips = tf_utils.transpose_batch_time(clips) with tf.variable_scope("video_layer_1"): h1 = conv3d(tf.pad(clips, paddings), ndf, kernel_size=4, strides=(1, 2, 2), padding='VALID') h1 = lrelu(h1, 0.2) layers.append(h1) with tf.variable_scope("video_layer_2"): h2 = conv3d(tf.pad(h1, paddings), ndf * 2, kernel_size=4, strides=(1, 2, 2), padding='VALID') h2 = norm_layer(h2) h2 = lrelu(h2, 0.2) layers.append(h2) with tf.variable_scope("video_layer_3"): h3 = conv3d(tf.pad(h2, paddings), ndf * 4, kernel_size=4, strides=(1, 2, 2), padding='VALID') h3 = norm_layer(h3) h3 = lrelu(h3, 0.2) layers.append(h3) with tf.variable_scope("video_layer_4"): if h3.shape[1].value < 4: kernel_size = (h3.shape[1].value, 4, 4) else: kernel_size = 4 logits = conv3d(h3, 1, kernel_size=kernel_size, strides=1, padding='VALID') layers.append(logits) return nest.map_structure(tf_utils.transpose_batch_time, layers)
def conv3d(inputs, *args, **kwargs): kwargs.setdefault('padding', 'VALID') kwargs.setdefault('use_spectral_norm', True) return ops.conv3d(tf.pad(inputs, paddings), *args, **kwargs)