Beispiel #1
0
def create_acvideo_discriminator(clips,
                                 actions,
                                 ndf=64,
                                 norm_layer='instance',
                                 use_noise=False,
                                 noise_sigma=None):
    norm_layer = ops.get_norm_layer(norm_layer)
    layers = []
    paddings = [[0, 0], [0, 0], [1, 1], [1, 1], [0, 0]]

    clips = clips * 2 - 1
    clip_pairs = tf.concat([clips[:-1], clips[1:]], axis=-1)
    clip_pairs = tile_concat([clip_pairs, actions[..., None, None, :]],
                             axis=-1)
    clip_pairs = tf_utils.transpose_batch_time(clip_pairs)

    with tf.variable_scope("acvideo_layer_1"):
        h1 = noise(clip_pairs, use_noise, noise_sigma)
        h1 = conv3d(tf.pad(h1, paddings),
                    ndf,
                    kernel_size=(3, 4, 4),
                    strides=(1, 2, 2),
                    padding='VALID',
                    use_bias=False)
        h1 = lrelu(h1, 0.2)
        layers.append(h1)

    with tf.variable_scope("acvideo_layer_2"):
        h2 = noise(h1, use_noise, noise_sigma)
        h2 = conv3d(tf.pad(h2, paddings),
                    ndf * 2,
                    kernel_size=(3, 4, 4),
                    strides=(1, 2, 2),
                    padding='VALID',
                    use_bias=False)
        h2 = norm_layer(h2)
        h2 = lrelu(h2, 0.2)
        layers.append(h2)

    with tf.variable_scope("acvideo_layer_3"):
        h3 = noise(h2, use_noise, noise_sigma)
        h3 = conv3d(tf.pad(h3, paddings),
                    ndf * 4,
                    kernel_size=(3, 4, 4),
                    strides=(1, 2, 2),
                    padding='VALID',
                    use_bias=False)
        h3 = norm_layer(h3)
        h3 = lrelu(h3, 0.2)
        layers.append(h3)

    with tf.variable_scope("acvideo_layer_4"):
        logits = conv3d(tf.pad(h3, paddings),
                        1,
                        kernel_size=(3, 4, 4),
                        strides=(1, 2, 2),
                        padding='VALID',
                        use_bias=False)
        layers.append(logits)
    return nest.map_structure(tf_utils.transpose_batch_time, layers)
Beispiel #2
0
def create_video_discriminator(clips, ndf=64, norm_layer='instance'):
    norm_layer = ops.get_norm_layer(norm_layer)
    layers = []
    paddings = [[0, 0], [0, 0], [1, 1], [1, 1], [0, 0]]

    clips = tf_utils.transpose_batch_time(clips)

    with tf.variable_scope("video_layer_1"):
        h1 = conv3d(tf.pad(clips, paddings),
                    ndf,
                    kernel_size=4,
                    strides=(1, 2, 2),
                    padding='VALID')
        h1 = lrelu(h1, 0.2)
        layers.append(h1)

    with tf.variable_scope("video_layer_2"):
        h2 = conv3d(tf.pad(h1, paddings),
                    ndf * 2,
                    kernel_size=4,
                    strides=(1, 2, 2),
                    padding='VALID')
        h2 = norm_layer(h2)
        h2 = lrelu(h2, 0.2)
        layers.append(h2)

    with tf.variable_scope("video_layer_3"):
        h3 = conv3d(tf.pad(h2, paddings),
                    ndf * 4,
                    kernel_size=4,
                    strides=(1, 2, 2),
                    padding='VALID')
        h3 = norm_layer(h3)
        h3 = lrelu(h3, 0.2)
        layers.append(h3)

    with tf.variable_scope("video_layer_4"):
        if h3.shape[1].value < 4:
            kernel_size = (h3.shape[1].value, 4, 4)
        else:
            kernel_size = 4
        logits = conv3d(h3,
                        1,
                        kernel_size=kernel_size,
                        strides=1,
                        padding='VALID')
        layers.append(logits)
    return nest.map_structure(tf_utils.transpose_batch_time, layers)
Beispiel #3
0
 def conv3d(inputs, *args, **kwargs):
     kwargs.setdefault('padding', 'VALID')
     kwargs.setdefault('use_spectral_norm', True)
     return ops.conv3d(tf.pad(inputs, paddings), *args, **kwargs)