Exemple #1
0
def encoder(inputs, nef=64, n_layers=3, norm_layer='instance'):
    norm_layer = ops.get_norm_layer(norm_layer)
    layers = []
    paddings = [[0, 0], [1, 1], [1, 1], [0, 0]]

    with tf.variable_scope("layer_1"):
        convolved = conv2d(tf.pad(inputs, paddings),
                           nef,
                           kernel_size=4,
                           strides=2,
                           padding='VALID')
        rectified = lrelu(convolved, 0.2)
        layers.append(rectified)

    for i in range(1, n_layers):
        with tf.variable_scope("layer_%d" % (len(layers) + 1)):
            out_channels = nef * min(2**i, 4)
            convolved = conv2d(tf.pad(layers[-1], paddings),
                               out_channels,
                               kernel_size=4,
                               strides=2,
                               padding='VALID')
            normalized = norm_layer(convolved)
            rectified = lrelu(normalized, 0.2)
            layers.append(rectified)

    pooled = pool2d(rectified,
                    rectified.shape.as_list()[1:3],
                    padding='VALID',
                    pool_mode='avg')
    squeezed = tf.squeeze(pooled, [1, 2])
    return squeezed
Exemple #2
0
def create_acvideo_discriminator(clips,
                                 actions,
                                 ndf=64,
                                 norm_layer='instance',
                                 use_noise=False,
                                 noise_sigma=None):
    norm_layer = ops.get_norm_layer(norm_layer)
    layers = []
    paddings = [[0, 0], [0, 0], [1, 1], [1, 1], [0, 0]]

    clips = clips * 2 - 1
    clip_pairs = tf.concat([clips[:-1], clips[1:]], axis=-1)
    clip_pairs = tile_concat([clip_pairs, actions[..., None, None, :]],
                             axis=-1)
    clip_pairs = tf_utils.transpose_batch_time(clip_pairs)

    with tf.variable_scope("acvideo_layer_1"):
        h1 = noise(clip_pairs, use_noise, noise_sigma)
        h1 = conv3d(tf.pad(h1, paddings),
                    ndf,
                    kernel_size=(3, 4, 4),
                    strides=(1, 2, 2),
                    padding='VALID',
                    use_bias=False)
        h1 = lrelu(h1, 0.2)
        layers.append(h1)

    with tf.variable_scope("acvideo_layer_2"):
        h2 = noise(h1, use_noise, noise_sigma)
        h2 = conv3d(tf.pad(h2, paddings),
                    ndf * 2,
                    kernel_size=(3, 4, 4),
                    strides=(1, 2, 2),
                    padding='VALID',
                    use_bias=False)
        h2 = norm_layer(h2)
        h2 = lrelu(h2, 0.2)
        layers.append(h2)

    with tf.variable_scope("acvideo_layer_3"):
        h3 = noise(h2, use_noise, noise_sigma)
        h3 = conv3d(tf.pad(h3, paddings),
                    ndf * 4,
                    kernel_size=(3, 4, 4),
                    strides=(1, 2, 2),
                    padding='VALID',
                    use_bias=False)
        h3 = norm_layer(h3)
        h3 = lrelu(h3, 0.2)
        layers.append(h3)

    with tf.variable_scope("acvideo_layer_4"):
        logits = conv3d(tf.pad(h3, paddings),
                        1,
                        kernel_size=(3, 4, 4),
                        strides=(1, 2, 2),
                        padding='VALID',
                        use_bias=False)
        layers.append(logits)
    return nest.map_structure(tf_utils.transpose_batch_time, layers)
Exemple #3
0
def create_encoder(image, nef=64, norm_layer='instance', dim_z=10):
    norm_layer = ops.get_norm_layer(norm_layer)
    layers = []
    paddings = [[0, 0], [1, 1], [1, 1], [0, 0]]

    with tf.variable_scope("layer_1"):
        h0 = conv2d(tf.pad(image, paddings),
                    nef,
                    kernel_size=4,
                    strides=2,
                    padding='VALID')
        h0 = norm_layer(h0)
        h0 = lrelu(h0, 0.2)
        layers.append(h0)

    with tf.variable_scope("layer_2"):
        h1 = conv2d(tf.pad(h0, paddings),
                    nef * 2,
                    kernel_size=4,
                    strides=2,
                    padding='VALID')
        h1 = norm_layer(h1)
        h1 = lrelu(h1, 0.2)
        layers.append(h1)

    with tf.variable_scope("layer_3"):
        h2 = conv2d(tf.pad(h1, paddings),
                    nef * 4,
                    kernel_size=4,
                    strides=2,
                    padding='VALID')
        h2 = norm_layer(h2)
        h2 = lrelu(h2, 0.2)
        layers.append(h2)

    with tf.variable_scope("layer_4"):
        h3 = conv2d(tf.pad(h2, paddings),
                    nef * 8,
                    kernel_size=4,
                    strides=2,
                    padding='VALID')
        h3 = norm_layer(h3)
        h3 = lrelu(h3, 0.2)
        layers.append(h3)

    with tf.variable_scope("layer_5"):
        h4 = conv2d(tf.pad(h3, paddings),
                    dim_z,
                    kernel_size=4,
                    strides=2,
                    padding='VALID')
        layers.append(h4)

    pooled = pool2d(h4,
                    h4.shape[1:3].as_list(),
                    padding='VALID',
                    pool_mode='avg')
    squeezed = tf.squeeze(pooled, [1, 2])
    return squeezed
Exemple #4
0
def create_image_discriminator(images,
                               ndf=64,
                               norm_layer='instance',
                               use_noise=False,
                               noise_sigma=None):
    norm_layer = ops.get_norm_layer(norm_layer)
    layers = []
    paddings = [[0, 0], [1, 1], [1, 1], [0, 0]]

    images = images * 2 - 1

    with tf.variable_scope("image_layer_1"):
        h1 = noise(images, use_noise, noise_sigma)
        h1 = conv2d(tf.pad(h1, paddings),
                    ndf,
                    kernel_size=4,
                    strides=2,
                    padding='VALID',
                    use_bias=False)
        h1 = lrelu(h1, 0.2)
        layers.append(h1)

    with tf.variable_scope("image_layer_2"):
        h2 = noise(h1, use_noise, noise_sigma)
        h2 = conv2d(tf.pad(h2, paddings),
                    ndf * 2,
                    kernel_size=4,
                    strides=2,
                    padding='VALID',
                    use_bias=False)
        h2 = norm_layer(h2)
        h2 = lrelu(h2, 0.2)
        layers.append(h2)

    with tf.variable_scope("image_layer_3"):
        h3 = noise(h2, use_noise, noise_sigma)
        h3 = conv2d(tf.pad(h3, paddings),
                    ndf * 4,
                    kernel_size=4,
                    strides=2,
                    padding='VALID',
                    use_bias=False)
        h3 = norm_layer(h3)
        h3 = lrelu(h3, 0.2)
        layers.append(h3)

    with tf.variable_scope("image_layer_4"):
        h4 = noise(h3, use_noise, noise_sigma)
        logits = conv2d(tf.pad(h4, paddings),
                        1,
                        kernel_size=4,
                        strides=2,
                        padding='VALID',
                        use_bias=False)
        layers.append(logits)
    return layers
Exemple #5
0
def create_n_layer_discriminator(discrim_targets,
                                 discrim_inputs=None,
                                 ndf=64,
                                 n_layers=3,
                                 norm_layer='instance'):
    norm_layer = ops.get_norm_layer(norm_layer)

    layers = []
    inputs = [discrim_targets]
    if discrim_inputs is not None:
        inputs.append(discrim_inputs)
    inputs = tf.concat(inputs, axis=-1)

    paddings = [[0, 0], [1, 1], [1, 1], [0, 0]]

    # layer_1: [batch, 256, 256, in_channels * 2] => [batch, 128, 128, ndf]
    with tf.variable_scope("layer_1"):
        convolved = conv2d(tf.pad(inputs, paddings),
                           ndf,
                           kernel_size=4,
                           strides=2,
                           padding='VALID')
        rectified = lrelu(convolved, 0.2)
        layers.append(rectified)

    # layer_2: [batch, 128, 128, ndf] => [batch, 64, 64, ndf * 2]
    # layer_3: [batch, 64, 64, ndf * 2] => [batch, 32, 32, ndf * 4]
    # layer_4: [batch, 32, 32, ndf * 4] => [batch, 31, 31, ndf * 8]
    for i in range(n_layers):
        with tf.variable_scope("layer_%d" % (len(layers) + 1)):
            out_channels = ndf * min(2**(i + 1), 8)
            stride = 1 if i == n_layers - 1 else 2  # last layer here has stride 1
            convolved = conv2d(tf.pad(layers[-1], paddings),
                               out_channels,
                               kernel_size=4,
                               strides=stride,
                               padding='VALID')
            normalized = norm_layer(convolved)
            rectified = lrelu(normalized, 0.2)
            layers.append(rectified)

    # layer_5: [batch, 31, 31, ndf * 8] => [batch, 30, 30, 1]
    with tf.variable_scope("layer_%d" % (len(layers) + 1)):
        logits = conv2d(tf.pad(rectified, paddings),
                        1,
                        kernel_size=4,
                        strides=1,
                        padding='VALID')
        layers.append(
            logits
        )  # don't apply sigmoid to the logits in case we want to use LSGAN
    return layers
Exemple #6
0
def create_video_discriminator(clips, ndf=64, norm_layer='instance'):
    norm_layer = ops.get_norm_layer(norm_layer)
    layers = []
    paddings = [[0, 0], [0, 0], [1, 1], [1, 1], [0, 0]]

    clips = tf_utils.transpose_batch_time(clips)

    with tf.variable_scope("video_layer_1"):
        h1 = conv3d(tf.pad(clips, paddings),
                    ndf,
                    kernel_size=4,
                    strides=(1, 2, 2),
                    padding='VALID')
        h1 = lrelu(h1, 0.2)
        layers.append(h1)

    with tf.variable_scope("video_layer_2"):
        h2 = conv3d(tf.pad(h1, paddings),
                    ndf * 2,
                    kernel_size=4,
                    strides=(1, 2, 2),
                    padding='VALID')
        h2 = norm_layer(h2)
        h2 = lrelu(h2, 0.2)
        layers.append(h2)

    with tf.variable_scope("video_layer_3"):
        h3 = conv3d(tf.pad(h2, paddings),
                    ndf * 4,
                    kernel_size=4,
                    strides=(1, 2, 2),
                    padding='VALID')
        h3 = norm_layer(h3)
        h3 = lrelu(h3, 0.2)
        layers.append(h3)

    with tf.variable_scope("video_layer_4"):
        if h3.shape[1].value < 4:
            kernel_size = (h3.shape[1].value, 4, 4)
        else:
            kernel_size = 4
        logits = conv3d(h3,
                        1,
                        kernel_size=kernel_size,
                        strides=1,
                        padding='VALID')
        layers.append(logits)
    return nest.map_structure(tf_utils.transpose_batch_time, layers)
Exemple #7
0
def create_n_layer_encoder(inputs,
                           nz=8,
                           nef=64,
                           n_layers=3,
                           norm_layer='instance',
                           include_top=True):
    norm_layer = ops.get_norm_layer(norm_layer)
    layers = []
    paddings = [[0, 0], [1, 1], [1, 1], [0, 0]]

    with tf.variable_scope("layer_1"):
        convolved = conv2d(tf.pad(inputs, paddings),
                           nef,
                           kernel_size=4,
                           strides=2,
                           padding='VALID')
        rectified = lrelu(convolved, 0.2)
        layers.append(rectified)

    for i in range(1, n_layers):
        with tf.variable_scope("layer_%d" % (len(layers) + 1)):
            out_channels = nef * min(2**i, 4)
            convolved = conv2d(tf.pad(layers[-1], paddings),
                               out_channels,
                               kernel_size=4,
                               strides=2,
                               padding='VALID')
            normalized = norm_layer(convolved)
            rectified = lrelu(normalized, 0.2)
            layers.append(rectified)

    pooled = pool2d(rectified,
                    rectified.shape[1:3].as_list(),
                    padding='VALID',
                    pool_mode='avg')
    squeezed = tf.squeeze(pooled, [1, 2])

    if include_top:
        with tf.variable_scope('z_mu'):
            z_mu = dense(squeezed, nz)
        with tf.variable_scope('z_log_sigma_sq'):
            z_log_sigma_sq = dense(squeezed, nz)
            z_log_sigma_sq = tf.clip_by_value(z_log_sigma_sq, -10, 10)
        outputs = {'enc_zs_mu': z_mu, 'enc_zs_log_sigma_sq': z_log_sigma_sq}
    else:
        outputs = squeezed
    return outputs
Exemple #8
0
def create_image_discriminator(images, ndf=64, norm_layer='instance'):
    norm_layer = ops.get_norm_layer(norm_layer)
    layers = []
    paddings = [[0, 0], [1, 1], [1, 1], [0, 0]]

    with tf.variable_scope("image_layer_1"):
        h1 = conv2d(tf.pad(images, paddings),
                    ndf,
                    kernel_size=4,
                    strides=2,
                    padding='VALID')
        h1 = lrelu(h1, 0.2)
        layers.append(h1)

    with tf.variable_scope("image_layer_2"):
        h2 = conv2d(tf.pad(h1, paddings),
                    ndf * 2,
                    kernel_size=4,
                    strides=2,
                    padding='VALID')
        h2 = norm_layer(h2)
        h2 = lrelu(h2, 0.2)
        layers.append(h2)

    with tf.variable_scope("image_layer_3"):
        h3 = conv2d(tf.pad(h2, paddings),
                    ndf * 4,
                    kernel_size=4,
                    strides=2,
                    padding='VALID')
        h3 = norm_layer(h3)
        h3 = lrelu(h3, 0.2)
        layers.append(h3)

    with tf.variable_scope("image_layer_4"):
        logits = conv2d(h3, 1, kernel_size=4, strides=1, padding='VALID')
        layers.append(logits)
    return layers
Exemple #9
0
def video_sn_discriminator(clips, ndf=64):
    clips = tf_utils.transpose_batch_time(clips)
    batch_size = clips.shape[0].value
    layers = []
    paddings = [[0, 0], [1, 1], [1, 1], [1, 1], [0, 0]]

    def conv3d(inputs, *args, **kwargs):
        kwargs.setdefault('padding', 'VALID')
        kwargs.setdefault('use_spectral_norm', True)
        return ops.conv3d(tf.pad(inputs, paddings), *args, **kwargs)

    with tf.variable_scope("sn_conv0_0"):
        layers.append(lrelu(conv3d(clips, ndf, kernel_size=3, strides=1), 0.1))

    with tf.variable_scope("sn_conv0_1"):
        layers.append(
            lrelu(
                conv3d(layers[-1], ndf * 2, kernel_size=4, strides=(1, 2, 2)),
                0.1))

    with tf.variable_scope("sn_conv1_0"):
        layers.append(
            lrelu(conv3d(layers[-1], ndf * 2, kernel_size=3, strides=1), 0.1))

    with tf.variable_scope("sn_conv1_1"):
        layers.append(
            lrelu(
                conv3d(layers[-1], ndf * 4, kernel_size=4, strides=(1, 2, 2)),
                0.1))

    with tf.variable_scope("sn_conv2_0"):
        layers.append(
            lrelu(conv3d(layers[-1], ndf * 4, kernel_size=3, strides=1), 0.1))

    with tf.variable_scope("sn_conv2_1"):
        layers.append(
            lrelu(conv3d(layers[-1], ndf * 8, kernel_size=4, strides=2), 0.1))

    with tf.variable_scope("sn_conv3_0"):
        layers.append(
            lrelu(conv3d(layers[-1], ndf * 8, kernel_size=3, strides=1), 0.1))

    with tf.variable_scope("sn_fc4"):
        logits = dense(tf.reshape(layers[-1], [batch_size, -1]),
                       1,
                       use_spectral_norm=True)
        layers.append(logits)
    layers = nest.map_structure(tf_utils.transpose_batch_time, layers)
    return layers
Exemple #10
0
def image_sn_discriminator(images, ndf=64):
    batch_size = images.shape[0].value
    layers = []
    paddings = [[0, 0], [1, 1], [1, 1], [0, 0]]

    def conv2d(inputs, *args, **kwargs):
        kwargs.setdefault('padding', 'VALID')
        kwargs.setdefault('use_spectral_norm', True)
        return ops.conv2d(tf.pad(inputs, paddings), *args, **kwargs)

    with tf.variable_scope("sn_conv0_0"):
        layers.append(lrelu(conv2d(images, ndf, kernel_size=3, strides=1),
                            0.1))

    with tf.variable_scope("sn_conv0_1"):
        layers.append(
            lrelu(conv2d(layers[-1], ndf * 2, kernel_size=4, strides=2), 0.1))

    with tf.variable_scope("sn_conv1_0"):
        layers.append(
            lrelu(conv2d(layers[-1], ndf * 2, kernel_size=3, strides=1), 0.1))

    with tf.variable_scope("sn_conv1_1"):
        layers.append(
            lrelu(conv2d(layers[-1], ndf * 4, kernel_size=4, strides=2), 0.1))

    with tf.variable_scope("sn_conv2_0"):
        layers.append(
            lrelu(conv2d(layers[-1], ndf * 4, kernel_size=3, strides=1), 0.1))

    with tf.variable_scope("sn_conv2_1"):
        layers.append(
            lrelu(conv2d(layers[-1], ndf * 8, kernel_size=4, strides=2), 0.1))

    with tf.variable_scope("sn_conv3_0"):
        layers.append(
            lrelu(conv2d(layers[-1], ndf * 8, kernel_size=3, strides=1), 0.1))

    with tf.variable_scope("sn_fc4"):
        logits = dense(tf.reshape(layers[-1], [batch_size, -1]),
                       1,
                       use_spectral_norm=True)
        layers.append(logits)
    return layers
Exemple #11
0
def create_legacy_discriminator(discrim_targets,
                                discrim_inputs=None,
                                ndf=64,
                                norm_layer='instance',
                                downsample_layer='conv_pool2d'):
    norm_layer = ops.get_norm_layer(norm_layer)
    downsample_layer = ops.get_downsample_layer(downsample_layer)

    layers = []
    inputs = [discrim_targets]
    if discrim_inputs is not None:
        inputs.append(discrim_inputs)
    inputs = tf.concat(inputs, axis=-1)

    scale_size = min(*inputs.shape.as_list()[1:3])
    if scale_size == 256:
        layer_specs = [
            (
                ndf, 2
            ),  # layer_1: [batch, 256, 256, in_channels * 2] => [batch, 128, 128, ndf]
            (ndf * 2,
             2),  # layer_2: [batch, 128, 128, ndf] => [batch, 64, 64, ndf * 2]
            (
                ndf * 4, 2
            ),  # layer_3: [batch, 64, 64, ndf * 2] => [batch, 32, 32, ndf * 4]
            (
                ndf * 8, 1
            ),  # layer_4: [batch, 32, 32, ndf * 4] => [batch, 32, 32, ndf * 8]
            (1, 1),  # layer_5: [batch, 32, 32, ndf * 8] => [batch, 32, 32, 1]
        ]
    elif scale_size == 128:
        layer_specs = [
            (ndf, 2),
            (ndf * 2, 2),
            (ndf * 4, 1),
            (ndf * 8, 1),
            (1, 1),
        ]
    elif scale_size == 64:
        layer_specs = [
            (ndf, 2),
            (ndf * 2, 1),
            (ndf * 4, 1),
            (ndf * 8, 1),
            (1, 1),
        ]
    else:
        raise NotImplementedError

    with tf.variable_scope("layer_1"):
        out_channels, strides = layer_specs[0]
        convolved = downsample_layer(inputs,
                                     out_channels,
                                     kernel_size=4,
                                     strides=strides)
        rectified = lrelu(convolved, 0.2)
        layers.append(rectified)

    for out_channels, strides in layer_specs[1:-1]:
        with tf.variable_scope("layer_%d" % (len(layers) + 1)):
            if strides == 1:
                convolved = conv2d(layers[-1], out_channels, kernel_size=4)
            else:
                convolved = downsample_layer(layers[-1],
                                             out_channels,
                                             kernel_size=4,
                                             strides=strides)
            normalized = norm_layer(convolved)
            rectified = lrelu(normalized, 0.2)
            layers.append(rectified)

    with tf.variable_scope("layer_%d" % (len(layers) + 1)):
        out_channels, strides = layer_specs[-1]
        if strides == 1:
            logits = conv2d(rectified, out_channels, kernel_size=4)
        else:
            logits = downsample_layer(rectified,
                                      out_channels,
                                      kernel_size=4,
                                      strides=strides)
        layers.append(
            logits
        )  # don't apply sigmoid to the logits in case we want to use LSGAN

    return layers
Exemple #12
0
def create_generator(generator_inputs,
                     output_nc=3,
                     ngf=64,
                     norm_layer='instance',
                     downsample_layer='conv_pool2d',
                     upsample_layer='upsample_conv2d'):
    norm_layer = ops.get_norm_layer(norm_layer)
    downsample_layer = ops.get_downsample_layer(downsample_layer)
    upsample_layer = ops.get_upsample_layer(upsample_layer)

    layers = []
    inputs = generator_inputs

    scale_size = min(*inputs.shape.as_list()[1:3])
    if scale_size == 256:
        layer_specs = [
            (
                ngf, 2
            ),  # encoder_1: [batch, 256, 256, in_channels] => [batch, 128, 128, ngf]
            (
                ngf * 2, 2
            ),  # encoder_2: [batch, 128, 128, ngf] => [batch, 64, 64, ngf * 2]
            (
                ngf * 4, 2
            ),  # encoder_3: [batch, 64, 64, ngf * 2] => [batch, 32, 32, ngf * 4]
            (
                ngf * 8, 2
            ),  # encoder_4: [batch, 32, 32, ngf * 4] => [batch, 16, 16, ngf * 8]
            (
                ngf * 8, 2
            ),  # encoder_5: [batch, 16, 16, ngf * 8] => [batch, 8, 8, ngf * 8]
            (ngf * 8,
             2),  # encoder_6: [batch, 8, 8, ngf * 8] => [batch, 4, 4, ngf * 8]
            (ngf * 8,
             2),  # encoder_7: [batch, 4, 4, ngf * 8] => [batch, 2, 2, ngf * 8]
            (ngf * 8,
             2),  # encoder_8: [batch, 2, 2, ngf * 8] => [batch, 1, 1, ngf * 8]
        ]
    elif scale_size == 128:
        layer_specs = [
            (ngf, 2),
            (ngf * 2, 2),
            (ngf * 4, 2),
            (ngf * 8, 2),
            (ngf * 8, 2),
            (ngf * 8, 2),
            (ngf * 8, 2),
        ]
    elif scale_size == 64:
        layer_specs = [
            (ngf, 2),
            (ngf * 2, 2),
            (ngf * 4, 2),
            (ngf * 8, 2),
            (ngf * 8, 2),
            (ngf * 8, 2),
        ]
    else:
        raise NotImplementedError

    with tf.variable_scope("encoder_1"):
        out_channels, strides = layer_specs[0]
        if strides == 1:
            output = conv2d(inputs, out_channels, kernel_size=4)
        else:
            output = downsample_layer(inputs,
                                      out_channels,
                                      kernel_size=4,
                                      strides=strides)
        layers.append(output)

    for out_channels, strides in layer_specs[1:]:
        with tf.variable_scope("encoder_%d" % (len(layers) + 1)):
            rectified = lrelu(layers[-1], 0.2)
            # [batch, in_height, in_width, in_channels] => [batch, in_height/2, in_width/2, out_channels]
            if strides == 1:
                convolved = conv2d(rectified, out_channels, kernel_size=4)
            else:
                convolved = downsample_layer(rectified,
                                             out_channels,
                                             kernel_size=4,
                                             strides=strides)
            output = norm_layer(convolved)
            layers.append(output)

    if scale_size == 256:
        layer_specs = [
            (
                ngf * 8, 2, 0.5
            ),  # decoder_8: [batch, 1, 1, ngf * 8] => [batch, 2, 2, ngf * 8 * 2]
            (
                ngf * 8, 2, 0.5
            ),  # decoder_7: [batch, 2, 2, ngf * 8 * 2] => [batch, 4, 4, ngf * 8 * 2]
            (
                ngf * 8, 2, 0.5
            ),  # decoder_6: [batch, 4, 4, ngf * 8 * 2] => [batch, 8, 8, ngf * 8 * 2]
            (
                ngf * 8, 2, 0.0
            ),  # decoder_5: [batch, 8, 8, ngf * 8 * 2] => [batch, 16, 16, ngf * 8 * 2]
            (
                ngf * 4, 2, 0.0
            ),  # decoder_4: [batch, 16, 16, ngf * 8 * 2] => [batch, 32, 32, ngf * 4 * 2]
            (
                ngf * 2, 2, 0.0
            ),  # decoder_3: [batch, 32, 32, ngf * 4 * 2] => [batch, 64, 64, ngf * 2 * 2]
            (
                ngf, 2, 0.0
            ),  # decoder_2: [batch, 64, 64, ngf * 2 * 2] => [batch, 128, 128, ngf * 2]
            (
                output_nc, 2, 0.0
            ),  # decoder_1: [batch, 128, 128, ngf * 2] => [batch, 256, 256, generator_outputs_channels]
        ]
    elif scale_size == 128:
        layer_specs = [
            (ngf * 8, 2, 0.5),
            (ngf * 8, 2, 0.5),
            (ngf * 8, 2, 0.5),
            (ngf * 4, 2, 0.0),
            (ngf * 2, 2, 0.0),
            (ngf, 2, 0.0),
            (output_nc, 2, 0.0),
        ]
    elif scale_size == 64:
        layer_specs = [
            (ngf * 8, 2, 0.5),
            (ngf * 8, 2, 0.5),
            (ngf * 4, 2, 0.0),
            (ngf * 2, 2, 0.0),
            (ngf, 2, 0.0),
            (output_nc, 2, 0.0),
        ]
    else:
        raise NotImplementedError

    num_encoder_layers = len(layers)
    for decoder_layer, (out_channels, stride,
                        dropout) in enumerate(layer_specs[:-1]):
        skip_layer = num_encoder_layers - decoder_layer - 1
        with tf.variable_scope("decoder_%d" % (skip_layer + 1)):
            if decoder_layer == 0:
                # first decoder layer doesn't have skip connections
                # since it is directly connected to the skip_layer
                input = layers[-1]
            else:
                input = tf.concat([layers[-1], layers[skip_layer]], axis=3)

            rectified = tf.nn.relu(input)
            # [batch, in_height, in_width, in_channels] => [batch, in_height*2, in_width*2, out_channels]
            if stride == 1:
                output = conv2d(rectified, out_channels, kernel_size=4)
            else:
                output = upsample_layer(rectified,
                                        out_channels,
                                        kernel_size=4,
                                        strides=strides)
            output = norm_layer(output)

            if dropout > 0.0:
                output = tf.nn.dropout(output, keep_prob=1 - dropout)

            layers.append(output)

    with tf.variable_scope("decoder_1"):
        out_channels, stride, dropout = layer_specs[-1]
        assert dropout == 0.0  # no dropout at the last layer
        input = tf.concat([layers[-1], layers[0]], axis=3)
        rectified = tf.nn.relu(input)
        if stride == 1:
            output = conv2d(rectified, out_channels, kernel_size=4)
        else:
            output = upsample_layer(rectified,
                                    out_channels,
                                    kernel_size=4,
                                    strides=strides)
        output = tf.tanh(output)
        output = (output + 1) / 2
        layers.append(output)

    return layers[-1]