def encoder(inputs, nef=64, n_layers=3, norm_layer='instance'): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [1, 1], [1, 1], [0, 0]] with tf.variable_scope("layer_1"): convolved = conv2d(tf.pad(inputs, paddings), nef, kernel_size=4, strides=2, padding='VALID') rectified = lrelu(convolved, 0.2) layers.append(rectified) for i in range(1, n_layers): with tf.variable_scope("layer_%d" % (len(layers) + 1)): out_channels = nef * min(2**i, 4) convolved = conv2d(tf.pad(layers[-1], paddings), out_channels, kernel_size=4, strides=2, padding='VALID') normalized = norm_layer(convolved) rectified = lrelu(normalized, 0.2) layers.append(rectified) pooled = pool2d(rectified, rectified.shape.as_list()[1:3], padding='VALID', pool_mode='avg') squeezed = tf.squeeze(pooled, [1, 2]) return squeezed
def create_acvideo_discriminator(clips, actions, ndf=64, norm_layer='instance', use_noise=False, noise_sigma=None): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [0, 0], [1, 1], [1, 1], [0, 0]] clips = clips * 2 - 1 clip_pairs = tf.concat([clips[:-1], clips[1:]], axis=-1) clip_pairs = tile_concat([clip_pairs, actions[..., None, None, :]], axis=-1) clip_pairs = tf_utils.transpose_batch_time(clip_pairs) with tf.variable_scope("acvideo_layer_1"): h1 = noise(clip_pairs, use_noise, noise_sigma) h1 = conv3d(tf.pad(h1, paddings), ndf, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) h1 = lrelu(h1, 0.2) layers.append(h1) with tf.variable_scope("acvideo_layer_2"): h2 = noise(h1, use_noise, noise_sigma) h2 = conv3d(tf.pad(h2, paddings), ndf * 2, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) h2 = norm_layer(h2) h2 = lrelu(h2, 0.2) layers.append(h2) with tf.variable_scope("acvideo_layer_3"): h3 = noise(h2, use_noise, noise_sigma) h3 = conv3d(tf.pad(h3, paddings), ndf * 4, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) h3 = norm_layer(h3) h3 = lrelu(h3, 0.2) layers.append(h3) with tf.variable_scope("acvideo_layer_4"): logits = conv3d(tf.pad(h3, paddings), 1, kernel_size=(3, 4, 4), strides=(1, 2, 2), padding='VALID', use_bias=False) layers.append(logits) return nest.map_structure(tf_utils.transpose_batch_time, layers)
def create_encoder(image, nef=64, norm_layer='instance', dim_z=10): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [1, 1], [1, 1], [0, 0]] with tf.variable_scope("layer_1"): h0 = conv2d(tf.pad(image, paddings), nef, kernel_size=4, strides=2, padding='VALID') h0 = norm_layer(h0) h0 = lrelu(h0, 0.2) layers.append(h0) with tf.variable_scope("layer_2"): h1 = conv2d(tf.pad(h0, paddings), nef * 2, kernel_size=4, strides=2, padding='VALID') h1 = norm_layer(h1) h1 = lrelu(h1, 0.2) layers.append(h1) with tf.variable_scope("layer_3"): h2 = conv2d(tf.pad(h1, paddings), nef * 4, kernel_size=4, strides=2, padding='VALID') h2 = norm_layer(h2) h2 = lrelu(h2, 0.2) layers.append(h2) with tf.variable_scope("layer_4"): h3 = conv2d(tf.pad(h2, paddings), nef * 8, kernel_size=4, strides=2, padding='VALID') h3 = norm_layer(h3) h3 = lrelu(h3, 0.2) layers.append(h3) with tf.variable_scope("layer_5"): h4 = conv2d(tf.pad(h3, paddings), dim_z, kernel_size=4, strides=2, padding='VALID') layers.append(h4) pooled = pool2d(h4, h4.shape[1:3].as_list(), padding='VALID', pool_mode='avg') squeezed = tf.squeeze(pooled, [1, 2]) return squeezed
def create_image_discriminator(images, ndf=64, norm_layer='instance', use_noise=False, noise_sigma=None): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [1, 1], [1, 1], [0, 0]] images = images * 2 - 1 with tf.variable_scope("image_layer_1"): h1 = noise(images, use_noise, noise_sigma) h1 = conv2d(tf.pad(h1, paddings), ndf, kernel_size=4, strides=2, padding='VALID', use_bias=False) h1 = lrelu(h1, 0.2) layers.append(h1) with tf.variable_scope("image_layer_2"): h2 = noise(h1, use_noise, noise_sigma) h2 = conv2d(tf.pad(h2, paddings), ndf * 2, kernel_size=4, strides=2, padding='VALID', use_bias=False) h2 = norm_layer(h2) h2 = lrelu(h2, 0.2) layers.append(h2) with tf.variable_scope("image_layer_3"): h3 = noise(h2, use_noise, noise_sigma) h3 = conv2d(tf.pad(h3, paddings), ndf * 4, kernel_size=4, strides=2, padding='VALID', use_bias=False) h3 = norm_layer(h3) h3 = lrelu(h3, 0.2) layers.append(h3) with tf.variable_scope("image_layer_4"): h4 = noise(h3, use_noise, noise_sigma) logits = conv2d(tf.pad(h4, paddings), 1, kernel_size=4, strides=2, padding='VALID', use_bias=False) layers.append(logits) return layers
def create_n_layer_discriminator(discrim_targets, discrim_inputs=None, ndf=64, n_layers=3, norm_layer='instance'): norm_layer = ops.get_norm_layer(norm_layer) layers = [] inputs = [discrim_targets] if discrim_inputs is not None: inputs.append(discrim_inputs) inputs = tf.concat(inputs, axis=-1) paddings = [[0, 0], [1, 1], [1, 1], [0, 0]] # layer_1: [batch, 256, 256, in_channels * 2] => [batch, 128, 128, ndf] with tf.variable_scope("layer_1"): convolved = conv2d(tf.pad(inputs, paddings), ndf, kernel_size=4, strides=2, padding='VALID') rectified = lrelu(convolved, 0.2) layers.append(rectified) # layer_2: [batch, 128, 128, ndf] => [batch, 64, 64, ndf * 2] # layer_3: [batch, 64, 64, ndf * 2] => [batch, 32, 32, ndf * 4] # layer_4: [batch, 32, 32, ndf * 4] => [batch, 31, 31, ndf * 8] for i in range(n_layers): with tf.variable_scope("layer_%d" % (len(layers) + 1)): out_channels = ndf * min(2**(i + 1), 8) stride = 1 if i == n_layers - 1 else 2 # last layer here has stride 1 convolved = conv2d(tf.pad(layers[-1], paddings), out_channels, kernel_size=4, strides=stride, padding='VALID') normalized = norm_layer(convolved) rectified = lrelu(normalized, 0.2) layers.append(rectified) # layer_5: [batch, 31, 31, ndf * 8] => [batch, 30, 30, 1] with tf.variable_scope("layer_%d" % (len(layers) + 1)): logits = conv2d(tf.pad(rectified, paddings), 1, kernel_size=4, strides=1, padding='VALID') layers.append( logits ) # don't apply sigmoid to the logits in case we want to use LSGAN return layers
def create_video_discriminator(clips, ndf=64, norm_layer='instance'): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [0, 0], [1, 1], [1, 1], [0, 0]] clips = tf_utils.transpose_batch_time(clips) with tf.variable_scope("video_layer_1"): h1 = conv3d(tf.pad(clips, paddings), ndf, kernel_size=4, strides=(1, 2, 2), padding='VALID') h1 = lrelu(h1, 0.2) layers.append(h1) with tf.variable_scope("video_layer_2"): h2 = conv3d(tf.pad(h1, paddings), ndf * 2, kernel_size=4, strides=(1, 2, 2), padding='VALID') h2 = norm_layer(h2) h2 = lrelu(h2, 0.2) layers.append(h2) with tf.variable_scope("video_layer_3"): h3 = conv3d(tf.pad(h2, paddings), ndf * 4, kernel_size=4, strides=(1, 2, 2), padding='VALID') h3 = norm_layer(h3) h3 = lrelu(h3, 0.2) layers.append(h3) with tf.variable_scope("video_layer_4"): if h3.shape[1].value < 4: kernel_size = (h3.shape[1].value, 4, 4) else: kernel_size = 4 logits = conv3d(h3, 1, kernel_size=kernel_size, strides=1, padding='VALID') layers.append(logits) return nest.map_structure(tf_utils.transpose_batch_time, layers)
def create_n_layer_encoder(inputs, nz=8, nef=64, n_layers=3, norm_layer='instance', include_top=True): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [1, 1], [1, 1], [0, 0]] with tf.variable_scope("layer_1"): convolved = conv2d(tf.pad(inputs, paddings), nef, kernel_size=4, strides=2, padding='VALID') rectified = lrelu(convolved, 0.2) layers.append(rectified) for i in range(1, n_layers): with tf.variable_scope("layer_%d" % (len(layers) + 1)): out_channels = nef * min(2**i, 4) convolved = conv2d(tf.pad(layers[-1], paddings), out_channels, kernel_size=4, strides=2, padding='VALID') normalized = norm_layer(convolved) rectified = lrelu(normalized, 0.2) layers.append(rectified) pooled = pool2d(rectified, rectified.shape[1:3].as_list(), padding='VALID', pool_mode='avg') squeezed = tf.squeeze(pooled, [1, 2]) if include_top: with tf.variable_scope('z_mu'): z_mu = dense(squeezed, nz) with tf.variable_scope('z_log_sigma_sq'): z_log_sigma_sq = dense(squeezed, nz) z_log_sigma_sq = tf.clip_by_value(z_log_sigma_sq, -10, 10) outputs = {'enc_zs_mu': z_mu, 'enc_zs_log_sigma_sq': z_log_sigma_sq} else: outputs = squeezed return outputs
def create_image_discriminator(images, ndf=64, norm_layer='instance'): norm_layer = ops.get_norm_layer(norm_layer) layers = [] paddings = [[0, 0], [1, 1], [1, 1], [0, 0]] with tf.variable_scope("image_layer_1"): h1 = conv2d(tf.pad(images, paddings), ndf, kernel_size=4, strides=2, padding='VALID') h1 = lrelu(h1, 0.2) layers.append(h1) with tf.variable_scope("image_layer_2"): h2 = conv2d(tf.pad(h1, paddings), ndf * 2, kernel_size=4, strides=2, padding='VALID') h2 = norm_layer(h2) h2 = lrelu(h2, 0.2) layers.append(h2) with tf.variable_scope("image_layer_3"): h3 = conv2d(tf.pad(h2, paddings), ndf * 4, kernel_size=4, strides=2, padding='VALID') h3 = norm_layer(h3) h3 = lrelu(h3, 0.2) layers.append(h3) with tf.variable_scope("image_layer_4"): logits = conv2d(h3, 1, kernel_size=4, strides=1, padding='VALID') layers.append(logits) return layers
def video_sn_discriminator(clips, ndf=64): clips = tf_utils.transpose_batch_time(clips) batch_size = clips.shape[0].value layers = [] paddings = [[0, 0], [1, 1], [1, 1], [1, 1], [0, 0]] def conv3d(inputs, *args, **kwargs): kwargs.setdefault('padding', 'VALID') kwargs.setdefault('use_spectral_norm', True) return ops.conv3d(tf.pad(inputs, paddings), *args, **kwargs) with tf.variable_scope("sn_conv0_0"): layers.append(lrelu(conv3d(clips, ndf, kernel_size=3, strides=1), 0.1)) with tf.variable_scope("sn_conv0_1"): layers.append( lrelu( conv3d(layers[-1], ndf * 2, kernel_size=4, strides=(1, 2, 2)), 0.1)) with tf.variable_scope("sn_conv1_0"): layers.append( lrelu(conv3d(layers[-1], ndf * 2, kernel_size=3, strides=1), 0.1)) with tf.variable_scope("sn_conv1_1"): layers.append( lrelu( conv3d(layers[-1], ndf * 4, kernel_size=4, strides=(1, 2, 2)), 0.1)) with tf.variable_scope("sn_conv2_0"): layers.append( lrelu(conv3d(layers[-1], ndf * 4, kernel_size=3, strides=1), 0.1)) with tf.variable_scope("sn_conv2_1"): layers.append( lrelu(conv3d(layers[-1], ndf * 8, kernel_size=4, strides=2), 0.1)) with tf.variable_scope("sn_conv3_0"): layers.append( lrelu(conv3d(layers[-1], ndf * 8, kernel_size=3, strides=1), 0.1)) with tf.variable_scope("sn_fc4"): logits = dense(tf.reshape(layers[-1], [batch_size, -1]), 1, use_spectral_norm=True) layers.append(logits) layers = nest.map_structure(tf_utils.transpose_batch_time, layers) return layers
def image_sn_discriminator(images, ndf=64): batch_size = images.shape[0].value layers = [] paddings = [[0, 0], [1, 1], [1, 1], [0, 0]] def conv2d(inputs, *args, **kwargs): kwargs.setdefault('padding', 'VALID') kwargs.setdefault('use_spectral_norm', True) return ops.conv2d(tf.pad(inputs, paddings), *args, **kwargs) with tf.variable_scope("sn_conv0_0"): layers.append(lrelu(conv2d(images, ndf, kernel_size=3, strides=1), 0.1)) with tf.variable_scope("sn_conv0_1"): layers.append( lrelu(conv2d(layers[-1], ndf * 2, kernel_size=4, strides=2), 0.1)) with tf.variable_scope("sn_conv1_0"): layers.append( lrelu(conv2d(layers[-1], ndf * 2, kernel_size=3, strides=1), 0.1)) with tf.variable_scope("sn_conv1_1"): layers.append( lrelu(conv2d(layers[-1], ndf * 4, kernel_size=4, strides=2), 0.1)) with tf.variable_scope("sn_conv2_0"): layers.append( lrelu(conv2d(layers[-1], ndf * 4, kernel_size=3, strides=1), 0.1)) with tf.variable_scope("sn_conv2_1"): layers.append( lrelu(conv2d(layers[-1], ndf * 8, kernel_size=4, strides=2), 0.1)) with tf.variable_scope("sn_conv3_0"): layers.append( lrelu(conv2d(layers[-1], ndf * 8, kernel_size=3, strides=1), 0.1)) with tf.variable_scope("sn_fc4"): logits = dense(tf.reshape(layers[-1], [batch_size, -1]), 1, use_spectral_norm=True) layers.append(logits) return layers
def create_legacy_discriminator(discrim_targets, discrim_inputs=None, ndf=64, norm_layer='instance', downsample_layer='conv_pool2d'): norm_layer = ops.get_norm_layer(norm_layer) downsample_layer = ops.get_downsample_layer(downsample_layer) layers = [] inputs = [discrim_targets] if discrim_inputs is not None: inputs.append(discrim_inputs) inputs = tf.concat(inputs, axis=-1) scale_size = min(*inputs.shape.as_list()[1:3]) if scale_size == 256: layer_specs = [ ( ndf, 2 ), # layer_1: [batch, 256, 256, in_channels * 2] => [batch, 128, 128, ndf] (ndf * 2, 2), # layer_2: [batch, 128, 128, ndf] => [batch, 64, 64, ndf * 2] ( ndf * 4, 2 ), # layer_3: [batch, 64, 64, ndf * 2] => [batch, 32, 32, ndf * 4] ( ndf * 8, 1 ), # layer_4: [batch, 32, 32, ndf * 4] => [batch, 32, 32, ndf * 8] (1, 1), # layer_5: [batch, 32, 32, ndf * 8] => [batch, 32, 32, 1] ] elif scale_size == 128: layer_specs = [ (ndf, 2), (ndf * 2, 2), (ndf * 4, 1), (ndf * 8, 1), (1, 1), ] elif scale_size == 64: layer_specs = [ (ndf, 2), (ndf * 2, 1), (ndf * 4, 1), (ndf * 8, 1), (1, 1), ] else: raise NotImplementedError with tf.variable_scope("layer_1"): out_channels, strides = layer_specs[0] convolved = downsample_layer(inputs, out_channels, kernel_size=4, strides=strides) rectified = lrelu(convolved, 0.2) layers.append(rectified) for out_channels, strides in layer_specs[1:-1]: with tf.variable_scope("layer_%d" % (len(layers) + 1)): if strides == 1: convolved = conv2d(layers[-1], out_channels, kernel_size=4) else: convolved = downsample_layer(layers[-1], out_channels, kernel_size=4, strides=strides) normalized = norm_layer(convolved) rectified = lrelu(normalized, 0.2) layers.append(rectified) with tf.variable_scope("layer_%d" % (len(layers) + 1)): out_channels, strides = layer_specs[-1] if strides == 1: logits = conv2d(rectified, out_channels, kernel_size=4) else: logits = downsample_layer(rectified, out_channels, kernel_size=4, strides=strides) layers.append( logits ) # don't apply sigmoid to the logits in case we want to use LSGAN return layers
def create_generator(generator_inputs, output_nc=3, ngf=64, norm_layer='instance', downsample_layer='conv_pool2d', upsample_layer='upsample_conv2d'): norm_layer = ops.get_norm_layer(norm_layer) downsample_layer = ops.get_downsample_layer(downsample_layer) upsample_layer = ops.get_upsample_layer(upsample_layer) layers = [] inputs = generator_inputs scale_size = min(*inputs.shape.as_list()[1:3]) if scale_size == 256: layer_specs = [ ( ngf, 2 ), # encoder_1: [batch, 256, 256, in_channels] => [batch, 128, 128, ngf] ( ngf * 2, 2 ), # encoder_2: [batch, 128, 128, ngf] => [batch, 64, 64, ngf * 2] ( ngf * 4, 2 ), # encoder_3: [batch, 64, 64, ngf * 2] => [batch, 32, 32, ngf * 4] ( ngf * 8, 2 ), # encoder_4: [batch, 32, 32, ngf * 4] => [batch, 16, 16, ngf * 8] ( ngf * 8, 2 ), # encoder_5: [batch, 16, 16, ngf * 8] => [batch, 8, 8, ngf * 8] (ngf * 8, 2), # encoder_6: [batch, 8, 8, ngf * 8] => [batch, 4, 4, ngf * 8] (ngf * 8, 2), # encoder_7: [batch, 4, 4, ngf * 8] => [batch, 2, 2, ngf * 8] (ngf * 8, 2), # encoder_8: [batch, 2, 2, ngf * 8] => [batch, 1, 1, ngf * 8] ] elif scale_size == 128: layer_specs = [ (ngf, 2), (ngf * 2, 2), (ngf * 4, 2), (ngf * 8, 2), (ngf * 8, 2), (ngf * 8, 2), (ngf * 8, 2), ] elif scale_size == 64: layer_specs = [ (ngf, 2), (ngf * 2, 2), (ngf * 4, 2), (ngf * 8, 2), (ngf * 8, 2), (ngf * 8, 2), ] else: raise NotImplementedError with tf.variable_scope("encoder_1"): out_channels, strides = layer_specs[0] if strides == 1: output = conv2d(inputs, out_channels, kernel_size=4) else: output = downsample_layer(inputs, out_channels, kernel_size=4, strides=strides) layers.append(output) for out_channels, strides in layer_specs[1:]: with tf.variable_scope("encoder_%d" % (len(layers) + 1)): rectified = lrelu(layers[-1], 0.2) # [batch, in_height, in_width, in_channels] => [batch, in_height/2, in_width/2, out_channels] if strides == 1: convolved = conv2d(rectified, out_channels, kernel_size=4) else: convolved = downsample_layer(rectified, out_channels, kernel_size=4, strides=strides) output = norm_layer(convolved) layers.append(output) if scale_size == 256: layer_specs = [ ( ngf * 8, 2, 0.5 ), # decoder_8: [batch, 1, 1, ngf * 8] => [batch, 2, 2, ngf * 8 * 2] ( ngf * 8, 2, 0.5 ), # decoder_7: [batch, 2, 2, ngf * 8 * 2] => [batch, 4, 4, ngf * 8 * 2] ( ngf * 8, 2, 0.5 ), # decoder_6: [batch, 4, 4, ngf * 8 * 2] => [batch, 8, 8, ngf * 8 * 2] ( ngf * 8, 2, 0.0 ), # decoder_5: [batch, 8, 8, ngf * 8 * 2] => [batch, 16, 16, ngf * 8 * 2] ( ngf * 4, 2, 0.0 ), # decoder_4: [batch, 16, 16, ngf * 8 * 2] => [batch, 32, 32, ngf * 4 * 2] ( ngf * 2, 2, 0.0 ), # decoder_3: [batch, 32, 32, ngf * 4 * 2] => [batch, 64, 64, ngf * 2 * 2] ( ngf, 2, 0.0 ), # decoder_2: [batch, 64, 64, ngf * 2 * 2] => [batch, 128, 128, ngf * 2] ( output_nc, 2, 0.0 ), # decoder_1: [batch, 128, 128, ngf * 2] => [batch, 256, 256, generator_outputs_channels] ] elif scale_size == 128: layer_specs = [ (ngf * 8, 2, 0.5), (ngf * 8, 2, 0.5), (ngf * 8, 2, 0.5), (ngf * 4, 2, 0.0), (ngf * 2, 2, 0.0), (ngf, 2, 0.0), (output_nc, 2, 0.0), ] elif scale_size == 64: layer_specs = [ (ngf * 8, 2, 0.5), (ngf * 8, 2, 0.5), (ngf * 4, 2, 0.0), (ngf * 2, 2, 0.0), (ngf, 2, 0.0), (output_nc, 2, 0.0), ] else: raise NotImplementedError num_encoder_layers = len(layers) for decoder_layer, (out_channels, stride, dropout) in enumerate(layer_specs[:-1]): skip_layer = num_encoder_layers - decoder_layer - 1 with tf.variable_scope("decoder_%d" % (skip_layer + 1)): if decoder_layer == 0: # first decoder layer doesn't have skip connections # since it is directly connected to the skip_layer input = layers[-1] else: input = tf.concat([layers[-1], layers[skip_layer]], axis=3) rectified = tf.nn.relu(input) # [batch, in_height, in_width, in_channels] => [batch, in_height*2, in_width*2, out_channels] if stride == 1: output = conv2d(rectified, out_channels, kernel_size=4) else: output = upsample_layer(rectified, out_channels, kernel_size=4, strides=strides) output = norm_layer(output) if dropout > 0.0: output = tf.nn.dropout(output, keep_prob=1 - dropout) layers.append(output) with tf.variable_scope("decoder_1"): out_channels, stride, dropout = layer_specs[-1] assert dropout == 0.0 # no dropout at the last layer input = tf.concat([layers[-1], layers[0]], axis=3) rectified = tf.nn.relu(input) if stride == 1: output = conv2d(rectified, out_channels, kernel_size=4) else: output = upsample_layer(rectified, out_channels, kernel_size=4, strides=strides) output = tf.tanh(output) output = (output + 1) / 2 layers.append(output) return layers[-1]