def residual_block_layer(inputs, hparams): """Residual block over inputs. Runs a residual block consisting of conv: kernel_size x kernel_size conv: 1x1 dropout, add and normalize according to hparams.layer_postprocess_sequence. Args: inputs: Tensor of shape [batch_size, height, width, hidden_dim]. hparams: Dict, hyperparameters. Returns: x: Tensor of shape [batch_size, height, width, hidden_dim] """ kernel = (hparams.res_kernel_size, hparams.res_kernel_size) x = inputs for i in range(hparams.num_res_layers): with tf.variable_scope("res_conv_%d" % i): # kernel_size x kernel_size conv block y = common_layers.conv_block( common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"), hparams.hidden_size, [((1, 1), kernel)], strides=(1, 1), padding="SAME", name="residual_conv") # 1x1 conv block y = common_layers.conv_block(y, hparams.hidden_size, [((1, 1), (1, 1))], strides=(1, 1), padding="SAME", name="residual_dense") x = common_layers.layer_postprocess(x, y, hparams) return x
def residual_block_layer(inputs, hparams): """Residual block over inputs. Runs a residual block consisting of conv: kernel_size x kernel_size conv: 1x1 dropout, add and normalize according to hparams.layer_postprocess_sequence. Args: inputs: Tensor of shape [batch, height, width, hparams.hidden_size]. hparams: tf.contrib.training.HParams. Returns: Tensor of shape [batch, height, width, hparams.hidden_size]. """ kernel = (hparams.res_kernel_size, hparams.res_kernel_size) x = inputs for i in range(hparams.num_res_layers): with tf.variable_scope("res_conv_%d" % i): # kernel_size x kernel_size conv block y = common_layers.conv_block( common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"), hparams.hidden_size, [((1, 1), kernel)], strides=(1, 1), padding="SAME", name="residual_conv") # 1x1 conv block y = common_layers.conv_block( y, hparams.hidden_size, [((1, 1), (1, 1))], strides=(1, 1), padding="SAME", name="residual_dense") x = common_layers.layer_postprocess(x, y, hparams) return x
def bottom(self, inputs): """Transform input from data space to model space. Perform the Xception "Entry flow", which consists of two convolutional filter upscalings followed by three residually connected separable convolution blocks. Args: inputs: A Tensor with shape [batch, ...] Returns: body_input: A Tensor with shape [batch, ?, ?, body_input_depth]. """ with tf.variable_scope(self.name): def xnet_resblock(x, filters, res_relu, name): with tf.variable_scope(name): y = common_layers.separable_conv_block( x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) return y + common_layers.conv_block(x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 2), first_relu=res_relu, force2d=True, name="res_conv0") inputs = common_layers.standardize_images(inputs) # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet. # tf.summary.image("inputs", inputs, max_outputs=2) x = common_layers.conv_block(inputs, 32, [((1, 1), (3, 3))], first_relu=False, padding="SAME", strides=(2, 2), force2d=True, name="conv0") x = common_layers.conv_block(x, 64, [((1, 1), (3, 3))], padding="SAME", force2d=True, name="conv1") x = xnet_resblock(x, min(128, self._body_input_depth), True, "block0") x = xnet_resblock(x, min(256, self._body_input_depth), False, "block1") return xnet_resblock(x, self._body_input_depth, False, "block2")
def bottom_compress(self, inputs, name="bottom"): """Transform input from data space to model space. Perform conversion of RGB pixel values to a real number and combine values for each pixel to form representation of image_length x image_length dims. Args: inputs: A Tensor with shape [batch, ...] name: string, scope. Returns: body_input: A Tensor with shape [batch, ?, ?, body_input_depth]. """ with tf.variable_scope(name): inputs = common_layers.convert_rgb_to_real(inputs) ishape = common_layers.shape_list(inputs) inputs = tf.reshape(inputs, [-1, ishape[1], ishape[2] * ishape[3], 1]) inputs.set_shape([None, None, None, 1]) # We compress RGB intensities for each pixel using a conv. x = common_layers.conv_block( inputs, self._body_input_depth, [((1, 1), (1, 3))], first_relu=False, padding="VALID", strides=(1, 3), force2d=True, name="conv_input") return x
def bytenet_internal(inputs, targets, hparams): """ByteNet, main step used for training.""" with tf.variable_scope("bytenet"): # Flatten inputs and extend length by 50%. inputs = tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2) extend_length = tf.to_int32(0.5 * tf.to_float(tf.shape(inputs)[1])) inputs_shape = inputs.shape.as_list() inputs = tf.pad(inputs, [[0, 0], [0, extend_length], [0, 0], [0, 0]]) inputs_shape[1] = None inputs.set_shape(inputs_shape) # Don't lose the other shapes when padding. # Pad inputs and targets to be the same length, divisible by 50. inputs, targets = common_layers.pad_to_same_length( inputs, targets, final_length_divisible_by=50) final_encoder = residual_dilated_conv(inputs, hparams.num_block_repeat, "SAME", "encoder", hparams) shifted_targets = common_layers.shift_right(targets) kernel = (hparams.kernel_height, hparams.kernel_width) decoder_start = common_layers.conv_block( tf.concat([final_encoder, shifted_targets], axis=3), hparams.hidden_size, [((1, 1), kernel)], padding="LEFT") return residual_dilated_conv(decoder_start, hparams.num_block_repeat, "LEFT", "decoder", hparams)
def slicenet_internal(inputs, targets, target_space, hparams, run_decoder=True): """The slicenet model, main step used for training.""" with tf.variable_scope("slicenet"): # Project to hidden size if necessary if inputs.get_shape().as_list()[-1] != hparams.hidden_size: inputs = common_layers.conv_block( inputs, hparams.hidden_size, [((1, 1), (3, 3))], first_relu=False, padding="SAME", force2d=True) # Flatten inputs and encode. inputs = tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2) inputs_mask = 1.0 - embedding_to_padding(inputs) inputs = common_layers.add_timing_signal(inputs) # Add position info. target_space_emb = embed_target_space(target_space, hparams.hidden_size) extra_layers = int(hparams.num_hidden_layers * 1.5) inputs_encoded = multi_conv_res( inputs, "SAME", "encoder", extra_layers, hparams, mask=inputs_mask) if not run_decoder: return inputs_encoded # Do the middle part. decoder_start, similarity_loss = slicenet_middle( inputs_encoded, targets, target_space_emb, inputs_mask, hparams) # Decode. decoder_final = multi_conv_res( decoder_start, "LEFT", "decoder", hparams.num_hidden_layers, hparams, mask=inputs_mask, source=inputs_encoded) return decoder_final, tf.reduce_mean(similarity_loss)
def project_to_hidden(inputs): return common_layers.conv_block( inputs, hparams.hidden_size, [((1, 1), (3, 3))], first_relu=False, padding="SAME", force2d=True)
def bottom_compress(self, inputs, name="bottom"): """Transform input from data space to model space. Perform conversion of RGB pixel values to a real number and combine values for each pixel to form representation of image_length x image_length dims. Args: inputs: A Tensor with shape [batch, ...] name: string, scope. Returns: body_input: A Tensor with shape [batch, ?, ?, body_input_depth]. """ with tf.variable_scope(name): inputs = common_layers.convert_rgb_to_real(inputs) ishape = common_layers.shape_list(inputs) inputs = tf.reshape(inputs, [-1, ishape[1], ishape[2] * ishape[3], 1]) inputs.set_shape([None, None, None, 1]) # We compress RGB intensities for each pixel using a conv. x = common_layers.conv_block( inputs, self._body_input_depth, [((1, 1), (1, 3))], first_relu=False, padding="VALID", strides=(1, 3), force2d=True, name="conv_input") return x
def project_to_hidden(inputs): return common_layers.conv_block( inputs, hparams.hidden_size, [((1, 1), (3, 3))], first_relu=False, padding="SAME", force2d=True)
def slicenet_internal(inputs, targets, target_space, hparams, run_decoder=True): """The slicenet model, main step used for training.""" with tf.variable_scope("slicenet"): # Project to hidden size if necessary if inputs.get_shape().as_list()[-1] != hparams.model_d: inputs = common_layers.conv_block( inputs, hparams.model_d, [((1, 1), (3, 3))], first_relu=False, padding="SAME", force2d=True) # Flatten inputs and encode. inputs = tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2) inputs_mask = 1.0 - embedding_to_padding(inputs) inputs = common_layers.add_timing_signal(inputs) # Add position info. target_space_emb = embed_target_space(target_space, hparams.model_d) extra_layers = int(hparams.num_hidden_layers * 1.5) inputs_encoded = multi_conv_res( inputs, "SAME", "encoder", extra_layers, hparams, mask=inputs_mask) if not run_decoder: return inputs_encoded # Do the middle part. decoder_start, similarity_loss = slicenet_middle( inputs_encoded, targets, target_space_emb, inputs_mask, hparams) # Decode. decoder_final = multi_conv_res( decoder_start, "LEFT", "decoder", hparams.num_hidden_layers, hparams, mask=inputs_mask, source=inputs_encoded) return decoder_final, tf.reduce_mean(similarity_loss)
def encode(self, inputs, target_space, hparams, features=None, losses=None): """Add two layers strided convolutions ontop of encode.""" inputs = common_layers.conv_block(inputs, hparams.hidden_size, [((1, 1), (3, 3))], first_relu=False, padding="SAME", force2d=True, name="small_image_conv") hparams.num_compress_steps = 2 compressed_inputs = transformer_vae.compress(inputs, None, is_2d=True, hparams=hparams, name="convolutions") return super(TransformerSketch, self).encode(compressed_inputs, target_space, hparams, features=features, losses=losses)
def bytenet_internal(inputs, targets, hparams): """ByteNet, main step used for training.""" with tf.variable_scope("bytenet"): # Flatten inputs and extend length by 50%. inputs = tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2) extend_length = tf.to_int32(0.5 * tf.to_float(tf.shape(inputs)[1])) inputs_shape = inputs.shape.as_list() inputs = tf.pad(inputs, [[0, 0], [0, extend_length], [0, 0], [0, 0]]) inputs_shape[1] = None inputs.set_shape( inputs_shape) # Don't lose the other shapes when padding. # Pad inputs and targets to be the same length, divisible by 50. inputs, targets = common_layers.pad_to_same_length( inputs, targets, final_length_divisible_by=50) final_encoder = residual_dilated_conv(inputs, hparams.num_block_repeat, "SAME", "encoder", hparams) shifted_targets = common_layers.shift_right(targets) kernel = (hparams.kernel_height, hparams.kernel_width) decoder_start = common_layers.conv_block( tf.concat([final_encoder, shifted_targets], axis=3), hparams.hidden_size, [((1, 1), kernel)], padding="LEFT") return residual_dilated_conv(decoder_start, hparams.num_block_repeat, "LEFT", "decoder", hparams)
def ae_compress(x, is_2d, hparams, name, reuse=None): """Compress, then AE.""" with tf.variable_scope(name, reuse=reuse): cur = compress(x, None, is_2d, hparams, "compress") # Convolve and ReLu to get state. cur = common_layers.conv_block( cur, hparams.hidden_size, [((1, 1), (1, 1))], name="mid_conv") means_size = hparams.z_size if hparams.do_vae else hparams.v_size means = tf.get_variable("z_to_dense", [means_size, hparams.hidden_size]) if hparams.do_vae: if hparams.bit_vae: hot, loss = bit_vae(cur, hparams, "bvae") else: hot, loss, _, _ = vae(cur, hparams.z_size, "vae") return cur, hot, loss if hparams.use_gumbel_softmax: _, hot, loss = dae(cur, hparams, "dae") return cur, hot, loss # Using k-means part. L2-normalizing to use fast cosine distance. cur = mix(tf.nn.l2_normalize(cur, dim=3), cur, hparams.startup_steps // 3, mode="exp", simple=True) cur_n = hparams.kmeans_lr_factor * cur cur_n += (1.0 - hparams.kmeans_lr_factor) * tf.stop_gradient(cur) hot, loss = kmeans(cur_n, means, hparams, name="kmeans") # We need a linear layer to undo the l2-normalization. cur = tf.layers.dense(cur, hparams.hidden_size, name="unnormalize") return cur, hot, loss
def vae_transformer_internal(inputs, targets, target_space, hparams): """VAE Transformer, main step used for training.""" with tf.variable_scope("vae_transformer"): # Prepare inputs, targets, and k. inputs = common_layers.flatten4d3d(inputs) input_len = tf.shape(inputs)[1] # Double input size to cover targets. inputs = tf.pad(inputs, [[0, 0], [0, input_len], [0, 0]]) inputs.set_shape([None, None, hparams.hidden_size]) targets = common_layers.flatten4d3d(targets) k = 2**hparams.num_compress_steps inputs, targets = common_layers.pad_to_same_length( inputs, targets, final_length_divisible_by=k) inputs = encode(inputs, target_space, hparams, "input_enc") # Compress and vae. z, kl_loss, _, _ = vae_compress(tf.expand_dims(targets, axis=2), tf.expand_dims(inputs, axis=2), hparams, "vae_compress", "vae_decompress") # Join z with inputs, run decoder. to_decode = common_layers.conv_block( tf.concat([z, tf.expand_dims(inputs, axis=2)], axis=3), hparams.hidden_size, [((1, 1), (1, 1))], name="join_z") ret = encode(tf.squeeze(to_decode, axis=2), target_space, hparams, "dec") # For experiments with one-sided decoder: # decoder_in = tf.squeeze(to_decode, axis=2) # (decoder_input, decoder_self_attention_bias) = ( # transformer.transformer_prepare_decoder(decoder_in, hparams)) # ret = transformer.transformer_decoder( # decoder_input, inputs, decoder_self_attention_bias, None, hparams) kl_loss *= common_layers.inverse_exp_decay(hparams.kl_warmup_steps) * 3.0 losses = {"kl": kl_loss} return tf.expand_dims(ret, axis=2), losses
def testConvBlock(self): x = np.random.rand(5, 7, 1, 11) y = common_layers.conv_block(tf.constant(x, dtype=tf.float32), 13, [(1, (3, 3)), (1, (3, 3))], padding="SAME", normalizer_fn=common_layers.noam_norm) self.evaluate(tf.global_variables_initializer()) res = self.evaluate(y) self.assertEqual(res.shape, (5, 7, 1, 13))
def xception_entry(inputs, hidden_dim): """Xception entry flow.""" with tf.variable_scope("xception_entry"): def xnet_resblock(x, filters, res_relu, name): """Resblock.""" with tf.variable_scope(name): y = common_layers.separable_conv_block(x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) return y + common_layers.conv_block(x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 2), first_relu=res_relu, force2d=True, name="res_conv0") tf.summary.image("inputs", inputs, max_outputs=2) x = common_layers.conv_block(inputs, 32, [((1, 1), (3, 3))], first_relu=False, padding="SAME", strides=(2, 2), force2d=True, name="conv0") x = common_layers.conv_block(x, 64, [((1, 1), (3, 3))], padding="SAME", force2d=True, name="conv1") x = xnet_resblock(x, min(128, hidden_dim), True, "block0") x = xnet_resblock(x, min(256, hidden_dim), False, "block1") return xnet_resblock(x, hidden_dim, False, "block2")
def xception_entry(inputs, hidden_dim): with tf.variable_scope("xception_entry"): def xnet_resblock(x, filters, res_relu, name): with tf.variable_scope(name): y = common_layers.separable_conv_block(x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) return y + common_layers.conv_block(x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 2), first_relu=res_relu, force2d=True, name="res_conv0") inputs = common_layers.standardize_images(inputs) # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet. # tf.summary.image("inputs", inputs, max_outputs=2) x = common_layers.conv_block(inputs, 32, [((1, 1), (3, 3))], first_relu=False, padding="SAME", strides=(2, 2), force2d=True, name="conv0") x = common_layers.conv_block(x, 64, [((1, 1), (3, 3))], padding="SAME", force2d=True, name="conv1") x = xnet_resblock(x, min(128, hidden_dim), True, "block0") x = xnet_resblock(x, min(256, hidden_dim), False, "block1") return xnet_resblock(x, hidden_dim, False, "block2")
def testConvBlock(self): x = np.random.rand(5, 7, 1, 11) y = common_layers.conv_block( tf.constant(x, dtype=tf.float32), 13, [(1, (3, 3)), (1, (3, 3))], padding="SAME", normalizer_fn=common_layers.noam_norm) self.evaluate(tf.global_variables_initializer()) res = self.evaluate(y) self.assertEqual(res.shape, (5, 7, 1, 13))
def decompress(source, hparams, name): """Decompression function.""" with tf.variable_scope(name): shape = tf.shape(source) thicker = common_layers.conv_block(source, hparams.hidden_size * 2, [((1, 1), (1, 1))], name="decompress_conv") return tf.reshape(thicker, [shape[0], shape[1] * 2, 1, hparams.hidden_size])
def decompress_step(source, c, hparams, first_relu, name): """Decompression function.""" with tf.variable_scope(name): shape = tf.shape(source) if c is not None: source = attend(source, c, hparams, "decompress_attend") thicker = common_layers.conv_block( source, hparams.hidden_size * 2, [((1, 1), (1, 1))], first_relu=first_relu, name="decompress_conv") return tf.reshape(thicker, [shape[0], shape[1] * 2, 1, hparams.hidden_size])
def compress_vae(inputs, hparams, name): """Compress, then VAE.""" with tf.variable_scope(name): # Run compression by strided convs. cur = tf.expand_dims(inputs, axis=2) for i in xrange(hparams.num_compress_steps): cur = common_layers.conv_block(cur, hparams.hidden_size, [((1, 1), (2, 1))], strides=(2, 1), name="compress_%d" % i) # Convolve and ReLu to get state. cur = common_layers.conv_block(cur, hparams.hidden_size, [((1, 1), (1, 1))], name="mid_conv") cur, kl_loss = vae(cur, hparams, name="vae") return cur, kl_loss
def bottom(self, inputs): with tf.variable_scope(self.name): inputs = common_layers.standardize_images(inputs) tf.summary.image("inputs", inputs, max_outputs=2) return common_layers.conv_block( inputs, self._body_input_depth, [((1, 1), (3, 3))], first_relu=False, padding="SAME", force2d=True, name="small_image_conv")
def decompress_step(source, hparams, first_relu, is_2d, name): """Decompression function.""" with tf.variable_scope(name): shape = common_layers.shape_list(source) multiplier = 4 if is_2d else 2 kernel = (1, 1) if is_2d else (1, 1) thicker = common_layers.conv_block( source, hparams.hidden_size * multiplier, [((1, 1), kernel)], first_relu=first_relu, name="decompress_conv") if is_2d: return tf.depth_to_space(thicker, 2) return tf.reshape(thicker, [shape[0], shape[1] * 2, 1, hparams.hidden_size])
def decompress_step(source, hparams, first_relu, is_2d, name): """Decompression function.""" with tf.variable_scope(name): shape = common_layers.shape_list(source) multiplier = 4 if is_2d else 2 kernel = (1, 1) if is_2d else (1, 1) thicker = common_layers.conv_block( source, hparams.hidden_size * multiplier, [((1, 1), kernel)], first_relu=first_relu, name="decompress_conv") if is_2d: return tf.depth_to_space(thicker, 2) return tf.reshape(thicker, [shape[0], shape[1] * 2, 1, hparams.hidden_size])
def decompress_step(source, c, hparams, first_relu, name): """Decompression function.""" with tf.variable_scope(name): shape = tf.shape(source) if c is not None: source = attend(source, c, hparams, "decompress_attend") first = common_layers.conv_block(source, hparams.hidden_size, [((1, 1), (3, 1)), ((1, 1), (3, 1))], first_relu=first_relu, padding="SAME", name="decompress_conv1") second = common_layers.conv_block(tf.concat([source, first], axis=3), hparams.hidden_size, [((1, 1), (3, 1)), ((1, 1), (3, 1))], first_relu=first_relu, padding="SAME", name="decompress_conv2") thicker = interleave(first, second) return tf.reshape(thicker, [shape[0], shape[1] * 2, 1, hparams.hidden_size])
def compress(x, c, hparams, name): """Compress.""" with tf.variable_scope(name): # Run compression by strided convs. cur = x for i in xrange(hparams.num_compress_steps): if c is not None: cur = attend(cur, c, hparams, "compress_attend_%d" % i) cur = residual_conv(cur, 1, hparams, "compress_rc_%d" % i) cur = common_layers.conv_block( cur, hparams.hidden_size, [((1, 1), (2, 1))], strides=(2, 1), name="compress_%d" % i) return cur
def decompress_step(source, hparams, first_relu, name): """Decompression function.""" with tf.variable_scope(name): shape = common_layers.shape_list(source) multiplier = 2 kernel = (1, 1) thicker = common_layers.conv_block(source, hparams.model_d * multiplier, [((1, 1), kernel)], first_relu=first_relu, name="decompress_conv") return tf.reshape(thicker, [shape[0], shape[1] * 2, 1, hparams.model_d])
def compress(x, is_2d, hparams, name): """Compress.""" with tf.variable_scope(name): # Run compression by strided convs. cur = x k1 = (3, 3) if is_2d else (3, 1) cur = residual_conv(cur, hparams.num_compress_steps, k1, hparams, "rc") k2 = (2, 2) if is_2d else (2, 1) for i in xrange(hparams.num_compress_steps): cur = common_layers.conv_block( cur, hparams.hidden_size, [((1, 1), k2)], strides=k2, name="compress_%d" % i) return cur
def compress(x, is_2d, hparams, name): """Compress.""" with tf.variable_scope(name): # Run compression by strided convs. cur = x k1 = (3, 3) if is_2d else (3, 1) cur = residual_conv(cur, hparams.num_compress_steps, k1, hparams, "rc") k2 = (2, 2) if is_2d else (2, 1) for i in xrange(hparams.num_compress_steps): cur = common_layers.conv_block( cur, hparams.hidden_size, [((1, 1), k2)], strides=k2, name="compress_%d" % i) return cur
def compress_encoder(inputs, hparams, strides=(2, 2), kernel_size=(3, 3), name=None): """Encoder that compresses 2-D inputs by 2**num_compress_steps. Args: inputs: Tensor of shape [batch, height, width, channels]. hparams: HParams. strides: Tuple, strides for conv block. kernel_size: Tuple, kernel window size for conv block. name: string, variable scope. Returns: Tensor of shape [batch, latent_length, hparams.hidden_size], where latent_length is hparams.num_latents * (height*width) / 2**(hparams.num_compress_steps). """ with tf.variable_scope(name, default_name="compress"): x = inputs for i in range(hparams.num_compress_steps // 2): with tf.variable_scope("compress_conv_%d" % i): y = common_layers.conv_block( common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"), hparams.hidden_size, dilation_rates_and_kernel_sizes=[((1, 1), kernel_size)], strides=strides, padding="SAME", name="compress_conv_%d" % i) y = tf.nn.dropout(y, 1.0 - hparams.dropout) if hparams.do_compress_attend: y = compress_self_attention_layer( x, hparams, name="compress_selfatt_%d" % i) y += x x = y x = residual_block_layer(x, hparams) # If using multiple copies of latents, blow up the hidden size and then # reshape to increase by num_latents. shape_x = common_layers.shape_list(x) x = tf.layers.dense(x, hparams.num_latents * hparams.hidden_size, name=name + "_dense") return tf.reshape(x, [ shape_x[0], shape_x[1] * shape_x[2] * hparams.num_latents, hparams.hidden_size ])
def xception_entry(inputs, hidden_dim): """Xception entry flow.""" with tf.variable_scope("xception_entry"): def xnet_resblock(x, filters, res_relu, name): """Resblock.""" with tf.variable_scope(name): y = common_layers.separable_conv_block( x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) return y + common_layers.conv_block( x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 2), first_relu=res_relu, force2d=True, name="res_conv0") tf.summary.image("inputs", inputs, max_outputs=2) x = common_layers.conv_block( inputs, 32, [((1, 1), (3, 3))], first_relu=False, padding="SAME", strides=(2, 2), force2d=True, name="conv0") x = common_layers.conv_block( x, 64, [((1, 1), (3, 3))], padding="SAME", force2d=True, name="conv1") x = xnet_resblock(x, min(128, hidden_dim), True, "block0") x = xnet_resblock(x, min(256, hidden_dim), False, "block1") return xnet_resblock(x, hidden_dim, False, "block2")
def xception_entry(inputs, hidden_dim): with tf.variable_scope("xception_entry"): def xnet_resblock(x, filters, res_relu, name): with tf.variable_scope(name): y = common_layers.separable_conv_block( x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) return y + common_layers.conv_block( x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 2), first_relu=res_relu, force2d=True, name="res_conv0") inputs = common_layers.standardize_images(inputs) # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet. # tf.summary.image("inputs", inputs, max_outputs=2) x = common_layers.conv_block( inputs, 32, [((1, 1), (3, 3))], first_relu=False, padding="SAME", strides=(2, 2), force2d=True, name="conv0") x = common_layers.conv_block( x, 64, [((1, 1), (3, 3))], padding="SAME", force2d=True, name="conv1") x = xnet_resblock(x, min(128, hidden_dim), True, "block0") x = xnet_resblock(x, min(256, hidden_dim), False, "block1") return xnet_resblock(x, hidden_dim, False, "block2")
def compress_encoder(inputs, hparams, strides=(2, 2), kernel_size=(3, 3), name=None): """Encoder that compresses 2-D inputs by 2**num_compress_steps. Args: inputs: Tensor of shape [batch, height, width, channels]. hparams: tf.contrib.training.HParams. strides: Tuple, strides for conv block. kernel_size: Tuple, kernel window size for conv block. name: string, variable scope. Returns: Tensor of shape [batch, latent_length, hparams.hidden_size], where latent_length is hparams.num_latents * (height*width) / 2**(hparams.num_compress_steps). """ with tf.variable_scope(name, default_name="compress"): x = inputs for i in range(hparams.num_compress_steps // 2): with tf.variable_scope("compress_conv_%d" % i): y = common_layers.conv_block( common_layers.layer_norm( x, hparams.hidden_size, name="lnorm"), hparams.hidden_size, dilation_rates_and_kernel_sizes=[((1, 1), kernel_size)], strides=strides, padding="SAME", name="compress_conv_%d" % i) y = tf.nn.dropout(y, 1.0 - hparams.dropout) if hparams.do_compress_attend: y = compress_self_attention_layer( x, hparams, name="compress_selfatt_%d" % i) y += x x = y x = residual_block_layer(x, hparams) # If using multiple copies of latents, blow up the hidden size and then # reshape to increase by num_latents. shape_x = common_layers.shape_list(x) x = tf.layers.dense(x, hparams.num_latents * hparams.hidden_size, name=name + "_dense") return tf.reshape(x, [shape_x[0], shape_x[1] * shape_x[2] * hparams.num_latents, hparams.hidden_size])
def residual_conv(x, repeat, k, hparams, name, reuse=None): """A stack of convolution blocks with residual connections.""" with tf.variable_scope(name, reuse=reuse): dilations_and_kernels = [((1, 1), k) for _ in range(3)] for i in range(repeat): with tf.variable_scope("repeat_%d" % i): y = common_layers.conv_block( common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"), hparams.hidden_size, dilations_and_kernels, padding="SAME", name="residual_conv") y = tf.nn.dropout(y, 1.0 - hparams.dropout) x += y return x
def residual_conv(x, repeat, k, hparams, name, reuse=None): """A stack of convolution blocks with residual connections.""" with tf.variable_scope(name, reuse=reuse): dilations_and_kernels = [((1, 1), k) for _ in xrange(3)] for i in xrange(repeat): with tf.variable_scope("repeat_%d" % i): y = common_layers.conv_block(common_layers.layer_norm( x, hparams.hidden_size, name="lnorm"), hparams.hidden_size, dilations_and_kernels, padding="SAME", name="residual_conv") y = tf.nn.dropout(y, 1.0 - hparams.dropout) x += y return x
def compress_encoder(inputs, hparams, strides=(2, 2), kernel=(3, 3), name="compress"): """Encoder that compresses inputs to length/2**num_compress_steps. Args: inputs: Tensor of shape [batch, height, width, hidden_dim]. hparams: Dict, hyperparameters. strides: Tuple, strides for conv block. kernel: Tuple, kernel window size for conv block. name: string, variable scope. Returns: x: Tensor of shape [batch, height*width/2**(compress_steps), hidden_dim]. """ with tf.variable_scope(name): x = inputs # Compress conv layers with strides and kernels as passed to the function. for i in range(hparams.num_compress_steps // 2): with tf.variable_scope("compress_conv_%d" % i): y = common_layers.conv_block( common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"), hparams.hidden_size, [((1, 1), kernel)], strides=strides, padding="SAME", name="compress_conv_%d" % i) y = tf.nn.dropout(y, 1.0 - hparams.dropout) x = y # Residual blocks. x = residual_block_layer(x, hparams) # If using multiple copies of latents, blow up the hidden size and then # reshape to increase by num_latents. shape_x = common_layers.shape_list(x) x = tf.layers.dense(x, hparams.num_latents * hparams.hidden_size, name=name + "_dense") new_shape = [ shape_x[0], shape_x[1] * shape_x[2] * hparams.num_latents, hparams.hidden_size ] return tf.reshape(x, new_shape)
def ae_compress(x, is_2d, hparams, name, reuse=None): """Compress, then AE.""" with tf.variable_scope(name, reuse=reuse): cur = compress(x, None, is_2d, hparams, "compress") # Convolve and ReLu to get state. cur = common_layers.conv_block(cur, hparams.hidden_size, [((1, 1), (1, 1))], name="mid_conv") means_size = hparams.z_size if hparams.do_vae else hparams.v_size means = tf.get_variable("z_to_dense", [means_size, hparams.hidden_size]) if hparams.do_vae: if hparams.bit_vae: hot, loss = bit_vae(cur, hparams, "bvae") else: hot, loss, _, _ = vae(cur, hparams.z_size, "vae") # Do a second level vae with some probability. if hparams.z_size2 > 0: prob_z2 = common_layers.inverse_exp_decay( hparams.startup_steps * 2) * 0.8 if hparams.mode != tf.contrib.learn.ModeKeys.TRAIN: prob_z2 = 1.0 def vae2(): hot2, loss2, _, _ = vae(hot, hparams.z_size2, "vae2") ret = tf.layers.dense(hot2, hparams.z_size) return mix(ret, hot, hparams.startup_steps * 2), loss2 hot, loss2 = tf.cond(tf.less(tf.random_uniform([]), prob_z2), vae2, lambda: (hot, tf.constant(0.0))) loss += loss2 * 0.1 return cur, hot, loss if hparams.use_gumbel_softmax: _, hot, loss = dae(cur, hparams, "dae") return cur, hot, loss # Using k-means part. L2-normalizing to use fast cosine distance. cur = mix(tf.nn.l2_normalize(cur, dim=3), cur, hparams.startup_steps // 3, mode="exp", simple=True) cur_n = hparams.kmeans_lr_factor * cur cur_n += (1.0 - hparams.kmeans_lr_factor) * tf.stop_gradient(cur) hot, loss = kmeans(cur_n, means, hparams, name="kmeans") # We need a linear layer to undo the l2-normalization. cur = tf.layers.dense(cur, hparams.hidden_size, name="unnormalize") return cur, hot, loss
def encode(self, inputs, target_space, hparams): """Add two layers strided convolutions ontop of encode.""" inputs = common_layers.conv_block( inputs, hparams.hidden_size, [((1, 1), (3, 3))], first_relu=False, padding="SAME", force2d=True, name="small_image_conv") hparams.num_compress_steps = 2 compressed_inputs = transformer_vae.compress(inputs, is_2d=True, hparams=hparams, name="convolutions") return super(TransformerSketch, self).encode( compressed_inputs, target_space, hparams)
def ae_compress(x, is_2d, hparams, name, reuse=None): """Compress, then AE.""" with tf.variable_scope(name, reuse=reuse): cur = compress(x, None, is_2d, hparams, "compress") # Convolve and ReLu to get state. cur = common_layers.conv_block(cur, hparams.hidden_size, [((1, 1), (1, 1))], name="mid_conv") cur = tf.nn.l2_normalize(cur, dim=3) cur_n = hparams.kmeans_lr_factor * cur cur_n += (1.0 - hparams.kmeans_lr_factor) * tf.stop_gradient(cur) means = tf.get_variable("z_to_dense", [hparams.v_size, hparams.hidden_size]) hot, loss = kmeans(cur_n, means, hparams, name="kmeans") # We need a linear layer to undo the l2-normalization. cur = tf.layers.dense(cur, hparams.hidden_size, name="unnormalize") return cur, hot, loss
def residual_dilated_conv(x, repeat, padding, name, hparams): """A stack of convolution blocks with residual connections.""" with tf.variable_scope(name): k = (hparams.kernel_height, hparams.kernel_width) dilations_and_kernels = [((2**i, 1), k) for i in range(hparams.num_hidden_layers)] for i in range(repeat): with tf.variable_scope("repeat_%d" % i): y = common_layers.conv_block( common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"), hparams.hidden_size, dilations_and_kernels, padding=padding, name="residual_conv") y = tf.nn.dropout(y, 1.0 - hparams.dropout) x += y return x
def residual_dilated_conv(x, repeat, padding, name, hparams): """A stack of convolution blocks with residual connections.""" with tf.variable_scope(name): k = (hparams.kernel_height, hparams.kernel_width) dilations_and_kernels = [((2**i, 1), k) for i in xrange(hparams.num_hidden_layers)] for i in xrange(repeat): with tf.variable_scope("repeat_%d" % i): y = common_layers.conv_block(common_layers.layer_norm( x, hparams.hidden_size, name="lnorm"), hparams.hidden_size, dilations_and_kernels, padding=padding, name="residual_conv") y = tf.nn.dropout(y, 1.0 - hparams.dropout) x += y return x
def bottom(self, inputs): with tf.variable_scope(self.name): inputs = common_layers.standardize_images(inputs) # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet. # tf.summary.image("inputs", inputs, max_outputs=2) if self._model_hparams.compress_steps > 0: strides = (2, 2) else: strides = (1, 1) return common_layers.conv_block(inputs, self._body_input_depth, [((1, 1), (3, 3))], first_relu=False, strides=strides, padding="SAME", force2d=True, name="small_image_conv")
def xnet_resblock(x, filters, res_relu, name): with tf.variable_scope(name): y = common_layers.separable_conv_block( x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) return y + common_layers.conv_block( x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 2), first_relu=res_relu, force2d=True, name="res_conv0")
def xnet_resblock(x, filters, res_relu, name): with tf.variable_scope(name): y = common_layers.separable_conv_block( x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) return y + common_layers.conv_block( x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 2), first_relu=res_relu, force2d=True, name="res_conv0")
def vae_compress(x, c, hparams, compress_name, decompress_name, reuse=None): """Compress, then VAE.""" mix_k = 8 with tf.variable_scope(compress_name, reuse=reuse): cur = compress(x, None, hparams, "compress") # Convolve and ReLu to get state. cur = common_layers.conv_block( cur, hparams.hidden_size, [((1, 1), (1, 1))], name="mid_conv") # z, kl_loss, mu, log_sigma = vae(cur, hparams, name="vae") z, kl_loss = dvae(cur, None, hparams, name="dvae") z1, kl_loss1 = top_k_experts(cur, mix_k, hparams) mu, log_sigma = None, None # Mix expert-selection and flat selection. alpha_p = common_layers.inverse_lin_decay(60000) + 0.001 z = alpha_p * z1 + (1 - alpha_p) * z kl_loss += kl_loss1 # Compress context. with tf.variable_scope(compress_name, reuse=reuse): compress_c = compress(c, None, hparams, "compress_context") c_z = tf.layers.dense(compress_c, hparams.v_size, name="mask_context") reconstruct_loss = tf.nn.softmax_cross_entropy_with_logits( labels=z, logits=c_z) # If not training, use the predicted z instead of the autoregressive one. # if hparams.mode != tf.contrib.learn.ModeKeys.TRAIN: # z = mix(c_z, z, 50000, max_prob=0.3, mode="exp") # z, _ = top_k_softmax(c_z, mix_k) with tf.variable_scope(decompress_name, reuse=reuse): # Decompress. z = tf.layers.dense(z, hparams.hidden_size, name="z_to_dense") # Leak at the beginning to help train. z = mix(z, cur, 30000) for i in xrange(hparams.num_compress_steps): j = hparams.num_compress_steps - i - 1 z = residual_conv(z, 1, hparams, "decompress_rc_%d" % j) z = decompress_step(z, c, hparams, i > 0, "decompress_step_%d" % j) return z, kl_loss + 0.0001 * reconstruct_loss, mu, log_sigma
def xnet_resblock(x, filters, res_relu, name): with tf.variable_scope(name): # Typically audio samples are >100k samples in length and have a width # of 2 or 4. Mono audio has a single channel while stereo has 2. y = common_layers.separable_conv_block( x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) return y + common_layers.conv_block( x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 2), first_relu=res_relu, force2d=True, name="res_conv0")
def xnet_resblock(x, filters, res_relu, name): with tf.variable_scope(name): # We only stride along the length dimension to preserve the spectral # bins (which are tiny in dimensionality relative to length) y = common_layers.separable_conv_block( x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 1)) return y + common_layers.conv_block( x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 1), first_relu=res_relu, force2d=True, name="res_conv0")
def xception_internal(inputs, hparams): """Xception body.""" with tf.variable_scope("xception"): cur = inputs if cur.get_shape().as_list()[1] > 200: # Large image, Xception entry flow cur = xception_entry(cur, hparams.hidden_size) else: # Small image, conv cur = common_layers.conv_block( cur, hparams.hidden_size, [((1, 1), (3, 3))], first_relu=False, padding="SAME", force2d=True, name="small_image_conv") for i in xrange(hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % i): cur = residual_block(cur, hparams) return xception_exit(cur)