def bottom(self, inputs): """Transform input from data space to model space. Perform the Xception "Entry flow", which consists of two convolutional filter upscalings followed by three residually connected separable convolution blocks. Args: inputs: A Tensor with shape [batch, ...] Returns: body_input: A Tensor with shape [batch, ?, ?, body_input_depth]. """ with tf.variable_scope(self.name): def xnet_resblock(x, filters, res_relu, name): with tf.variable_scope(name): y = common_layers.separable_conv_block( x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) return y + common_layers.conv_block(x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 2), first_relu=res_relu, force2d=True, name="res_conv0") inputs = common_layers.standardize_images(inputs) # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet. # tf.summary.image("inputs", inputs, max_outputs=2) x = common_layers.conv_block(inputs, 32, [((1, 1), (3, 3))], first_relu=False, padding="SAME", strides=(2, 2), force2d=True, name="conv0") x = common_layers.conv_block(x, 64, [((1, 1), (3, 3))], padding="SAME", force2d=True, name="conv1") x = xnet_resblock(x, min(128, self._body_input_depth), True, "block0") x = xnet_resblock(x, min(256, self._body_input_depth), False, "block1") return xnet_resblock(x, self._body_input_depth, False, "block2")
def bottom(self, inputs): """Transform input from data space to model space. Perform the Xception "Entry flow", which consists of two convolutional filter upscalings followed by three residually connected separable convolution blocks. Args: inputs: A Tensor with shape [batch, ...] Returns: body_input: A Tensor with shape [batch, ?, ?, body_input_depth]. """ with tf.variable_scope(self.name): def xnet_resblock(x, filters, res_relu, name): with tf.variable_scope(name): y = common_layers.separable_conv_block( x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) return y + common_layers.conv_block( x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 2), first_relu=res_relu, force2d=True, name="res_conv0") inputs = common_layers.standardize_images(inputs) # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet. # tf.summary.image("inputs", inputs, max_outputs=2) x = common_layers.conv_block( inputs, 32, [((1, 1), (3, 3))], first_relu=False, padding="SAME", strides=(2, 2), force2d=True, name="conv0") x = common_layers.conv_block( x, 64, [((1, 1), (3, 3))], padding="SAME", force2d=True, name="conv1") x = xnet_resblock(x, min(128, self._body_input_depth), True, "block0") x = xnet_resblock(x, min(256, self._body_input_depth), False, "block1") return xnet_resblock(x, self._body_input_depth, False, "block2")
def bytenet_internal(inputs, targets, hparams, train): """ByteNet, main step used for training.""" with tf.variable_scope("bytenet"): # Flatten inputs and extend length by 50%. inputs = tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2) extend_length = tf.to_int32(0.5 * tf.to_float(tf.shape(inputs)[1])) inputs_shape = inputs.shape.as_list() inputs = tf.pad(inputs, [[0, 0], [0, extend_length], [0, 0], [0, 0]]) inputs_shape[1] = None inputs.set_shape(inputs_shape) # Don't lose the other shapes when padding. # Pad inputs and targets to be the same length, divisible by 50. inputs, targets = common_layers.pad_to_same_length( inputs, targets, final_length_divisible_by=50) final_encoder = residual_dilated_conv( inputs, hparams.num_block_repeat, "SAME", "encoder", hparams, train) shifted_targets = common_layers.shift_left(targets) kernel = (hparams.kernel_height, hparams.kernel_width) decoder_start = common_layers.conv_block( tf.concat([final_encoder, shifted_targets], axis=3), hparams.hidden_size, [((1, 1), kernel)], padding="LEFT") return residual_dilated_conv( decoder_start, hparams.num_block_repeat, "LEFT", "decoder", hparams, train)
def bytenet_internal(inputs, targets, hparams, train): """ByteNet, main step used for training.""" with tf.variable_scope("bytenet"): # Flatten inputs and extend length by 50%. inputs = tf.expand_dims(common_layers.flatten4d3d(inputs), axis=2) extend_length = tf.to_int32(0.5 * tf.to_float(tf.shape(inputs)[1])) inputs_shape = inputs.shape.as_list() inputs = tf.pad(inputs, [[0, 0], [0, extend_length], [0, 0], [0, 0]]) inputs_shape[1] = None inputs.set_shape( inputs_shape) # Don't lose the other shapes when padding. # Pad inputs and targets to be the same length, divisible by 50. inputs, targets = common_layers.pad_to_same_length( inputs, targets, final_length_divisible_by=50) final_encoder = residual_dilated_conv(inputs, hparams.num_block_repeat, "SAME", "encoder", hparams, train) shifted_targets = common_layers.shift_left(targets) kernel = (hparams.kernel_height, hparams.kernel_width) decoder_start = common_layers.conv_block( tf.concat([final_encoder, shifted_targets], axis=3), hparams.hidden_size, [((1, 1), kernel)], padding="LEFT") return residual_dilated_conv(decoder_start, hparams.num_block_repeat, "LEFT", "decoder", hparams, train)
def testConvBlock(self): x = np.random.rand(5, 7, 1, 11) with self.test_session() as session: y = common_layers.conv_block(tf.constant(x, dtype=tf.float32), 13, [(1, (3, 3)), (1, (3, 3))], padding="SAME", normalizer_fn=common_layers.noam_norm) session.run(tf.global_variables_initializer()) res = session.run(y) self.assertEqual(res.shape, (5, 7, 1, 13))
def testConvBlock(self): x = np.random.rand(5, 7, 1, 11) with self.test_session() as session: y = common_layers.conv_block( tf.constant(x, dtype=tf.float32), 13, [(1, (3, 3)), (1, (3, 3))], padding="SAME", normalizer_fn=common_layers.noam_norm) session.run(tf.global_variables_initializer()) res = session.run(y) self.assertEqual(res.shape, (5, 7, 1, 13))
def targets_bottom_simple(self, inputs): with tf.variable_scope(self.name): inputs = common_layers.standardize_images(inputs) if self._model_hparams.compress_steps > 0: kernel, strides = (2, 2), (2, 2) # Crucial to not leak! else: kernel, strides = (1, 1), (1, 1) return common_layers.conv_block(inputs, self._body_input_depth, [((1, 1), kernel)], first_relu=False, strides=strides, force2d=True, name="small_image_conv")
def bottom(self, inputs): with tf.variable_scope(self.name): inputs = common_layers.standardize_images(inputs) # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet. # tf.summary.image("inputs", inputs, max_outputs=2) if self._model_hparams.compress_steps > 0: strides = (2, 2) else: strides = (1, 1) return common_layers.conv_block( inputs, self._body_input_depth, [((1, 1), (3, 3))], first_relu=False, strides=strides, padding="SAME", force2d=True, name="small_image_conv")
def residual_dilated_conv(x, repeat, padding, name, hparams, train): """A stack of convolution blocks with residual connections.""" with tf.variable_scope(name): k = (hparams.kernel_height, hparams.kernel_width) dilations_and_kernels = [((2**i, 1), k) for i in xrange(hparams.num_hidden_layers)] for i in xrange(repeat): with tf.variable_scope("repeat_%d" % i): y = common_layers.conv_block( x, hparams.hidden_size, dilations_and_kernels, padding=padding, name="residual_conv") x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm") x = tf.nn.dropout(x, 1.0 - hparams.dropout * tf.to_float(train)) return x
def bottom(self, inputs): with tf.variable_scope(self.name): inputs = common_layers.standardize_images(inputs) # TODO(lukaszkaiser): summaries here don't work in multi-problem case yet. # tf.summary.image("inputs", inputs, max_outputs=2) if self._model_hparams.compress_steps > 0: strides = (2, 2) else: strides = (1, 1) return common_layers.conv_block(inputs, self._body_input_depth, [((1, 1), (3, 3))], first_relu=False, strides=strides, padding="SAME", force2d=True, name="small_image_conv")
def residual_dilated_conv(x, repeat, padding, name, hparams): """A stack of convolution blocks with residual connections.""" with tf.variable_scope(name): k = (hparams.kernel_height, hparams.kernel_width) dilations_and_kernels = [((2**i, 1), k) for i in xrange(hparams.num_hidden_layers)] for i in xrange(repeat): with tf.variable_scope("repeat_%d" % i): y = common_layers.conv_block(x, hparams.hidden_size, dilations_and_kernels, padding=padding, name="residual_conv") x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm") x = tf.nn.dropout(x, hparams.dropout) return x
def xnet_resblock(x, filters, res_relu, name): with tf.variable_scope(name): y = common_layers.separable_conv_block( x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) return y + common_layers.conv_block( x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 2), first_relu=res_relu, force2d=True, name="res_conv0")
def xnet_resblock(x, filters, res_relu, name): with tf.variable_scope(name): # Typically audio samples are >100k samples in length and have a width # of 2 or 4. Mono audio has a single channel while stereo has 2. y = common_layers.separable_conv_block( x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 2)) return y + common_layers.conv_block( x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 2), first_relu=res_relu, force2d=True, name="res_conv0")
def xnet_resblock(x, filters, res_relu, name): with tf.variable_scope(name): # We only stride along the length dimension to preserve the spectral # bins (which are tiny in dimensionality relative to length) y = common_layers.separable_conv_block( x, filters, [((1, 1), (3, 3)), ((1, 1), (3, 3))], first_relu=True, padding="SAME", force2d=True, name="sep_conv_block") y = common_layers.pool(y, (3, 3), "MAX", "SAME", strides=(2, 1)) return y + common_layers.conv_block( x, filters, [((1, 1), (1, 1))], padding="SAME", strides=(2, 1), first_relu=res_relu, force2d=True, name="res_conv0")