Esempio n. 1
0
    def __call__(self, inputs, inputs_seq_len,
                 keep_prob_input, keep_prob_hidden, keep_prob_output):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size`[B, T, input_size]`
            inputs_seq_len (placeholder): A tensor of size` [B]`
            keep_prob_input (placeholder, float): A probability to keep nodes
                in the input-hidden connection
            keep_prob_hidden (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            keep_prob_output (placeholder, float): A probability to keep nodes
                in the hidden-output connection
        Returns:
            logits: A tensor of size `[T, B, num_classes]`
            final_state: A final hidden state of the encoder
        """
        # inputs: 3D tensor `[batch_size, max_time, input_size * splice]`
        batch_size = tf.shape(inputs)[0]
        max_time = tf.shape(inputs)[1]

        # Reshape to 4D tensor
        # `[batch_size * max_time, input_size / 3, splice, 3(+Δ,ΔΔ)]`
        inputs = tf.reshape(
            inputs,
            shape=[batch_size * max_time, int(self.input_size / 3), self.splice, 3])

        with tf.variable_scope('VGG1'):
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 3, 96],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv1')
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 96, 96],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv2')
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 96, 96],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv3')
            inputs = max_pool(inputs, name='max_pool')
            # TODO(hirofumi): try batch normalization

        with tf.variable_scope('VGG2'):
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 96, 192],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv1')
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 192, 192],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv2')
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 192, 192],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv3')
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 192, 192],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv4')
            inputs = max_pool(inputs, name='max_pool')
            # TODO(hirofumi): try batch normalization

        with tf.variable_scope('VGG3'):
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 192, 384],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv1')
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 384, 384],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv2')
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 384, 384],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv3')
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 384, 384],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv4')
            inputs = max_pool(inputs, name='max_pool')
            # TODO(hirofumi): try batch normalization

        # Reshape to 2D tensor `[batch_size * max_time, new_h * new_w * 384]`
        new_h = math.ceil(self.input_size / (3 * 2**3)
                          )  # expected to be 5 or 6
        new_w = math.ceil(self.splice / (2**3))  # expected to be 2
        inputs = tf.reshape(
            inputs, shape=[batch_size * max_time, new_h * new_w * 384])

        with tf.variable_scope('fc1') as scope:
            inputs = tf.contrib.layers.fully_connected(
                inputs=inputs,
                num_outputs=1024,
                activation_fn=tf.nn.relu,
                weights_initializer=tf.truncated_normal_initializer(
                    stddev=self.parameter_init),
                biases_initializer=tf.zeros_initializer(),
                scope=scope)

        with tf.variable_scope('fc2') as scope:
            inputs = tf.contrib.layers.fully_connected(
                inputs=inputs,
                num_outputs=1024,
                activation_fn=tf.nn.relu,
                weights_initializer=tf.truncated_normal_initializer(
                    stddev=self.parameter_init),
                biases_initializer=tf.zeros_initializer(),
                scope=scope)

        with tf.variable_scope('fc3') as scope:
            logits_2d = tf.contrib.layers.fully_connected(
                inputs=inputs,
                num_outputs=self.num_classes,
                activation_fn=tf.nn.relu,
                weights_initializer=tf.truncated_normal_initializer(
                    stddev=self.parameter_init),
                biases_initializer=tf.zeros_initializer(),
                scope=scope)

        # if self.bottleneck_dim is not None and self.bottleneck_dim != 0:
        #     with tf.variable_scope('bottleneck') as scope:
        #         outputs = tf.contrib.layers.fully_connected(
        #             outputs, self.bottleneck_dim,
        #             activation_fn=tf.nn.relu,
        #             weights_initializer=tf.truncated_normal_initializer(stddev=0.1),
        #             biases_initializer=tf.zeros_initializer(),
        #             scope=scope)
        #
        #         # Dropout for the hidden-output connections
        #         outputs = tf.nn.dropout(
        #             outputs, keep_prob_output, name='dropout_output_bottle')

        # Reshape back to 3D tensor `[batch_size, max_time, num_classes]`
        logits = tf.reshape(
            logits_2d, shape=[batch_size, max_time, self.num_classes])

        # Convert to time-major: `[max_time, batch_size, num_classes]'
        logits = tf.transpose(logits, (1, 0, 2))

        return logits, None
    def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, T, input_size (num_channels * splice * num_stack * 3)]`
            inputs_seq_len (placeholder): A tensor of size` [B]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states.
                if time_major is True, a tensor of size `[T, B, output_dim]`
                otherwise, `[B, T, output_dim]`
            final_state: None
        """
        # inputs: 3D tensor `[B, T, input_dim]`
        batch_size = tf.shape(inputs)[0]
        max_time = tf.shape(inputs)[1]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        # For debug
        # print(input_dim)
        # print(self.num_channels)
        # print(self.splice)
        # print(self.num_stack)

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(
            inputs,
            shape=[batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3])

        # NOTE: filter_size: `[H, W, C_in, C_out]`
        with tf.variable_scope('CNN1'):
            inputs = conv_layer(inputs,
                                filter_size=[9, 9, 3, 64],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[3, 1],
                              stride=[3, 1],
                              name='max_pool')
            # TODO: try dropout

        with tf.variable_scope('CNN2'):
            inputs = conv_layer(inputs,
                                filter_size=[3, 4, 64, 128],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[1, 1],
                              stride=[1, 1],
                              name='max_pool')
            # TODO: try dropout

        # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]`
        outputs = tf.reshape(
            inputs, shape=[batch_size * max_time, np.prod(inputs.shape.as_list()[-3:])])

        for i in range(1, 3, 1):
            with tf.variable_scope('fc%d' % (i)) as scope:
                outputs = tf.contrib.layers.fully_connected(
                    inputs=outputs,
                    num_outputs=768,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=self.parameter_init),
                    biases_initializer=tf.zeros_initializer(),
                    scope=scope)
                outputs = tf.nn.dropout(outputs, keep_prob)

        # Reshape back to 3D tensor `[B, T, 768]`
        outputs = tf.reshape(
            outputs, shape=[batch_size, max_time, 768])

        if self.time_major:
            # Convert to time-major: `[T, B, num_classes]'
            outputs = tf.transpose(outputs, [1, 0, 2])

        return outputs, None
    def __call__(self, inputs, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, input_size (num_channels * splice * num_stack * 3)]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states.
                if time_major is True, a tensor of size `[T, B, output_dim]`
                otherwise, `[B, output_dim]`
        """
        # inputs: 2D tensor `[B, input_dim]`
        batch_size = tf.shape(inputs)[0]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        # for debug
        # print(input_dim)  # 1200
        # print(self.num_channels)  # 40
        # print(self.splice)  # 5
        # print(self.num_stack)  # 2

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(inputs,
                            shape=[
                                batch_size, self.num_channels,
                                self.splice * self.num_stack, 3
                            ])

        # NOTE: filter_size: `[H, W, C_in, C_out]`
        with tf.variable_scope('CNN1'):
            inputs = conv_layer(inputs,
                                filter_size=[9, 9, 3, 128],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[3, 1],
                              stride=[3, 1],
                              name='max_pool')

        with tf.variable_scope('CNN2'):
            inputs = conv_layer(inputs,
                                filter_size=[3, 4, 128, 256],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[1, 1],
                              stride=[1, 1],
                              name='max_pool')

        # Reshape to 2D tensor `[B, new_h * new_w * C_out]`
        outputs = tf.reshape(
            inputs, shape=[batch_size,
                           np.prod(inputs.shape.as_list()[-3:])])

        for i in range(1, 5, 1):
            with tf.variable_scope('fc%d' % (i)) as scope:
                outputs = tf.contrib.layers.fully_connected(
                    inputs=outputs,
                    num_outputs=2048,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=self.parameter_init),
                    biases_initializer=tf.zeros_initializer(),
                    scope=scope)

        return outputs
    def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, T, input_size (num_channels * (splice * num_stack) * 3)]`
            inputs_seq_len (placeholder): A tensor of size` [B]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states.
                if time_major is True, a tensor of size
                    `[T, B, num_units (num_proj)]`
                otherwise, `[B, T, num_units (num_proj)]`
            final_state: A final hidden state of the encoder
        """
        # inputs: 3D tensor `[B, T, input_dim]`
        batch_size = tf.shape(inputs)[0]
        max_time = tf.shape(inputs)[1]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        # For debug
        # print(input_dim)
        # print(self.num_channels)
        # print(self.splice)
        # print(self.num_stack)

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(
            inputs,
            shape=[batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3])

        # NOTE: filter_size: `[H, W, C_in, C_out]`
        with tf.variable_scope('CNN1'):
            inputs = conv_layer(inputs,
                                filter_size=[11, 21, 3, 32],
                                stride=[3, 2],
                                parameter_init=self.parameter_init,
                                activation='relu')
            # inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[1, 1],
                              stride=[1, 1],
                              name='max_pool')
            inputs = tf.nn.dropout(inputs, keep_prob)

        with tf.variable_scope('CNN2'):
            inputs = conv_layer(inputs,
                                filter_size=[11, 11, 32, 32],
                                stride=[1, 2],
                                parameter_init=self.parameter_init,
                                activation='relu')
            # inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[1, 1],
                              stride=[1, 1],
                              name='max_pool')
            inputs = tf.nn.dropout(inputs, keep_prob)

        with tf.variable_scope('CNN3'):
            inputs = conv_layer(inputs,
                                filter_size=[3, 3, 32, 96],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            # inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[1, 1],
                              stride=[1, 1],
                              name='max_pool')
            inputs = tf.nn.dropout(inputs, keep_prob)

        # Reshape to 3D tensor `[B, T, new_h * new_w * C_out]`
        inputs = tf.reshape(
            inputs, shape=[batch_size, max_time, np.prod(inputs.shape.as_list()[-3:])])

        initializer = tf.random_uniform_initializer(
            minval=-self.parameter_init, maxval=self.parameter_init)

        if self.lstm_impl == 'BasicLSTMCell':
            outputs, final_state = basiclstmcell(
                self.num_units, self.num_layers,
                inputs, inputs_seq_len, keep_prob, initializer,
                self.time_major)

        elif self.lstm_impl == 'LSTMCell':
            outputs, final_state = lstmcell(
                self.num_units, self.num_proj, self.num_layers,
                self.use_peephole, self.clip_activation,
                inputs, inputs_seq_len, keep_prob, initializer,
                self.time_major)

        elif self.lstm_impl == 'LSTMBlockCell':
            outputs, final_state = lstmblockcell(
                self.num_units, self.num_layers,
                self.use_peephole, self.clip_activation,
                inputs, inputs_seq_len, keep_prob, initializer,
                self.time_major)

        elif self.lstm_impl == 'LSTMBlockFusedCell':
            outputs, final_state = lstmblockfusedcell(
                self.num_units, self.num_layers,
                inputs, inputs_seq_len, keep_prob, initializer,
                self.time_major)

        elif self.lstm_impl == 'CudnnLSTM':
            outputs, final_state = cudnnlstm(
                self.num_units, self.num_layers,
                inputs, inputs_seq_len, keep_prob, initializer,
                self.time_major)
        else:
            raise IndexError(
                'lstm_impl is "BasicLSTMCell" or "LSTMCell" or ' +
                '"LSTMBlockCell" or "LSTMBlockFusedCell" or ' +
                '"CudnnLSTM".')

        # Reshape to 2D tensor `[B * T (T * B), output_dim]`
        output_dim = outputs.shape.as_list()[-1]
        outputs = tf.reshape(
            outputs, shape=[batch_size * max_time, output_dim])

        with tf.variable_scope('fc1') as scope:
            outputs = tf.contrib.layers.fully_connected(
                inputs=outputs,
                num_outputs=896,
                activation_fn=tf.nn.relu,
                weights_initializer=tf.truncated_normal_initializer(
                    stddev=self.parameter_init),
                biases_initializer=tf.zeros_initializer(),
                scope=scope)
            outputs = tf.nn.dropout(outputs, keep_prob)

        with tf.variable_scope('fc2') as scope:
            outputs = tf.contrib.layers.fully_connected(
                inputs=outputs,
                num_outputs=74,
                activation_fn=tf.nn.relu,
                weights_initializer=tf.truncated_normal_initializer(
                    stddev=self.parameter_init),
                biases_initializer=tf.zeros_initializer(),
                scope=scope)

        output_dim = outputs.shape.as_list()[-1]
        if self.time_major:
            # Reshape back to 3D tensor `[T, B, 74]`
            outputs = tf.reshape(
                outputs, shape=[max_time, batch_size, output_dim])
        else:
            # Reshape back to 3D tensor `[B, T, 74]`
            outputs = tf.reshape(
                outputs, shape=[batch_size, max_time, output_dim])

        return outputs, final_state
    def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, T, input_size (num_channels * (splice * num_stack) * 3)]`
            inputs_seq_len (placeholder): A tensor of size` [B]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states.
                if time_major is True, a tensor of size `[T, B, output_dim]`
                otherwise, `[B, T, output_dim]`
            final_state: None
        """
        # inputs: 3D tensor `[B, T, input_dim]`
        batch_size = tf.shape(inputs)[0]
        max_time = tf.shape(inputs)[1]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        # For debug
        # print(input_dim)
        # print(self.num_channels)
        # print(self.splice)
        # print(self.num_stack)

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(
            inputs,
            shape=[batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3])

        # NOTE: filter_size: `[H, W, C_in, C_out]`
        with tf.variable_scope('VGG1'):
            for i_layer in range(1, 4, 1):
                input_channels = inputs.shape.as_list()[-1]
                inputs = conv_layer(inputs,
                                    filter_size=[3, 3, input_channels, 96],
                                    stride=[1, 1],
                                    parameter_init=self.parameter_init,
                                    activation='relu',
                                    name='conv1')
                inputs = batch_normalization(inputs, is_training=is_training)
                if i_layer == 3:
                    inputs = max_pool(inputs,
                                      pooling_size=[2, 2],
                                      stride=[2, 2],
                                      name='max_pool')
                inputs = tf.nn.dropout(inputs, keep_prob)

        with tf.variable_scope('VGG2'):
            for i_layer in range(1, 5, 1):
                input_channels = inputs.shape.as_list()[-1]
                inputs = conv_layer(inputs,
                                    filter_size=[3, 3, input_channels, 192],
                                    stride=[1, 1],
                                    parameter_init=self.parameter_init,
                                    activation='relu',
                                    name='conv%d' % i_layer)
                inputs = batch_normalization(inputs, is_training=is_training)
                if i_layer == 4:
                    inputs = max_pool(inputs,
                                      pooling_size=[2, 2],
                                      stride=[2, 2],
                                      name='max_pool')
                inputs = tf.nn.dropout(inputs, keep_prob)

        with tf.variable_scope('VGG3'):
            for i_layer in range(1, 5, 1):
                input_channels = inputs.shape.as_list()[-1]
                inputs = conv_layer(inputs,
                                    filter_size=[3, 3, input_channels, 384],
                                    parameter_init=self.parameter_init,
                                    activation='relu',
                                    name='conv%d' % i_layer)
                inputs = batch_normalization(inputs, is_training=is_training)
                if i_layer == 4:
                    inputs = max_pool(inputs,
                                      pooling_size=[2, 2],
                                      stride=[2, 2],
                                      name='max_pool')
                inputs = tf.nn.dropout(inputs, keep_prob)

        # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]`
        outputs = tf.reshape(
            inputs, shape=[batch_size * max_time, np.prod(inputs.shape.as_list()[-3:])])

        for i_layer in range(1, 3, 1):
            with tf.variable_scope('fc%d' % i_layer) as scope:
                outputs = tf.contrib.layers.fully_connected(
                    inputs=outputs,
                    num_outputs=1024,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=self.parameter_init),
                    biases_initializer=tf.zeros_initializer(),
                    scope=scope)
                if i_layer == 1:
                    outputs = tf.nn.dropout(outputs, keep_prob)

        # Reshape back to 3D tensor `[B, T, 1024]`
        output_dim = outputs.shape.as_list()[-1]
        outputs = tf.reshape(
            outputs, shape=[batch_size, max_time, output_dim])

        if self.time_major:
            # Convert to time-major: `[T, B, num_classes]'
            outputs = tf.transpose(outputs, [1, 0, 2])

        return outputs, None
    def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, T, input_size (num_channels * splice * num_stack * 3)]`
            inputs_seq_len (placeholder): A tensor of size` [B]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states.
                if time_major is True, a tensor of size `[T, B, output_dim]`
                otherwise, `[B, T, output_dim]`
            final_state: None
        """
        # inputs: 3D tensor `[B, T, input_dim]`
        batch_size = tf.shape(inputs)[0]
        max_time = tf.shape(inputs)[1]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        # For debug
        # print(input_dim)
        # print(self.num_channels)
        # print(self.splice)
        # print(self.num_stack)

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(
            inputs,
            shape=[batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3])

        # Choose the activation function
        activation = 'relu'
        # activation = 'prelu'
        # activation = 'maxout'
        # TODO: add prelu and maxout layers

        # NOTE: filter_size: `[H, W, C_in, C_out]`
        # 1-4th layers
        for i_layer in range(1, 5, 1):
            with tf.variable_scope('CNN%d' % i_layer):
                input_channels = inputs.shape.as_list()[-1]
                inputs = conv_layer(inputs,
                                    filter_size=[3, 5, input_channels, 128],
                                    stride=[1, 1],
                                    parameter_init=self.parameter_init,
                                    activation=activation,
                                    name='conv')
                # inputs = batch_normalization(inputs, is_training=is_training)
                if i_layer == 1:
                    inputs = max_pool(inputs,
                                      pooling_size=[3, 1],
                                      stride=[3, 1],
                                      name='pool')
                inputs = tf.nn.dropout(inputs, keep_prob)

        # 5-10th layers
        for i_layer in range(5, 11, 1):
            with tf.variable_scope('CNN%d' % i_layer):
                input_channels = inputs.shape.as_list()[-1]
                inputs = conv_layer(inputs,
                                    filter_size=[3, 5, input_channels, 256],
                                    stride=[1, 1],
                                    parameter_init=self.parameter_init,
                                    activation=activation,
                                    name='conv')
                # inputs = batch_normalization(inputs, is_training=is_training)
                # NOTE: No poling
                inputs = tf.nn.dropout(inputs, keep_prob)

        # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]`
        outputs = tf.reshape(
            inputs, shape=[batch_size * max_time, np.prod(inputs.shape.as_list()[-3:])])

        # 11-14th layers
        for i_layer in range(1, 4, 1):
            with tf.variable_scope('fc%d' % i_layer) as scope:
                outputs = tf.contrib.layers.fully_connected(
                    inputs=outputs,
                    num_outputs=1024,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=self.parameter_init),
                    biases_initializer=tf.zeros_initializer(),
                    scope=scope)
                if i_layer != 3:
                    outputs = tf.nn.dropout(outputs, keep_prob)

        # Reshape back to 3D tensor `[B, T, 1024]`
        logits = tf.reshape(
            outputs, shape=[batch_size, max_time, 1024])

        if self.time_major:
            # Convert to time-major: `[T, B, 1024]'
            logits = tf.transpose(logits, [1, 0, 2])

        return logits, None
Esempio n. 7
0
    def __call__(self, inputs, inputs_seq_len,
                 keep_prob_input, keep_prob_hidden, keep_prob_output):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size`[B, T, input_size]`
            inputs_seq_len (placeholder): A tensor of size` [B]`
            keep_prob_input (placeholder, float): A probability to keep nodes
                in the input-hidden connection
            keep_prob_hidden (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            keep_prob_output (placeholder, float): A probability to keep nodes
                in the hidden-output connection
        Returns:
            logits: A tensor of size `[T, B, num_classes]`
            final_state: A final hidden state of the encoder
        """
        # inputs: `[B, T, input_size * splice]`
        batch_size = tf.shape(inputs)[0]
        max_time = tf.shape(inputs)[1]

        # Reshape to 4D tensor `[B * T, input_size / 3, splice, 3(+Δ, ΔΔ)]`
        inputs = tf.reshape(
            inputs,
            shape=[batch_size * max_time, int(self.input_size / 3), self.splice, 3])

        with tf.variable_scope('VGG1'):
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 3, 64],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv1')
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 64, 64],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv2')
            inputs = max_pool(inputs, name='max_pool')
            # TODO(hirofumi): try batch normalization

        with tf.variable_scope('VGG2'):
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 64, 128],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv1')
            inputs = conv_layer(inputs,
                                filter_shape=[3, 3, 128, 128],
                                parameter_init=self.parameter_init,
                                relu=True,
                                name='conv2')
            inputs = max_pool(inputs, name='max_pool')
            # TODO(hirofumi): try batch normalization

        # Reshape to 2D tensor `[B * T, new_h * new_w * 128]`
        new_h = math.ceil(self.input_size / 3 / 4)  # expected to be 11 ro 10
        new_w = math.ceil(self.splice / 4)  # expected to be 3
        inputs = tf.reshape(
            inputs, shape=[batch_size * max_time, new_h * new_w * 128])

        # Insert linear layer to recude CNN's output demention
        # from (new_h * new_w * 128) to 256
        with tf.variable_scope('linear') as scope:
            inputs = tf.contrib.layers.fully_connected(
                inputs=inputs,
                num_outputs=256,
                activation_fn=tf.nn.relu,
                scope=scope)

        # Dropout for the VGG-output-hidden connection
        inputs = tf.nn.dropout(inputs,
                               keep_prob_input,
                               name='dropout_input')

        # Reshape back to 3D tensor `[B, T, 256]`
        inputs = tf.reshape(inputs, shape=[batch_size, max_time, 256])

        initializer = tf.random_uniform_initializer(
            minval=-self.parameter_init,
            maxval=self.parameter_init)

        # Hidden layers
        lstm_list = []
        with tf.variable_scope('multi_lstm', initializer=initializer) as scope:
            for i_layer in range(1, self.num_layers + 1, 1):

                if self.lstm_impl == 'BasicLSTMCell':
                    lstm = tf.contrib.rnn.BasicLSTMCell(
                        self.num_units,
                        forget_bias=1.0,
                        state_is_tuple=True,
                        activation=tf.tanh)

                elif self.lstm_impl == 'LSTMCell':
                    lstm = tf.contrib.rnn.LSTMCell(
                        self.num_units,
                        use_peepholes=self.use_peephole,
                        cell_clip=self.clip_activation,
                        num_proj=self.num_proj,
                        forget_bias=1.0,
                        state_is_tuple=True)

                elif self.lstm_impl == 'LSTMBlockCell':
                    # NOTE: This should be faster than tf.contrib.rnn.LSTMCell
                    lstm = tf.contrib.rnn.LSTMBlockCell(
                        self.num_units,
                        forget_bias=1.0,
                        # clip_cell=True,
                        use_peephole=self.use_peephole)
                    # TODO: cell clipping (update for rc1.3)

                elif self.lstm_impl == 'LSTMBlockFusedCell':
                    raise NotImplementedError

                elif self.lstm_impl == 'CudnnLSTM':
                    raise NotImplementedError

                else:
                    raise IndexError(
                        'lstm_impl is "BasicLSTMCell" or "LSTMCell" or ' +
                        '"LSTMBlockCell" or "LSTMBlockFusedCell" or ' +
                        '"CudnnLSTM".')

                # Dropout for the hidden-hidden connections
                lstm = tf.contrib.rnn.DropoutWrapper(
                    lstm, output_keep_prob=keep_prob_hidden)

                lstm_list.append(lstm)

            # Stack multiple cells
            stacked_lstm = tf.contrib.rnn.MultiRNNCell(
                lstm_list, state_is_tuple=True)

            # Ignore 2nd return (the last state)
            outputs, final_state = tf.nn.dynamic_rnn(
                cell=stacked_lstm,
                inputs=inputs,
                sequence_length=inputs_seq_len,
                dtype=tf.float32,
                scope=scope)
            # NOTE: initial states are zero states by default

        if self.return_hidden_states:
            return outputs, final_state

        # Reshape to apply the same weights over the timesteps
        if self.num_proj is None:
            outputs = tf.reshape(outputs, shape=[-1, self.num_units])
        else:
            outputs = tf.reshape(outputs, shape=[-1, self.num_proj])

        if self.bottleneck_dim is not None and self.bottleneck_dim != 0:
            with tf.variable_scope('bottleneck') as scope:
                outputs = tf.contrib.layers.fully_connected(
                    outputs, self.bottleneck_dim,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=self.parameter_init),
                    biases_initializer=tf.zeros_initializer(),
                    scope=scope)

                # Dropout for the hidden-output connections
                outputs = tf.nn.dropout(
                    outputs, keep_prob_output, name='dropout_output_bottle')

        with tf.variable_scope('output') as scope:
            logits_2d = tf.contrib.layers.fully_connected(
                outputs, self.num_classes,
                activation_fn=None,
                weights_initializer=tf.truncated_normal_initializer(
                    stddev=self.parameter_init),
                biases_initializer=tf.zeros_initializer(),
                scope=scope)

            # Reshape back to the original shape
            logits = tf.reshape(
                logits_2d, shape=[batch_size, -1, self.num_classes])

            # Convert to time-major: `[T, B, num_classes]'
            logits = tf.transpose(logits, (1, 0, 2))

            # Dropout for the hidden-output connections
            logits = tf.nn.dropout(
                logits, keep_prob_output, name='dropout_output')

            return logits, final_state
    def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, T, input_size (num_channels * splice * num_stack * 3)]`
            inputs_seq_len (placeholder): A tensor of size` [B]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states, a tensor of size
                `[T, B, num_units (num_proj)]`
            final_state: A final hidden state of the encoder
        """
        # inputs: 3D tensor `[B, T, input_dim]`
        batch_size = tf.shape(inputs)[0]
        max_time = tf.shape(inputs)[1]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(inputs,
                            shape=[
                                batch_size * max_time, self.num_channels,
                                self.splice * self.num_stack, 3
                            ])

        # NOTE: filter_size: `[H, W, C_in, C_out]`
        with tf.variable_scope('VGG1'):
            inputs = conv_layer(inputs,
                                filter_size=[3, 3, 3, 64],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu',
                                name='conv1')
            inputs = conv_layer(inputs,
                                filter_size=[3, 3, 64, 64],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu',
                                name='conv2')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[2, 2],
                              stride=[2, 2],
                              name='max_pool')
            # TODO(hirofumi): try dropout

        with tf.variable_scope('VGG2'):
            inputs = conv_layer(inputs,
                                filter_size=[3, 3, 64, 128],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu',
                                name='conv1')
            inputs = conv_layer(inputs,
                                filter_size=[3, 3, 128, 128],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu',
                                name='conv2')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[2, 2],
                              stride=[2, 2],
                              name='max_pool')
            # TODO(hirofumi): try dropout

        # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]`
        new_h = math.ceil(self.num_channels / 4)
        new_w = math.ceil(self.splice * self.num_stack / 4)
        channel_out = inputs.shape.as_list()[-1]
        inputs = tf.reshape(
            inputs, shape=[batch_size * max_time, new_h * new_w * channel_out])

        # Insert linear layer to recude CNN's output demention
        # from (new_h * new_w * C_out) to 256
        with tf.variable_scope('bridge') as scope:
            inputs = tf.contrib.layers.fully_connected(
                inputs=inputs,
                num_outputs=256,
                activation_fn=tf.nn.relu,
                weights_initializer=tf.truncated_normal_initializer(
                    stddev=self.parameter_init),
                biases_initializer=tf.zeros_initializer(),
                scope=scope)

        # Dropout for the VGG-output-hidden connection
        inputs = tf.nn.dropout(inputs, keep_prob, name='dropout_pipe')

        # Reshape back to 3D tensor `[B, T, 256]`
        inputs = tf.reshape(inputs, shape=[batch_size, max_time, 256])

        initializer = tf.random_uniform_initializer(
            minval=-self.parameter_init, maxval=self.parameter_init)

        if self.lstm_impl == 'BasicLSTMCell':
            outputs, final_state = basiclstmcell(self.num_units,
                                                 self.num_layers, inputs,
                                                 inputs_seq_len, keep_prob,
                                                 initializer, self.time_major)

        elif self.lstm_impl == 'LSTMCell':
            outputs, final_state = lstmcell(self.num_units, self.num_proj,
                                            self.num_layers, self.use_peephole,
                                            self.clip_activation, inputs,
                                            inputs_seq_len, keep_prob,
                                            initializer, self.time_major)

        elif self.lstm_impl == 'LSTMBlockCell':
            outputs, final_state = lstmblockcell(self.num_units,
                                                 self.num_layers,
                                                 self.use_peephole,
                                                 self.clip_activation, inputs,
                                                 inputs_seq_len, keep_prob,
                                                 initializer, self.time_major)

        elif self.lstm_impl == 'LSTMBlockFusedCell':
            outputs, final_state = lstmblockfusedcell(self.num_units,
                                                      self.num_layers, inputs,
                                                      inputs_seq_len,
                                                      keep_prob, initializer,
                                                      self.time_major)

        elif self.lstm_impl == 'CudnnLSTM':
            outputs, final_state = cudnnlstm(self.num_units, self.num_layers,
                                             inputs, inputs_seq_len, keep_prob,
                                             initializer, self.time_major)
        else:
            raise IndexError('lstm_impl is "BasicLSTMCell" or "LSTMCell" or ' +
                             '"LSTMBlockCell" or "LSTMBlockFusedCell" or ' +
                             '"CudnnLSTM".')

        return outputs, final_state
    def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, T, input_size (num_channels * (splice * num_stack) * 3)]`
            inputs_seq_len (placeholder): A tensor of size` [B]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states.
                if time_major is True, a tensor of size
                    `[T, B, num_units (num_proj)]`
                otherwise, `[B, T, num_units (num_proj)]`
            final_state: A final hidden state of the encoder
        """
        # inputs: 3D tensor `[B, T, input_dim]`
        batch_size = tf.shape(inputs)[0]
        max_time = tf.shape(inputs)[1]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        # For debug
        # print(input_dim)
        # print(self.num_channels)
        # print(self.splice)
        # print(self.num_stack)

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(inputs,
                            shape=[
                                batch_size * max_time, self.num_channels,
                                self.splice * self.num_stack, 3
                            ])

        # NOTE: filter_size: `[H, W, C_in, C_out]`
        with tf.variable_scope('CNN1'):
            inputs = conv_layer(inputs,
                                filter_size=[11, 21, 3, 32],
                                stride=[3, 2],
                                parameter_init=self.parameter_init,
                                activation='relu')
            # inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[1, 1],
                              stride=[1, 1],
                              name='max_pool')
            inputs = tf.nn.dropout(inputs, keep_prob)

        with tf.variable_scope('CNN2'):
            inputs = conv_layer(inputs,
                                filter_size=[11, 11, 32, 32],
                                stride=[1, 2],
                                parameter_init=self.parameter_init,
                                activation='relu')
            # inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[1, 1],
                              stride=[1, 1],
                              name='max_pool')
            inputs = tf.nn.dropout(inputs, keep_prob)

        with tf.variable_scope('CNN3'):
            inputs = conv_layer(inputs,
                                filter_size=[3, 3, 32, 96],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            # inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[1, 1],
                              stride=[1, 1],
                              name='max_pool')
            inputs = tf.nn.dropout(inputs, keep_prob)

        # Reshape to 3D tensor `[B, T, new_h * new_w * C_out]`
        inputs = tf.reshape(
            inputs,
            shape=[batch_size, max_time,
                   np.prod(inputs.shape.as_list()[-3:])])

        initializer = tf.random_uniform_initializer(
            minval=-self.parameter_init, maxval=self.parameter_init)

        if self.lstm_impl == 'BasicLSTMCell':
            outputs, final_state = basiclstmcell(self.num_units,
                                                 self.num_layers, inputs,
                                                 inputs_seq_len, keep_prob,
                                                 initializer, self.time_major)

        elif self.lstm_impl == 'LSTMCell':
            outputs, final_state = lstmcell(self.num_units, self.num_proj,
                                            self.num_layers, self.use_peephole,
                                            self.clip_activation, inputs,
                                            inputs_seq_len, keep_prob,
                                            initializer, self.time_major)

        elif self.lstm_impl == 'LSTMBlockCell':
            outputs, final_state = lstmblockcell(self.num_units,
                                                 self.num_layers,
                                                 self.use_peephole,
                                                 self.clip_activation, inputs,
                                                 inputs_seq_len, keep_prob,
                                                 initializer, self.time_major)

        elif self.lstm_impl == 'LSTMBlockFusedCell':
            outputs, final_state = lstmblockfusedcell(self.num_units,
                                                      self.num_layers, inputs,
                                                      inputs_seq_len,
                                                      keep_prob, initializer,
                                                      self.time_major)

        elif self.lstm_impl == 'CudnnLSTM':
            outputs, final_state = cudnnlstm(self.num_units, self.num_layers,
                                             inputs, inputs_seq_len, keep_prob,
                                             initializer, self.time_major)
        else:
            raise IndexError('lstm_impl is "BasicLSTMCell" or "LSTMCell" or ' +
                             '"LSTMBlockCell" or "LSTMBlockFusedCell" or ' +
                             '"CudnnLSTM".')

        # Reshape to 2D tensor `[B * T (T * B), output_dim]`
        output_dim = outputs.shape.as_list()[-1]
        outputs = tf.reshape(outputs,
                             shape=[batch_size * max_time, output_dim])

        with tf.variable_scope('fc1') as scope:
            outputs = tf.contrib.layers.fully_connected(
                inputs=outputs,
                num_outputs=896,
                activation_fn=tf.nn.relu,
                weights_initializer=tf.truncated_normal_initializer(
                    stddev=self.parameter_init),
                biases_initializer=tf.zeros_initializer(),
                scope=scope)
            outputs = tf.nn.dropout(outputs, keep_prob)

        with tf.variable_scope('fc2') as scope:
            outputs = tf.contrib.layers.fully_connected(
                inputs=outputs,
                num_outputs=74,
                activation_fn=tf.nn.relu,
                weights_initializer=tf.truncated_normal_initializer(
                    stddev=self.parameter_init),
                biases_initializer=tf.zeros_initializer(),
                scope=scope)

        output_dim = outputs.shape.as_list()[-1]
        if self.time_major:
            # Reshape back to 3D tensor `[T, B, 74]`
            outputs = tf.reshape(outputs,
                                 shape=[max_time, batch_size, output_dim])
        else:
            # Reshape back to 3D tensor `[B, T, 74]`
            outputs = tf.reshape(outputs,
                                 shape=[batch_size, max_time, output_dim])

        return outputs, final_state
Esempio n. 10
0
    def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, T, input_size (num_channels * splice * num_stack * 3)]`
            inputs_seq_len (placeholder): A tensor of size` [B]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states.
                if time_major is True, a tensor of size `[T, B, output_dim]`
                otherwise, `[B, T, output_dim]`
            final_state: None
        """
        # inputs: 3D tensor `[B, T, input_dim]`
        batch_size = tf.shape(inputs)[0]
        max_time = tf.shape(inputs)[1]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(inputs,
                            shape=[
                                batch_size * max_time, self.num_channels,
                                self.splice * self.num_stack, 3
                            ])

        # Choose the activation function
        activation = 'relu'
        # activation = 'prelu'
        # activation = 'maxout'
        # TODO: add prelu and maxout layers

        # 1-4th layers
        with tf.variable_scope('CNN1'):
            for i_layer in range(1, 5, 1):
                if i_layer == 1:
                    inputs = conv_layer(inputs,
                                        filter_size=[3, 5, 3, 128],
                                        stride=[1, 1],
                                        parameter_init=self.parameter_init,
                                        activation=activation,
                                        name='conv1')
                    inputs = max_pool(inputs,
                                      pooling_size=[3, 1],
                                      stride=[3, 1],
                                      name='pool')
                else:
                    inputs = conv_layer(inputs,
                                        filter_size=[3, 5, 128, 128],
                                        stride=[1, 1],
                                        parameter_init=self.parameter_init,
                                        activation=activation,
                                        name='conv%d' % i_layer)
                    # NOTE: No poling

                inputs = batch_normalization(inputs, is_training=is_training)
                # inputs = tf.nn.dropout(inputs, keep_prob)
                # TODO: try Weight decay

        # 5-10th layers
        with tf.variable_scope('CNN2'):
            for i_layer in range(5, 11, 1):
                if i_layer == 5:
                    inputs = conv_layer(inputs,
                                        filter_size=[3, 5, 128, 256],
                                        stride=[1, 1],
                                        parameter_init=self.parameter_init,
                                        activation=activation,
                                        name='conv1')
                    # NOTE: No poling
                else:
                    inputs = conv_layer(inputs,
                                        filter_size=[3, 5, 256, 256],
                                        stride=[1, 1],
                                        parameter_init=self.parameter_init,
                                        activation=activation,
                                        name='conv%d' % i_layer)
                    # NOTE: No poling

                inputs = batch_normalization(inputs, is_training=is_training)
                # inputs = tf.nn.dropout(inputs, keep_prob)
                # TODO: try Weight decay

        # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]`
        new_h = math.ceil(self.num_channels / 3)
        new_w = self.splice * self.num_stack
        channel_out = inputs.shape.as_list()[-1]
        outputs = tf.reshape(
            inputs, shape=[batch_size * max_time, new_h * new_w * channel_out])

        # 11-14th layers
        for i in range(1, 4, 1):
            with tf.variable_scope('fc%d' % i) as scope:
                outputs = tf.contrib.layers.fully_connected(
                    inputs=outputs,
                    num_outputs=1024,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=self.parameter_init),
                    biases_initializer=tf.zeros_initializer(),
                    scope=scope)
            outputs = batch_normalization(outputs, is_training=is_training)
            outputs = tf.nn.dropout(outputs, keep_prob)
            # TODO: try Weight decay

        # Reshape back to 3D tensor `[B, T, 1024]`
        logits = tf.reshape(outputs, shape=[batch_size, max_time, 1024])

        if self.time_major:
            # Convert to time-major: `[T, B, 1024]'
            logits = tf.transpose(logits, [1, 0, 2])

        return logits, None
    def __call__(self, inputs, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, input_size (num_channels * splice * num_stack * 3)]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states.
                if time_major is True, a tensor of size `[T, B, output_dim]`
                otherwise, `[B, output_dim]`
        """
        # inputs: 2D tensor `[B, input_dim]`
        batch_size = tf.shape(inputs)[0]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        # for debug
        # print(input_dim)  # 1200
        # print(self.num_channels)  # 40
        # print(self.splice)  # 5
        # print(self.num_stack)  # 2

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(
            inputs,
            shape=[batch_size, self.num_channels, self.splice * self.num_stack, 3])

        # NOTE: filter_size: `[H, W, C_in, C_out]`
        with tf.variable_scope('CNN1'):
            inputs = conv_layer(inputs,
                                filter_size=[9, 9, 3, 128],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[3, 1],
                              stride=[3, 1],
                              name='max_pool')

        with tf.variable_scope('CNN2'):
            inputs = conv_layer(inputs,
                                filter_size=[3, 4, 128, 256],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[1, 1],
                              stride=[1, 1],
                              name='max_pool')

        # Reshape to 2D tensor `[B, new_h * new_w * C_out]`
        outputs = tf.reshape(
            inputs, shape=[batch_size, np.prod(inputs.shape.as_list()[-3:])])

        for i in range(1, 5, 1):
            with tf.variable_scope('fc%d' % (i)) as scope:
                outputs = tf.contrib.layers.fully_connected(
                    inputs=outputs,
                    num_outputs=2048,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=self.parameter_init),
                    biases_initializer=tf.zeros_initializer(),
                    scope=scope)

        return outputs