def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, T, input_size (num_channels * splice * num_stack * 3)]`
            inputs_seq_len (placeholder): A tensor of size` [B]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states.
                if time_major is True, a tensor of size `[T, B, output_dim]`
                otherwise, `[B, T, output_dim]`
            final_state: None
        """
        # inputs: 3D tensor `[B, T, input_dim]`
        batch_size = tf.shape(inputs)[0]
        max_time = tf.shape(inputs)[1]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        # For debug
        # print(input_dim)
        # print(self.num_channels)
        # print(self.splice)
        # print(self.num_stack)

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(
            inputs,
            shape=[batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3])

        # NOTE: filter_size: `[H, W, C_in, C_out]`
        with tf.variable_scope('CNN1'):
            inputs = conv_layer(inputs,
                                filter_size=[9, 9, 3, 64],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[3, 1],
                              stride=[3, 1],
                              name='max_pool')
            # TODO: try dropout

        with tf.variable_scope('CNN2'):
            inputs = conv_layer(inputs,
                                filter_size=[3, 4, 64, 128],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[1, 1],
                              stride=[1, 1],
                              name='max_pool')
            # TODO: try dropout

        # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]`
        outputs = tf.reshape(
            inputs, shape=[batch_size * max_time, np.prod(inputs.shape.as_list()[-3:])])

        for i in range(1, 3, 1):
            with tf.variable_scope('fc%d' % (i)) as scope:
                outputs = tf.contrib.layers.fully_connected(
                    inputs=outputs,
                    num_outputs=768,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=self.parameter_init),
                    biases_initializer=tf.zeros_initializer(),
                    scope=scope)
                outputs = tf.nn.dropout(outputs, keep_prob)

        # Reshape back to 3D tensor `[B, T, 768]`
        outputs = tf.reshape(
            outputs, shape=[batch_size, max_time, 768])

        if self.time_major:
            # Convert to time-major: `[T, B, num_classes]'
            outputs = tf.transpose(outputs, [1, 0, 2])

        return outputs, None
    def __call__(self, inputs, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, input_size (num_channels * splice * num_stack * 3)]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states.
                if time_major is True, a tensor of size `[T, B, output_dim]`
                otherwise, `[B, output_dim]`
        """
        # inputs: 2D tensor `[B, input_dim]`
        batch_size = tf.shape(inputs)[0]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        # for debug
        # print(input_dim)  # 1200
        # print(self.num_channels)  # 40
        # print(self.splice)  # 5
        # print(self.num_stack)  # 2

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(inputs,
                            shape=[
                                batch_size, self.num_channels,
                                self.splice * self.num_stack, 3
                            ])

        # NOTE: filter_size: `[H, W, C_in, C_out]`
        with tf.variable_scope('CNN1'):
            inputs = conv_layer(inputs,
                                filter_size=[9, 9, 3, 128],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[3, 1],
                              stride=[3, 1],
                              name='max_pool')

        with tf.variable_scope('CNN2'):
            inputs = conv_layer(inputs,
                                filter_size=[3, 4, 128, 256],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[1, 1],
                              stride=[1, 1],
                              name='max_pool')

        # Reshape to 2D tensor `[B, new_h * new_w * C_out]`
        outputs = tf.reshape(
            inputs, shape=[batch_size,
                           np.prod(inputs.shape.as_list()[-3:])])

        for i in range(1, 5, 1):
            with tf.variable_scope('fc%d' % (i)) as scope:
                outputs = tf.contrib.layers.fully_connected(
                    inputs=outputs,
                    num_outputs=2048,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=self.parameter_init),
                    biases_initializer=tf.zeros_initializer(),
                    scope=scope)

        return outputs
    def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, T, input_size (num_channels * (splice * num_stack) * 3)]`
            inputs_seq_len (placeholder): A tensor of size` [B]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states.
                if time_major is True, a tensor of size `[T, B, output_dim]`
                otherwise, `[B, T, output_dim]`
            final_state: None
        """
        # inputs: 3D tensor `[B, T, input_dim]`
        batch_size = tf.shape(inputs)[0]
        max_time = tf.shape(inputs)[1]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        # For debug
        # print(input_dim)
        # print(self.num_channels)
        # print(self.splice)
        # print(self.num_stack)

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(
            inputs,
            shape=[batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3])

        # NOTE: filter_size: `[H, W, C_in, C_out]`
        with tf.variable_scope('VGG1'):
            for i_layer in range(1, 4, 1):
                input_channels = inputs.shape.as_list()[-1]
                inputs = conv_layer(inputs,
                                    filter_size=[3, 3, input_channels, 96],
                                    stride=[1, 1],
                                    parameter_init=self.parameter_init,
                                    activation='relu',
                                    name='conv1')
                inputs = batch_normalization(inputs, is_training=is_training)
                if i_layer == 3:
                    inputs = max_pool(inputs,
                                      pooling_size=[2, 2],
                                      stride=[2, 2],
                                      name='max_pool')
                inputs = tf.nn.dropout(inputs, keep_prob)

        with tf.variable_scope('VGG2'):
            for i_layer in range(1, 5, 1):
                input_channels = inputs.shape.as_list()[-1]
                inputs = conv_layer(inputs,
                                    filter_size=[3, 3, input_channels, 192],
                                    stride=[1, 1],
                                    parameter_init=self.parameter_init,
                                    activation='relu',
                                    name='conv%d' % i_layer)
                inputs = batch_normalization(inputs, is_training=is_training)
                if i_layer == 4:
                    inputs = max_pool(inputs,
                                      pooling_size=[2, 2],
                                      stride=[2, 2],
                                      name='max_pool')
                inputs = tf.nn.dropout(inputs, keep_prob)

        with tf.variable_scope('VGG3'):
            for i_layer in range(1, 5, 1):
                input_channels = inputs.shape.as_list()[-1]
                inputs = conv_layer(inputs,
                                    filter_size=[3, 3, input_channels, 384],
                                    parameter_init=self.parameter_init,
                                    activation='relu',
                                    name='conv%d' % i_layer)
                inputs = batch_normalization(inputs, is_training=is_training)
                if i_layer == 4:
                    inputs = max_pool(inputs,
                                      pooling_size=[2, 2],
                                      stride=[2, 2],
                                      name='max_pool')
                inputs = tf.nn.dropout(inputs, keep_prob)

        # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]`
        outputs = tf.reshape(
            inputs, shape=[batch_size * max_time, np.prod(inputs.shape.as_list()[-3:])])

        for i_layer in range(1, 3, 1):
            with tf.variable_scope('fc%d' % i_layer) as scope:
                outputs = tf.contrib.layers.fully_connected(
                    inputs=outputs,
                    num_outputs=1024,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=self.parameter_init),
                    biases_initializer=tf.zeros_initializer(),
                    scope=scope)
                if i_layer == 1:
                    outputs = tf.nn.dropout(outputs, keep_prob)

        # Reshape back to 3D tensor `[B, T, 1024]`
        output_dim = outputs.shape.as_list()[-1]
        outputs = tf.reshape(
            outputs, shape=[batch_size, max_time, output_dim])

        if self.time_major:
            # Convert to time-major: `[T, B, num_classes]'
            outputs = tf.transpose(outputs, [1, 0, 2])

        return outputs, None
    def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, T, input_size (num_channels * splice * num_stack * 3)]`
            inputs_seq_len (placeholder): A tensor of size` [B]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states, a tensor of size
                `[T, B, num_units (num_proj)]`
            final_state: A final hidden state of the encoder
        """
        # inputs: 3D tensor `[B, T, input_dim]`
        batch_size = tf.shape(inputs)[0]
        max_time = tf.shape(inputs)[1]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(inputs,
                            shape=[
                                batch_size * max_time, self.num_channels,
                                self.splice * self.num_stack, 3
                            ])

        # NOTE: filter_size: `[H, W, C_in, C_out]`
        with tf.variable_scope('VGG1'):
            inputs = conv_layer(inputs,
                                filter_size=[3, 3, 3, 64],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu',
                                name='conv1')
            inputs = conv_layer(inputs,
                                filter_size=[3, 3, 64, 64],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu',
                                name='conv2')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[2, 2],
                              stride=[2, 2],
                              name='max_pool')
            # TODO(hirofumi): try dropout

        with tf.variable_scope('VGG2'):
            inputs = conv_layer(inputs,
                                filter_size=[3, 3, 64, 128],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu',
                                name='conv1')
            inputs = conv_layer(inputs,
                                filter_size=[3, 3, 128, 128],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu',
                                name='conv2')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[2, 2],
                              stride=[2, 2],
                              name='max_pool')
            # TODO(hirofumi): try dropout

        # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]`
        new_h = math.ceil(self.num_channels / 4)
        new_w = math.ceil(self.splice * self.num_stack / 4)
        channel_out = inputs.shape.as_list()[-1]
        inputs = tf.reshape(
            inputs, shape=[batch_size * max_time, new_h * new_w * channel_out])

        # Insert linear layer to recude CNN's output demention
        # from (new_h * new_w * C_out) to 256
        with tf.variable_scope('bridge') as scope:
            inputs = tf.contrib.layers.fully_connected(
                inputs=inputs,
                num_outputs=256,
                activation_fn=tf.nn.relu,
                weights_initializer=tf.truncated_normal_initializer(
                    stddev=self.parameter_init),
                biases_initializer=tf.zeros_initializer(),
                scope=scope)

        # Dropout for the VGG-output-hidden connection
        inputs = tf.nn.dropout(inputs, keep_prob, name='dropout_pipe')

        # Reshape back to 3D tensor `[B, T, 256]`
        inputs = tf.reshape(inputs, shape=[batch_size, max_time, 256])

        initializer = tf.random_uniform_initializer(
            minval=-self.parameter_init, maxval=self.parameter_init)

        if self.lstm_impl == 'BasicLSTMCell':
            outputs, final_state = basiclstmcell(self.num_units,
                                                 self.num_layers, inputs,
                                                 inputs_seq_len, keep_prob,
                                                 initializer, self.time_major)

        elif self.lstm_impl == 'LSTMCell':
            outputs, final_state = lstmcell(self.num_units, self.num_proj,
                                            self.num_layers, self.use_peephole,
                                            self.clip_activation, inputs,
                                            inputs_seq_len, keep_prob,
                                            initializer, self.time_major)

        elif self.lstm_impl == 'LSTMBlockCell':
            outputs, final_state = lstmblockcell(self.num_units,
                                                 self.num_layers,
                                                 self.use_peephole,
                                                 self.clip_activation, inputs,
                                                 inputs_seq_len, keep_prob,
                                                 initializer, self.time_major)

        elif self.lstm_impl == 'LSTMBlockFusedCell':
            outputs, final_state = lstmblockfusedcell(self.num_units,
                                                      self.num_layers, inputs,
                                                      inputs_seq_len,
                                                      keep_prob, initializer,
                                                      self.time_major)

        elif self.lstm_impl == 'CudnnLSTM':
            outputs, final_state = cudnnlstm(self.num_units, self.num_layers,
                                             inputs, inputs_seq_len, keep_prob,
                                             initializer, self.time_major)
        else:
            raise IndexError('lstm_impl is "BasicLSTMCell" or "LSTMCell" or ' +
                             '"LSTMBlockCell" or "LSTMBlockFusedCell" or ' +
                             '"CudnnLSTM".')

        return outputs, final_state
Beispiel #5
0
    def __call__(self, inputs, inputs_seq_len, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, T, input_size (num_channels * splice * num_stack * 3)]`
            inputs_seq_len (placeholder): A tensor of size` [B]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states.
                if time_major is True, a tensor of size `[T, B, output_dim]`
                otherwise, `[B, T, output_dim]`
            final_state: None
        """
        # inputs: 3D tensor `[B, T, input_dim]`
        batch_size = tf.shape(inputs)[0]
        max_time = tf.shape(inputs)[1]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(inputs,
                            shape=[
                                batch_size * max_time, self.num_channels,
                                self.splice * self.num_stack, 3
                            ])

        # Choose the activation function
        activation = 'relu'
        # activation = 'prelu'
        # activation = 'maxout'
        # TODO: add prelu and maxout layers

        # 1-4th layers
        with tf.variable_scope('CNN1'):
            for i_layer in range(1, 5, 1):
                if i_layer == 1:
                    inputs = conv_layer(inputs,
                                        filter_size=[3, 5, 3, 128],
                                        stride=[1, 1],
                                        parameter_init=self.parameter_init,
                                        activation=activation,
                                        name='conv1')
                    inputs = max_pool(inputs,
                                      pooling_size=[3, 1],
                                      stride=[3, 1],
                                      name='pool')
                else:
                    inputs = conv_layer(inputs,
                                        filter_size=[3, 5, 128, 128],
                                        stride=[1, 1],
                                        parameter_init=self.parameter_init,
                                        activation=activation,
                                        name='conv%d' % i_layer)
                    # NOTE: No poling

                inputs = batch_normalization(inputs, is_training=is_training)
                # inputs = tf.nn.dropout(inputs, keep_prob)
                # TODO: try Weight decay

        # 5-10th layers
        with tf.variable_scope('CNN2'):
            for i_layer in range(5, 11, 1):
                if i_layer == 5:
                    inputs = conv_layer(inputs,
                                        filter_size=[3, 5, 128, 256],
                                        stride=[1, 1],
                                        parameter_init=self.parameter_init,
                                        activation=activation,
                                        name='conv1')
                    # NOTE: No poling
                else:
                    inputs = conv_layer(inputs,
                                        filter_size=[3, 5, 256, 256],
                                        stride=[1, 1],
                                        parameter_init=self.parameter_init,
                                        activation=activation,
                                        name='conv%d' % i_layer)
                    # NOTE: No poling

                inputs = batch_normalization(inputs, is_training=is_training)
                # inputs = tf.nn.dropout(inputs, keep_prob)
                # TODO: try Weight decay

        # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]`
        new_h = math.ceil(self.num_channels / 3)
        new_w = self.splice * self.num_stack
        channel_out = inputs.shape.as_list()[-1]
        outputs = tf.reshape(
            inputs, shape=[batch_size * max_time, new_h * new_w * channel_out])

        # 11-14th layers
        for i in range(1, 4, 1):
            with tf.variable_scope('fc%d' % i) as scope:
                outputs = tf.contrib.layers.fully_connected(
                    inputs=outputs,
                    num_outputs=1024,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=self.parameter_init),
                    biases_initializer=tf.zeros_initializer(),
                    scope=scope)
            outputs = batch_normalization(outputs, is_training=is_training)
            outputs = tf.nn.dropout(outputs, keep_prob)
            # TODO: try Weight decay

        # Reshape back to 3D tensor `[B, T, 1024]`
        logits = tf.reshape(outputs, shape=[batch_size, max_time, 1024])

        if self.time_major:
            # Convert to time-major: `[T, B, 1024]'
            logits = tf.transpose(logits, [1, 0, 2])

        return logits, None
    def __call__(self, inputs, keep_prob, is_training):
        """Construct model graph.
        Args:
            inputs (placeholder): A tensor of size
                `[B, input_size (num_channels * splice * num_stack * 3)]`
            keep_prob (placeholder, float): A probability to keep nodes
                in the hidden-hidden connection
            is_training (bool):
        Returns:
            outputs: Encoder states.
                if time_major is True, a tensor of size `[T, B, output_dim]`
                otherwise, `[B, output_dim]`
        """
        # inputs: 2D tensor `[B, input_dim]`
        batch_size = tf.shape(inputs)[0]
        input_dim = inputs.shape.as_list()[-1]
        # NOTE: input_dim: num_channels * splice * num_stack * 3

        # for debug
        # print(input_dim)  # 1200
        # print(self.num_channels)  # 40
        # print(self.splice)  # 5
        # print(self.num_stack)  # 2

        assert input_dim == self.num_channels * self.splice * self.num_stack * 3

        # Reshape to 4D tensor `[B, num_channels, splice * num_stack, 3]`
        inputs = tf.reshape(
            inputs,
            shape=[batch_size, self.num_channels, self.splice * self.num_stack, 3])

        # NOTE: filter_size: `[H, W, C_in, C_out]`
        with tf.variable_scope('CNN1'):
            inputs = conv_layer(inputs,
                                filter_size=[9, 9, 3, 128],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[3, 1],
                              stride=[3, 1],
                              name='max_pool')

        with tf.variable_scope('CNN2'):
            inputs = conv_layer(inputs,
                                filter_size=[3, 4, 128, 256],
                                stride=[1, 1],
                                parameter_init=self.parameter_init,
                                activation='relu')
            inputs = batch_normalization(inputs, is_training=is_training)
            inputs = max_pool(inputs,
                              pooling_size=[1, 1],
                              stride=[1, 1],
                              name='max_pool')

        # Reshape to 2D tensor `[B, new_h * new_w * C_out]`
        outputs = tf.reshape(
            inputs, shape=[batch_size, np.prod(inputs.shape.as_list()[-3:])])

        for i in range(1, 5, 1):
            with tf.variable_scope('fc%d' % (i)) as scope:
                outputs = tf.contrib.layers.fully_connected(
                    inputs=outputs,
                    num_outputs=2048,
                    activation_fn=tf.nn.relu,
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=self.parameter_init),
                    biases_initializer=tf.zeros_initializer(),
                    scope=scope)

        return outputs