def __call__(self, inputs, inputs_seq_len, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, T, input_size (num_channels * splice * num_stack * 3)]` inputs_seq_len (placeholder): A tensor of size` [B]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states. if time_major is True, a tensor of size `[T, B, output_dim]` otherwise, `[B, T, output_dim]` final_state: None """ # inputs: 3D tensor `[B, T, input_dim]` batch_size = tf.shape(inputs)[0] max_time = tf.shape(inputs)[1] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 # For debug # print(input_dim) # print(self.num_channels) # print(self.splice) # print(self.num_stack) assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]` inputs = tf.reshape( inputs, shape=[batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3]) # NOTE: filter_size: `[H, W, C_in, C_out]` with tf.variable_scope('CNN1'): inputs = conv_layer(inputs, filter_size=[9, 9, 3, 64], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[3, 1], stride=[3, 1], name='max_pool') # TODO: try dropout with tf.variable_scope('CNN2'): inputs = conv_layer(inputs, filter_size=[3, 4, 64, 128], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[1, 1], stride=[1, 1], name='max_pool') # TODO: try dropout # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]` outputs = tf.reshape( inputs, shape=[batch_size * max_time, np.prod(inputs.shape.as_list()[-3:])]) for i in range(1, 3, 1): with tf.variable_scope('fc%d' % (i)) as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=768, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) outputs = tf.nn.dropout(outputs, keep_prob) # Reshape back to 3D tensor `[B, T, 768]` outputs = tf.reshape( outputs, shape=[batch_size, max_time, 768]) if self.time_major: # Convert to time-major: `[T, B, num_classes]' outputs = tf.transpose(outputs, [1, 0, 2]) return outputs, None
def __call__(self, inputs, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, input_size (num_channels * splice * num_stack * 3)]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states. if time_major is True, a tensor of size `[T, B, output_dim]` otherwise, `[B, output_dim]` """ # inputs: 2D tensor `[B, input_dim]` batch_size = tf.shape(inputs)[0] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 # for debug # print(input_dim) # 1200 # print(self.num_channels) # 40 # print(self.splice) # 5 # print(self.num_stack) # 2 assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B, num_channels, splice * num_stack, 3]` inputs = tf.reshape(inputs, shape=[ batch_size, self.num_channels, self.splice * self.num_stack, 3 ]) # NOTE: filter_size: `[H, W, C_in, C_out]` with tf.variable_scope('CNN1'): inputs = conv_layer(inputs, filter_size=[9, 9, 3, 128], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[3, 1], stride=[3, 1], name='max_pool') with tf.variable_scope('CNN2'): inputs = conv_layer(inputs, filter_size=[3, 4, 128, 256], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[1, 1], stride=[1, 1], name='max_pool') # Reshape to 2D tensor `[B, new_h * new_w * C_out]` outputs = tf.reshape( inputs, shape=[batch_size, np.prod(inputs.shape.as_list()[-3:])]) for i in range(1, 5, 1): with tf.variable_scope('fc%d' % (i)) as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=2048, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) return outputs
def __call__(self, inputs, inputs_seq_len, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, T, input_size (num_channels * (splice * num_stack) * 3)]` inputs_seq_len (placeholder): A tensor of size` [B]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states. if time_major is True, a tensor of size `[T, B, output_dim]` otherwise, `[B, T, output_dim]` final_state: None """ # inputs: 3D tensor `[B, T, input_dim]` batch_size = tf.shape(inputs)[0] max_time = tf.shape(inputs)[1] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 # For debug # print(input_dim) # print(self.num_channels) # print(self.splice) # print(self.num_stack) assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]` inputs = tf.reshape( inputs, shape=[batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3]) # NOTE: filter_size: `[H, W, C_in, C_out]` with tf.variable_scope('VGG1'): for i_layer in range(1, 4, 1): input_channels = inputs.shape.as_list()[-1] inputs = conv_layer(inputs, filter_size=[3, 3, input_channels, 96], stride=[1, 1], parameter_init=self.parameter_init, activation='relu', name='conv1') inputs = batch_normalization(inputs, is_training=is_training) if i_layer == 3: inputs = max_pool(inputs, pooling_size=[2, 2], stride=[2, 2], name='max_pool') inputs = tf.nn.dropout(inputs, keep_prob) with tf.variable_scope('VGG2'): for i_layer in range(1, 5, 1): input_channels = inputs.shape.as_list()[-1] inputs = conv_layer(inputs, filter_size=[3, 3, input_channels, 192], stride=[1, 1], parameter_init=self.parameter_init, activation='relu', name='conv%d' % i_layer) inputs = batch_normalization(inputs, is_training=is_training) if i_layer == 4: inputs = max_pool(inputs, pooling_size=[2, 2], stride=[2, 2], name='max_pool') inputs = tf.nn.dropout(inputs, keep_prob) with tf.variable_scope('VGG3'): for i_layer in range(1, 5, 1): input_channels = inputs.shape.as_list()[-1] inputs = conv_layer(inputs, filter_size=[3, 3, input_channels, 384], parameter_init=self.parameter_init, activation='relu', name='conv%d' % i_layer) inputs = batch_normalization(inputs, is_training=is_training) if i_layer == 4: inputs = max_pool(inputs, pooling_size=[2, 2], stride=[2, 2], name='max_pool') inputs = tf.nn.dropout(inputs, keep_prob) # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]` outputs = tf.reshape( inputs, shape=[batch_size * max_time, np.prod(inputs.shape.as_list()[-3:])]) for i_layer in range(1, 3, 1): with tf.variable_scope('fc%d' % i_layer) as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=1024, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) if i_layer == 1: outputs = tf.nn.dropout(outputs, keep_prob) # Reshape back to 3D tensor `[B, T, 1024]` output_dim = outputs.shape.as_list()[-1] outputs = tf.reshape( outputs, shape=[batch_size, max_time, output_dim]) if self.time_major: # Convert to time-major: `[T, B, num_classes]' outputs = tf.transpose(outputs, [1, 0, 2]) return outputs, None
def __call__(self, inputs, inputs_seq_len, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, T, input_size (num_channels * splice * num_stack * 3)]` inputs_seq_len (placeholder): A tensor of size` [B]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states, a tensor of size `[T, B, num_units (num_proj)]` final_state: A final hidden state of the encoder """ # inputs: 3D tensor `[B, T, input_dim]` batch_size = tf.shape(inputs)[0] max_time = tf.shape(inputs)[1] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]` inputs = tf.reshape(inputs, shape=[ batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3 ]) # NOTE: filter_size: `[H, W, C_in, C_out]` with tf.variable_scope('VGG1'): inputs = conv_layer(inputs, filter_size=[3, 3, 3, 64], stride=[1, 1], parameter_init=self.parameter_init, activation='relu', name='conv1') inputs = conv_layer(inputs, filter_size=[3, 3, 64, 64], stride=[1, 1], parameter_init=self.parameter_init, activation='relu', name='conv2') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[2, 2], stride=[2, 2], name='max_pool') # TODO(hirofumi): try dropout with tf.variable_scope('VGG2'): inputs = conv_layer(inputs, filter_size=[3, 3, 64, 128], stride=[1, 1], parameter_init=self.parameter_init, activation='relu', name='conv1') inputs = conv_layer(inputs, filter_size=[3, 3, 128, 128], stride=[1, 1], parameter_init=self.parameter_init, activation='relu', name='conv2') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[2, 2], stride=[2, 2], name='max_pool') # TODO(hirofumi): try dropout # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]` new_h = math.ceil(self.num_channels / 4) new_w = math.ceil(self.splice * self.num_stack / 4) channel_out = inputs.shape.as_list()[-1] inputs = tf.reshape( inputs, shape=[batch_size * max_time, new_h * new_w * channel_out]) # Insert linear layer to recude CNN's output demention # from (new_h * new_w * C_out) to 256 with tf.variable_scope('bridge') as scope: inputs = tf.contrib.layers.fully_connected( inputs=inputs, num_outputs=256, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) # Dropout for the VGG-output-hidden connection inputs = tf.nn.dropout(inputs, keep_prob, name='dropout_pipe') # Reshape back to 3D tensor `[B, T, 256]` inputs = tf.reshape(inputs, shape=[batch_size, max_time, 256]) initializer = tf.random_uniform_initializer( minval=-self.parameter_init, maxval=self.parameter_init) if self.lstm_impl == 'BasicLSTMCell': outputs, final_state = basiclstmcell(self.num_units, self.num_layers, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'LSTMCell': outputs, final_state = lstmcell(self.num_units, self.num_proj, self.num_layers, self.use_peephole, self.clip_activation, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'LSTMBlockCell': outputs, final_state = lstmblockcell(self.num_units, self.num_layers, self.use_peephole, self.clip_activation, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'LSTMBlockFusedCell': outputs, final_state = lstmblockfusedcell(self.num_units, self.num_layers, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'CudnnLSTM': outputs, final_state = cudnnlstm(self.num_units, self.num_layers, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) else: raise IndexError('lstm_impl is "BasicLSTMCell" or "LSTMCell" or ' + '"LSTMBlockCell" or "LSTMBlockFusedCell" or ' + '"CudnnLSTM".') return outputs, final_state
def __call__(self, inputs, inputs_seq_len, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, T, input_size (num_channels * splice * num_stack * 3)]` inputs_seq_len (placeholder): A tensor of size` [B]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states. if time_major is True, a tensor of size `[T, B, output_dim]` otherwise, `[B, T, output_dim]` final_state: None """ # inputs: 3D tensor `[B, T, input_dim]` batch_size = tf.shape(inputs)[0] max_time = tf.shape(inputs)[1] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]` inputs = tf.reshape(inputs, shape=[ batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3 ]) # Choose the activation function activation = 'relu' # activation = 'prelu' # activation = 'maxout' # TODO: add prelu and maxout layers # 1-4th layers with tf.variable_scope('CNN1'): for i_layer in range(1, 5, 1): if i_layer == 1: inputs = conv_layer(inputs, filter_size=[3, 5, 3, 128], stride=[1, 1], parameter_init=self.parameter_init, activation=activation, name='conv1') inputs = max_pool(inputs, pooling_size=[3, 1], stride=[3, 1], name='pool') else: inputs = conv_layer(inputs, filter_size=[3, 5, 128, 128], stride=[1, 1], parameter_init=self.parameter_init, activation=activation, name='conv%d' % i_layer) # NOTE: No poling inputs = batch_normalization(inputs, is_training=is_training) # inputs = tf.nn.dropout(inputs, keep_prob) # TODO: try Weight decay # 5-10th layers with tf.variable_scope('CNN2'): for i_layer in range(5, 11, 1): if i_layer == 5: inputs = conv_layer(inputs, filter_size=[3, 5, 128, 256], stride=[1, 1], parameter_init=self.parameter_init, activation=activation, name='conv1') # NOTE: No poling else: inputs = conv_layer(inputs, filter_size=[3, 5, 256, 256], stride=[1, 1], parameter_init=self.parameter_init, activation=activation, name='conv%d' % i_layer) # NOTE: No poling inputs = batch_normalization(inputs, is_training=is_training) # inputs = tf.nn.dropout(inputs, keep_prob) # TODO: try Weight decay # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]` new_h = math.ceil(self.num_channels / 3) new_w = self.splice * self.num_stack channel_out = inputs.shape.as_list()[-1] outputs = tf.reshape( inputs, shape=[batch_size * max_time, new_h * new_w * channel_out]) # 11-14th layers for i in range(1, 4, 1): with tf.variable_scope('fc%d' % i) as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=1024, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) outputs = batch_normalization(outputs, is_training=is_training) outputs = tf.nn.dropout(outputs, keep_prob) # TODO: try Weight decay # Reshape back to 3D tensor `[B, T, 1024]` logits = tf.reshape(outputs, shape=[batch_size, max_time, 1024]) if self.time_major: # Convert to time-major: `[T, B, 1024]' logits = tf.transpose(logits, [1, 0, 2]) return logits, None
def __call__(self, inputs, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, input_size (num_channels * splice * num_stack * 3)]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states. if time_major is True, a tensor of size `[T, B, output_dim]` otherwise, `[B, output_dim]` """ # inputs: 2D tensor `[B, input_dim]` batch_size = tf.shape(inputs)[0] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 # for debug # print(input_dim) # 1200 # print(self.num_channels) # 40 # print(self.splice) # 5 # print(self.num_stack) # 2 assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B, num_channels, splice * num_stack, 3]` inputs = tf.reshape( inputs, shape=[batch_size, self.num_channels, self.splice * self.num_stack, 3]) # NOTE: filter_size: `[H, W, C_in, C_out]` with tf.variable_scope('CNN1'): inputs = conv_layer(inputs, filter_size=[9, 9, 3, 128], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[3, 1], stride=[3, 1], name='max_pool') with tf.variable_scope('CNN2'): inputs = conv_layer(inputs, filter_size=[3, 4, 128, 256], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[1, 1], stride=[1, 1], name='max_pool') # Reshape to 2D tensor `[B, new_h * new_w * C_out]` outputs = tf.reshape( inputs, shape=[batch_size, np.prod(inputs.shape.as_list()[-3:])]) for i in range(1, 5, 1): with tf.variable_scope('fc%d' % (i)) as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=2048, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) return outputs