def __call__(self, inputs, inputs_seq_len, keep_prob_input, keep_prob_hidden, keep_prob_output): """Construct model graph. Args: inputs (placeholder): A tensor of size`[B, T, input_size]` inputs_seq_len (placeholder): A tensor of size` [B]` keep_prob_input (placeholder, float): A probability to keep nodes in the input-hidden connection keep_prob_hidden (placeholder, float): A probability to keep nodes in the hidden-hidden connection keep_prob_output (placeholder, float): A probability to keep nodes in the hidden-output connection Returns: logits: A tensor of size `[T, B, num_classes]` final_state: A final hidden state of the encoder """ # inputs: 3D tensor `[batch_size, max_time, input_size * splice]` batch_size = tf.shape(inputs)[0] max_time = tf.shape(inputs)[1] # Reshape to 4D tensor # `[batch_size * max_time, input_size / 3, splice, 3(+Δ,ΔΔ)]` inputs = tf.reshape( inputs, shape=[batch_size * max_time, int(self.input_size / 3), self.splice, 3]) with tf.variable_scope('VGG1'): inputs = conv_layer(inputs, filter_shape=[3, 3, 3, 96], parameter_init=self.parameter_init, relu=True, name='conv1') inputs = conv_layer(inputs, filter_shape=[3, 3, 96, 96], parameter_init=self.parameter_init, relu=True, name='conv2') inputs = conv_layer(inputs, filter_shape=[3, 3, 96, 96], parameter_init=self.parameter_init, relu=True, name='conv3') inputs = max_pool(inputs, name='max_pool') # TODO(hirofumi): try batch normalization with tf.variable_scope('VGG2'): inputs = conv_layer(inputs, filter_shape=[3, 3, 96, 192], parameter_init=self.parameter_init, relu=True, name='conv1') inputs = conv_layer(inputs, filter_shape=[3, 3, 192, 192], parameter_init=self.parameter_init, relu=True, name='conv2') inputs = conv_layer(inputs, filter_shape=[3, 3, 192, 192], parameter_init=self.parameter_init, relu=True, name='conv3') inputs = conv_layer(inputs, filter_shape=[3, 3, 192, 192], parameter_init=self.parameter_init, relu=True, name='conv4') inputs = max_pool(inputs, name='max_pool') # TODO(hirofumi): try batch normalization with tf.variable_scope('VGG3'): inputs = conv_layer(inputs, filter_shape=[3, 3, 192, 384], parameter_init=self.parameter_init, relu=True, name='conv1') inputs = conv_layer(inputs, filter_shape=[3, 3, 384, 384], parameter_init=self.parameter_init, relu=True, name='conv2') inputs = conv_layer(inputs, filter_shape=[3, 3, 384, 384], parameter_init=self.parameter_init, relu=True, name='conv3') inputs = conv_layer(inputs, filter_shape=[3, 3, 384, 384], parameter_init=self.parameter_init, relu=True, name='conv4') inputs = max_pool(inputs, name='max_pool') # TODO(hirofumi): try batch normalization # Reshape to 2D tensor `[batch_size * max_time, new_h * new_w * 384]` new_h = math.ceil(self.input_size / (3 * 2**3) ) # expected to be 5 or 6 new_w = math.ceil(self.splice / (2**3)) # expected to be 2 inputs = tf.reshape( inputs, shape=[batch_size * max_time, new_h * new_w * 384]) with tf.variable_scope('fc1') as scope: inputs = tf.contrib.layers.fully_connected( inputs=inputs, num_outputs=1024, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) with tf.variable_scope('fc2') as scope: inputs = tf.contrib.layers.fully_connected( inputs=inputs, num_outputs=1024, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) with tf.variable_scope('fc3') as scope: logits_2d = tf.contrib.layers.fully_connected( inputs=inputs, num_outputs=self.num_classes, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) # if self.bottleneck_dim is not None and self.bottleneck_dim != 0: # with tf.variable_scope('bottleneck') as scope: # outputs = tf.contrib.layers.fully_connected( # outputs, self.bottleneck_dim, # activation_fn=tf.nn.relu, # weights_initializer=tf.truncated_normal_initializer(stddev=0.1), # biases_initializer=tf.zeros_initializer(), # scope=scope) # # # Dropout for the hidden-output connections # outputs = tf.nn.dropout( # outputs, keep_prob_output, name='dropout_output_bottle') # Reshape back to 3D tensor `[batch_size, max_time, num_classes]` logits = tf.reshape( logits_2d, shape=[batch_size, max_time, self.num_classes]) # Convert to time-major: `[max_time, batch_size, num_classes]' logits = tf.transpose(logits, (1, 0, 2)) return logits, None
def __call__(self, inputs, inputs_seq_len, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, T, input_size (num_channels * splice * num_stack * 3)]` inputs_seq_len (placeholder): A tensor of size` [B]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states. if time_major is True, a tensor of size `[T, B, output_dim]` otherwise, `[B, T, output_dim]` final_state: None """ # inputs: 3D tensor `[B, T, input_dim]` batch_size = tf.shape(inputs)[0] max_time = tf.shape(inputs)[1] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 # For debug # print(input_dim) # print(self.num_channels) # print(self.splice) # print(self.num_stack) assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]` inputs = tf.reshape( inputs, shape=[batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3]) # NOTE: filter_size: `[H, W, C_in, C_out]` with tf.variable_scope('CNN1'): inputs = conv_layer(inputs, filter_size=[9, 9, 3, 64], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[3, 1], stride=[3, 1], name='max_pool') # TODO: try dropout with tf.variable_scope('CNN2'): inputs = conv_layer(inputs, filter_size=[3, 4, 64, 128], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[1, 1], stride=[1, 1], name='max_pool') # TODO: try dropout # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]` outputs = tf.reshape( inputs, shape=[batch_size * max_time, np.prod(inputs.shape.as_list()[-3:])]) for i in range(1, 3, 1): with tf.variable_scope('fc%d' % (i)) as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=768, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) outputs = tf.nn.dropout(outputs, keep_prob) # Reshape back to 3D tensor `[B, T, 768]` outputs = tf.reshape( outputs, shape=[batch_size, max_time, 768]) if self.time_major: # Convert to time-major: `[T, B, num_classes]' outputs = tf.transpose(outputs, [1, 0, 2]) return outputs, None
def __call__(self, inputs, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, input_size (num_channels * splice * num_stack * 3)]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states. if time_major is True, a tensor of size `[T, B, output_dim]` otherwise, `[B, output_dim]` """ # inputs: 2D tensor `[B, input_dim]` batch_size = tf.shape(inputs)[0] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 # for debug # print(input_dim) # 1200 # print(self.num_channels) # 40 # print(self.splice) # 5 # print(self.num_stack) # 2 assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B, num_channels, splice * num_stack, 3]` inputs = tf.reshape(inputs, shape=[ batch_size, self.num_channels, self.splice * self.num_stack, 3 ]) # NOTE: filter_size: `[H, W, C_in, C_out]` with tf.variable_scope('CNN1'): inputs = conv_layer(inputs, filter_size=[9, 9, 3, 128], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[3, 1], stride=[3, 1], name='max_pool') with tf.variable_scope('CNN2'): inputs = conv_layer(inputs, filter_size=[3, 4, 128, 256], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[1, 1], stride=[1, 1], name='max_pool') # Reshape to 2D tensor `[B, new_h * new_w * C_out]` outputs = tf.reshape( inputs, shape=[batch_size, np.prod(inputs.shape.as_list()[-3:])]) for i in range(1, 5, 1): with tf.variable_scope('fc%d' % (i)) as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=2048, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) return outputs
def __call__(self, inputs, inputs_seq_len, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, T, input_size (num_channels * (splice * num_stack) * 3)]` inputs_seq_len (placeholder): A tensor of size` [B]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states. if time_major is True, a tensor of size `[T, B, num_units (num_proj)]` otherwise, `[B, T, num_units (num_proj)]` final_state: A final hidden state of the encoder """ # inputs: 3D tensor `[B, T, input_dim]` batch_size = tf.shape(inputs)[0] max_time = tf.shape(inputs)[1] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 # For debug # print(input_dim) # print(self.num_channels) # print(self.splice) # print(self.num_stack) assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]` inputs = tf.reshape( inputs, shape=[batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3]) # NOTE: filter_size: `[H, W, C_in, C_out]` with tf.variable_scope('CNN1'): inputs = conv_layer(inputs, filter_size=[11, 21, 3, 32], stride=[3, 2], parameter_init=self.parameter_init, activation='relu') # inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[1, 1], stride=[1, 1], name='max_pool') inputs = tf.nn.dropout(inputs, keep_prob) with tf.variable_scope('CNN2'): inputs = conv_layer(inputs, filter_size=[11, 11, 32, 32], stride=[1, 2], parameter_init=self.parameter_init, activation='relu') # inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[1, 1], stride=[1, 1], name='max_pool') inputs = tf.nn.dropout(inputs, keep_prob) with tf.variable_scope('CNN3'): inputs = conv_layer(inputs, filter_size=[3, 3, 32, 96], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') # inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[1, 1], stride=[1, 1], name='max_pool') inputs = tf.nn.dropout(inputs, keep_prob) # Reshape to 3D tensor `[B, T, new_h * new_w * C_out]` inputs = tf.reshape( inputs, shape=[batch_size, max_time, np.prod(inputs.shape.as_list()[-3:])]) initializer = tf.random_uniform_initializer( minval=-self.parameter_init, maxval=self.parameter_init) if self.lstm_impl == 'BasicLSTMCell': outputs, final_state = basiclstmcell( self.num_units, self.num_layers, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'LSTMCell': outputs, final_state = lstmcell( self.num_units, self.num_proj, self.num_layers, self.use_peephole, self.clip_activation, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'LSTMBlockCell': outputs, final_state = lstmblockcell( self.num_units, self.num_layers, self.use_peephole, self.clip_activation, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'LSTMBlockFusedCell': outputs, final_state = lstmblockfusedcell( self.num_units, self.num_layers, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'CudnnLSTM': outputs, final_state = cudnnlstm( self.num_units, self.num_layers, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) else: raise IndexError( 'lstm_impl is "BasicLSTMCell" or "LSTMCell" or ' + '"LSTMBlockCell" or "LSTMBlockFusedCell" or ' + '"CudnnLSTM".') # Reshape to 2D tensor `[B * T (T * B), output_dim]` output_dim = outputs.shape.as_list()[-1] outputs = tf.reshape( outputs, shape=[batch_size * max_time, output_dim]) with tf.variable_scope('fc1') as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=896, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) outputs = tf.nn.dropout(outputs, keep_prob) with tf.variable_scope('fc2') as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=74, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) output_dim = outputs.shape.as_list()[-1] if self.time_major: # Reshape back to 3D tensor `[T, B, 74]` outputs = tf.reshape( outputs, shape=[max_time, batch_size, output_dim]) else: # Reshape back to 3D tensor `[B, T, 74]` outputs = tf.reshape( outputs, shape=[batch_size, max_time, output_dim]) return outputs, final_state
def __call__(self, inputs, inputs_seq_len, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, T, input_size (num_channels * (splice * num_stack) * 3)]` inputs_seq_len (placeholder): A tensor of size` [B]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states. if time_major is True, a tensor of size `[T, B, output_dim]` otherwise, `[B, T, output_dim]` final_state: None """ # inputs: 3D tensor `[B, T, input_dim]` batch_size = tf.shape(inputs)[0] max_time = tf.shape(inputs)[1] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 # For debug # print(input_dim) # print(self.num_channels) # print(self.splice) # print(self.num_stack) assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]` inputs = tf.reshape( inputs, shape=[batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3]) # NOTE: filter_size: `[H, W, C_in, C_out]` with tf.variable_scope('VGG1'): for i_layer in range(1, 4, 1): input_channels = inputs.shape.as_list()[-1] inputs = conv_layer(inputs, filter_size=[3, 3, input_channels, 96], stride=[1, 1], parameter_init=self.parameter_init, activation='relu', name='conv1') inputs = batch_normalization(inputs, is_training=is_training) if i_layer == 3: inputs = max_pool(inputs, pooling_size=[2, 2], stride=[2, 2], name='max_pool') inputs = tf.nn.dropout(inputs, keep_prob) with tf.variable_scope('VGG2'): for i_layer in range(1, 5, 1): input_channels = inputs.shape.as_list()[-1] inputs = conv_layer(inputs, filter_size=[3, 3, input_channels, 192], stride=[1, 1], parameter_init=self.parameter_init, activation='relu', name='conv%d' % i_layer) inputs = batch_normalization(inputs, is_training=is_training) if i_layer == 4: inputs = max_pool(inputs, pooling_size=[2, 2], stride=[2, 2], name='max_pool') inputs = tf.nn.dropout(inputs, keep_prob) with tf.variable_scope('VGG3'): for i_layer in range(1, 5, 1): input_channels = inputs.shape.as_list()[-1] inputs = conv_layer(inputs, filter_size=[3, 3, input_channels, 384], parameter_init=self.parameter_init, activation='relu', name='conv%d' % i_layer) inputs = batch_normalization(inputs, is_training=is_training) if i_layer == 4: inputs = max_pool(inputs, pooling_size=[2, 2], stride=[2, 2], name='max_pool') inputs = tf.nn.dropout(inputs, keep_prob) # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]` outputs = tf.reshape( inputs, shape=[batch_size * max_time, np.prod(inputs.shape.as_list()[-3:])]) for i_layer in range(1, 3, 1): with tf.variable_scope('fc%d' % i_layer) as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=1024, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) if i_layer == 1: outputs = tf.nn.dropout(outputs, keep_prob) # Reshape back to 3D tensor `[B, T, 1024]` output_dim = outputs.shape.as_list()[-1] outputs = tf.reshape( outputs, shape=[batch_size, max_time, output_dim]) if self.time_major: # Convert to time-major: `[T, B, num_classes]' outputs = tf.transpose(outputs, [1, 0, 2]) return outputs, None
def __call__(self, inputs, inputs_seq_len, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, T, input_size (num_channels * splice * num_stack * 3)]` inputs_seq_len (placeholder): A tensor of size` [B]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states. if time_major is True, a tensor of size `[T, B, output_dim]` otherwise, `[B, T, output_dim]` final_state: None """ # inputs: 3D tensor `[B, T, input_dim]` batch_size = tf.shape(inputs)[0] max_time = tf.shape(inputs)[1] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 # For debug # print(input_dim) # print(self.num_channels) # print(self.splice) # print(self.num_stack) assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]` inputs = tf.reshape( inputs, shape=[batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3]) # Choose the activation function activation = 'relu' # activation = 'prelu' # activation = 'maxout' # TODO: add prelu and maxout layers # NOTE: filter_size: `[H, W, C_in, C_out]` # 1-4th layers for i_layer in range(1, 5, 1): with tf.variable_scope('CNN%d' % i_layer): input_channels = inputs.shape.as_list()[-1] inputs = conv_layer(inputs, filter_size=[3, 5, input_channels, 128], stride=[1, 1], parameter_init=self.parameter_init, activation=activation, name='conv') # inputs = batch_normalization(inputs, is_training=is_training) if i_layer == 1: inputs = max_pool(inputs, pooling_size=[3, 1], stride=[3, 1], name='pool') inputs = tf.nn.dropout(inputs, keep_prob) # 5-10th layers for i_layer in range(5, 11, 1): with tf.variable_scope('CNN%d' % i_layer): input_channels = inputs.shape.as_list()[-1] inputs = conv_layer(inputs, filter_size=[3, 5, input_channels, 256], stride=[1, 1], parameter_init=self.parameter_init, activation=activation, name='conv') # inputs = batch_normalization(inputs, is_training=is_training) # NOTE: No poling inputs = tf.nn.dropout(inputs, keep_prob) # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]` outputs = tf.reshape( inputs, shape=[batch_size * max_time, np.prod(inputs.shape.as_list()[-3:])]) # 11-14th layers for i_layer in range(1, 4, 1): with tf.variable_scope('fc%d' % i_layer) as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=1024, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) if i_layer != 3: outputs = tf.nn.dropout(outputs, keep_prob) # Reshape back to 3D tensor `[B, T, 1024]` logits = tf.reshape( outputs, shape=[batch_size, max_time, 1024]) if self.time_major: # Convert to time-major: `[T, B, 1024]' logits = tf.transpose(logits, [1, 0, 2]) return logits, None
def __call__(self, inputs, inputs_seq_len, keep_prob_input, keep_prob_hidden, keep_prob_output): """Construct model graph. Args: inputs (placeholder): A tensor of size`[B, T, input_size]` inputs_seq_len (placeholder): A tensor of size` [B]` keep_prob_input (placeholder, float): A probability to keep nodes in the input-hidden connection keep_prob_hidden (placeholder, float): A probability to keep nodes in the hidden-hidden connection keep_prob_output (placeholder, float): A probability to keep nodes in the hidden-output connection Returns: logits: A tensor of size `[T, B, num_classes]` final_state: A final hidden state of the encoder """ # inputs: `[B, T, input_size * splice]` batch_size = tf.shape(inputs)[0] max_time = tf.shape(inputs)[1] # Reshape to 4D tensor `[B * T, input_size / 3, splice, 3(+Δ, ΔΔ)]` inputs = tf.reshape( inputs, shape=[batch_size * max_time, int(self.input_size / 3), self.splice, 3]) with tf.variable_scope('VGG1'): inputs = conv_layer(inputs, filter_shape=[3, 3, 3, 64], parameter_init=self.parameter_init, relu=True, name='conv1') inputs = conv_layer(inputs, filter_shape=[3, 3, 64, 64], parameter_init=self.parameter_init, relu=True, name='conv2') inputs = max_pool(inputs, name='max_pool') # TODO(hirofumi): try batch normalization with tf.variable_scope('VGG2'): inputs = conv_layer(inputs, filter_shape=[3, 3, 64, 128], parameter_init=self.parameter_init, relu=True, name='conv1') inputs = conv_layer(inputs, filter_shape=[3, 3, 128, 128], parameter_init=self.parameter_init, relu=True, name='conv2') inputs = max_pool(inputs, name='max_pool') # TODO(hirofumi): try batch normalization # Reshape to 2D tensor `[B * T, new_h * new_w * 128]` new_h = math.ceil(self.input_size / 3 / 4) # expected to be 11 ro 10 new_w = math.ceil(self.splice / 4) # expected to be 3 inputs = tf.reshape( inputs, shape=[batch_size * max_time, new_h * new_w * 128]) # Insert linear layer to recude CNN's output demention # from (new_h * new_w * 128) to 256 with tf.variable_scope('linear') as scope: inputs = tf.contrib.layers.fully_connected( inputs=inputs, num_outputs=256, activation_fn=tf.nn.relu, scope=scope) # Dropout for the VGG-output-hidden connection inputs = tf.nn.dropout(inputs, keep_prob_input, name='dropout_input') # Reshape back to 3D tensor `[B, T, 256]` inputs = tf.reshape(inputs, shape=[batch_size, max_time, 256]) initializer = tf.random_uniform_initializer( minval=-self.parameter_init, maxval=self.parameter_init) # Hidden layers lstm_list = [] with tf.variable_scope('multi_lstm', initializer=initializer) as scope: for i_layer in range(1, self.num_layers + 1, 1): if self.lstm_impl == 'BasicLSTMCell': lstm = tf.contrib.rnn.BasicLSTMCell( self.num_units, forget_bias=1.0, state_is_tuple=True, activation=tf.tanh) elif self.lstm_impl == 'LSTMCell': lstm = tf.contrib.rnn.LSTMCell( self.num_units, use_peepholes=self.use_peephole, cell_clip=self.clip_activation, num_proj=self.num_proj, forget_bias=1.0, state_is_tuple=True) elif self.lstm_impl == 'LSTMBlockCell': # NOTE: This should be faster than tf.contrib.rnn.LSTMCell lstm = tf.contrib.rnn.LSTMBlockCell( self.num_units, forget_bias=1.0, # clip_cell=True, use_peephole=self.use_peephole) # TODO: cell clipping (update for rc1.3) elif self.lstm_impl == 'LSTMBlockFusedCell': raise NotImplementedError elif self.lstm_impl == 'CudnnLSTM': raise NotImplementedError else: raise IndexError( 'lstm_impl is "BasicLSTMCell" or "LSTMCell" or ' + '"LSTMBlockCell" or "LSTMBlockFusedCell" or ' + '"CudnnLSTM".') # Dropout for the hidden-hidden connections lstm = tf.contrib.rnn.DropoutWrapper( lstm, output_keep_prob=keep_prob_hidden) lstm_list.append(lstm) # Stack multiple cells stacked_lstm = tf.contrib.rnn.MultiRNNCell( lstm_list, state_is_tuple=True) # Ignore 2nd return (the last state) outputs, final_state = tf.nn.dynamic_rnn( cell=stacked_lstm, inputs=inputs, sequence_length=inputs_seq_len, dtype=tf.float32, scope=scope) # NOTE: initial states are zero states by default if self.return_hidden_states: return outputs, final_state # Reshape to apply the same weights over the timesteps if self.num_proj is None: outputs = tf.reshape(outputs, shape=[-1, self.num_units]) else: outputs = tf.reshape(outputs, shape=[-1, self.num_proj]) if self.bottleneck_dim is not None and self.bottleneck_dim != 0: with tf.variable_scope('bottleneck') as scope: outputs = tf.contrib.layers.fully_connected( outputs, self.bottleneck_dim, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) # Dropout for the hidden-output connections outputs = tf.nn.dropout( outputs, keep_prob_output, name='dropout_output_bottle') with tf.variable_scope('output') as scope: logits_2d = tf.contrib.layers.fully_connected( outputs, self.num_classes, activation_fn=None, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) # Reshape back to the original shape logits = tf.reshape( logits_2d, shape=[batch_size, -1, self.num_classes]) # Convert to time-major: `[T, B, num_classes]' logits = tf.transpose(logits, (1, 0, 2)) # Dropout for the hidden-output connections logits = tf.nn.dropout( logits, keep_prob_output, name='dropout_output') return logits, final_state
def __call__(self, inputs, inputs_seq_len, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, T, input_size (num_channels * splice * num_stack * 3)]` inputs_seq_len (placeholder): A tensor of size` [B]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states, a tensor of size `[T, B, num_units (num_proj)]` final_state: A final hidden state of the encoder """ # inputs: 3D tensor `[B, T, input_dim]` batch_size = tf.shape(inputs)[0] max_time = tf.shape(inputs)[1] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]` inputs = tf.reshape(inputs, shape=[ batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3 ]) # NOTE: filter_size: `[H, W, C_in, C_out]` with tf.variable_scope('VGG1'): inputs = conv_layer(inputs, filter_size=[3, 3, 3, 64], stride=[1, 1], parameter_init=self.parameter_init, activation='relu', name='conv1') inputs = conv_layer(inputs, filter_size=[3, 3, 64, 64], stride=[1, 1], parameter_init=self.parameter_init, activation='relu', name='conv2') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[2, 2], stride=[2, 2], name='max_pool') # TODO(hirofumi): try dropout with tf.variable_scope('VGG2'): inputs = conv_layer(inputs, filter_size=[3, 3, 64, 128], stride=[1, 1], parameter_init=self.parameter_init, activation='relu', name='conv1') inputs = conv_layer(inputs, filter_size=[3, 3, 128, 128], stride=[1, 1], parameter_init=self.parameter_init, activation='relu', name='conv2') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[2, 2], stride=[2, 2], name='max_pool') # TODO(hirofumi): try dropout # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]` new_h = math.ceil(self.num_channels / 4) new_w = math.ceil(self.splice * self.num_stack / 4) channel_out = inputs.shape.as_list()[-1] inputs = tf.reshape( inputs, shape=[batch_size * max_time, new_h * new_w * channel_out]) # Insert linear layer to recude CNN's output demention # from (new_h * new_w * C_out) to 256 with tf.variable_scope('bridge') as scope: inputs = tf.contrib.layers.fully_connected( inputs=inputs, num_outputs=256, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) # Dropout for the VGG-output-hidden connection inputs = tf.nn.dropout(inputs, keep_prob, name='dropout_pipe') # Reshape back to 3D tensor `[B, T, 256]` inputs = tf.reshape(inputs, shape=[batch_size, max_time, 256]) initializer = tf.random_uniform_initializer( minval=-self.parameter_init, maxval=self.parameter_init) if self.lstm_impl == 'BasicLSTMCell': outputs, final_state = basiclstmcell(self.num_units, self.num_layers, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'LSTMCell': outputs, final_state = lstmcell(self.num_units, self.num_proj, self.num_layers, self.use_peephole, self.clip_activation, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'LSTMBlockCell': outputs, final_state = lstmblockcell(self.num_units, self.num_layers, self.use_peephole, self.clip_activation, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'LSTMBlockFusedCell': outputs, final_state = lstmblockfusedcell(self.num_units, self.num_layers, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'CudnnLSTM': outputs, final_state = cudnnlstm(self.num_units, self.num_layers, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) else: raise IndexError('lstm_impl is "BasicLSTMCell" or "LSTMCell" or ' + '"LSTMBlockCell" or "LSTMBlockFusedCell" or ' + '"CudnnLSTM".') return outputs, final_state
def __call__(self, inputs, inputs_seq_len, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, T, input_size (num_channels * (splice * num_stack) * 3)]` inputs_seq_len (placeholder): A tensor of size` [B]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states. if time_major is True, a tensor of size `[T, B, num_units (num_proj)]` otherwise, `[B, T, num_units (num_proj)]` final_state: A final hidden state of the encoder """ # inputs: 3D tensor `[B, T, input_dim]` batch_size = tf.shape(inputs)[0] max_time = tf.shape(inputs)[1] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 # For debug # print(input_dim) # print(self.num_channels) # print(self.splice) # print(self.num_stack) assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]` inputs = tf.reshape(inputs, shape=[ batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3 ]) # NOTE: filter_size: `[H, W, C_in, C_out]` with tf.variable_scope('CNN1'): inputs = conv_layer(inputs, filter_size=[11, 21, 3, 32], stride=[3, 2], parameter_init=self.parameter_init, activation='relu') # inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[1, 1], stride=[1, 1], name='max_pool') inputs = tf.nn.dropout(inputs, keep_prob) with tf.variable_scope('CNN2'): inputs = conv_layer(inputs, filter_size=[11, 11, 32, 32], stride=[1, 2], parameter_init=self.parameter_init, activation='relu') # inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[1, 1], stride=[1, 1], name='max_pool') inputs = tf.nn.dropout(inputs, keep_prob) with tf.variable_scope('CNN3'): inputs = conv_layer(inputs, filter_size=[3, 3, 32, 96], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') # inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[1, 1], stride=[1, 1], name='max_pool') inputs = tf.nn.dropout(inputs, keep_prob) # Reshape to 3D tensor `[B, T, new_h * new_w * C_out]` inputs = tf.reshape( inputs, shape=[batch_size, max_time, np.prod(inputs.shape.as_list()[-3:])]) initializer = tf.random_uniform_initializer( minval=-self.parameter_init, maxval=self.parameter_init) if self.lstm_impl == 'BasicLSTMCell': outputs, final_state = basiclstmcell(self.num_units, self.num_layers, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'LSTMCell': outputs, final_state = lstmcell(self.num_units, self.num_proj, self.num_layers, self.use_peephole, self.clip_activation, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'LSTMBlockCell': outputs, final_state = lstmblockcell(self.num_units, self.num_layers, self.use_peephole, self.clip_activation, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'LSTMBlockFusedCell': outputs, final_state = lstmblockfusedcell(self.num_units, self.num_layers, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) elif self.lstm_impl == 'CudnnLSTM': outputs, final_state = cudnnlstm(self.num_units, self.num_layers, inputs, inputs_seq_len, keep_prob, initializer, self.time_major) else: raise IndexError('lstm_impl is "BasicLSTMCell" or "LSTMCell" or ' + '"LSTMBlockCell" or "LSTMBlockFusedCell" or ' + '"CudnnLSTM".') # Reshape to 2D tensor `[B * T (T * B), output_dim]` output_dim = outputs.shape.as_list()[-1] outputs = tf.reshape(outputs, shape=[batch_size * max_time, output_dim]) with tf.variable_scope('fc1') as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=896, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) outputs = tf.nn.dropout(outputs, keep_prob) with tf.variable_scope('fc2') as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=74, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) output_dim = outputs.shape.as_list()[-1] if self.time_major: # Reshape back to 3D tensor `[T, B, 74]` outputs = tf.reshape(outputs, shape=[max_time, batch_size, output_dim]) else: # Reshape back to 3D tensor `[B, T, 74]` outputs = tf.reshape(outputs, shape=[batch_size, max_time, output_dim]) return outputs, final_state
def __call__(self, inputs, inputs_seq_len, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, T, input_size (num_channels * splice * num_stack * 3)]` inputs_seq_len (placeholder): A tensor of size` [B]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states. if time_major is True, a tensor of size `[T, B, output_dim]` otherwise, `[B, T, output_dim]` final_state: None """ # inputs: 3D tensor `[B, T, input_dim]` batch_size = tf.shape(inputs)[0] max_time = tf.shape(inputs)[1] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B * T, num_channels, splice * num_stack, 3]` inputs = tf.reshape(inputs, shape=[ batch_size * max_time, self.num_channels, self.splice * self.num_stack, 3 ]) # Choose the activation function activation = 'relu' # activation = 'prelu' # activation = 'maxout' # TODO: add prelu and maxout layers # 1-4th layers with tf.variable_scope('CNN1'): for i_layer in range(1, 5, 1): if i_layer == 1: inputs = conv_layer(inputs, filter_size=[3, 5, 3, 128], stride=[1, 1], parameter_init=self.parameter_init, activation=activation, name='conv1') inputs = max_pool(inputs, pooling_size=[3, 1], stride=[3, 1], name='pool') else: inputs = conv_layer(inputs, filter_size=[3, 5, 128, 128], stride=[1, 1], parameter_init=self.parameter_init, activation=activation, name='conv%d' % i_layer) # NOTE: No poling inputs = batch_normalization(inputs, is_training=is_training) # inputs = tf.nn.dropout(inputs, keep_prob) # TODO: try Weight decay # 5-10th layers with tf.variable_scope('CNN2'): for i_layer in range(5, 11, 1): if i_layer == 5: inputs = conv_layer(inputs, filter_size=[3, 5, 128, 256], stride=[1, 1], parameter_init=self.parameter_init, activation=activation, name='conv1') # NOTE: No poling else: inputs = conv_layer(inputs, filter_size=[3, 5, 256, 256], stride=[1, 1], parameter_init=self.parameter_init, activation=activation, name='conv%d' % i_layer) # NOTE: No poling inputs = batch_normalization(inputs, is_training=is_training) # inputs = tf.nn.dropout(inputs, keep_prob) # TODO: try Weight decay # Reshape to 2D tensor `[B * T, new_h * new_w * C_out]` new_h = math.ceil(self.num_channels / 3) new_w = self.splice * self.num_stack channel_out = inputs.shape.as_list()[-1] outputs = tf.reshape( inputs, shape=[batch_size * max_time, new_h * new_w * channel_out]) # 11-14th layers for i in range(1, 4, 1): with tf.variable_scope('fc%d' % i) as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=1024, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) outputs = batch_normalization(outputs, is_training=is_training) outputs = tf.nn.dropout(outputs, keep_prob) # TODO: try Weight decay # Reshape back to 3D tensor `[B, T, 1024]` logits = tf.reshape(outputs, shape=[batch_size, max_time, 1024]) if self.time_major: # Convert to time-major: `[T, B, 1024]' logits = tf.transpose(logits, [1, 0, 2]) return logits, None
def __call__(self, inputs, keep_prob, is_training): """Construct model graph. Args: inputs (placeholder): A tensor of size `[B, input_size (num_channels * splice * num_stack * 3)]` keep_prob (placeholder, float): A probability to keep nodes in the hidden-hidden connection is_training (bool): Returns: outputs: Encoder states. if time_major is True, a tensor of size `[T, B, output_dim]` otherwise, `[B, output_dim]` """ # inputs: 2D tensor `[B, input_dim]` batch_size = tf.shape(inputs)[0] input_dim = inputs.shape.as_list()[-1] # NOTE: input_dim: num_channels * splice * num_stack * 3 # for debug # print(input_dim) # 1200 # print(self.num_channels) # 40 # print(self.splice) # 5 # print(self.num_stack) # 2 assert input_dim == self.num_channels * self.splice * self.num_stack * 3 # Reshape to 4D tensor `[B, num_channels, splice * num_stack, 3]` inputs = tf.reshape( inputs, shape=[batch_size, self.num_channels, self.splice * self.num_stack, 3]) # NOTE: filter_size: `[H, W, C_in, C_out]` with tf.variable_scope('CNN1'): inputs = conv_layer(inputs, filter_size=[9, 9, 3, 128], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[3, 1], stride=[3, 1], name='max_pool') with tf.variable_scope('CNN2'): inputs = conv_layer(inputs, filter_size=[3, 4, 128, 256], stride=[1, 1], parameter_init=self.parameter_init, activation='relu') inputs = batch_normalization(inputs, is_training=is_training) inputs = max_pool(inputs, pooling_size=[1, 1], stride=[1, 1], name='max_pool') # Reshape to 2D tensor `[B, new_h * new_w * C_out]` outputs = tf.reshape( inputs, shape=[batch_size, np.prod(inputs.shape.as_list()[-3:])]) for i in range(1, 5, 1): with tf.variable_scope('fc%d' % (i)) as scope: outputs = tf.contrib.layers.fully_connected( inputs=outputs, num_outputs=2048, activation_fn=tf.nn.relu, weights_initializer=tf.truncated_normal_initializer( stddev=self.parameter_init), biases_initializer=tf.zeros_initializer(), scope=scope) return outputs