def __call__(self, inputs, state, scope=None): """ output = new_state = activation(BN(W * input) + U * state + B). state dim: batch_size * num_units input dim: batch_size * feature_size W: feature_size * num_units U: num_units * num_units """ with tf.variable_scope(scope or type(self).__name__): # print "rnn cell input size: ", inputs.get_shape().as_list() # print "rnn cell state size: ", state.get_shape().as_list() wsize = inputs.get_shape()[1] w = _variable_on_cpu('W', [self._num_units, wsize], initializer=tf.orthogonal_initializer()) # print w.name resi = tf.matmul(inputs, w, transpose_a=False, transpose_b=True) # batch_size * num_units bn_resi = seq_batch_norm(resi) # bn_resi = resi usize = state.get_shape()[1] u = _variable_on_cpu('U', [self._num_units, usize], initializer=tf.orthogonal_initializer()) resu = tf.matmul(state, u, transpose_a=False, transpose_b=True) # res_nb = tf.add_n([bn_resi, resu]) res_nb = tf.add(bn_resi, resu) bias = _variable_on_cpu('B', [self._num_units], tf.constant_initializer(0)) res = tf.add(res_nb, bias) output = relux(res, capping=20) return output, output
def seq_batch_norm(x, scope=None, is_train=True): """sequence batch normalization, input N * D""" with tf.name_scope(None): with tf.variable_scope("sbn", reuse=None): inputs_shape = x.get_shape() param_shape = inputs_shape[-1] beta = _variable_on_cpu('beta', [param_shape], initializer=tf.zeros_initializer(), trainable=False) gamma = _variable_on_cpu('gamma', [param_shape], initializer=tf.ones_initializer(), trainable=False) batch_mean, batch_var = tf.nn.moments(x, [0], name='moments') moving_mean = _variable_on_cpu('moving_mean', [param_shape], initializer=tf.zeros_initializer(), trainable=False) moving_variance = _variable_on_cpu( 'moving_variance', [param_shape], initializer=tf.ones_initializer(), trainable=False) moving_averages.assign_moving_average(moving_mean, batch_mean, 0.997) moving_averages.assign_moving_average(moving_variance, batch_var, 0.997) normed = tf.nn.batch_normalization(x, moving_mean, moving_variance, beta, gamma, 1e-5) return normed
def seq_batch_norm(x, scope=None, is_train=True): """sequence batch normalization, input N * D""" with tf.variable_scope("sbn"): inputs_shape = x.get_shape() param_shape = inputs_shape[-1] batch_mean, batch_var = tf.nn.moments(x, [0], name='moments') ema = tf.train.ExponentialMovingAverage(decay=0.9997) def mean_var_with_update(): ema_apply_op = ema.apply([batch_mean, batch_var]) with tf.control_dependencies([ema_apply_op]): return tf.identity(batch_mean), tf.identity(batch_var) mean, var = control_flow_ops.cond( tf.cast(is_train, "bool"), mean_var_with_update, lambda: (ema.average(batch_mean), ema.average(batch_var))) offset = _variable_on_cpu('offset', [param_shape], initializer=tf.zeros_initializer(), trainable=True) scale = _variable_on_cpu('scale', [param_shape], initializer=tf.ones_initializer(), trainable=True) normed = tf.nn.batch_normalization(x, mean, var, offset, scale, 0.001) return normed
def _linear(args, output_size, bias, scope=None, use_fp16=False): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_start: starting value to initialize the bias; 0 by default. scope: VariableScope for the created subgraph; defaults to "Linear". Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ if args is None or (nest.is_sequence(args) and not args): raise ValueError("`args` must be specified") if not nest.is_sequence(args): args = [args] # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape().as_list() for a in args] for shape in shapes: if len(shape) != 2: raise ValueError( "Linear is expecting 2D arguments: %s" % str(shapes)) if not shape[1]: raise ValueError( "Linear expects shape[1] of arguments: %s" % str(shapes)) else: total_arg_size += shape[1] dtype = [a.dtype for a in args][0] # Now the computation. with tf.variable_scope(scope or "Linear"): matrix = _variable_on_cpu('Matrix', [total_arg_size, output_size], use_fp16 = use_fp16) if use_fp16: dtype = tf.float16 else: dtype = tf.float32 args = [tf.cast(x, dtype) for x in args] if len(args) == 1: res = tf.matmul(args[0], matrix, transpose_a=False, transpose_b=True) else: res = tf.matmul(tf.concat(args, 1), matrix, transpose_a=False, transpose_b=True) if not bias: return res bias_term = _variable_on_cpu('Bias', [output_size], tf.constant_initializer(0), use_fp16=use_fp16) return res + bias_term
def batch_norm2(inputs, decay=0.999, center=True, scale=True, epsilon=0.001, moving_vars='moving_vars', activation=None, is_training=True, trainable=True, scope=None, data_format='NHWC'): """Adds a Batch Normalization layer. Args: inputs: a tensor of size [batch_size, height, width, channels] or [batch_size, channels]. decay: decay for the moving average. center: If True, subtract beta. If False, beta is not created and ignored. scale: If True, multiply by gamma. If False, gamma is not used. When the next layer is linear (also e.g. ReLU), this can be disabled since the scaling can be done by the next layer. epsilon: small float added to variance to avoid dividing by zero. moving_vars: collection to store the moving_mean and moving_variance. activation: activation function. is_training: whether or not the model is in training mode. trainable: whether or not the variables should be trainable or not. scope: Optional scope for variable_scope. Returns: a tensor representing the output of the operation. """ inputs_shape = inputs.get_shape() with tf.variable_scope('bn2'): if data_format == 'NCHW': params_shape = inputs_shape[1] else: params_shape = inputs_shape[-1] # scale scale = _variable_on_cpu('scale', params_shape, initializer=tf.ones_initializer()) # shift shift = _variable_on_cpu('shift', params_shape, initializer=tf.zeros_initializer()) moving_mean = _variable_on_cpu('moving_mean', [params_shape], initializer=tf.zeros_initializer(), trainable=False) moving_var = _variable_on_cpu('moving_variance', [params_shape], initializer=tf.ones_initializer(), trainable=False) if is_training: y, batch_mean, batch_var = tf.nn.fused_batch_norm(inputs, gamma, beta, mean=None, variance=None, epsilon=epsilon, data_format=data_format, is_training=is_training) moving_averages.assign_moving_average(moving_mean, batch_mean, decay) moving_averages.assign_moving_average(moving_var, batch_var, decay) else: y, _, _ = tf.nn.fused_batch_norm(inputs, scale, shift, mean=moving_mean, variance=moving_var, epsilon=epsilon, data_format=data_format, is_training=is_training) return y
def batch_norm2(inputs, decay = 0.999, center = True, scale = True, epsilon = 0.001, moving_vars = 'moving_vars', activation = None, is_training = True, trainable = True, scope = None, reuse = None, data_format = 'NHWC'): """Adds a Batch Normalization layer. Args: inputs: a tensor of size [batch_size, height, width, channels] or [batch_size, channels]. decay: decay for the moving average. center: If True, subtract beta. If False, beta is not created and ignored. scale: If True, multiply by gamma. If False, gamma is not used. When the next layer is linear (also e.g. ReLU), this can be disabled since the scaling can be done by the next layer. epsilon: small float added to variance to avoid dividing by zero. moving_vars: collection to store the moving_mean and moving_variance. activation: activation function. is_training: whether or not the model is in training mode. trainable: whether or not the variables should be trainable or not. scope: Optional scope for variable_scope. reuse: whether or not the layer and its variables should be reused. To be able to reuse the layer scope must be given. Returns: a tensor representing the output of the operation. """ inputs_shape = inputs.get_shape() with tf.variable_scope(scope, 'bn2', [inputs], reuse = reuse): axis = list(range(len(inputs_shape) - 1)) if data_format == 'NCHW': params_shape = inputs_shape[1] else: params_shape = inputs_shape[-1] # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta = _variable_on_cpu('beta', params_shape, initializer = tf.zeros_initializer()) if scale: gamma = _variable_on_cpu('gamma', params_shape, initializer = tf.ones_initializer()) outputs, _, _ = tf.nn.fused_batch_norm( inputs, gamma, beta, mean = None, variance = None, epsilon = epsilon, data_format = data_format, is_training = is_training) outputs.set_shape(inputs.get_shape()) return outputs
def batch_norm(x, scope = None, is_train = True, data_format = None): """batch normalization, currently only work on NHWC""" with tf.variable_scope(scope or 'bn'): inputs_shape = x.get_shape() param_shape = inputs_shape[-1] beta = _variable_on_cpu('beta', [param_shape], initializer = tf.zeros_initializer()) gamma = _variable_on_cpu('gamma', [param_shape], initializer = tf.ones_initializer()) batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name = 'moments') ema = tf.train.ExponentialMovingAverage(decay = 0.5, zero_debias = True) def mean_var_with_update(): ema_apply_op = ema.apply([batch_mean, batch_var]) with tf.control_dependencies([ema_apply_op]): return tf.identity(batch_mean), tf.identity(batch_var) if is_train: mean, var = mean_var_with_update() else: mean, var = lambda : (ema.average(batch_mean), ema.average(batch_var)) normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-5) return normed
def __call__(self, inputs, state, scope = None): """Most basic RNN: output = new_state = activation(BN(W * input) + U * state + B). state dim: seq_len * num_units input dim: batch_size * feature_size W: feature_size * num_units U: num_units * num_units """ with tf.variable_scope(scope or type(self).__name__): # print "rnn cell input size: ", inputs.get_shape().as_list() # print "rnn cell state size: ", state.get_shape().as_list() wsize = inputs.get_shape()[1] w = _variable_on_cpu('W', [self._num_units, wsize], use_fp16 = self.use_fp16) resi = tf.matmul(inputs, w, transpose_a = False, transpose_b = True) # batch_size * num_units bn_resi = seq_batch_norm(resi) usize = state.get_shape()[1] u = _variable_on_cpu('U', [self._num_units, usize], use_fp16 = self.use_fp16) resu = tf.matmul(state, u, transpose_a = False, transpose_b = True) bias = _variable_on_cpu('B', [self._num_units], tf.constant_initializer(0), use_fp16 = self.use_fp16) output = relux(tf.add(bn_resi, resu) + bias, capping = 20) return output, output
def __call__(self, inputs, state, scope=None, weight_size=None): with tf.variable_scope(scope or type(self).__name__): # if len(inputs.get_shape()) == 2: # inputs = tf.expand_dims(inputs, axis=0) # state = tf.expand_dims(state, axis=0) # print "input size: ", inputs.get_shape(), " state size: ", state.get_shape() rnn_weights = _variable_on_cpu( "rnn_weights", [self.param_size], tf.constant_initializer(1.0 / self.param_size), self.use_fp16) output, output_h = self.model(input_data=inputs, input_h=state, params=rnn_weights, is_training=True) # print "output size: ", output.get_shape(), "output h size: ", output_h.get_shape() # output = tf.squeeze(output, axis=0) # output_h = tf.squeeze(output_h, axis=0) return output, output_h
def inference(feats, seq_lens, params): """Build the deepSpeech model. Args: feats: MFCC features returned from distorted_inputs() or inputs(). seq_lens: Input sequence length per utterance. params: parameters of the model. Returns: Logits. """ # We instantiate all variables using tf.get_variable() instead of # tf.Variable() in order to share variables across multiple GPU # training runs. If we only ran this model on a single GPU, # we could simplify this function # by replacing all instances of tf.get_variable() with tf.Variable(). if params.use_fp16: dtype = tf.float16 else: dtype = tf.float32 feat_len = feats.get_shape().as_list()[-1] # convolutional layers with tf.variable_scope('conv1') as scope: kernel = _variable_with_weight_decay( 'weights', shape=[11, feat_len, 1, params.num_filters], wd_value=None, use_fp16=params.use_fp16) feats = tf.expand_dims(feats, dim=-1) conv = tf.nn.conv2d(feats, kernel, [1, params.temporal_stride, 1, 1], padding='SAME') # conv = tf.nn.atrous_conv2d(feats, kernel, rate=2, padding='SAME') biases = _variable_on_cpu('biases', [params.num_filters], tf.constant_initializer(-0.05), params.use_fp16) bias = tf.nn.bias_add(conv, biases) conv1 = tf.nn.relu(bias, name=scope.name) _activation_summary(conv1) # dropout conv1_drop = tf.nn.dropout(conv1, params.keep_prob) # recurrent layers with tf.variable_scope('rnn') as scope: # Reshape conv output to fit rnn input rnn_input = tf.reshape( conv1_drop, [params.batch_size, -1, feat_len * params.num_filters]) # Permute into time major order for rnn rnn_input = tf.transpose(rnn_input, perm=[1, 0, 2]) # Make one instance of cell on a fixed device, # and use copies of the weights on other devices. cell = rnn_cell.CustomRNNCell(params.num_hidden, activation=tf.nn.relu6, use_fp16=params.use_fp16) drop_cell = tf.contrib.rnn.DropoutWrapper( cell, output_keep_prob=params.keep_prob) multi_cell = tf.contrib.rnn.MultiRNNCell([drop_cell] * params.num_rnn_layers) seq_lens = tf.div(seq_lens, params.temporal_stride) if params.rnn_type == 'uni-dir': rnn_outputs, _ = tf.nn.dynamic_rnn(multi_cell, rnn_input, sequence_length=seq_lens, dtype=dtype, time_major=True, scope='rnn', swap_memory=True) else: outputs, _ = tf.nn.bidirectional_dynamic_rnn( multi_cell, multi_cell, rnn_input, sequence_length=seq_lens, dtype=dtype, time_major=True, scope='rnn', swap_memory=True) outputs_fw, outputs_bw = outputs rnn_outputs = outputs_fw + outputs_bw _activation_summary(rnn_outputs) # Linear layer(WX + b) - softmax is applied by CTC cost function. with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay('weights', [params.num_hidden, NUM_CLASSES], wd_value=None, use_fp16=params.use_fp16) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0), params.use_fp16) logit_inputs = tf.reshape(rnn_outputs, [-1, cell.output_size]) logits = tf.add(tf.matmul(logit_inputs, weights), biases, name=scope.name) logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES]) _activation_summary(logits) return logits
def inference(sess, feats, seq_lens, params): """Build the deepSpeech model. Args: feats: MFCC features returned from distorted_inputs() or inputs(). seq_lens: Input sequence length per utterance. params: parameters of the model. Returns: logits. """ # data layout: N, T, F # feat_len = feats.get_shape().as_list()[-1] # print "feat shape: ", feats.get_shape().as_list() ######################### # convolutional layers ######################### with tf.variable_scope('conv1') as scope: ## N, T, F feats = tf.expand_dims(feats, axis=3) ## N, T, F, 1 # convolution kernel = _variable_with_weight_decay( 'weights', shape=[11, 41, 1, params.num_filters], wd_value=None, use_fp16=params.use_fp16) conv = tf.nn.conv2d(feats, kernel, [1, 2, 2, 1], padding='VALID') # biases = _variable_on_cpu('biases', [params.num_filters], # tf.constant_initializer(-0.05), # params.use_fp16) # bias = tf.nn.bias_add(conv, biases) ## N, T, F, 32 # batch normalization bn = custom_ops.batch_norm(conv) # clipped ReLU conv1 = custom_ops.relux(bn, capping=20) _activation_summary(conv1) with tf.variable_scope('conv2') as scope: ## N, T, F, 32 # convolution kernel = _variable_with_weight_decay( 'weights', shape=[11, 21, params.num_filters, params.num_filters], wd_value=None, use_fp16=params.use_fp16) conv = tf.nn.conv2d(conv1, kernel, [1, 1, 2, 1], padding='VALID') # biases = _variable_on_cpu('biases', # [params.num_filters], # tf.constant_initializer(-0.05), # params.use_fp16) # bias = tf.nn.bias_add(conv, biases) ## N, T, F, 32 # batch normalization bn = custom_ops.batch_norm(conv) # clipped ReLU conv2 = custom_ops.relux(bn, capping=20) _activation_summary(conv2) ###################### # recurrent layers ###################### # Reshape conv output to fit rnn input: N, T, F * C fdim = conv2.get_shape().dims feat_dim = fdim[2].value * fdim[3].value rnn_input = tf.reshape(conv2, [params.batch_size, -1, feat_dim]) # Permute into time major order for rnn: T, N, F * C rnn_input = tf.transpose(rnn_input, perm=[1, 0, 2]) fw_cell = custom_ops.CustomRNNCell2(params.num_hidden) # fw_cell_list = [fw_cell] * params.num_rnn_layers # bw_cell = custom_ops.CustomRNNCell2(params.num_hidden) # bw_cell_list = [bw_cell] * params.num_rnn_layers conved_seq_lens = get_rnn_seqlen(seq_lens) rnn_outputs = custom_ops.stacked_brnn(fw_cell, fw_cell, params.num_hidden, params.num_rnn_layers, rnn_input, params.batch_size, conved_seq_lens) _activation_summary(rnn_outputs) # Linear layer(WX + b) - softmax is applied by CTC cost function. with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay( 'weights', [NUM_CLASSES, params.num_hidden * 2], wd_value=None, use_fp16=params.use_fp16) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0), params.use_fp16) logit_inputs = tf.reshape(rnn_outputs, [-1, params.num_hidden * 2]) logits = tf.add(tf.matmul(logit_inputs, weights, transpose_a=False, transpose_b=True), biases, name=scope.name) logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES]) _activation_summary(logits) return logits
def inference(session, feats, seq_lens, params): """Build the deepSpeech model. Args: feats: MFCC features returned from distorted_inputs() or inputs(). seq_lens: Input sequence length per utterance. params: parameters of the model. Returns: Logits. """ # We instantiate all variables using tf.get_variable() instead of # tf.Variable() in order to share variables across multiple GPU # training runs. If we only ran this model on a single GPU, # we could simplify this function # by replacing all instances of tf.get_variable() with tf.Variable(). if params.use_fp16: dtype = tf.float16 else: dtype = tf.float32 feat_len = feats.get_shape().as_list()[-1] # data layout: N, T, F # print "feat shape: ", feats.get_shape().as_list() ######################### # convolutional layers ######################### with tf.variable_scope('conv1') as scope: # convolution kernel = _variable_with_weight_decay( 'weights', shape=[20, 5, 1, params.num_filters], wd_value=None, use_fp16=params.use_fp16) ## N. T, F feats = tf.expand_dims(feats, dim=-1) ## N, T, F, 1 conv = tf.nn.conv2d(feats, kernel, [1, 2, 2, 1], padding='VALID') biases = _variable_on_cpu('biases', [params.num_filters], tf.constant_initializer(-0.05), params.use_fp16) bias = tf.nn.bias_add(conv, biases) ## N, T, F, 32 # batch normalization bn = custom_ops.batch_norm(bias) # clipped ReLU conv1 = custom_ops.relux(bn, capping=20) _activation_summary(conv1) with tf.variable_scope('conv2') as scope: # convolution kernel = _variable_with_weight_decay( 'weights', shape=[10, 5, params.num_filters, params.num_filters], wd_value=None, use_fp16=params.use_fp16) ## N. T, F, 32 conv = tf.nn.conv2d(conv1, kernel, [1, 2, 1, 1], padding='VALID') biases = _variable_on_cpu('biases', [params.num_filters], tf.constant_initializer(-0.05), params.use_fp16) bias = tf.nn.bias_add(conv, biases) ## N, T, F, 32 # batch normalization bn = custom_ops.batch_norm(bias) # clipped ReLU conv2 = custom_ops.relux(bn, capping=20) _activation_summary(conv2) ###################### # recurrent layers ###################### # Reshape conv output to fit rnn input: N, T, F * 32 rnn_input = tf.reshape(conv2, [params.batch_size, -1, 75 * params.num_filters]) # Permute into time major order for rnn: T, N, F * 32 rnn_input = tf.transpose(rnn_input, perm=[1, 0, 2]) # Make one instance of cell on a fixed device, # and use copies of the weights on other devices. cell = custom_ops.CustomRNNCell2(params.num_hidden, use_fp16=params.use_fp16) multi_cell = tf.contrib.rnn.MultiRNNCell([cell] * params.num_rnn_layers) rnn_seq_lens = get_rnn_seqlen(seq_lens) if params.rnn_type == 'uni-dir': rnn_outputs, _ = tf.nn.dynamic_rnn(multi_cell, rnn_input, sequence_length=rnn_seq_lens, dtype=dtype, time_major=True, swap_memory=True) else: outputs, _ = tf.nn.bidirectional_dynamic_rnn( multi_cell, multi_cell, rnn_input, sequence_length=rnn_seq_lens, dtype=dtype, time_major=True, swap_memory=False) outputs_fw, outputs_bw = outputs rnn_outputs = outputs_fw + outputs_bw _activation_summary(rnn_outputs) # Linear layer(WX + b) - softmax is applied by CTC cost function. with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay('weights', [NUM_CLASSES, params.num_hidden], wd_value=None, use_fp16=params.use_fp16) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0), params.use_fp16) logit_inputs = tf.reshape(rnn_outputs, [-1, cell.output_size]) logits = tf.add(tf.matmul(logit_inputs, weights, transpose_a=False, transpose_b=True), biases, name=scope.name) logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES]) _activation_summary(logits) return logits
def inference(sess, feats, seq_lens, params): """Build the deepSpeech model. Args: feats: MFCC features returned from distorted_inputs() or inputs(). seq_lens: Input sequence length per utterance. params: parameters of the model. Returns: Logits. """ # We instantiate all variables using tf.get_variable() instead of # tf.Variable() in order to share variables across multiple GPU # training runs. If we only ran this model on a single GPU, # we could simplify this function # by replacing all instances of tf.get_variable() with tf.Variable(). if params.use_fp16: dtype = tf.float16 else: dtype = tf.float32 feat_len = feats.get_shape().as_list()[-1] # data layout: N, T, F # print "feat shape: ", feats.get_shape().as_list() ######################### # convolutional layers ######################### with tf.variable_scope('conv1') as scope: # convolution kernel = _variable_with_weight_decay( 'weights', shape=[20, 5, 1, params.num_filters], wd_value=None, use_fp16=params.use_fp16) ## N, T, F feats = tf.expand_dims(feats, axis=1) ## N, 1, T, F conv = tf.nn.conv2d(feats, kernel, strides=[1, 1, 2, 2], padding='VALID', data_format='NCHW') biases = _variable_on_cpu('biases', [params.num_filters], tf.constant_initializer(-0.05), params.use_fp16) bias = tf.nn.bias_add(conv, biases, data_format='NCHW') ## N, 32, T, F # batch normalization bn = custom_ops.batch_norm2(bias, data_format='NCHW') # clipped ReLU conv1 = custom_ops.relux(bn, capping=20) _activation_summary(conv1) with tf.variable_scope('conv2') as scope: # convolution kernel = _variable_with_weight_decay( 'weights', shape=[10, 5, params.num_filters, params.num_filters], wd_value=None, use_fp16=params.use_fp16) ## N, 32, T, F conv = tf.nn.conv2d(conv1, kernel, [1, 1, 2, 1], padding='VALID', data_format='NCHW') biases = _variable_on_cpu('biases', [params.num_filters], tf.constant_initializer(-0.05), params.use_fp16) bias = tf.nn.bias_add(conv, biases, data_format='NCHW') ## N, 32, T, F # batch normalization bn = custom_ops.batch_norm2(bias, data_format='NCHW') # clipped ReLU conv2 = custom_ops.relux(bn, capping=20) _activation_summary(conv2) ###################### # recurrent layers ###################### # conv2 = tf.Print(conv2, [conv2.get_shape()], "Conved Tensor Shape: ") with tf.variable_scope('rnn') as scope: # N, C, T, F => T, N, C, F rnn_input1 = tf.transpose(conv2, perm=[2, 0, 1, 3]) # Reshape conv output to fit rnn input: T, N, 32 * F rnn_input = tf.reshape( rnn_input1, [-1, params.batch_size, 75 * params.num_filters]) # Make one instance of cell on a fixed device, # and use copies of the weights on other devices. cell_list = [] if params.engine == 'mkldnn_rnn' or params.engine == 'cudnn_rnn': cell_list.append( MkldnnRNNCell(sess, params.num_hidden, input_size=75 * params.num_filters, use_fp16=params.use_fp16)) for i in range(params.num_rnn_layers - 1): cell_list.append( MkldnnRNNCell(sess, params.num_hidden, input_size=params.num_hidden, use_fp16=params.use_fp16)) else: cell = custom_ops.CustomRNNCell2(params.num_hidden, use_fp16=params.use_fp16) cell_list = [cell] * params.num_rnn_layers rnn_seq_lens = get_rnn_seqlen(seq_lens) rnn_outputs = custom_ops.stacked_brnn(cell_list, cell_list, params.num_hidden, params.num_rnn_layers, rnn_input, rnn_seq_lens, params.batch_size) _activation_summary(rnn_outputs) # print "rnn output:", rnn_outputs.get_shape() # Linear layer(WX + b) - softmax is applied by CTC cost function. with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay('weights', [NUM_CLASSES, params.num_hidden], wd_value=None, use_fp16=params.use_fp16) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0), params.use_fp16) logit_inputs = tf.reshape(rnn_outputs, [-1, params.num_hidden]) logits = tf.add(tf.matmul(logit_inputs, weights, transpose_a=False, transpose_b=True), biases, name=scope.name) logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES]) _activation_summary(logits) return logits