def inference(session, feats, seq_lens, params): """Build the deepSpeech model. Args: feats: MFCC features returned from distorted_inputs() or inputs(). seq_lens: Input sequence length per utterance. params: parameters of the model. Returns: Logits. """ # We instantiate all variables using tf.get_variable() instead of # tf.Variable() in order to share variables across multiple GPU # training runs. If we only ran this model on a single GPU, # we could simplify this function # by replacing all instances of tf.get_variable() with tf.Variable(). if params.use_fp16: dtype = tf.float16 else: dtype = tf.float32 feat_len = feats.get_shape().as_list()[-1] # data layout: N, T, F # print "feat shape: ", feats.get_shape().as_list() ######################### # convolutional layers ######################### with tf.variable_scope('conv1') as scope: # convolution kernel = _variable_with_weight_decay( 'weights', shape=[20, 5, 1, params.num_filters], wd_value=None, use_fp16=params.use_fp16) ## N. T, F feats = tf.expand_dims(feats, dim=-1) ## N, T, F, 1 conv = tf.nn.conv2d(feats, kernel, [1, 2, 2, 1], padding='VALID') biases = _variable_on_cpu('biases', [params.num_filters], tf.constant_initializer(-0.05), params.use_fp16) bias = tf.nn.bias_add(conv, biases) ## N, T, F, 32 # batch normalization bn = custom_ops.batch_norm(bias) # clipped ReLU conv1 = custom_ops.relux(bn, capping=20) _activation_summary(conv1) with tf.variable_scope('conv2') as scope: # convolution kernel = _variable_with_weight_decay( 'weights', shape=[10, 5, params.num_filters, params.num_filters], wd_value=None, use_fp16=params.use_fp16) ## N. T, F, 32 conv = tf.nn.conv2d(conv1, kernel, [1, 2, 1, 1], padding='VALID') biases = _variable_on_cpu('biases', [params.num_filters], tf.constant_initializer(-0.05), params.use_fp16) bias = tf.nn.bias_add(conv, biases) ## N, T, F, 32 # batch normalization bn = custom_ops.batch_norm(bias) # clipped ReLU conv2 = custom_ops.relux(bn, capping=20) _activation_summary(conv2) ###################### # recurrent layers ###################### # Reshape conv output to fit rnn input: N, T, F * 32 rnn_input = tf.reshape(conv2, [params.batch_size, -1, 75 * params.num_filters]) # Permute into time major order for rnn: T, N, F * 32 rnn_input = tf.transpose(rnn_input, perm=[1, 0, 2]) # Make one instance of cell on a fixed device, # and use copies of the weights on other devices. cell = custom_ops.CustomRNNCell2(params.num_hidden, use_fp16=params.use_fp16) multi_cell = tf.contrib.rnn.MultiRNNCell([cell] * params.num_rnn_layers) rnn_seq_lens = get_rnn_seqlen(seq_lens) if params.rnn_type == 'uni-dir': rnn_outputs, _ = tf.nn.dynamic_rnn(multi_cell, rnn_input, sequence_length=rnn_seq_lens, dtype=dtype, time_major=True, swap_memory=True) else: outputs, _ = tf.nn.bidirectional_dynamic_rnn( multi_cell, multi_cell, rnn_input, sequence_length=rnn_seq_lens, dtype=dtype, time_major=True, swap_memory=False) outputs_fw, outputs_bw = outputs rnn_outputs = outputs_fw + outputs_bw _activation_summary(rnn_outputs) # Linear layer(WX + b) - softmax is applied by CTC cost function. with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay('weights', [NUM_CLASSES, params.num_hidden], wd_value=None, use_fp16=params.use_fp16) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0), params.use_fp16) logit_inputs = tf.reshape(rnn_outputs, [-1, cell.output_size]) logits = tf.add(tf.matmul(logit_inputs, weights, transpose_a=False, transpose_b=True), biases, name=scope.name) logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES]) _activation_summary(logits) return logits
def inference(sess, feats, seq_lens, params): """Build the deepSpeech model. Args: feats: MFCC features returned from distorted_inputs() or inputs(). seq_lens: Input sequence length per utterance. params: parameters of the model. Returns: logits. """ # data layout: N, T, F # feat_len = feats.get_shape().as_list()[-1] # print "feat shape: ", feats.get_shape().as_list() ######################### # convolutional layers ######################### with tf.variable_scope('conv1') as scope: ## N, T, F feats = tf.expand_dims(feats, axis=3) ## N, T, F, 1 # convolution kernel = _variable_with_weight_decay( 'weights', shape=[11, 41, 1, params.num_filters], wd_value=None, use_fp16=params.use_fp16) conv = tf.nn.conv2d(feats, kernel, [1, 2, 2, 1], padding='VALID') # biases = _variable_on_cpu('biases', [params.num_filters], # tf.constant_initializer(-0.05), # params.use_fp16) # bias = tf.nn.bias_add(conv, biases) ## N, T, F, 32 # batch normalization bn = custom_ops.batch_norm(conv) # clipped ReLU conv1 = custom_ops.relux(bn, capping=20) _activation_summary(conv1) with tf.variable_scope('conv2') as scope: ## N, T, F, 32 # convolution kernel = _variable_with_weight_decay( 'weights', shape=[11, 21, params.num_filters, params.num_filters], wd_value=None, use_fp16=params.use_fp16) conv = tf.nn.conv2d(conv1, kernel, [1, 1, 2, 1], padding='VALID') # biases = _variable_on_cpu('biases', # [params.num_filters], # tf.constant_initializer(-0.05), # params.use_fp16) # bias = tf.nn.bias_add(conv, biases) ## N, T, F, 32 # batch normalization bn = custom_ops.batch_norm(conv) # clipped ReLU conv2 = custom_ops.relux(bn, capping=20) _activation_summary(conv2) ###################### # recurrent layers ###################### # Reshape conv output to fit rnn input: N, T, F * C fdim = conv2.get_shape().dims feat_dim = fdim[2].value * fdim[3].value rnn_input = tf.reshape(conv2, [params.batch_size, -1, feat_dim]) # Permute into time major order for rnn: T, N, F * C rnn_input = tf.transpose(rnn_input, perm=[1, 0, 2]) fw_cell = custom_ops.CustomRNNCell2(params.num_hidden) # fw_cell_list = [fw_cell] * params.num_rnn_layers # bw_cell = custom_ops.CustomRNNCell2(params.num_hidden) # bw_cell_list = [bw_cell] * params.num_rnn_layers conved_seq_lens = get_rnn_seqlen(seq_lens) rnn_outputs = custom_ops.stacked_brnn(fw_cell, fw_cell, params.num_hidden, params.num_rnn_layers, rnn_input, params.batch_size, conved_seq_lens) _activation_summary(rnn_outputs) # Linear layer(WX + b) - softmax is applied by CTC cost function. with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay( 'weights', [NUM_CLASSES, params.num_hidden * 2], wd_value=None, use_fp16=params.use_fp16) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0), params.use_fp16) logit_inputs = tf.reshape(rnn_outputs, [-1, params.num_hidden * 2]) logits = tf.add(tf.matmul(logit_inputs, weights, transpose_a=False, transpose_b=True), biases, name=scope.name) logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES]) _activation_summary(logits) return logits
def inference(sess, feats, seq_lens, params): """Build the deepSpeech model. Args: feats: MFCC features returned from distorted_inputs() or inputs(). seq_lens: Input sequence length per utterance. params: parameters of the model. Returns: Logits. """ # We instantiate all variables using tf.get_variable() instead of # tf.Variable() in order to share variables across multiple GPU # training runs. If we only ran this model on a single GPU, # we could simplify this function # by replacing all instances of tf.get_variable() with tf.Variable(). if params.use_fp16: dtype = tf.float16 else: dtype = tf.float32 feat_len = feats.get_shape().as_list()[-1] # data layout: N, T, F # print "feat shape: ", feats.get_shape().as_list() ######################### # convolutional layers ######################### with tf.variable_scope('conv1') as scope: # convolution kernel = _variable_with_weight_decay( 'weights', shape=[20, 5, 1, params.num_filters], wd_value=None, use_fp16=params.use_fp16) ## N, T, F feats = tf.expand_dims(feats, axis=1) ## N, 1, T, F conv = tf.nn.conv2d(feats, kernel, strides=[1, 1, 2, 2], padding='VALID', data_format='NCHW') biases = _variable_on_cpu('biases', [params.num_filters], tf.constant_initializer(-0.05), params.use_fp16) bias = tf.nn.bias_add(conv, biases, data_format='NCHW') ## N, 32, T, F # batch normalization bn = custom_ops.batch_norm2(bias, data_format='NCHW') # clipped ReLU conv1 = custom_ops.relux(bn, capping=20) _activation_summary(conv1) with tf.variable_scope('conv2') as scope: # convolution kernel = _variable_with_weight_decay( 'weights', shape=[10, 5, params.num_filters, params.num_filters], wd_value=None, use_fp16=params.use_fp16) ## N, 32, T, F conv = tf.nn.conv2d(conv1, kernel, [1, 1, 2, 1], padding='VALID', data_format='NCHW') biases = _variable_on_cpu('biases', [params.num_filters], tf.constant_initializer(-0.05), params.use_fp16) bias = tf.nn.bias_add(conv, biases, data_format='NCHW') ## N, 32, T, F # batch normalization bn = custom_ops.batch_norm2(bias, data_format='NCHW') # clipped ReLU conv2 = custom_ops.relux(bn, capping=20) _activation_summary(conv2) ###################### # recurrent layers ###################### # conv2 = tf.Print(conv2, [conv2.get_shape()], "Conved Tensor Shape: ") with tf.variable_scope('rnn') as scope: # N, C, T, F => T, N, C, F rnn_input1 = tf.transpose(conv2, perm=[2, 0, 1, 3]) # Reshape conv output to fit rnn input: T, N, 32 * F rnn_input = tf.reshape( rnn_input1, [-1, params.batch_size, 75 * params.num_filters]) # Make one instance of cell on a fixed device, # and use copies of the weights on other devices. cell_list = [] if params.engine == 'mkldnn_rnn' or params.engine == 'cudnn_rnn': cell_list.append( MkldnnRNNCell(sess, params.num_hidden, input_size=75 * params.num_filters, use_fp16=params.use_fp16)) for i in range(params.num_rnn_layers - 1): cell_list.append( MkldnnRNNCell(sess, params.num_hidden, input_size=params.num_hidden, use_fp16=params.use_fp16)) else: cell = custom_ops.CustomRNNCell2(params.num_hidden, use_fp16=params.use_fp16) cell_list = [cell] * params.num_rnn_layers rnn_seq_lens = get_rnn_seqlen(seq_lens) rnn_outputs = custom_ops.stacked_brnn(cell_list, cell_list, params.num_hidden, params.num_rnn_layers, rnn_input, rnn_seq_lens, params.batch_size) _activation_summary(rnn_outputs) # print "rnn output:", rnn_outputs.get_shape() # Linear layer(WX + b) - softmax is applied by CTC cost function. with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay('weights', [NUM_CLASSES, params.num_hidden], wd_value=None, use_fp16=params.use_fp16) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0), params.use_fp16) logit_inputs = tf.reshape(rnn_outputs, [-1, params.num_hidden]) logits = tf.add(tf.matmul(logit_inputs, weights, transpose_a=False, transpose_b=True), biases, name=scope.name) logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES]) _activation_summary(logits) return logits