Beispiel #1
0
def inference(feats, seq_lens, params):
    """Build the deepSpeech model.

    Args:
      feats: MFCC features returned from distorted_inputs() or inputs().
      seq_lens: Input sequence length per utterance.
      params: parameters of the model.

    Returns:
      Logits.
    """
    # We instantiate all variables using tf.get_variable() instead of
    # tf.Variable() in order to share variables across multiple GPU
    # training runs. If we only ran this model on a single GPU,
    # we could simplify this function
    # by replacing all instances of tf.get_variable() with tf.Variable().

    if params.use_fp16:
        dtype = tf.float16
    else:
        dtype = tf.float32

    feat_len = feats.get_shape().as_list()[-1]

    # convolutional layers
    with tf.variable_scope('conv1') as scope:
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[11, feat_len, 1, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)

        feats = tf.expand_dims(feats, dim=-1)
        conv = tf.nn.conv2d(feats,
                            kernel, [1, params.temporal_stride, 1, 1],
                            padding='SAME')
        # conv = tf.nn.atrous_conv2d(feats, kernel, rate=2, padding='SAME')
        biases = _variable_on_cpu('biases', [params.num_filters],
                                  tf.constant_initializer(-0.05),
                                  params.use_fp16)
        bias = tf.nn.bias_add(conv, biases)
        conv1 = tf.nn.relu(bias, name=scope.name)
        _activation_summary(conv1)

        # dropout
        conv1_drop = tf.nn.dropout(conv1, params.keep_prob)

    # recurrent layers
    with tf.variable_scope('rnn') as scope:

        # Reshape conv output to fit rnn input
        rnn_input = tf.reshape(
            conv1_drop, [params.batch_size, -1, feat_len * params.num_filters])
        # Permute into time major order for rnn
        rnn_input = tf.transpose(rnn_input, perm=[1, 0, 2])
        # Make one instance of cell on a fixed device,
        # and use copies of the weights on other devices.
        cell = rnn_cell.CustomRNNCell(params.num_hidden,
                                      activation=tf.nn.relu6,
                                      use_fp16=params.use_fp16)
        drop_cell = tf.contrib.rnn.DropoutWrapper(
            cell, output_keep_prob=params.keep_prob)
        multi_cell = tf.contrib.rnn.MultiRNNCell([drop_cell] *
                                                 params.num_rnn_layers)

        seq_lens = tf.div(seq_lens, params.temporal_stride)
        if params.rnn_type == 'uni-dir':
            rnn_outputs, _ = tf.nn.dynamic_rnn(multi_cell,
                                               rnn_input,
                                               sequence_length=seq_lens,
                                               dtype=dtype,
                                               time_major=True,
                                               scope='rnn',
                                               swap_memory=True)
        else:
            outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                multi_cell,
                multi_cell,
                rnn_input,
                sequence_length=seq_lens,
                dtype=dtype,
                time_major=True,
                scope='rnn',
                swap_memory=True)
            outputs_fw, outputs_bw = outputs
            rnn_outputs = outputs_fw + outputs_bw
        _activation_summary(rnn_outputs)

    # Linear layer(WX + b) - softmax is applied by CTC cost function.
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay('weights',
                                              [params.num_hidden, NUM_CLASSES],
                                              wd_value=None,
                                              use_fp16=params.use_fp16)
        biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                  tf.constant_initializer(0.0),
                                  params.use_fp16)
        logit_inputs = tf.reshape(rnn_outputs, [-1, cell.output_size])
        logits = tf.add(tf.matmul(logit_inputs, weights),
                        biases,
                        name=scope.name)
        logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES])
        _activation_summary(logits)

    return logits
Beispiel #2
0
def inference(session, feats, seq_lens, params):
    """Build the deepSpeech model.

    Args:
      feats: MFCC features returned from distorted_inputs() or inputs().
      seq_lens: Input sequence length per utterance.
      params: parameters of the model.

    Returns:
      Logits.
    """
    # We instantiate all variables using tf.get_variable() instead of
    # tf.Variable() in order to share variables across multiple GPU
    # training runs. If we only ran this model on a single GPU,
    # we could simplify this function
    # by replacing all instances of tf.get_variable() with tf.Variable().

    if params.use_fp16:
        dtype = tf.float16
    else:
        dtype = tf.float32

    feat_len = feats.get_shape().as_list()[-1]
    # data layout: N, T, F
    # print "feat shape: ", feats.get_shape().as_list()

    #########################
    #  convolutional layers
    #########################
    with tf.variable_scope('conv1') as scope:
        # convolution
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[20, 5, 1, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)

        ## N. T, F
        feats = tf.expand_dims(feats, dim=-1)
        ## N, T, F, 1
        conv = tf.nn.conv2d(feats, kernel, [1, 2, 2, 1], padding='VALID')
        biases = _variable_on_cpu('biases', [params.num_filters],
                                  tf.constant_initializer(-0.05),
                                  params.use_fp16)
        bias = tf.nn.bias_add(conv, biases)
        ## N, T, F, 32
        # batch normalization
        bn = custom_ops.batch_norm(bias)

        # clipped ReLU
        conv1 = custom_ops.relux(bn, capping=20)
        _activation_summary(conv1)

    with tf.variable_scope('conv2') as scope:
        # convolution
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[10, 5, params.num_filters, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)

        ## N. T, F, 32
        conv = tf.nn.conv2d(conv1, kernel, [1, 2, 1, 1], padding='VALID')
        biases = _variable_on_cpu('biases', [params.num_filters],
                                  tf.constant_initializer(-0.05),
                                  params.use_fp16)
        bias = tf.nn.bias_add(conv, biases)
        ## N, T, F, 32
        # batch normalization
        bn = custom_ops.batch_norm(bias)

        # clipped ReLU
        conv2 = custom_ops.relux(bn, capping=20)
        _activation_summary(conv2)

    ######################
    # recurrent layers
    ######################
    # Reshape conv output to fit rnn input: N, T, F * 32
    rnn_input = tf.reshape(conv2,
                           [params.batch_size, -1, 75 * params.num_filters])
    # Permute into time major order for rnn: T, N, F * 32
    rnn_input = tf.transpose(rnn_input, perm=[1, 0, 2])
    # Make one instance of cell on a fixed device,
    # and use copies of the weights on other devices.
    cell = custom_ops.CustomRNNCell2(params.num_hidden,
                                     use_fp16=params.use_fp16)
    multi_cell = tf.contrib.rnn.MultiRNNCell([cell] * params.num_rnn_layers)

    rnn_seq_lens = get_rnn_seqlen(seq_lens)
    if params.rnn_type == 'uni-dir':
        rnn_outputs, _ = tf.nn.dynamic_rnn(multi_cell,
                                           rnn_input,
                                           sequence_length=rnn_seq_lens,
                                           dtype=dtype,
                                           time_major=True,
                                           swap_memory=True)
    else:
        outputs, _ = tf.nn.bidirectional_dynamic_rnn(
            multi_cell,
            multi_cell,
            rnn_input,
            sequence_length=rnn_seq_lens,
            dtype=dtype,
            time_major=True,
            swap_memory=False)
        outputs_fw, outputs_bw = outputs
        rnn_outputs = outputs_fw + outputs_bw
    _activation_summary(rnn_outputs)

    # Linear layer(WX + b) - softmax is applied by CTC cost function.
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay('weights',
                                              [NUM_CLASSES, params.num_hidden],
                                              wd_value=None,
                                              use_fp16=params.use_fp16)
        biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                  tf.constant_initializer(0.0),
                                  params.use_fp16)
        logit_inputs = tf.reshape(rnn_outputs, [-1, cell.output_size])
        logits = tf.add(tf.matmul(logit_inputs,
                                  weights,
                                  transpose_a=False,
                                  transpose_b=True),
                        biases,
                        name=scope.name)
        logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES])
        _activation_summary(logits)

    return logits
Beispiel #3
0
def inference(sess, feats, seq_lens, params):
    """Build the deepSpeech model.

    Args:
      feats: MFCC features returned from distorted_inputs() or inputs().
      seq_lens: Input sequence length per utterance.
      params: parameters of the model.

    Returns:
      logits.
    """
    # data layout: N, T, F
    # feat_len = feats.get_shape().as_list()[-1]
    # print "feat shape: ", feats.get_shape().as_list()

    #########################
    #  convolutional layers
    #########################
    with tf.variable_scope('conv1') as scope:
        ## N, T, F
        feats = tf.expand_dims(feats, axis=3)

        ## N, T, F, 1
        # convolution
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[11, 41, 1, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)
        conv = tf.nn.conv2d(feats, kernel, [1, 2, 2, 1], padding='VALID')
        # biases = _variable_on_cpu('biases', [params.num_filters],
        #                           tf.constant_initializer(-0.05),
        #                          params.use_fp16)
        # bias = tf.nn.bias_add(conv, biases)

        ## N, T, F, 32
        # batch normalization
        bn = custom_ops.batch_norm(conv)

        # clipped ReLU
        conv1 = custom_ops.relux(bn, capping=20)
        _activation_summary(conv1)

    with tf.variable_scope('conv2') as scope:
        ## N, T, F, 32
        # convolution
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[11, 21, params.num_filters, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)
        conv = tf.nn.conv2d(conv1, kernel, [1, 1, 2, 1], padding='VALID')
        # biases = _variable_on_cpu('biases',
        #                           [params.num_filters],
        #                           tf.constant_initializer(-0.05),
        #                           params.use_fp16)
        # bias = tf.nn.bias_add(conv, biases)

        ## N, T, F, 32
        # batch normalization
        bn = custom_ops.batch_norm(conv)

        # clipped ReLU
        conv2 = custom_ops.relux(bn, capping=20)
        _activation_summary(conv2)

    ######################
    # recurrent layers
    ######################
    # Reshape conv output to fit rnn input: N, T, F * C
    fdim = conv2.get_shape().dims
    feat_dim = fdim[2].value * fdim[3].value
    rnn_input = tf.reshape(conv2, [params.batch_size, -1, feat_dim])

    # Permute into time major order for rnn: T, N, F * C
    rnn_input = tf.transpose(rnn_input, perm=[1, 0, 2])

    fw_cell = custom_ops.CustomRNNCell2(params.num_hidden)
    # fw_cell_list = [fw_cell] * params.num_rnn_layers

    # bw_cell = custom_ops.CustomRNNCell2(params.num_hidden)
    # bw_cell_list = [bw_cell] * params.num_rnn_layers

    conved_seq_lens = get_rnn_seqlen(seq_lens)

    rnn_outputs = custom_ops.stacked_brnn(fw_cell, fw_cell, params.num_hidden,
                                          params.num_rnn_layers, rnn_input,
                                          params.batch_size, conved_seq_lens)
    _activation_summary(rnn_outputs)

    # Linear layer(WX + b) - softmax is applied by CTC cost function.
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay(
            'weights', [NUM_CLASSES, params.num_hidden * 2],
            wd_value=None,
            use_fp16=params.use_fp16)
        biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                  tf.constant_initializer(0.0),
                                  params.use_fp16)
        logit_inputs = tf.reshape(rnn_outputs, [-1, params.num_hidden * 2])
        logits = tf.add(tf.matmul(logit_inputs,
                                  weights,
                                  transpose_a=False,
                                  transpose_b=True),
                        biases,
                        name=scope.name)
        logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES])
        _activation_summary(logits)

    return logits
def inference(sess, feats, seq_lens, params):
    """Build the deepSpeech model.

    Args:
      feats: MFCC features returned from distorted_inputs() or inputs().
      seq_lens: Input sequence length per utterance.
      params: parameters of the model.

    Returns:
      Logits.
    """
    # We instantiate all variables using tf.get_variable() instead of
    # tf.Variable() in order to share variables across multiple GPU
    # training runs. If we only ran this model on a single GPU,
    # we could simplify this function
    # by replacing all instances of tf.get_variable() with tf.Variable().

    if params.use_fp16:
        dtype = tf.float16
    else:
        dtype = tf.float32

    feat_len = feats.get_shape().as_list()[-1]
    # data layout: N, T, F
    # print "feat shape: ", feats.get_shape().as_list()

    #########################
    #  convolutional layers
    #########################
    with tf.variable_scope('conv1') as scope:
        # convolution
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[20, 5, 1, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)

        ## N, T, F
        feats = tf.expand_dims(feats, axis=1)
        ## N, 1, T, F
        conv = tf.nn.conv2d(feats,
                            kernel,
                            strides=[1, 1, 2, 2],
                            padding='VALID',
                            data_format='NCHW')
        biases = _variable_on_cpu('biases', [params.num_filters],
                                  tf.constant_initializer(-0.05),
                                  params.use_fp16)
        bias = tf.nn.bias_add(conv, biases, data_format='NCHW')
        ## N, 32, T, F
        # batch normalization
        bn = custom_ops.batch_norm2(bias, data_format='NCHW')

        # clipped ReLU
        conv1 = custom_ops.relux(bn, capping=20)
        _activation_summary(conv1)

    with tf.variable_scope('conv2') as scope:
        # convolution
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[10, 5, params.num_filters, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)

        ## N, 32, T, F
        conv = tf.nn.conv2d(conv1,
                            kernel, [1, 1, 2, 1],
                            padding='VALID',
                            data_format='NCHW')
        biases = _variable_on_cpu('biases', [params.num_filters],
                                  tf.constant_initializer(-0.05),
                                  params.use_fp16)
        bias = tf.nn.bias_add(conv, biases, data_format='NCHW')
        ## N, 32, T, F
        # batch normalization
        bn = custom_ops.batch_norm2(bias, data_format='NCHW')

        # clipped ReLU
        conv2 = custom_ops.relux(bn, capping=20)
        _activation_summary(conv2)

    ######################
    # recurrent layers
    ######################
    # conv2 = tf.Print(conv2, [conv2.get_shape()], "Conved Tensor Shape: ")
    with tf.variable_scope('rnn') as scope:
        # N, C, T, F => T, N, C, F
        rnn_input1 = tf.transpose(conv2, perm=[2, 0, 1, 3])
        # Reshape conv output to fit rnn input: T, N, 32 * F
        rnn_input = tf.reshape(
            rnn_input1, [-1, params.batch_size, 75 * params.num_filters])
        # Make one instance of cell on a fixed device,
        # and use copies of the weights on other devices.
        cell_list = []
        if params.engine == 'mkldnn_rnn' or params.engine == 'cudnn_rnn':
            cell_list.append(
                MkldnnRNNCell(sess,
                              params.num_hidden,
                              input_size=75 * params.num_filters,
                              use_fp16=params.use_fp16))
            for i in range(params.num_rnn_layers - 1):
                cell_list.append(
                    MkldnnRNNCell(sess,
                                  params.num_hidden,
                                  input_size=params.num_hidden,
                                  use_fp16=params.use_fp16))
        else:
            cell = custom_ops.CustomRNNCell2(params.num_hidden,
                                             use_fp16=params.use_fp16)
            cell_list = [cell] * params.num_rnn_layers

        rnn_seq_lens = get_rnn_seqlen(seq_lens)
        rnn_outputs = custom_ops.stacked_brnn(cell_list, cell_list,
                                              params.num_hidden,
                                              params.num_rnn_layers, rnn_input,
                                              rnn_seq_lens, params.batch_size)
        _activation_summary(rnn_outputs)

    # print "rnn output:", rnn_outputs.get_shape()

    # Linear layer(WX + b) - softmax is applied by CTC cost function.
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay('weights',
                                              [NUM_CLASSES, params.num_hidden],
                                              wd_value=None,
                                              use_fp16=params.use_fp16)
        biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                  tf.constant_initializer(0.0),
                                  params.use_fp16)
        logit_inputs = tf.reshape(rnn_outputs, [-1, params.num_hidden])
        logits = tf.add(tf.matmul(logit_inputs,
                                  weights,
                                  transpose_a=False,
                                  transpose_b=True),
                        biases,
                        name=scope.name)
        logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES])
        _activation_summary(logits)

    return logits