コード例 #1
0
ファイル: custom_ops.py プロジェクト: EmergentOrder/Lantern
 def __call__(self, inputs, state, scope=None):
     """
      output = new_state = activation(BN(W * input) + U * state + B).
        state dim: batch_size * num_units
        input dim: batch_size * feature_size
        W: feature_size * num_units
        U: num_units * num_units
     """
     with tf.variable_scope(scope or type(self).__name__):
         # print "rnn cell input size: ", inputs.get_shape().as_list()
         # print "rnn cell state size: ", state.get_shape().as_list()
         wsize = inputs.get_shape()[1]
         w = _variable_on_cpu('W', [self._num_units, wsize], initializer=tf.orthogonal_initializer())
         # print w.name
         resi = tf.matmul(inputs, w, transpose_a=False, transpose_b=True)
         # batch_size * num_units
         bn_resi = seq_batch_norm(resi)
         # bn_resi = resi
         usize = state.get_shape()[1]
         u = _variable_on_cpu('U', [self._num_units, usize], initializer=tf.orthogonal_initializer())
         resu = tf.matmul(state, u, transpose_a=False, transpose_b=True)
         # res_nb = tf.add_n([bn_resi, resu])
         res_nb = tf.add(bn_resi, resu)
         bias = _variable_on_cpu('B', [self._num_units],
                                  tf.constant_initializer(0))
         res = tf.add(res_nb, bias)
         output = relux(res, capping=20)
     return output, output
コード例 #2
0
ファイル: custom_ops.py プロジェクト: yao-matrix/deepSpeech2
def seq_batch_norm(x, scope=None, is_train=True):
    """sequence batch normalization, input N * D"""
    with tf.name_scope(None):
        with tf.variable_scope("sbn", reuse=None):
            inputs_shape = x.get_shape()
            param_shape = inputs_shape[-1]
            beta = _variable_on_cpu('beta', [param_shape],
                                    initializer=tf.zeros_initializer(),
                                    trainable=False)
            gamma = _variable_on_cpu('gamma', [param_shape],
                                     initializer=tf.ones_initializer(),
                                     trainable=False)
            batch_mean, batch_var = tf.nn.moments(x, [0], name='moments')

            moving_mean = _variable_on_cpu('moving_mean', [param_shape],
                                           initializer=tf.zeros_initializer(),
                                           trainable=False)
            moving_variance = _variable_on_cpu(
                'moving_variance', [param_shape],
                initializer=tf.ones_initializer(),
                trainable=False)
            moving_averages.assign_moving_average(moving_mean, batch_mean,
                                                  0.997)
            moving_averages.assign_moving_average(moving_variance, batch_var,
                                                  0.997)
            normed = tf.nn.batch_normalization(x, moving_mean, moving_variance,
                                               beta, gamma, 1e-5)
    return normed
コード例 #3
0
def seq_batch_norm(x, scope=None, is_train=True):
    """sequence batch normalization, input N * D"""
    with tf.variable_scope("sbn"):
        inputs_shape = x.get_shape()
        param_shape = inputs_shape[-1]

        batch_mean, batch_var = tf.nn.moments(x, [0], name='moments')

        ema = tf.train.ExponentialMovingAverage(decay=0.9997)

        def mean_var_with_update():
            ema_apply_op = ema.apply([batch_mean, batch_var])
            with tf.control_dependencies([ema_apply_op]):
                return tf.identity(batch_mean), tf.identity(batch_var)

        mean, var = control_flow_ops.cond(
            tf.cast(is_train, "bool"), mean_var_with_update, lambda:
            (ema.average(batch_mean), ema.average(batch_var)))

        offset = _variable_on_cpu('offset', [param_shape],
                                  initializer=tf.zeros_initializer(),
                                  trainable=True)
        scale = _variable_on_cpu('scale', [param_shape],
                                 initializer=tf.ones_initializer(),
                                 trainable=True)

        normed = tf.nn.batch_normalization(x, mean, var, offset, scale, 0.001)
    return normed
コード例 #4
0
ファイル: custom_ops.py プロジェクト: EmergentOrder/Lantern
def _linear(args, output_size, bias, scope=None, use_fp16=False):
    """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.

    Args:
      args: a 2D Tensor or a list of 2D, batch x n, Tensors.
      output_size: int, second dimension of W[i].
      bias: boolean, whether to add a bias term or not.
      bias_start: starting value to initialize the bias; 0 by default.
      scope: VariableScope for the created subgraph; defaults to "Linear".

    Returns:
      A 2D Tensor with shape [batch x output_size] equal to
      sum_i(args[i] * W[i]), where W[i]s are newly created matrices.

    Raises:
      ValueError: if some of the arguments has unspecified or wrong shape.
    """
    if args is None or (nest.is_sequence(args) and not args):
        raise ValueError("`args` must be specified")
    if not nest.is_sequence(args):
        args = [args]

    # Calculate the total size of arguments on dimension 1.
    total_arg_size = 0
    shapes = [a.get_shape().as_list() for a in args]
    for shape in shapes:
        if len(shape) != 2:
            raise ValueError(
                "Linear is expecting 2D arguments: %s" % str(shapes))
        if not shape[1]:
            raise ValueError(
                "Linear expects shape[1] of arguments: %s" % str(shapes))
        else:
            total_arg_size += shape[1]

    dtype = [a.dtype for a in args][0]

    # Now the computation.
    with tf.variable_scope(scope or "Linear"):
        matrix = _variable_on_cpu('Matrix', [total_arg_size, output_size],
                                  use_fp16 = use_fp16)
        if use_fp16:
            dtype = tf.float16
        else:
            dtype = tf.float32
        args = [tf.cast(x, dtype) for x in args]
        if len(args) == 1:
            res = tf.matmul(args[0], matrix, transpose_a=False, transpose_b=True)
        else:
            res = tf.matmul(tf.concat(args, 1), matrix, transpose_a=False, transpose_b=True)
        if not bias:
            return res
        bias_term = _variable_on_cpu('Bias', [output_size],
                                     tf.constant_initializer(0),
                                     use_fp16=use_fp16)
    return res + bias_term
コード例 #5
0
ファイル: custom_ops.py プロジェクト: EmergentOrder/Lantern
def batch_norm2(inputs,
                decay=0.999,
                center=True,
                scale=True,
                epsilon=0.001,
                moving_vars='moving_vars',
                activation=None,
                is_training=True,
                trainable=True,
                scope=None,
                data_format='NHWC'):
  """Adds a Batch Normalization layer.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels]
            or [batch_size, channels].
    decay: decay for the moving average.
    center: If True, subtract beta. If False, beta is not created and ignored.
    scale: If True, multiply by gamma. If False, gamma is
      not used. When the next layer is linear (also e.g. ReLU), this can be
      disabled since the scaling can be done by the next layer.
    epsilon: small float added to variance to avoid dividing by zero.
    moving_vars: collection to store the moving_mean and moving_variance.
    activation: activation function.
    is_training: whether or not the model is in training mode.
    trainable: whether or not the variables should be trainable or not.
    scope: Optional scope for variable_scope.

  Returns:
    a tensor representing the output of the operation.

  """
  inputs_shape = inputs.get_shape()
  with tf.variable_scope('bn2'):
    if data_format == 'NCHW':
      params_shape = inputs_shape[1]
    else:
      params_shape = inputs_shape[-1]

    # scale
    scale = _variable_on_cpu('scale', params_shape, initializer=tf.ones_initializer())
    # shift
    shift = _variable_on_cpu('shift', params_shape, initializer=tf.zeros_initializer())

    moving_mean = _variable_on_cpu('moving_mean', [params_shape], initializer=tf.zeros_initializer(), trainable=False)	
    moving_var = _variable_on_cpu('moving_variance', [params_shape], initializer=tf.ones_initializer(), trainable=False)

    if is_training:
      y, batch_mean, batch_var = tf.nn.fused_batch_norm(inputs, gamma, beta, mean=None, variance=None, epsilon=epsilon,
                                                        data_format=data_format, is_training=is_training)
      moving_averages.assign_moving_average(moving_mean, batch_mean, decay)
      moving_averages.assign_moving_average(moving_var, batch_var, decay)
    else:
      y, _, _ = tf.nn.fused_batch_norm(inputs, scale, shift, mean=moving_mean, variance=moving_var, epsilon=epsilon,
                                       data_format=data_format, is_training=is_training)  
    return y
コード例 #6
0
def batch_norm2(inputs,
               decay = 0.999,
               center = True,
               scale = True,
               epsilon = 0.001,
               moving_vars = 'moving_vars',
               activation = None,
               is_training = True,
               trainable = True,
               scope = None,
               reuse = None,
               data_format = 'NHWC'):
  """Adds a Batch Normalization layer.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels]
            or [batch_size, channels].
    decay: decay for the moving average.
    center: If True, subtract beta. If False, beta is not created and ignored.
    scale: If True, multiply by gamma. If False, gamma is
      not used. When the next layer is linear (also e.g. ReLU), this can be
      disabled since the scaling can be done by the next layer.
    epsilon: small float added to variance to avoid dividing by zero.
    moving_vars: collection to store the moving_mean and moving_variance.
    activation: activation function.
    is_training: whether or not the model is in training mode.
    trainable: whether or not the variables should be trainable or not.
    scope: Optional scope for variable_scope.
    reuse: whether or not the layer and its variables should be reused. To be
      able to reuse the layer scope must be given.

  Returns:
    a tensor representing the output of the operation.

  """
  inputs_shape = inputs.get_shape()
  with tf.variable_scope(scope, 'bn2', [inputs], reuse = reuse):
    axis = list(range(len(inputs_shape) - 1))
    if data_format == 'NCHW':
      params_shape = inputs_shape[1]
    else:
      params_shape = inputs_shape[-1]
    # Allocate parameters for the beta and gamma of the normalization.
    beta, gamma = None, None
    if center:
      beta = _variable_on_cpu('beta', params_shape, initializer = tf.zeros_initializer())

    if scale:
      gamma = _variable_on_cpu('gamma', params_shape, initializer = tf.ones_initializer())
 
    outputs, _, _ = tf.nn.fused_batch_norm(
        inputs, gamma, beta, mean = None, variance = None, epsilon = epsilon,
        data_format = data_format, is_training = is_training)
    outputs.set_shape(inputs.get_shape())

    return outputs
コード例 #7
0
def batch_norm(x, scope = None, is_train = True, data_format = None):
    """batch normalization, currently only work on NHWC"""
    with tf.variable_scope(scope or 'bn'):
        inputs_shape = x.get_shape()
        param_shape = inputs_shape[-1]        
        beta = _variable_on_cpu('beta', [param_shape], initializer = tf.zeros_initializer())
        gamma = _variable_on_cpu('gamma', [param_shape], initializer = tf.ones_initializer())
        batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name = 'moments')
        ema = tf.train.ExponentialMovingAverage(decay = 0.5, zero_debias = True)
        def mean_var_with_update():
            ema_apply_op = ema.apply([batch_mean, batch_var])
            with tf.control_dependencies([ema_apply_op]):
                return tf.identity(batch_mean), tf.identity(batch_var)
        if is_train:
            mean, var = mean_var_with_update()
        else:
            mean, var = lambda : (ema.average(batch_mean), ema.average(batch_var))
        normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-5)
    return normed
コード例 #8
0
ファイル: custom_ops.py プロジェクト: leepaul009/deepSpeech
 def __call__(self, inputs, state, scope = None):
     """Most basic RNN:
     output = new_state = activation(BN(W * input) + U * state + B).
      state dim: seq_len * num_units
      input dim: batch_size * feature_size
      W: feature_size * num_units
      U: num_units * num_units
     """
     with tf.variable_scope(scope or type(self).__name__):
         # print "rnn cell input size: ", inputs.get_shape().as_list()
         # print "rnn cell state size: ", state.get_shape().as_list()
         wsize = inputs.get_shape()[1]
         w = _variable_on_cpu('W', [self._num_units, wsize], use_fp16 = self.use_fp16)
         resi = tf.matmul(inputs, w, transpose_a = False, transpose_b = True)
         # batch_size * num_units
         bn_resi = seq_batch_norm(resi)
         usize = state.get_shape()[1]
         u = _variable_on_cpu('U', [self._num_units, usize], use_fp16 = self.use_fp16)
         resu = tf.matmul(state, u, transpose_a = False, transpose_b = True)
         bias = _variable_on_cpu('B', [self._num_units],
                                  tf.constant_initializer(0),
                                  use_fp16 = self.use_fp16)
         output = relux(tf.add(bn_resi, resu) + bias, capping = 20)
     return output, output
コード例 #9
0
ファイル: mkldnn_rnn_op.py プロジェクト: supunab/Lantern
 def __call__(self, inputs, state, scope=None, weight_size=None):
     with tf.variable_scope(scope or type(self).__name__):
         # if len(inputs.get_shape()) == 2:
         #   inputs = tf.expand_dims(inputs, axis=0)
         # state = tf.expand_dims(state, axis=0)
         # print "input size: ", inputs.get_shape(), " state size: ", state.get_shape()
         rnn_weights = _variable_on_cpu(
             "rnn_weights", [self.param_size],
             tf.constant_initializer(1.0 / self.param_size), self.use_fp16)
         output, output_h = self.model(input_data=inputs,
                                       input_h=state,
                                       params=rnn_weights,
                                       is_training=True)
         # print "output size: ", output.get_shape(), "output h size: ", output_h.get_shape()
         # output = tf.squeeze(output, axis=0)
         # output_h = tf.squeeze(output_h, axis=0)
     return output, output_h
コード例 #10
0
def inference(feats, seq_lens, params):
    """Build the deepSpeech model.

    Args:
      feats: MFCC features returned from distorted_inputs() or inputs().
      seq_lens: Input sequence length per utterance.
      params: parameters of the model.

    Returns:
      Logits.
    """
    # We instantiate all variables using tf.get_variable() instead of
    # tf.Variable() in order to share variables across multiple GPU
    # training runs. If we only ran this model on a single GPU,
    # we could simplify this function
    # by replacing all instances of tf.get_variable() with tf.Variable().

    if params.use_fp16:
        dtype = tf.float16
    else:
        dtype = tf.float32

    feat_len = feats.get_shape().as_list()[-1]

    # convolutional layers
    with tf.variable_scope('conv1') as scope:
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[11, feat_len, 1, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)

        feats = tf.expand_dims(feats, dim=-1)
        conv = tf.nn.conv2d(feats,
                            kernel, [1, params.temporal_stride, 1, 1],
                            padding='SAME')
        # conv = tf.nn.atrous_conv2d(feats, kernel, rate=2, padding='SAME')
        biases = _variable_on_cpu('biases', [params.num_filters],
                                  tf.constant_initializer(-0.05),
                                  params.use_fp16)
        bias = tf.nn.bias_add(conv, biases)
        conv1 = tf.nn.relu(bias, name=scope.name)
        _activation_summary(conv1)

        # dropout
        conv1_drop = tf.nn.dropout(conv1, params.keep_prob)

    # recurrent layers
    with tf.variable_scope('rnn') as scope:

        # Reshape conv output to fit rnn input
        rnn_input = tf.reshape(
            conv1_drop, [params.batch_size, -1, feat_len * params.num_filters])
        # Permute into time major order for rnn
        rnn_input = tf.transpose(rnn_input, perm=[1, 0, 2])
        # Make one instance of cell on a fixed device,
        # and use copies of the weights on other devices.
        cell = rnn_cell.CustomRNNCell(params.num_hidden,
                                      activation=tf.nn.relu6,
                                      use_fp16=params.use_fp16)
        drop_cell = tf.contrib.rnn.DropoutWrapper(
            cell, output_keep_prob=params.keep_prob)
        multi_cell = tf.contrib.rnn.MultiRNNCell([drop_cell] *
                                                 params.num_rnn_layers)

        seq_lens = tf.div(seq_lens, params.temporal_stride)
        if params.rnn_type == 'uni-dir':
            rnn_outputs, _ = tf.nn.dynamic_rnn(multi_cell,
                                               rnn_input,
                                               sequence_length=seq_lens,
                                               dtype=dtype,
                                               time_major=True,
                                               scope='rnn',
                                               swap_memory=True)
        else:
            outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                multi_cell,
                multi_cell,
                rnn_input,
                sequence_length=seq_lens,
                dtype=dtype,
                time_major=True,
                scope='rnn',
                swap_memory=True)
            outputs_fw, outputs_bw = outputs
            rnn_outputs = outputs_fw + outputs_bw
        _activation_summary(rnn_outputs)

    # Linear layer(WX + b) - softmax is applied by CTC cost function.
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay('weights',
                                              [params.num_hidden, NUM_CLASSES],
                                              wd_value=None,
                                              use_fp16=params.use_fp16)
        biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                  tf.constant_initializer(0.0),
                                  params.use_fp16)
        logit_inputs = tf.reshape(rnn_outputs, [-1, cell.output_size])
        logits = tf.add(tf.matmul(logit_inputs, weights),
                        biases,
                        name=scope.name)
        logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES])
        _activation_summary(logits)

    return logits
コード例 #11
0
ファイル: deepSpeech.py プロジェクト: KyleHai/DeepSpeech2
def inference(sess, feats, seq_lens, params):
    """Build the deepSpeech model.

    Args:
      feats: MFCC features returned from distorted_inputs() or inputs().
      seq_lens: Input sequence length per utterance.
      params: parameters of the model.

    Returns:
      logits.
    """
    # data layout: N, T, F
    # feat_len = feats.get_shape().as_list()[-1]
    # print "feat shape: ", feats.get_shape().as_list()

    #########################
    #  convolutional layers
    #########################
    with tf.variable_scope('conv1') as scope:
        ## N, T, F
        feats = tf.expand_dims(feats, axis=3)

        ## N, T, F, 1
        # convolution
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[11, 41, 1, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)
        conv = tf.nn.conv2d(feats, kernel, [1, 2, 2, 1], padding='VALID')
        # biases = _variable_on_cpu('biases', [params.num_filters],
        #                           tf.constant_initializer(-0.05),
        #                          params.use_fp16)
        # bias = tf.nn.bias_add(conv, biases)

        ## N, T, F, 32
        # batch normalization
        bn = custom_ops.batch_norm(conv)

        # clipped ReLU
        conv1 = custom_ops.relux(bn, capping=20)
        _activation_summary(conv1)

    with tf.variable_scope('conv2') as scope:
        ## N, T, F, 32
        # convolution
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[11, 21, params.num_filters, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)
        conv = tf.nn.conv2d(conv1, kernel, [1, 1, 2, 1], padding='VALID')
        # biases = _variable_on_cpu('biases',
        #                           [params.num_filters],
        #                           tf.constant_initializer(-0.05),
        #                           params.use_fp16)
        # bias = tf.nn.bias_add(conv, biases)

        ## N, T, F, 32
        # batch normalization
        bn = custom_ops.batch_norm(conv)

        # clipped ReLU
        conv2 = custom_ops.relux(bn, capping=20)
        _activation_summary(conv2)

    ######################
    # recurrent layers
    ######################
    # Reshape conv output to fit rnn input: N, T, F * C
    fdim = conv2.get_shape().dims
    feat_dim = fdim[2].value * fdim[3].value
    rnn_input = tf.reshape(conv2, [params.batch_size, -1, feat_dim])

    # Permute into time major order for rnn: T, N, F * C
    rnn_input = tf.transpose(rnn_input, perm=[1, 0, 2])

    fw_cell = custom_ops.CustomRNNCell2(params.num_hidden)
    # fw_cell_list = [fw_cell] * params.num_rnn_layers

    # bw_cell = custom_ops.CustomRNNCell2(params.num_hidden)
    # bw_cell_list = [bw_cell] * params.num_rnn_layers

    conved_seq_lens = get_rnn_seqlen(seq_lens)

    rnn_outputs = custom_ops.stacked_brnn(fw_cell, fw_cell, params.num_hidden,
                                          params.num_rnn_layers, rnn_input,
                                          params.batch_size, conved_seq_lens)
    _activation_summary(rnn_outputs)

    # Linear layer(WX + b) - softmax is applied by CTC cost function.
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay(
            'weights', [NUM_CLASSES, params.num_hidden * 2],
            wd_value=None,
            use_fp16=params.use_fp16)
        biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                  tf.constant_initializer(0.0),
                                  params.use_fp16)
        logit_inputs = tf.reshape(rnn_outputs, [-1, params.num_hidden * 2])
        logits = tf.add(tf.matmul(logit_inputs,
                                  weights,
                                  transpose_a=False,
                                  transpose_b=True),
                        biases,
                        name=scope.name)
        logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES])
        _activation_summary(logits)

    return logits
コード例 #12
0
def inference(session, feats, seq_lens, params):
    """Build the deepSpeech model.

    Args:
      feats: MFCC features returned from distorted_inputs() or inputs().
      seq_lens: Input sequence length per utterance.
      params: parameters of the model.

    Returns:
      Logits.
    """
    # We instantiate all variables using tf.get_variable() instead of
    # tf.Variable() in order to share variables across multiple GPU
    # training runs. If we only ran this model on a single GPU,
    # we could simplify this function
    # by replacing all instances of tf.get_variable() with tf.Variable().

    if params.use_fp16:
        dtype = tf.float16
    else:
        dtype = tf.float32

    feat_len = feats.get_shape().as_list()[-1]
    # data layout: N, T, F
    # print "feat shape: ", feats.get_shape().as_list()

    #########################
    #  convolutional layers
    #########################
    with tf.variable_scope('conv1') as scope:
        # convolution
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[20, 5, 1, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)

        ## N. T, F
        feats = tf.expand_dims(feats, dim=-1)
        ## N, T, F, 1
        conv = tf.nn.conv2d(feats, kernel, [1, 2, 2, 1], padding='VALID')
        biases = _variable_on_cpu('biases', [params.num_filters],
                                  tf.constant_initializer(-0.05),
                                  params.use_fp16)
        bias = tf.nn.bias_add(conv, biases)
        ## N, T, F, 32
        # batch normalization
        bn = custom_ops.batch_norm(bias)

        # clipped ReLU
        conv1 = custom_ops.relux(bn, capping=20)
        _activation_summary(conv1)

    with tf.variable_scope('conv2') as scope:
        # convolution
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[10, 5, params.num_filters, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)

        ## N. T, F, 32
        conv = tf.nn.conv2d(conv1, kernel, [1, 2, 1, 1], padding='VALID')
        biases = _variable_on_cpu('biases', [params.num_filters],
                                  tf.constant_initializer(-0.05),
                                  params.use_fp16)
        bias = tf.nn.bias_add(conv, biases)
        ## N, T, F, 32
        # batch normalization
        bn = custom_ops.batch_norm(bias)

        # clipped ReLU
        conv2 = custom_ops.relux(bn, capping=20)
        _activation_summary(conv2)

    ######################
    # recurrent layers
    ######################
    # Reshape conv output to fit rnn input: N, T, F * 32
    rnn_input = tf.reshape(conv2,
                           [params.batch_size, -1, 75 * params.num_filters])
    # Permute into time major order for rnn: T, N, F * 32
    rnn_input = tf.transpose(rnn_input, perm=[1, 0, 2])
    # Make one instance of cell on a fixed device,
    # and use copies of the weights on other devices.
    cell = custom_ops.CustomRNNCell2(params.num_hidden,
                                     use_fp16=params.use_fp16)
    multi_cell = tf.contrib.rnn.MultiRNNCell([cell] * params.num_rnn_layers)

    rnn_seq_lens = get_rnn_seqlen(seq_lens)
    if params.rnn_type == 'uni-dir':
        rnn_outputs, _ = tf.nn.dynamic_rnn(multi_cell,
                                           rnn_input,
                                           sequence_length=rnn_seq_lens,
                                           dtype=dtype,
                                           time_major=True,
                                           swap_memory=True)
    else:
        outputs, _ = tf.nn.bidirectional_dynamic_rnn(
            multi_cell,
            multi_cell,
            rnn_input,
            sequence_length=rnn_seq_lens,
            dtype=dtype,
            time_major=True,
            swap_memory=False)
        outputs_fw, outputs_bw = outputs
        rnn_outputs = outputs_fw + outputs_bw
    _activation_summary(rnn_outputs)

    # Linear layer(WX + b) - softmax is applied by CTC cost function.
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay('weights',
                                              [NUM_CLASSES, params.num_hidden],
                                              wd_value=None,
                                              use_fp16=params.use_fp16)
        biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                  tf.constant_initializer(0.0),
                                  params.use_fp16)
        logit_inputs = tf.reshape(rnn_outputs, [-1, cell.output_size])
        logits = tf.add(tf.matmul(logit_inputs,
                                  weights,
                                  transpose_a=False,
                                  transpose_b=True),
                        biases,
                        name=scope.name)
        logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES])
        _activation_summary(logits)

    return logits
コード例 #13
0
def inference(sess, feats, seq_lens, params):
    """Build the deepSpeech model.

    Args:
      feats: MFCC features returned from distorted_inputs() or inputs().
      seq_lens: Input sequence length per utterance.
      params: parameters of the model.

    Returns:
      Logits.
    """
    # We instantiate all variables using tf.get_variable() instead of
    # tf.Variable() in order to share variables across multiple GPU
    # training runs. If we only ran this model on a single GPU,
    # we could simplify this function
    # by replacing all instances of tf.get_variable() with tf.Variable().

    if params.use_fp16:
        dtype = tf.float16
    else:
        dtype = tf.float32

    feat_len = feats.get_shape().as_list()[-1]
    # data layout: N, T, F
    # print "feat shape: ", feats.get_shape().as_list()

    #########################
    #  convolutional layers
    #########################
    with tf.variable_scope('conv1') as scope:
        # convolution
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[20, 5, 1, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)

        ## N, T, F
        feats = tf.expand_dims(feats, axis=1)
        ## N, 1, T, F
        conv = tf.nn.conv2d(feats,
                            kernel,
                            strides=[1, 1, 2, 2],
                            padding='VALID',
                            data_format='NCHW')
        biases = _variable_on_cpu('biases', [params.num_filters],
                                  tf.constant_initializer(-0.05),
                                  params.use_fp16)
        bias = tf.nn.bias_add(conv, biases, data_format='NCHW')
        ## N, 32, T, F
        # batch normalization
        bn = custom_ops.batch_norm2(bias, data_format='NCHW')

        # clipped ReLU
        conv1 = custom_ops.relux(bn, capping=20)
        _activation_summary(conv1)

    with tf.variable_scope('conv2') as scope:
        # convolution
        kernel = _variable_with_weight_decay(
            'weights',
            shape=[10, 5, params.num_filters, params.num_filters],
            wd_value=None,
            use_fp16=params.use_fp16)

        ## N, 32, T, F
        conv = tf.nn.conv2d(conv1,
                            kernel, [1, 1, 2, 1],
                            padding='VALID',
                            data_format='NCHW')
        biases = _variable_on_cpu('biases', [params.num_filters],
                                  tf.constant_initializer(-0.05),
                                  params.use_fp16)
        bias = tf.nn.bias_add(conv, biases, data_format='NCHW')
        ## N, 32, T, F
        # batch normalization
        bn = custom_ops.batch_norm2(bias, data_format='NCHW')

        # clipped ReLU
        conv2 = custom_ops.relux(bn, capping=20)
        _activation_summary(conv2)

    ######################
    # recurrent layers
    ######################
    # conv2 = tf.Print(conv2, [conv2.get_shape()], "Conved Tensor Shape: ")
    with tf.variable_scope('rnn') as scope:
        # N, C, T, F => T, N, C, F
        rnn_input1 = tf.transpose(conv2, perm=[2, 0, 1, 3])
        # Reshape conv output to fit rnn input: T, N, 32 * F
        rnn_input = tf.reshape(
            rnn_input1, [-1, params.batch_size, 75 * params.num_filters])
        # Make one instance of cell on a fixed device,
        # and use copies of the weights on other devices.
        cell_list = []
        if params.engine == 'mkldnn_rnn' or params.engine == 'cudnn_rnn':
            cell_list.append(
                MkldnnRNNCell(sess,
                              params.num_hidden,
                              input_size=75 * params.num_filters,
                              use_fp16=params.use_fp16))
            for i in range(params.num_rnn_layers - 1):
                cell_list.append(
                    MkldnnRNNCell(sess,
                                  params.num_hidden,
                                  input_size=params.num_hidden,
                                  use_fp16=params.use_fp16))
        else:
            cell = custom_ops.CustomRNNCell2(params.num_hidden,
                                             use_fp16=params.use_fp16)
            cell_list = [cell] * params.num_rnn_layers

        rnn_seq_lens = get_rnn_seqlen(seq_lens)
        rnn_outputs = custom_ops.stacked_brnn(cell_list, cell_list,
                                              params.num_hidden,
                                              params.num_rnn_layers, rnn_input,
                                              rnn_seq_lens, params.batch_size)
        _activation_summary(rnn_outputs)

    # print "rnn output:", rnn_outputs.get_shape()

    # Linear layer(WX + b) - softmax is applied by CTC cost function.
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay('weights',
                                              [NUM_CLASSES, params.num_hidden],
                                              wd_value=None,
                                              use_fp16=params.use_fp16)
        biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                  tf.constant_initializer(0.0),
                                  params.use_fp16)
        logit_inputs = tf.reshape(rnn_outputs, [-1, params.num_hidden])
        logits = tf.add(tf.matmul(logit_inputs,
                                  weights,
                                  transpose_a=False,
                                  transpose_b=True),
                        biases,
                        name=scope.name)
        logits = tf.reshape(logits, [-1, params.batch_size, NUM_CLASSES])
        _activation_summary(logits)

    return logits