Esempio n. 1
0
def tower_loss(scope, feats, labels, seq_lens):
    """Calculate the total loss on a single tower running the deepSpeech model.

    This function builds the graph for computing the loss per tower(GPU).

    ARGS:
      scope: unique prefix string identifying the
             deepSpeech tower, e.g. 'tower_0'
      feats: Tensor of shape BxFxT representing the
             audio features (mfccs or spectrogram).
      labels: sparse tensor holding labels of each utterance.
      seq_lens: tensor of shape [batch_size] holding
              the sequence length per input utterance.
    Returns:
       Tensor of shape [batch_size] containing
       the total loss for a batch of data
    """

    # Build inference Graph.
    logits = deepSpeech.inference(feats, seq_lens, ARGS)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    strided_seq_lens = tf.div(seq_lens, ARGS.temporal_stride)
    _ = deepSpeech.loss(logits, labels, strided_seq_lens)

    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)

    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')

    # Compute the moving average of all individual losses and the total loss.
    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
    loss_averages_op = loss_averages.apply(losses + [total_loss])

    # Attach a scalar summary to all individual losses and the total loss;
    # do the same for the averaged version of the losses.
    for loss in losses + [total_loss]:
        # Remove 'tower_[0-9]/' from the name in case this is a
        # multi-GPU training session. This helps the clarity
        # of presentation on tensorboard.
        loss_name = re.sub('%s_[0-9]*/' % helper_routines.TOWER_NAME, '',
                           loss.op.name)
        # Name each loss as '(raw)' and name the moving average
        # version of the loss as the original loss name.
        tf.scalar_summary(loss_name + '(raw)', loss)
        tf.scalar_summary(loss_name, loss_averages.average(loss))

    # Without this loss_averages_op would never run
    with tf.control_dependencies([loss_averages_op]):
        total_loss = tf.identity(total_loss)
    return total_loss
Esempio n. 2
0
def train():
    """
    Train deepSpeech for a number of steps.
      This function build a set of ops required to build the model and optimize
      weights.
    """

    # Learning rate set up
    learning_rate, global_step = set_learning_rate()

    # Create an optimizer that performs gradient descent.
    # optimizer = tf.train.AdamOptimizer(learning_rate)

    # forward pass to compute loss
    freq_size = 161
    inputs = tf.placeholder(tf.float32, [ARGS.batch_size, freq_size, None])
    targets = tf.placeholder(tf.float32, [ARGS.batch_size, None])
    max_seqlen = tf.placeholder(tf.int32, [ARGS.batch_size])

    logits = deepSpeech.inference(inputs, max_seqlen, ARGS)
    # ctcloss
    loss = deepSpeech.loss(logits, targets, max_seqlen)
    # backward optimize
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    # optimizer.minimize(loss)
    grads_and_vars = optimizer.compute_gradients(loss)
    clipped_grads_and_vars = [(tf.clip_by_value(grad,
                                                clip_value_min=-400,
                                                clip_value_max=400), var)
                              for grad, var in grads_and_vars]
    apply_gradient_op = optimizer.apply_gradients(grads,
                                                  global_step=global_step)
    train_op = apply_gradient_op

    with g.as_default(), tf.device('/device:GPU:0'):

        # Start running operations on the Graph. allow_soft_placement
        # must be set to True to build towers on GPU, as some of the
        # ops do not have GPU implementations.
        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=ARGS.log_device_placement))

        print("forbid the use of checkpoint")
        sess.run(tf.global_variables_initializer())
        # Start the queue runners.
        tf.train.start_queue_runners(sess)
        g.finalize()

        # Run training loop
        # run_train_loop(sess, (loss_op, train_op, summary_op), saver)
        run_train_loop(sess, (logits, loss, train_op))
Esempio n. 3
0
def evaluate():
    """ Evaluate deepSpeech modelfor a number of steps."""
    if ARGS.lm_model != None:
        print(ARGS.lm_model)
        lm = LM(ARGS.lm_model)
    else:
        lm = None

    with tf.Graph().as_default() as graph:

        # Get feats and labels for deepSpeech.
        feats, labels, seq_lens = deepSpeech.inputs(ARGS.eval_data,
                                                    data_dir=ARGS.data_dir,
                                                    batch_size=ARGS.batch_size,
                                                    use_fp16=ARGS.use_fp16,
                                                    shuffle=True)

        # Build ops that computes the logits predictions from the
        # inference model.
        ARGS.keep_prob = 1.0  # Disable dropout during testing.
        logits = deepSpeech.inference(feats, seq_lens, ARGS)

        # Calculate predictions.
        output_log_prob = tf.nn.softmax(logits)
        decoder = tf.nn.ctc_greedy_decoder
        strided_seq_lens = tf.div(seq_lens, ARGS.temporal_stride)
        predictions = decoder(output_log_prob, strided_seq_lens)

        # Restore the moving average version of the learned variables for eval.
        variable_averages = tf.train.ExponentialMovingAverage(
            ARGS.moving_avg_decay)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(ARGS.eval_dir, graph)

        while True:
            eval_once(saver, summary_writer, predictions, summary_op, labels,
                      output_log_prob, lm)

            if ARGS.run_once:
                break
            time.sleep(ARGS.eval_interval_secs)
def tower_loss(sess, feats, labels, seq_lens):
    """Calculate the total loss on a single tower running the deepSpeech model.

    This function builds the graph for computing the loss per tower(GPU).

    ARGS:
      feats: Tensor of shape BxFxT representing the
             audio features (mfccs or spectrogram).
      labels: sparse tensor holding labels of each utterance.
      seq_lens: tensor of shape [batch_size] holding
              the sequence length per input utterance.
    Returns:
       Tensor of shape [batch_size] containing
       the total loss for a batch of data
    """

    # Build inference Graph.
    logits = deepSpeech.inference(sess, feats, seq_lens, ARGS)

    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    total_loss = deepSpeech.loss(logits, labels, seq_lens)

    # Compute the moving average of all individual losses and the total loss.
    # loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
    # loss_averages_op = loss_averages.apply([total_loss])

    # Attach a scalar summary to all individual losses and the total loss;
    # do the same for the averaged version of the losses.
    # loss_name = total_loss.op.name
    # Name each loss as '(raw)' and name the moving average
    # version of the loss as the original loss name.
    # tf.summary.scalar(loss_name + '(raw)', total_loss)

    # Without this loss_averages_op would never run
    # with tf.control_dependencies([loss_averages_op]):
    #     total_loss = tf.identity(total_loss)

    return total_loss