def test_time_major(self):
    """Testing time_major param.


    testing if transposing and setting time_major=False will result in the same
    loss
    """
    # [max_time x batch_size x depth tensor]
    inputs = np.random.randn(2, 2, 3).astype(np.float32)
    labels = SimpleSparseTensorFrom([[0, 1], [1, 0]])
    seq_lens = np.array([2, 2], dtype=np.int32)

    inputs_t = constant_op.constant(inputs)

    # Transposing tensor to [batch_size x max_time x depth tensor]
    inputs_t_transposed = constant_op.constant(inputs.transpose(1, 0, 2))

    with self.test_session(use_gpu=False) as sess:
      loss = ctc_ops.ctc_loss(
          inputs=inputs_t, labels=labels, sequence_length=seq_lens)
      loss_transposed = ctc_ops.ctc_loss(
          inputs=inputs_t_transposed,
          labels=labels,
          sequence_length=seq_lens,
          time_major=False)

      (tf_loss, tf_loss_transposed) = sess.run([loss, loss_transposed])
      self.assertAllEqual(tf_loss, tf_loss_transposed)
    def test_time_major(self):
        """Testing time_major param.


    testing if transposing and setting time_major=False will result in the same
    loss
    """
        # [max_time x batch_size x depth tensor]
        inputs = np.random.randn(2, 2, 3).astype(np.float32)
        labels = SimpleSparseTensorFrom([[0, 1], [1, 0]])
        seq_lens = np.array([2, 2], dtype=np.int32)

        inputs_t = constant_op.constant(inputs)

        # Transposing tensor to [batch_size x max_time x depth tensor]
        inputs_t_transposed = constant_op.constant(inputs.transpose(1, 0, 2))

        with self.test_session(use_gpu=False) as sess:
            loss = ctc_ops.ctc_loss(inputs=inputs_t,
                                    labels=labels,
                                    sequence_length=seq_lens)
            loss_transposed = ctc_ops.ctc_loss(inputs=inputs_t_transposed,
                                               labels=labels,
                                               sequence_length=seq_lens,
                                               time_major=False)

            (tf_loss, tf_loss_transposed) = sess.run([loss, loss_transposed])
            self.assertAllEqual(tf_loss, tf_loss_transposed)
def define_logit_and_ctc(output_combined, targetY, seqLengths, nHiddenOutput,
                         nClass):
    W = tf.Variable(
        tf.truncated_normal([nHiddenOutput, nClass],
                            stddev=np.sqrt(2.0 / nHiddenOutput)))
    # Zero initialization
    # Tip: tf.zeros_initializer
    b = tf.Variable(tf.zeros([nClass]))
    batch_size = tf.shape(output_combined)[0]
    max_time = tf.shape(output_combined)[1]
    output_combined_reshape = tf.reshape(output_combined, [-1, nHiddenOutput])
    # Doing the affine projection
    logits = tf.matmul(output_combined_reshape, W) + b
    # Reshaping back to the original shape
    logits = tf.reshape(logits, [batch_size, max_time, nClass])
    # Time major, this is convenient for edit distance.
    logits = tf.transpose(logits, (1, 0, 2))
    loss_individual = ctc.ctc_loss(logits, targetY, seqLengths)
    loss_overall = tf.reduce_mean(loss_individual)

    # just use this beam search.
    predictions = tf.to_int32(
        ctc.ctc_beam_search_decoder(logits, seqLengths)[0][0])
    errorRate_raw = tf.reduce_sum(
        tf.edit_distance(predictions, targetY, normalize=False))
    z_count_this = tf.size(targetY.values)
    errorRate_this_batch = errorRate_raw / tf.to_float(z_count_this)

    return (loss_overall,
            loss_individual), (errorRate_raw, z_count_this,
                               errorRate_this_batch), (logits, predictions)
    def testCtcLossDenseWithBlankIndexIsSameAsCtcLoss(self):
        random_seed.set_random_seed(5)

        batch_size = 8
        num_labels = 6
        label_length = 5
        num_frames = 12
        logits = random_ops.random_uniform(
            [num_frames, batch_size, num_labels])
        labels = random_ops.random_uniform([batch_size, label_length],
                                           minval=0,
                                           maxval=num_labels - 1,
                                           dtype=dtypes.int64)

        label_lengths = random_ops.random_uniform([batch_size],
                                                  minval=2,
                                                  maxval=label_length,
                                                  dtype=dtypes.int64)
        label_mask = array_ops.sequence_mask(label_lengths,
                                             maxlen=label_length,
                                             dtype=label_lengths.dtype)
        labels *= label_mask

        logit_lengths = [num_frames] * batch_size

        tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
        tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
            tf_ctc_loss_labels, label_lengths)

        tf_nn_ctc_loss = ctc_ops.ctc_loss(labels=tf_ctc_loss_labels,
                                          inputs=logits,
                                          sequence_length=logit_lengths,
                                          time_major=True)
        tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]

        # Shift the blank logits/labels to be somewhere in the middle.
        blank_index = 2
        shifted_logits = array_ops.concat([
            logits[:, :, :blank_index],
            logits[:, :, -1:],
            logits[:, :, blank_index:-1],
        ],
                                          axis=2)
        shifted_labels = array_ops.where(labels < blank_index, labels,
                                         labels + 1)

        ctc_loss = ctc_ops.ctc_loss_dense(labels=shifted_labels,
                                          logits=shifted_logits,
                                          label_length=label_lengths,
                                          logit_length=logit_lengths,
                                          blank_index=blank_index)
        ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]

        with self.cached_session() as sess:
            for _ in range(32):
                self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
                self.assertAllClose(*self.evaluate(
                    [ctc_loss_grads, tf_nn_ctc_grads]),
                                    rtol=2e-06,
                                    atol=2e-06)
Beispiel #5
0
    def loss(self):
        """ Define loss
        return
        """
        # ctc loss
        with tf.name_scope('loss'):
            self.avg_loss = tf.reduce_mean(
                ctc_ops.ctc_loss(self.text, self.logits, self.seq_length))
            tf.summary.scalar('loss', self.avg_loss)
        # [optimizer]
        with tf.name_scope('train'):
            self.optimizer = tf.train.AdamOptimizer(
                learning_rate=self.hyparam.learning_rate).minimize(
                    self.avg_loss)

        with tf.name_scope("decode"):
            self.decoded, log_prob = ctc_ops.ctc_beam_search_decoder(
                self.logits, self.seq_length, merge_repeated=False)

        with tf.name_scope("ctc_beam_search_decode"):
            self.prob = tf.nn.softmax(self.logits, dim=0)
            self.prob = tf.transpose(
                self.prob, [1, 0, 2]
            )  # keep the same dim with decoder {batch_size, time_step, n_character}
            self.decoder = LM_decoder(self.hyparam.alpha, self.hyparam.beta,
                                      self.hyparam.lang_model_path, self.words)

        with tf.name_scope("accuracy"):
            self.distance = tf.edit_distance(
                tf.cast(self.decoded[0], tf.int32), self.text)
            # compute label error rate (accuracy)
            self.label_err = tf.reduce_mean(self.distance,
                                            name='label_error_rate')
            tf.summary.scalar('accuracy', self.label_err)
    def _testCTCLoss(self,
                     inputs,
                     seq_lens,
                     labels,
                     loss_truth,
                     grad_truth,
                     expected_err_re=None):
        self.assertEquals(len(inputs), len(grad_truth))

        inputs_t = constant_op.constant(inputs)

        with self.test_session(use_gpu=False) as sess:
            loss = ctc_ops.ctc_loss(inputs=inputs_t,
                                    labels=labels,
                                    sequence_length=seq_lens)
            grad = gradients_impl.gradients(loss, [inputs_t])[0]

            self.assertShapeEqual(loss_truth, loss)
            self.assertShapeEqual(grad_truth, grad)

            if expected_err_re is None:
                (tf_loss, tf_grad) = sess.run([loss, grad])
                self.assertAllClose(tf_loss, loss_truth, atol=1e-6)
                self.assertAllClose(tf_grad, grad_truth, atol=1e-6)
            else:
                with self.assertRaisesOpError(expected_err_re):
                    sess.run([loss, grad])
Beispiel #7
0
    def ctc_batch_cost(self, y_true, y_pred, input_length, label_length):
        """Runs CTC loss algorithm on each batch element.

        # Arguments
            y_true: tensor `(samples, max_string_length)`
                containing the truth labels.
            y_pred: tensor `(samples, time_steps, num_categories)`
                containing the prediction, or output of the softmax.
            input_length: tensor `(samples, 1)` containing the sequence length for
                each batch item in `y_pred`.
            label_length: tensor `(samples, 1)` containing the sequence length for
                each batch item in `y_true`.

        # Returns
            Tensor with shape (samples,1) containing the
                CTC loss of each element.
        """
        label_length = tf.to_int32(tf.squeeze(label_length, axis=-1))
        input_length = tf.to_int32(tf.squeeze(input_length, axis=-1))
        sparse_labels = tf.to_int32(
            K.ctc_label_dense_to_sparse(y_true, label_length))

        y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-7)

        # 注意这里的True是为了忽略解码失败的情况,此时loss会变成nan直到下一个个batch
        return tf.expand_dims(
            ctc.ctc_loss(inputs=y_pred,
                         labels=sparse_labels,
                         sequence_length=input_length,
                         ignore_longer_outputs_than_inputs=True), 1)
def ctc_batch_cost(y_true, y_pred, input_length, label_length):
    """
    FROM KERAS - MODIFIED FOR BATCH SIZE OF ONE.
    Runs CTC loss algorithm on each batch element.
    # Arguments
        y_true: tensor `(samples, max_string_length)`
            containing the truth labels.
        y_pred: tensor `(samples, time_steps, num_categories)`
            containing the prediction, or output of the softmax.
        input_length: tensor `(samples, 1)` containing the sequence length for
            each batch item in `y_pred`.
        label_length: tensor `(samples, 1)` containing the sequence length for
            each batch item in `y_true`.
    # Returns
        Tensor with shape (samples,1) containing the
            CTC loss of each element.
    """
    label_length = tf.to_int32(tf.squeeze(label_length, axis=1))
    input_length = tf.to_int32(tf.squeeze(input_length, axis=1))
    sparse_labels = tf.to_int32(
        K.ctc_label_dense_to_sparse(y_true, label_length))

    y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + K.epsilon())

    return tf.expand_dims(
        ctc.ctc_loss(inputs=y_pred,
                     labels=sparse_labels,
                     sequence_length=input_length), 1)
Beispiel #9
0
    def build_graph(self, args, maxTimeSteps):
	self.graph = tf.Graph()
	with self.graph.as_default():
    	    self.inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, args.batch_size, args.num_feature)) #[maxL,32,39]
    	    inputXrs = tf.reshape(self.inputX, [-1, args.num_feature])
    	    self.inputList = tf.split(inputXrs, maxTimeSteps, 0) #convert inputXrs from [32*maxL,39] to [32,maxL,39]

            self.targetIxs = tf.placeholder(tf.int64)
    	    self.targetVals = tf.placeholder(tf.int32)
    	    self.targetShape = tf.placeholder(tf.int64)
    	    self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals, self.targetShape)
            self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size))
	    depth = 10
	    width = 8
	    self.config = { 'name':'residual network',
			    'num_layer':depth,
			    'num_featuremap':width,
			    'num_class':args.num_class,
			    'optimizer':args.optimizer,
			    'learning rate':args.learning_rate
		}

	    inpt = tf.reshape(self.inputX,[args.batch_size,maxTimeSteps,args.num_feature,1])
            conv_output = build_resnet(inpt,maxTimeSteps,depth,width,args.num_class)
    	    self.loss = tf.reduce_mean(ctc.ctc_loss(self.targetY, conv_output, self.seqLengths))
    	    self.optimizer = args.optimizer(args.learning_rate).minimize(self.loss)
    	    self.logitsMaxTest = tf.slice(tf.argmax(conv_output, 2), [0, 0], [self.seqLengths[0], 1])
    	    self.predictions = tf.to_int32(ctc.ctc_beam_search_decoder(conv_output, self.seqLengths)[0][0])
    	    self.errorRate = tf.reduce_sum(tf.edit_distance(self.predictions, self.targetY, normalize=False))/tf.to_float(tf.size(self.targetY.values))
	    self.initial_op = tf.global_variables_initializer()
	    self.saver = tf.train.Saver(tf.global_variables(),max_to_keep=2,keep_checkpoint_every_n_hours=1)
	    self.logfile = args.log_dir+str(datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S')+'.txt').replace(' ','').replace('/','')
	    self.var_op = tf.global_variables()
	    self.var_trainable_op = tf.trainable_variables()
Beispiel #10
0
    def loss(self):
        """
        定义loss
        :return:
        """
        # 调用ctc loss
        with tf.name_scope('loss'):  #损失
            self.avg_loss = tf.reduce_mean(
                ctc_ops.ctc_loss(self.text, self.logits, self.seq_length))
            tf.summary.scalar('loss', self.avg_loss)
        # [optimizer]
        with tf.name_scope('train'):  #训练过程
            self.optimizer = tf.train.AdamOptimizer(
                learning_rate=learning_rate).minimize(self.avg_loss)

        with tf.name_scope("decode"):
            self.decoded, log_prob = ctc_ops.ctc_beam_search_decoder(
                self.logits, self.seq_length, merge_repeated=False)

        with tf.name_scope("accuracy"):
            self.distance = tf.edit_distance(
                tf.cast(self.decoded[0], tf.int32), self.text)
            # 计算label error rate (accuracy)
            self.label_err = tf.reduce_mean(self.distance,
                                            name='label_error_rate')
            tf.summary.scalar('accuracy', self.label_err)
def my_ctc_batch_cost(y_true, y_pred, input_length, label_length):
    """Runs CTC loss algorithm on each batch element.
  Arguments:
      y_true: tensor `(samples, max_string_length)`
          containing the truth labels.
      y_pred: tensor `(samples, time_steps, num_categories)`
          containing the prediction, or output of the softmax.
      input_length: tensor `(samples, 1)` containing the sequence length for
          each batch item in `y_pred`.
      label_length: tensor `(samples, 1)` containing the sequence length for
          each batch item in `y_true`.
  Returns:
      Tensor with shape (samples,1) containing the
          CTC loss of each element.
  """
    label_length = math_ops.to_int32(array_ops.squeeze(label_length, axis=-1))
    input_length = math_ops.to_int32(array_ops.squeeze(input_length, axis=-1))
    sparse_labels = math_ops.to_int32(
        my_ctc_label_dense_to_sparse(y_true, label_length))

    y_pred = math_ops.log(
        array_ops.transpose(y_pred, perm=[1, 0, 2]) + epsilon())

    return array_ops.expand_dims(
        ctc.ctc_loss(inputs=y_pred,
                     labels=sparse_labels,
                     sequence_length=input_length,
                     ctc_merge_repeated=True), 1)
  def _testCTCLoss(self,
                   inputs,
                   seq_lens,
                   labels,
                   loss_truth,
                   grad_truth,
                   expected_err_re=None):
    self.assertEquals(len(inputs), len(grad_truth))

    inputs_t = constant_op.constant(inputs)

    with self.test_session(use_gpu=False) as sess:
      loss = ctc_ops.ctc_loss(
          inputs=inputs_t, labels=labels, sequence_length=seq_lens)
      grad = gradients_impl.gradients(loss, [inputs_t])[0]

      self.assertShapeEqual(loss_truth, loss)
      self.assertShapeEqual(grad_truth, grad)

      if expected_err_re is None:
        (tf_loss, tf_grad) = sess.run([loss, grad])
        self.assertAllClose(tf_loss, loss_truth, atol=1e-6)
        self.assertAllClose(tf_grad, grad_truth, atol=1e-6)
      else:
        with self.assertRaisesOpError(expected_err_re):
          sess.run([loss, grad])
Beispiel #13
0
    def testCtcLossDenseIsSameAsCtcLoss(self):
        with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
            random_seed.set_random_seed(5)

            batch_size = 8
            num_labels = 6
            label_length = 5
            minimum_logits_length = 10
            num_frames = minimum_logits_length + batch_size
            logits = random_ops.random_uniform(
                [num_frames, batch_size, num_labels])
            labels = random_ops.random_uniform([batch_size, label_length],
                                               minval=1,
                                               maxval=num_labels,
                                               dtype=dtypes.int64)

            label_lengths = random_ops.random_uniform([batch_size],
                                                      minval=2,
                                                      maxval=label_length,
                                                      dtype=dtypes.int64)
            label_mask = array_ops.sequence_mask(label_lengths,
                                                 maxlen=label_length,
                                                 dtype=label_lengths.dtype)
            labels *= label_mask

            logit_lengths = math_ops.range(batch_size) + minimum_logits_length

            ctc_loss = ctc_ops.ctc_loss_dense(labels=labels,
                                              logits=logits,
                                              label_length=label_lengths,
                                              logit_length=logit_lengths)
            ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]

            # Shift labels down by one (move blank from 0 to num_labels -1)
            tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1
            tf_nn_ctc_logits = array_ops.concat([
                logits[:, :, 1:],
                logits[:, :, 0:1],
            ],
                                                axis=2)

            tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
                tf_ctc_loss_labels, label_lengths)

            tf_nn_ctc_loss = ctc_ops.ctc_loss(labels=tf_ctc_loss_labels,
                                              inputs=tf_nn_ctc_logits,
                                              sequence_length=logit_lengths,
                                              time_major=True)
            tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss,
                                                       [logits])[0]

            with self.cached_session() as sess:
                for _ in range(32):
                    self.assertAllClose(
                        *self.evaluate([ctc_loss, tf_nn_ctc_loss]))
                    self.assertAllClose(*self.evaluate(
                        [ctc_loss_grads, tf_nn_ctc_grads]),
                                        rtol=4e-06,
                                        atol=4e-06)
Beispiel #14
0
  def ctc_loss(self,outputs, targets, seq_len, num_classes,initial_learning_rate, keep_prob=0.8, scopeN="l1-ctc_loss"):
    """Implements ctc loss
    
    @param outputs: [batch,h,w,chanels]
    @param targets: sparce tensor 
    @param seq_len: the length of the inputs sequences [batch]
    @param num_classes: the number of classes
    @param initial_learning_rate: learning rate
    @param keep_prob: if true dropout layer
    @param scopeN: the scope name
    
    @returns: list with [optimizer, cost, Inaccuracy- label error rate, decoded output of the batch]
    """
    with tf.name_scope('Train'):
        with tf.variable_scope("ctc_loss-"+scopeN) as scope:
            W = tf.Variable(tf.truncated_normal([self.hidden*2,
                                                 num_classes],
                                                stddev=0.1))
            # Zero initialization
            b = tf.Variable(tf.constant(0., shape=[num_classes]))

        tf.summary.histogram('histogram-b-ctc', b)
        tf.summary.histogram('histogram-w-ctc', W)

        # Doing the affine projection
        logits = tf.matmul(outputs, W) +  b 

        if keep_prob is not None:
            logits = tf.nn.dropout(logits, keep_prob)

        # Reshaping back to the original shape
        logits = tf.reshape(logits, [self.width, self.batch_size, num_classes])    
        #logits =  tf.transpose(logits, [1,0,2])

        with tf.name_scope('CTC-loss'):
            loss = ctc_ops.ctc_loss(logits, targets, seq_len)
            cost = tf.reduce_mean(loss)
            
        with tf.name_scope('Optimizer'):
            if self.optimizer == "ADAM":
                optimizer = tf.train.AdamOptimizer(learning_rate=initial_learning_rate,name="AdamOptimizer").minimize(cost)
            elif self.optimizer == "RMSP":
                optimizer = tf.train.RMSPropOptimizer(learning_rate=initial_learning_rate, decay=self.decay, momentum=self.momentum).minimize(cost)
            else:
                raise Exception("model type not supported: {}".format(self.optimizer))
        
        with tf.name_scope('Prediction'):
            if self.ctc_decoder == 'greedy':
                decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_len)
            elif self.ctc_decoder == 'beam_search':
                decoded, log_prob = ctc_ops.ctc_beam_search_decoder(logits, seq_len)
            else:
                raise Exception("model type not supported: {}".format(self.ctc_decoder))

            # Inaccuracy: label error rate
            ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                              targets))
    return optimizer, cost, ler, decoded
Beispiel #15
0
    def testCtcLossDenseUniqueFastPathIsSameAsCtcLoss(self):
        random_seed.set_random_seed(5)

        batch_size = 8
        num_labels = 6
        label_length = 5
        num_frames = 12
        logits = random_ops.random_uniform(
            [num_frames, batch_size, num_labels])
        labels = random_ops.random_uniform([batch_size, label_length],
                                           minval=1,
                                           maxval=num_labels,
                                           dtype=dtypes.int64)

        label_lengths = random_ops.random_uniform([batch_size],
                                                  minval=2,
                                                  maxval=label_length,
                                                  dtype=dtypes.int64)
        label_mask = array_ops.sequence_mask(label_lengths,
                                             maxlen=label_length,
                                             dtype=label_lengths.dtype)
        labels *= label_mask

        logit_lengths = [num_frames] * batch_size

        ctc_loss = ctc_ops.ctc_loss_dense(
            labels=labels,
            logits=logits,
            label_length=label_lengths,
            logit_length=logit_lengths,
            unique=ctc_ops.ctc_unique_labels(labels))
        ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]

        # Shift labels down by one (move blank from 0 to num_labels -1)
        tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1
        tf_nn_ctc_logits = array_ops.concat([
            logits[:, :, 1:],
            logits[:, :, 0:1],
        ],
                                            axis=2)

        tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
            tf_ctc_loss_labels, label_lengths)

        tf_nn_ctc_loss = ctc_ops.ctc_loss(labels=tf_ctc_loss_labels,
                                          inputs=tf_nn_ctc_logits,
                                          sequence_length=logit_lengths,
                                          time_major=True)
        tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]

        with self.cached_session() as sess:
            for _ in range(32):
                self.assertAllClose(*sess.run([ctc_loss, tf_nn_ctc_loss]))
                self.assertAllClose(*sess.run(
                    [ctc_loss_grads, tf_nn_ctc_grads]),
                                    rtol=2e-06,
                                    atol=2e-06)
Beispiel #16
0
def test_one_wav(wav_path, label_text):
    tf.reset_default_graph()

    input_tensor = tf.placeholder(
        tf.float32, [None, None, n_input + (2 * n_input * n_context)],
        name='input')
    # ctc_loss计算需要使用sparse_placeholder生成SparseTensor
    targets = tf.sparse_placeholder(tf.int32, name='targets')  # 文本
    keep_dropout = tf.placeholder(tf.float32)
    seq_length = tf.placeholder(tf.int32, [None], name='seq_length')  # 序列长
    regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
    logits = inference(input_tensor, words_size + 1, False, keep_dropout,
                       regularizer, tf.to_int64(seq_length))
    avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(
        targets, logits, seq_length)) + tf.add_n(tf.get_collection('losses'))
    learning_rate = 0.001
    optimizer = tf.train.AdamOptimizer(
        learning_rate=learning_rate).minimize(avg_loss)
    with tf.name_scope("decode"):
        decoded, log_prob = ctc_ops.ctc_beam_search_decoder(
            logits, seq_length, merge_repeated=False)
    with tf.name_scope("accuracy"):
        distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)
        # 计算label error rate (accuracy)
        ler = tf.reduce_mean(distance, name='label_error_rate')
    choose_cpkt = "BiRNN.cpkt-117"
    saver = tf.train.Saver(max_to_keep=5)
    re1 = ""
    re2 = ""
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, savedir + choose_cpkt)

        _, source, source_lengths, sparse_labels = next_batch(
            labels=label_text, wav_files=wav_path)
        feed = {
            input_tensor: source,
            targets: sparse_labels,
            seq_length: source_lengths,
            keep_dropout: 1.0
        }
        d, test_ler, batch_cost, _ = sess.run(
            [decoded[0], ler, avg_loss, optimizer], feed_dict=feed)
        dense_decoded = tf.sparse_tensor_to_dense(
            d, default_value=-1).eval(session=sess)
        dense_labels = base.sparse_tuple_to_texts_ch(sparse_labels, words)
        print('Label err rate: ', test_ler)
        for orig, decoded_arr in zip(dense_labels, dense_decoded):
            # convert to strings
            decoded_str = base.ndarray_to_text_ch(decoded_arr, words)
            decoded_str = decoded_str.strip().strip('龚')
            re1 = orig
            re2 = decoded_str
            print('Original: {}'.format(orig))
            print('Decoded:  {}'.format(decoded_str))
    return re1, re2, test_ler
  def testCtcLossDenseWithBlankIndexIsSameAsCtcLoss(self):
    random_seed.set_random_seed(5)

    batch_size = 8
    num_labels = 6
    label_length = 5
    num_frames = 12
    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
    labels = random_ops.random_uniform(
        [batch_size, label_length], minval=0, maxval=num_labels-1,
        dtype=dtypes.int64)

    label_lengths = random_ops.random_uniform(
        [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
    label_mask = array_ops.sequence_mask(
        label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
    labels *= label_mask

    logit_lengths = [num_frames] * batch_size

    tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
    tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
        tf_ctc_loss_labels, label_lengths)

    tf_nn_ctc_loss = ctc_ops.ctc_loss(
        labels=tf_ctc_loss_labels,
        inputs=logits,
        sequence_length=logit_lengths,
        time_major=True)
    tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]

    # Shift the blank logits/labels to be somewhere in the middle.
    blank_index = 2
    shifted_logits = array_ops.concat([
        logits[:, :, :blank_index],
        logits[:, :, -1:],
        logits[:, :, blank_index:-1],
    ], axis=2)
    shifted_labels = array_ops.where(labels < blank_index, labels, labels + 1)

    ctc_loss = ctc_ops.ctc_loss_dense(
        labels=shifted_labels,
        logits=shifted_logits,
        label_length=label_lengths,
        logit_length=logit_lengths,
        blank_index=blank_index)
    ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]

    with self.cached_session() as sess:
      for _ in range(32):
        self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
        self.assertAllClose(
            *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
            rtol=2e-06,
            atol=2e-06)
Beispiel #18
0
    def setup_loss_function(self):
        with tf.name_scope("loss"):
            self.total_loss = ctc_ops.ctc_loss(
                self.targets, self.logits, self.seq_length,ignore_longer_outputs_than_inputs=True)
            self.avg_loss = tf.reduce_mean(self.total_loss)
            self.loss_summary = tf.summary.scalar("avg_loss", self.avg_loss)

            self.cost_placeholder = tf.placeholder(dtype=tf.float32, shape=[])

            self.train_cost_op = tf.summary.scalar(
                "train_avg_loss", self.cost_placeholder)
  def testEmptyBatch(self):
    inputs = constant_op.constant([], dtype=dtypes.float32, shape=(1, 0, 2))
    sequence_lengths = constant_op.constant([], dtype=dtypes.int32)
    labels = sparse_tensor.SparseTensor(
        indices=constant_op.constant([], shape=(0, 2), dtype=dtypes.int64),
        values=constant_op.constant([], shape=(0,), dtype=dtypes.int32),
        dense_shape=[5, 5])

    with self.test_session(use_gpu=False) as sess:
      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                   "batch_size must not be 0"):
        sess.run(ctc_ops.ctc_loss(labels, inputs, sequence_lengths))
  def testEmptyBatch(self):
    inputs = constant_op.constant([], dtype=dtypes.float32, shape=(1, 0, 2))
    sequence_lengths = constant_op.constant([], dtype=dtypes.int32)
    labels = sparse_tensor.SparseTensor(
        indices=constant_op.constant([], shape=(0, 2), dtype=dtypes.int64),
        values=constant_op.constant([], shape=(0,), dtype=dtypes.int32),
        dense_shape=[5, 5])

    with self.session(use_gpu=False) as sess:
      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                   "batch_size must not be 0"):
        sess.run(ctc_ops.ctc_loss(labels, inputs, sequence_lengths))
  def testCtcLossDenseUniqueFastPathIsSameAsCtcLoss(self):
    random_seed.set_random_seed(5)

    batch_size = 8
    num_labels = 6
    label_length = 5
    num_frames = 12
    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
    labels = random_ops.random_uniform(
        [batch_size, label_length], minval=1, maxval=num_labels,
        dtype=dtypes.int64)

    label_lengths = random_ops.random_uniform(
        [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
    label_mask = array_ops.sequence_mask(
        label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
    labels *= label_mask

    logit_lengths = [num_frames] * batch_size

    ctc_loss = ctc_ops.ctc_loss_dense(
        labels=labels,
        logits=logits,
        label_length=label_lengths,
        logit_length=logit_lengths,
        unique=ctc_ops.ctc_unique_labels(labels))
    ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]

    # Shift labels down by one (move blank from 0 to num_labels -1)
    tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1
    tf_nn_ctc_logits = array_ops.concat([
        logits[:, :, 1:],
        logits[:, :, 0:1],
    ], axis=2)

    tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
        tf_ctc_loss_labels, label_lengths)

    tf_nn_ctc_loss = ctc_ops.ctc_loss(
        labels=tf_ctc_loss_labels,
        inputs=tf_nn_ctc_logits,
        sequence_length=logit_lengths,
        time_major=True)
    tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]

    with self.cached_session() as sess:
      for _ in range(32):
        self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
        self.assertAllClose(
            *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
            rtol=2e-06,
            atol=2e-06)
    def testCtcLossDenseWithNegativeBlankIndexIsSameAsCtcLoss(self):
        with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
            random_seed.set_random_seed(5)

            batch_size = 8
            num_labels = 6
            label_length = 5
            num_frames = 12
            logits = random_ops.random_uniform(
                [num_frames, batch_size, num_labels])
            labels = random_ops.random_uniform([batch_size, label_length],
                                               minval=0,
                                               maxval=num_labels - 1,
                                               dtype=dtypes.int64)

            label_lengths = random_ops.random_uniform([batch_size],
                                                      minval=2,
                                                      maxval=label_length,
                                                      dtype=dtypes.int64)
            label_mask = array_ops.sequence_mask(label_lengths,
                                                 maxlen=label_length,
                                                 dtype=label_lengths.dtype)
            labels *= label_mask

            logit_lengths = [num_frames] * batch_size

            ctc_loss = ctc_ops.ctc_loss_dense(labels=labels,
                                              logits=logits,
                                              label_length=label_lengths,
                                              logit_length=logit_lengths,
                                              blank_index=-1)
            ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]

            tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
            tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
                tf_ctc_loss_labels, label_lengths)

            tf_nn_ctc_loss = ctc_ops.ctc_loss(labels=tf_ctc_loss_labels,
                                              inputs=logits,
                                              sequence_length=logit_lengths,
                                              time_major=True)
            tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss,
                                                       [logits])[0]

            with self.cached_session() as sess:
                for _ in range(32):
                    self.assertAllClose(
                        *self.evaluate([ctc_loss, tf_nn_ctc_loss]))
                    self.assertAllClose(*self.evaluate(
                        [ctc_loss_grads, tf_nn_ctc_grads]),
                                        rtol=2e-06,
                                        atol=2e-06)
Beispiel #23
0
 def ctc_lambda_func(self, args):
     y_pred, y_true, input_length, label_length = args
     label_length = math_ops.to_int32(array_ops.squeeze(label_length))
     input_length = math_ops.to_int32(array_ops.squeeze(input_length))
     sparse_labels = math_ops.to_int32(
         ctc_label_dense_to_sparse(y_true, label_length))
     y_pred = math_ops.log(
         array_ops.transpose(y_pred, perm=[1, 0, 2]) + 1e-7)
     return array_ops.expand_dims(
         ctc.ctc_loss(inputs=y_pred,
                      labels=sparse_labels,
                      sequence_length=input_length,
                      ignore_longer_outputs_than_inputs=True), 1)
Beispiel #24
0
def _ctc_loss_with_beam_search(logits,
                               sparse_labels,
                               seq_length,
                               top_path=1,
                               merge_repeated=False):

    ctc_loss = math_ops.reduce_mean(
        ctc_ops.ctc_loss(sparse_labels, logits, seq_length))
    pre_label_tensors, log_prob = tf.nn.ctc_beam_search_decoder(
        logits, seq_length, merge_repeated=merge_repeated, top_paths=top_path)
    top1_label_tensor = math_ops.cast(pre_label_tensors[0], dtypes.int32)
    top1_ed = math_ops.reduce_mean(
        array_ops.edit_distance(top1_label_tensor, sparse_labels))
    return ctc_loss, top1_ed, pre_label_tensors, log_prob
Beispiel #25
0
def ctc_batch(y_true, y_pred, input_length, label_length):

    label_length = tf.to_int32(tf.squeeze(label_length))
    input_length = tf.to_int32(tf.squeeze(input_length))
    sparse_labels = tf.to_int32(
        K.ctc_label_dense_to_sparse(y_true, label_length))

    y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)

    return tf.expand_dims(
        ctc.ctc_loss(inputs=y_pred,
                     labels=sparse_labels,
                     sequence_length=input_length,
                     ignore_longer_outputs_than_inputs=True), 1)
    def testInvalidSecondGradient(self):
        inputs = np.random.randn(2, 2, 3).astype(np.float32)
        inputs_t = constant_op.constant(inputs)
        labels = SimpleSparseTensorFrom([[0, 1], [1, 0]])
        seq_lens = np.array([2, 2], dtype=np.int32)
        v = [1.0]

        with self.test_session(use_gpu=False):
            loss = ctc_ops.ctc_loss(inputs=inputs_t,
                                    labels=labels,
                                    sequence_length=seq_lens)
            # Taking ths second gradient should fail, since it is not
            # yet supported.
            with self.assertRaisesRegexp(LookupError, "explicitly disabled"):
                _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
  def testInvalidSecondGradient(self):
    inputs = np.random.randn(2, 2, 3).astype(np.float32)
    inputs_t = constant_op.constant(inputs)
    labels = SimpleSparseTensorFrom([[0, 1], [1, 0]])
    seq_lens = np.array([2, 2], dtype=np.int32)
    v = [1.0]

    with self.test_session(use_gpu=False):
      loss = ctc_ops.ctc_loss(
          inputs=inputs_t, labels=labels, sequence_length=seq_lens)
      # Taking ths second gradient should fail, since it is not
      # yet supported.
      with self.assertRaisesRegexp(LookupError,
                                   "explicitly disabled"):
        _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
Beispiel #28
0
    def backward(self):
        self.loss = ctc.ctc_loss(labels=self.y_,
                                 inputs=self.logits,
                                 sequence_length=self.seq_len)
        self.cost = tf.reduce_mean(self.loss)

        self.opt = tf.train.AdamOptimizer(0.05).minimize(self.loss)
        self.decoded, self.log_prob = \
            tf.nn.ctc_beam_search_decoder(self.logits,
                                          self.seq_len,
                                          merge_repeated=False)
        #把稀疏矩阵转换成稠密矩阵,长度不足的序列用-1来填充
        self.dense_decoded = tf.sparse_tensor_to_dense(self.decoded[0],
                                                       default_value=-1)
        self.acc = tf.reduce_mean(
            tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.y_))
  def testCtcLossDenseWithNegativeBlankIndexIsSameAsCtcLoss(self):
    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
      random_seed.set_random_seed(5)

      batch_size = 8
      num_labels = 6
      label_length = 5
      num_frames = 12
      logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
      labels = random_ops.random_uniform(
          [batch_size, label_length], minval=0, maxval=num_labels-1,
          dtype=dtypes.int64)

      label_lengths = random_ops.random_uniform(
          [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
      label_mask = array_ops.sequence_mask(
          label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
      labels *= label_mask

      logit_lengths = [num_frames] * batch_size

      ctc_loss = ctc_ops.ctc_loss_dense(
          labels=labels,
          logits=logits,
          label_length=label_lengths,
          logit_length=logit_lengths,
          blank_index=-1)
      ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]

      tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
      tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
          tf_ctc_loss_labels, label_lengths)

      tf_nn_ctc_loss = ctc_ops.ctc_loss(
          labels=tf_ctc_loss_labels,
          inputs=logits,
          sequence_length=logit_lengths,
          time_major=True)
      tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]

      with self.cached_session() as sess:
        for _ in range(32):
          self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
          self.assertAllClose(
              *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
              rtol=2e-06,
              atol=2e-06)
    def build_graph(self, args, maxTimeSteps):
	self.graph = tf.Graph()
	with self.graph.as_default():
    	    self.inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, args.batch_size, args.num_feature)) #[maxL,32,39]
	    self.inputXX = tf.reshape(self.inputX,shape=(args.batch_size,maxTimeSteps,args.num_feature))
    	    inputXrs = tf.reshape(self.inputX, [-1, args.num_feature])
    	    #self.inputList = tf.split(0, maxTimeSteps, inputXrs) #convert inputXrs from [32*maxL,39] to [32,maxL,39]
    	    #self.inputnew = tf.reshape(self.inputX, [1, 0, 2])
            self.targetIxs = tf.placeholder(tf.int64)
    	    self.targetVals = tf.placeholder(tf.int32)
    	    self.targetShape = tf.placeholder(tf.int64)
    	    self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals, self.targetShape)
            self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size))
	    self.config = { 'name':args.model,
			    'rnncell':self.cell_fn,
			    'num_layer':args.num_layer,
			    'num_hidden':args.num_hidden,
			    'num_class':args.num_class,
			    'activation':args.activation,
			    'optimizer':args.optimizer,
			    'learning rate':args.learning_rate
	    }	    

	    # forward layer
            forwardH1 = self.cell_fn(args.num_hidden,activation=tf.nn.relu)
	    # backward layer
            backwardH1 = self.cell_fn(args.num_hidden,activation=tf.nn.relu)
	    # bi-directional layer
            fbH1, state = bidirectional_dynamic_rnn(forwardH1, backwardH1, self.inputXX, sequence_length=self.seqLengths, dtype=tf.float32, scope='BDRNN_H1')
	    fbH1 = tf.concat(2, fbH1)
	    print(fbH1.get_shape)
            shape = fbH1.get_shape().as_list()
	    fbH1 = tf.reshape(fbH1,[shape[0]*shape[1],-1]) #seq*batch,feature
	    fbH1_list = tf.split(0,shape[1],fbH1)
    	    logits = [build_forward_layer(t,[shape[2],args.num_class],kernel='linear') for t in fbH1_list]
    	    logits3d = tf.pack(logits)
    	    self.loss = tf.reduce_mean(ctc.ctc_loss(logits3d, self.targetY, self.seqLengths))
    	    self.optimizer = tf.train.AdamOptimizer(args.learning_rate).minimize(self.loss)
    	    self.logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [self.seqLengths[0], 1])
    	    self.predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, self.seqLengths)[0][0])
    	    self.errorRate = tf.reduce_sum(tf.edit_distance(self.predictions, self.targetY, normalize=False))/tf.to_float(tf.size(self.targetY.values))
	    self.initial_op = tf.initialize_all_variables()
	    self.saver = tf.train.Saver(tf.all_variables(),max_to_keep=5,keep_checkpoint_every_n_hours=1)
	    self.logfile = args.log_dir+str(datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S')+'.txt').replace(' ','').replace('/','')
	    self.var_op = tf.all_variables()
	    self.var_trainable_op = tf.trainable_variables()
Beispiel #31
0
    def loss(self):
        """
        定义loss
        :return:
        """
        # 调用ctc loss
        with tf.name_scope('loss'): #损失
            self.avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(self.text, self.logits, self.seq_length))
            tf.summary.scalar('loss',self.avg_loss)
        # [optimizer]
        with tf.name_scope('train'): #训练过程
            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.avg_loss)

        with tf.name_scope("decode"):
            self.decoded, log_prob = ctc_ops.ctc_beam_search_decoder(self.logits, self.seq_length, merge_repeated=False)

        with tf.name_scope("accuracy"):
            self.distance = tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.text)
            # 计算label error rate (accuracy)
            self.label_err = tf.reduce_mean(self.distance, name='label_error_rate')
            tf.summary.scalar('accuracy', self.label_err)
Beispiel #32
0
def ctc_batch_cost(y_true, y_pred, input_length):
    """Runs CTC loss algorithm on each batch element.
    # Arguments
        y_true: tensor `(samples, max_string_length)`
                containing the truth labels.
        y_pred: tensor `(samples, time_steps, num_categories)`
                containing the prediction, or output of the softmax.
        input_length: tensor `(samples, 1)` containing the sequence
                length for each batch item in `y_pred`.
                [time_step] * batch_size | np.zeros([batch_size, 1]);input_length[i] = time_step(img_w//pool_size)
    # Returns
        Tensor with shape (samples,1) containing the
            CTC loss of each element.
    """
    input_length = tf.to_int32(tf.squeeze(input_length))
    sparse_labels = tf.to_int32(ctc_label_dense_to_sparse(y_true))

    y_pred = tf.log(tf.transpose(y_pred, perm=[1, 0, 2]) + 1e-8)

    return tf.expand_dims(
        ctc.ctc_loss(inputs=y_pred,
                     labels=sparse_labels,
                     sequence_length=input_length), 1)
Beispiel #33
0
    # see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers
    W = tf.Variable(tf.truncated_normal([num_hidden, num_classes], stddev=0.1))
    # Zero initialization
    # Tip: Is tf.zeros_initializer the same?
    b = tf.Variable(tf.constant(0., shape=[num_classes]))

    # Doing the affine projection
    logits = tf.matmul(outputs, W) + b

    # Reshaping back to the original shape
    logits = tf.reshape(logits, [batch_s, -1, num_classes])

    # Time major
    logits = tf.transpose(logits, (1, 0, 2))

    loss = ctc_ops.ctc_loss(targets, logits, seq_len)
    cost = tf.reduce_mean(loss)

    optimizer = tf.train.MomentumOptimizer(initial_learning_rate,
                                           0.9).minimize(cost)

    # Option 2: tf.contrib.ctc.ctc_beam_search_decoder
    # (it's slower but you'll get better results)
    decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_len)

    # Inaccuracy: label error rate
    ler = tf.reduce_mean(
        tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))

    saver = tf.train.Saver()
Beispiel #34
0
def continue_train():
    input_tensor = tf.placeholder(
        tf.float32, [None, None, n_input + (2 * n_input * n_context)],
        name='input')
    # ctc_loss计算需要使用sparse_placeholder生成SparseTensor
    targets = tf.sparse_placeholder(tf.int32, name='targets')  # 文本
    keep_dropout = tf.placeholder(tf.float32)
    seq_length = tf.placeholder(tf.int32, [None], name='seq_length')  # 序列长
    regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
    logits = inference(input_tensor, words_size + 1, True, keep_dropout,
                       regularizer, tf.to_int64(seq_length))
    avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(
        targets, logits, seq_length)) + tf.add_n(tf.get_collection('losses'))
    learning_rate = 0.001
    optimizer = tf.train.AdamOptimizer(
        learning_rate=learning_rate).minimize(avg_loss)
    with tf.name_scope("decode"):
        decoded, log_prob = ctc_ops.ctc_beam_search_decoder(
            logits, seq_length, merge_repeated=False)
    with tf.name_scope("accuracy"):
        distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)
        # 计算label error rate (accuracy)
        ler = tf.reduce_mean(distance, name='label_error_rate')
    epochs = 1000
    #ckpt = tf.train.get_checkpoint_state(savedir)
    saver = tf.train.Saver(max_to_keep=5)
    #saver2 = tf.train.Saver(max_to_keep=5)  # 生成saver
    with tf.Session() as sess:
        choose_cpkt = "BiRNN.cpkt-204"
        sess.run(tf.global_variables_initializer())
        print_tensors_in_checkpoint_file(savedir + choose_cpkt, None, True)
        saver.restore(sess, savedir + choose_cpkt)
        #graph = tf.get_default_graph()
        #cur_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
        startepo = 204
        train_start = time.time()
        for epoch in range(startepo, epochs):  # 样本集迭代次数
            epoch_start = time.time()
            #if epoch < startepo:
            #    continue

            print("epoch start:", epoch + 1, "total epochs= ", epochs)
            ##run batch##
            n_batches_per_epoch = int(np.ceil(len(labels) / batch_size))
            print("total loop ", n_batches_per_epoch, "in one epoch,",
                  batch_size, "items in one loop")

            train_cost = 0
            train_ler = 0
            next_idx = 0

            for batch in range(n_batches_per_epoch):  # 一次batch_size,取多少次
                # 取数据
                next_idx, source, source_lengths, sparse_labels = next_batch(
                    labels, next_idx, batch_size)
                feed = {
                    input_tensor: source,
                    targets: sparse_labels,
                    seq_length: source_lengths,
                    keep_dropout: keep_dropout_rate
                }

                # 计算 avg_loss optimizer ;
                batch_cost, _ = sess.run([avg_loss, optimizer], feed_dict=feed)
                train_cost += batch_cost

                if (batch + 1) % 50 == 0:
                    print('loop:', batch + 1, 'Train cost: ',
                          train_cost / (batch + 1))
                    feed2 = {
                        input_tensor: source,
                        targets: sparse_labels,
                        seq_length: source_lengths,
                        keep_dropout: 1.0
                    }

                    d, train_ler = sess.run([decoded[0], ler], feed_dict=feed2)
                    dense_decoded = tf.sparse_tensor_to_dense(
                        d, default_value=-1).eval(session=sess)
                    dense_labels = base.sparse_tuple_to_texts_ch(
                        sparse_labels, words)

                    counter = 0
                    print('Label err rate: ', train_ler)
                    duration = time.time() - train_start
                    print('cost time: {:.2f} min'.format(duration / 60))
                    for orig, decoded_arr in zip(dense_labels, dense_decoded):
                        # convert to strings
                        decoded_str = base.ndarray_to_text_ch(
                            decoded_arr, words)
                        decoded_str = decoded_str.strip().strip('龚')
                        print(' file {}'.format(counter))
                        print('Original: {}'.format(orig))
                        print('Decoded:  {}'.format(decoded_str))
                        counter = counter + 1
                        break

            epoch_duration = time.time() - epoch_start

            log = 'Epoch {}/{}, train_cost: {:.3f}, train_ler: {:.3f}, time: {:.2f} sec'
            print(
                log.format(epoch + 1, epochs, train_cost, train_ler,
                           epoch_duration))
            saver.save(sess, savedir + "BiRNN.cpkt", global_step=epoch + 1)
            print("save cpkt-%s complete." % (epoch + 1))
Beispiel #35
0
def CheckpointTest():
    # input_tensor为输入音频数据,由前面分析可知,它的结构是[batch_size, amax_stepsize, n_input + (2 * n_input * n_context)]
    # 其中,batch_size是batch的长度,amax_stepsize是时序长度,n_input + (2 * n_input * n_context)是MFCC特征数,
    # batch_size是可变的,所以设为None,由于每一批次的时序长度不固定,所有,amax_stepsize也设为None
    input_tensor = tf.placeholder(tf.float32, [None, None, n_input + (2 * n_input * n_context)], name='input')
    # Use sparse_placeholder; will generate a SparseTensor, required by ctc_loss op.
    # targets保存的是音频数据对应的文本的系数张量,所以用sparse_placeholder创建一个稀疏张量
    targets = tf.sparse_placeholder(tf.int32, name='targets')
    # seq_length保存的是当前batch数据的时序长度
    seq_length = tf.placeholder(tf.int32, [None], name='seq_length')
    # keep_dropout则是dropout的参数
    keep_dropout = tf.placeholder(tf.float32)

    # logits is the non-normalized output/activations from the last layer.
    # logits will be input for the loss function.
    # nn_model is from the import statement in the load_model function
    logits = BiRNN_model(input_tensor, tf.to_int64(seq_length), n_input, n_context, words_size + 1, keep_dropout)

    aa = ctc_ops.ctc_loss(targets, logits, seq_length)
    # 使用ctc loss计算损失
    avg_loss = tf.reduce_mean(aa)

    # 优化器
    learning_rate = 0.001
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(avg_loss)

    # 使用CTC decoder
    with tf.name_scope("decode"):
        decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_length, merge_repeated=True)

    # 计算编辑距离
    with tf.name_scope("accuracy"):
        distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)
        # 计算label error rate (accuracy)
        ler = tf.reduce_mean(distance, name='label_error_rate')

    # 迭代次数
    epochs = 150
    # 模型保存地址
    savedir = "saver/"
    # 如果该目录不存在,新建
    if os.path.exists(savedir) == False:
        os.mkdir(savedir)

    # 生成saver
    saver = tf.train.Saver(max_to_keep=1)
    # 创建session
    with tf.Session() as sess:
        # 初始化
        sess.run(tf.global_variables_initializer())
        # 没有模型的话,就重新初始化
        kpt = tf.train.latest_checkpoint(savedir)
        print("kpt:", kpt)
        startepo = 0
        if kpt != None:
            saver.restore(sess, kpt)
            ind = kpt.find("-")
            startepo = int(kpt[ind + 1:])

        # 要识别的语音文件
        wav_file = 'input.wav'

        source, source_lengths, sparse_labels = get_speech_file(wav_file, labels)
        feed2 = {input_tensor: source, targets: sparse_labels, seq_length: source_lengths, keep_dropout: 1.0}
        d, train_ler = sess.run([decoded[0], ler], feed_dict=feed2)
        dense_decoded = tf.sparse_tensor_to_dense(d, default_value=-1).eval(session=sess)
        if (len(dense_decoded) > 0):
            decoded_str = ndarray_to_text_ch(dense_decoded[0], words)
            print('Decoded:  {}'.format(decoded_str))
    fbH1, _, _ = bidirectional_rnn(forwardH1,
                                   backwardH1,
                                   inputList,
                                   dtype=tf.float32,
                                   scope='BDLSTM_H1')
    fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1]
    outH1 = [
        tf.reduce_sum(tf.mul(t, weightsOutH1), reduction_indices=1) +
        biasesOutH1 for t in fbH1rs
    ]

    logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1]

    ####Optimizing
    logits3d = tf.pack(logits)
    loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths))
    optimizer = tf.train.MomentumOptimizer(learningRate,
                                           momentum).minimize(loss)

    ####Evaluating
    logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0],
                             [seqLengths[0], 1])
    predictions = tf.to_int32(
        ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0])
    errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \
                tf.to_float(tf.size(targetY.values))

####Run session
with tf.Session(graph=graph) as session:
    print('Initializing')
    tf.initialize_all_variables().run()
    outputs = tf.reshape(outputs, [-1, num_lstm_hidden])

    # Weights for regression layer.
    W = tf.Variable(tf.truncated_normal([num_lstm_hidden, num_classes], stddev=0.1), name='W')
    b = tf.Variable(tf.constant(0., shape=[num_classes]), name='b')

    # Apply linear transform
    logits = tf.matmul(outputs, W) + b

    # Reshaping back to the original shape
    logits = tf.reshape(logits, [batch_s, -1, num_classes])

    # Swap dimensions to time major for CTC loss.
    logits = tf.transpose(logits, (1, 0, 2))

    loss = ctc.ctc_loss(targets, logits, seq_len)
    cost = tf.reduce_mean(loss)

    # Record the loss
    tf.contrib.deprecated.scalar_summary('loss', cost)

    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum, use_nesterov=True).minimize(cost)

    decoded, log_prob = ctc.ctc_beam_search_decoder(inputs=logits, sequence_length=seq_len)

    # Label error rate using the edit distance between output and target
    ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                          targets))

    # Record the label error rate
    tf.contrib.deprecated.scalar_summary('label error rate', ler)
Beispiel #38
0
    tf.float32, [None, None, n_input + (2 * n_input * n_context)],
    name='input')  #语音log filter bank or MFCC features
# Use sparse_placeholder; will generate a SparseTensor, required by ctc_loss op.
targets = tf.sparse_placeholder(tf.int32, name='targets')  #文本
# 1d array of size [batch_size]
seq_length = tf.placeholder(tf.int32, [None], name='seq_length')  #序列长
keep_dropout = tf.placeholder(tf.float32)

# logits is the non-normalized output/activations from the last layer.
# logits will be input for the loss function.
# nn_model is from the import statement in the load_model function
logits = BiRNN_model(input_tensor, tf.to_int64(seq_length), n_input, n_context,
                     words_size + 1, keep_dropout)

#调用ctc loss
avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(targets, logits, seq_length))

#[optimizer]
learning_rate = 0.001
optimizer = tf.train.AdamOptimizer(
    learning_rate=learning_rate).minimize(avg_loss)

with tf.name_scope("decode"):
    decoded, log_prob = ctc_ops.ctc_beam_search_decoder(logits,
                                                        seq_length,
                                                        merge_repeated=False)

with tf.name_scope("accuracy"):
    distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)
    # 计算label error rate (accuracy)
    ler = tf.reduce_mean(distance, name='label_error_rate')
Beispiel #39
0
def runCTC(batch):
    INPUT_PATH = '../TRAIN/All/mfcc/'  #directory of MFCC nFeatures x nFrames 2-D array .npy files
    TARGET_PATH = '../TRAIN/All/phone_y/'  #directory of nPhonemes 1-D array .npy files

    ####Learning Parameters
    learningRate = 0.001
    momentum = 0.9
    nEpochs = 300
    batchSize = batch.shape[1]

    ####Network Parameters
    nFeatures = 39  #12 MFCC coefficients + energy, and derivatives
    nHidden = 256
    nClasses = 30  #39 phonemes, plus the "blank" for CTC

    ####Load data
    print('Loading data')
    with open('TIMIT_data_prepared_for_CTC.pkl', 'rb') as f:
        data = pickle.load(f)
    input_list = batch
    charmap = data['chars']
    print(charmap)
    charmap.append('_')
    #batchedData, maxTimeSteps = data_lists_to_batches(input_list, target_list, batchSize)
    maxTimeSteps = 776
    totalN = len(input_list)

    ####Define graph
    print('Defining graph')
    graph = tf.Graph()
    with graph.as_default():

        ####NOTE: try variable-steps inputs and dynamic bidirectional rnn, when it's implemented in tensorflow

        ####Graph input
        inputX = tf.placeholder(tf.float32,
                                shape=(maxTimeSteps, batchSize, nFeatures))
        #Prep input data to fit requirements of rnn.bidirectional_rnn
        #  Reshape to 2-D tensor (nTimeSteps*batchSize, nfeatures)
        inputXrs = tf.reshape(inputX, [-1, nFeatures])
        #  Split to get a list of 'n_steps' tensors of shape (batch_size, n_hidden)
        inputList = tf.split(0, maxTimeSteps, inputXrs)
        targetIxs = tf.placeholder(tf.int64)
        targetVals = tf.placeholder(tf.int32)
        targetShape = tf.placeholder(tf.int64)
        targetY = tf.SparseTensor(targetIxs, targetVals, targetShape)
        seqLengths = tf.placeholder(tf.int32, shape=(batchSize))

        ####Weights & biases
        weightsOutH1 = tf.Variable(
            tf.truncated_normal([2, nHidden],
                                stddev=np.sqrt(2.0 / (2 * nHidden))))
        biasesOutH1 = tf.Variable(tf.zeros([nHidden]))
        weightsOutH2 = tf.Variable(
            tf.truncated_normal([2, nHidden],
                                stddev=np.sqrt(2.0 / (2 * nHidden))))
        biasesOutH2 = tf.Variable(tf.zeros([nHidden]))
        weightsClasses = tf.Variable(
            tf.truncated_normal([nHidden, nClasses],
                                stddev=np.sqrt(2.0 / nHidden)))
        biasesClasses = tf.Variable(tf.zeros([nClasses]))

        ####Network
        forwardH1 = rnn_cell.LSTMCell(nHidden,
                                      use_peepholes=True,
                                      state_is_tuple=True)
        backwardH1 = rnn_cell.LSTMCell(nHidden,
                                       use_peepholes=True,
                                       state_is_tuple=True)
        fbH1, _, _ = bidirectional_rnn(forwardH1,
                                       backwardH1,
                                       inputList,
                                       dtype=tf.float32,
                                       scope='BDLSTM_H1')
        fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1]
        outH1 = [
            tf.reduce_sum(tf.mul(t, weightsOutH1), reduction_indices=1) +
            biasesOutH1 for t in fbH1rs
        ]

        logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1]

        ####Optimizing
        logits3d = tf.pack(logits)
        loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths))
        optimizer = tf.train.MomentumOptimizer(learningRate,
                                               momentum).minimize(loss)

        ####Evaluating
        logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0],
                                 [seqLengths[0], 1])
        predictions = tf.to_int32(
            ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0])
        errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \
                    tf.to_float(tf.size(targetY.values))

    ####Run session
    with tf.Session(graph=graph) as session:
        print('Initializing')
        saver = tf.train.Saver()

        ckpt = tf.train.get_checkpoint_state('/users/TeamASR/models')
        if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
            print("Reading model parameters from %s" %
                  ckpt.model_checkpoint_path)
            saver.restore(session, ckpt.model_checkpoint_path)
        else:
            print("Created model with fresh parameters.")
            session.run(tf.initialize_all_variables())
        feedDict = {inputX: batch, seqLengths: (np.ones([batchSize]) * 776)}
        logit = session.run([logits3d], feed_dict=feedDict)
    return logit
Beispiel #40
0
def runCTC(batch):
    INPUT_PATH = '../TRAIN/All/mfcc/' #directory of MFCC nFeatures x nFrames 2-D array .npy files
    TARGET_PATH = '../TRAIN/All/phone_y/' #directory of nPhonemes 1-D array .npy files


    ####Learning Parameters
    learningRate = 0.001
    momentum = 0.9
    nEpochs = 300
    batchSize = batch.shape[1]

    ####Network Parameters
    nFeatures = 39 #12 MFCC coefficients + energy, and derivatives
    nHidden = 256
    nClasses = 30 #39 phonemes, plus the "blank" for CTC

    ####Load data
    print('Loading data')
    with open('TIMIT_data_prepared_for_CTC.pkl','rb') as f:
        data= pickle.load(f)
    input_list = batch
    charmap = data['chars']
    print(charmap)
    charmap.append('_')
    #batchedData, maxTimeSteps = data_lists_to_batches(input_list, target_list, batchSize)
    maxTimeSteps = 776
    totalN = len(input_list)

    ####Define graph
    print('Defining graph')
    graph = tf.Graph()
    with graph.as_default():

        ####NOTE: try variable-steps inputs and dynamic bidirectional rnn, when it's implemented in tensorflow
            
        ####Graph input
        inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, batchSize, nFeatures))
        #Prep input data to fit requirements of rnn.bidirectional_rnn
        #  Reshape to 2-D tensor (nTimeSteps*batchSize, nfeatures)
        inputXrs = tf.reshape(inputX, [-1, nFeatures])
        #  Split to get a list of 'n_steps' tensors of shape (batch_size, n_hidden)
        inputList = tf.split(0, maxTimeSteps, inputXrs)
        targetIxs = tf.placeholder(tf.int64)
        targetVals = tf.placeholder(tf.int32)
        targetShape = tf.placeholder(tf.int64)
        targetY = tf.SparseTensor(targetIxs, targetVals, targetShape)
        seqLengths = tf.placeholder(tf.int32, shape=(batchSize))

        ####Weights & biases
        weightsOutH1 = tf.Variable(tf.truncated_normal([2, nHidden],
                                                       stddev=np.sqrt(2.0 / (2*nHidden))))
        biasesOutH1 = tf.Variable(tf.zeros([nHidden]))
        weightsOutH2 = tf.Variable(tf.truncated_normal([2, nHidden],
                                                       stddev=np.sqrt(2.0 / (2*nHidden))))
        biasesOutH2 = tf.Variable(tf.zeros([nHidden]))
        weightsClasses = tf.Variable(tf.truncated_normal([nHidden, nClasses],
                                                         stddev=np.sqrt(2.0 / nHidden)))
        biasesClasses = tf.Variable(tf.zeros([nClasses]))

        ####Network
        forwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True)
        backwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True)
        fbH1, _, _ = bidirectional_rnn(forwardH1, backwardH1, inputList, dtype=tf.float32,
                                           scope='BDLSTM_H1')
        fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1]
        outH1 = [tf.reduce_sum(tf.mul(t, weightsOutH1), reduction_indices=1) + biasesOutH1 for t in fbH1rs]

        logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1]

        ####Optimizing
        logits3d = tf.pack(logits)
        loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths))
        optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss)

        ####Evaluating
        logitsMaxTest = tf.slice(tf.argmax(logits3d,2), [0, 0], [seqLengths[0], 1])
        predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0])
        errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \
                    tf.to_float(tf.size(targetY.values))

    ####Run session
    with tf.Session(graph=graph) as session:
        print('Initializing')
        saver = tf.train.Saver()
        
        ckpt = tf.train.get_checkpoint_state('/users/TeamASR/models')
        if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
            print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
            saver.restore(session, ckpt.model_checkpoint_path)
        else:
            print("Created model with fresh parameters.")
            session.run(tf.initialize_all_variables())
        feedDict = {inputX: batch, seqLengths: (np.ones([batchSize])*776)}
        logit = session.run([logits3d], feed_dict=feedDict)
    return logit
	forwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True)
	backwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True)
	print("building bidirectional_rnn ... SLOW!!!")
	fbH1, _, _ = bidirectional_rnn(forwardH1, backwardH1, inputList, dtype=tf.float32, scope='BDLSTM_H1')
	print("done building rnn")
	print("building fbH1rs ")
	fbH1rs = [tf.reshape(t, [Size, 2, nHidden]) for t in fbH1]
	print("building outH1 ")
	outH1 = [tf.reduce_sum(tf.multiply(t, weightsOutH1), axis=1) + biasesOutH1 for t in fbH1rs]
	print("building logits ")
	logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1]
	print("len(outH1) %d"% len(outH1))
	####Optimizing
	print("building loss")
	logits3d = tf.stack(logits)
	loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths))
	out = tf.identity(loss, 'ctc_loss_mean')
	optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss)

	####Evaluating
	print("building Evaluation")
	logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seqLengths[0], 1])
	predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0])
	reduced_sum = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False))
	errorRate = reduced_sum / tf.to_float(tf.size(targetY.values))

	check_op = tf.add_check_numerics_ops()
print("done building graph")

####Run session
with tf.Session(graph=graph) as session: