def compute_ler(self, decode_op_main, decode_op_sub,
                    labels_main, labels_sub):
        """Operation for computing LER (Label Error Rate).
        Args:
            decode_op_main: operation for decoding of the main task
            decode_op_sub: operation for decoding of the sub task
            labels_main: A SparseTensor of target labels in the main task
            labels_sub: A SparseTensor of target labels in the sub task
        Return:
            ler_op_main: operation for computing LER of the main task
            ler_op_sub: operation for computing LER of the sub task
        """
        # Compute LER (normalize by label length)
        ler_op_main = tf.reduce_mean(tf.edit_distance(
            decode_op_main, labels_main, normalize=True))
        ler_op_sub = tf.reduce_mean(tf.edit_distance(
            decode_op_sub, labels_sub, normalize=True))

        # Add a scalar summary for the snapshot of LER
        self.summaries_train.append(tf.summary.scalar(
            'ler_main_train', ler_op_main))
        self.summaries_train.append(tf.summary.scalar(
            'ler_sub_train', ler_op_sub))
        self.summaries_dev.append(tf.summary.scalar(
            'ler_main_dev', ler_op_main))
        self.summaries_dev.append(tf.summary.scalar(
            'ler_sub_dev', ler_op_sub))

        return ler_op_main, ler_op_sub
Example #2
0
def _get_testing(rnn_logits,sequence_length,label,label_length):
    """Create ops for testing (all scalars): 
       loss: CTC loss function value, 
       label_error:  Batch-normalized edit distance on beam search max
       sequence_error: Batch-normalized sequence error rate
    """
    with tf.name_scope("train"):
        loss = model.ctc_loss_layer(rnn_logits,label,sequence_length) 
    with tf.name_scope("test"):
        predictions,_ = tf.nn.ctc_beam_search_decoder(rnn_logits, 
                                                   sequence_length,
                                                   beam_width=128,
                                                   top_paths=1,
                                                   merge_repeated=True)
        hypothesis = tf.cast(predictions[0], tf.int32) # for edit_distance
        label_errors = tf.edit_distance(hypothesis, label, normalize=False)
        sequence_errors = tf.count_nonzero(label_errors,axis=0)
        total_label_error = tf.reduce_sum( label_errors )
        total_labels = tf.reduce_sum( label_length )
        label_error = tf.truediv( total_label_error, 
                                  tf.cast(total_labels, tf.float32 ),
                                  name='label_error')
        sequence_error = tf.truediv( tf.cast( sequence_errors, tf.int32 ),
                                     tf.shape(label_length)[0],
                                     name='sequence_error')
        tf.summary.scalar( 'loss', loss )
        tf.summary.scalar( 'label_error', label_error )
        tf.summary.scalar( 'sequence_error', sequence_error )

    return loss, label_error, sequence_error
  def _testEditDistanceST(
      self, hypothesis_st, truth_st, normalize, expected_output,
      expected_shape, expected_err_re=None):
    edit_distance = tf.edit_distance(
        hypothesis=hypothesis_st, truth=truth_st, normalize=normalize)

    if expected_err_re is None:
      self.assertEqual(edit_distance.get_shape(), expected_shape)
      output = edit_distance.eval()
      self.assertAllClose(output, expected_output)
    else:
      with self.assertRaisesOpError(expected_err_re):
        edit_distance.eval()
def test_edit_distance():
		graph = tf.Graph()
		with graph.as_default():
				truth = tf.sparse_placeholder(tf.int32)
				hyp = tf.sparse_placeholder(tf.int32)
				editDist = tf.edit_distance(hyp, truth, normalize=False)

		with tf.Session(graph=graph) as session:
				truthTest = sparse_tensor_feed([[0,1,2], [0,1,2,3,4]])
				hypTest = sparse_tensor_feed([[3,4,5], [0,1,2,2]])
				feedDict = {truth: truthTest, hyp: hypTest}
				dist = session.run([editDist], feed_dict=feedDict)
				print(dist)
    def _testEditDistance(self, hypothesis, truth, normalize, expected_output, expected_err_re=None):
        # hypothesis and truth are (index, value, shape) tuples
        hypothesis_st = tf.SparseTensor(*[ConstantOf(x) for x in hypothesis])
        truth_st = tf.SparseTensor(*[ConstantOf(x) for x in truth])
        edit_distance = tf.edit_distance(hypothesis=hypothesis_st, truth=truth_st, normalize=normalize)

        with self.test_session():
            if expected_err_re is None:
                # Shape inference figures out the shape from the shape variables
                expected_shape = [max(h, t) for h, t in zip(hypothesis[2], truth[2])[:-1]]
                self.assertEqual(edit_distance.get_shape(), expected_shape)
                output = edit_distance.eval()
                self.assertAllClose(output, expected_output)
            else:
                with self.assertRaisesOpError(expected_err_re):
                    edit_distance.eval()
    def compute_ler(self, decode_op, labels):
        """Operation for computing LER (Label Error Rate).
        Args:
            decode_op: operation for decoding
            labels: A SparseTensor of target labels
        Return:
            ler_op: operation for computing LER
        """
        # Compute LER (normalize by label length)
        ler_op = tf.reduce_mean(tf.edit_distance(
            decode_op, labels, normalize=True))

        # Add a scalar summary for the snapshot of LER
        self.summaries_train.append(tf.summary.scalar('ler_train', ler_op))
        self.summaries_dev.append(tf.summary.scalar('ler_dev', ler_op))

        return ler_op
Example #7
0
def sequence_edit_distance(predictions,
                           labels,
                           weights_fn=common_layers.weights_nonzero):
  """Average edit distance, ignoring padding 0s.

  The score returned is the edit distance divided by the total length of
  reference truth and the weight returned is the total length of the truth.

  Args:
    predictions: Tensor of shape [`batch_size`, `length`, 1, `num_classes`] and
        type tf.float32 representing the logits, 0-padded.
    labels: Tensor of shape [`batch_size`, `length`, 1, 1] and type tf.int32
        representing the labels of same length as logits and 0-padded.
    weights_fn: ignored. The weights returned are the total length of the ground
        truth labels, excluding 0-paddings.

  Returns:
    (edit distance / reference length, reference length)

  Raises:
    ValueError: if weights_fn is not common_layers.weights_nonzero.
  """
  if weights_fn is not common_layers.weights_nonzero:
    raise ValueError("Only weights_nonzero can be used for this metric.")

  with tf.variable_scope("edit_distance", values=[predictions, labels]):
    # Transform logits into sequence classes by taking max at every step.
    predictions = tf.to_int32(
        tf.squeeze(tf.argmax(predictions, axis=-1), axis=(2, 3)))
    nonzero_idx = tf.where(tf.not_equal(predictions, 0))
    sparse_outputs = tf.SparseTensor(nonzero_idx,
                                     tf.gather_nd(predictions, nonzero_idx),
                                     tf.shape(predictions, out_type=tf.int64))
    labels = tf.squeeze(labels, axis=(2, 3))
    nonzero_idx = tf.where(tf.not_equal(labels, 0))
    label_sparse_outputs = tf.SparseTensor(nonzero_idx,
                                           tf.gather_nd(labels, nonzero_idx),
                                           tf.shape(labels, out_type=tf.int64))
    distance = tf.reduce_sum(
        tf.edit_distance(sparse_outputs, label_sparse_outputs, normalize=False))
    reference_length = tf.to_float(common_layers.shape_list(nonzero_idx)[0])
    return distance / reference_length, reference_length
Example #8
0
    def loss(self):
        """
        定义loss
        :return:
        """
        # 调用ctc loss
        with tf.name_scope('loss'): #损失
            self.avg_loss = tf.reduce_mean(ctc_ops.ctc_loss(self.text, self.logits, self.seq_length))
            tf.summary.scalar('loss',self.avg_loss)
        # [optimizer]
        with tf.name_scope('train'): #训练过程
            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.avg_loss)

        with tf.name_scope("decode"):
            self.decoded, log_prob = ctc_ops.ctc_beam_search_decoder(self.logits, self.seq_length, merge_repeated=False)

        with tf.name_scope("accuracy"):
            self.distance = tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.text)
            # 计算label error rate (accuracy)
            self.label_err = tf.reduce_mean(self.distance, name='label_error_rate')
            tf.summary.scalar('accuracy', self.label_err)
    def compute_ler(self, labels_true, labels_pred):
        """Operation for computing LER (Label Error Rate).
        Args:
            labels_true: A SparseTensor of target labels
            labels_pred: A SparseTensor of predicted labels
        Returns:
            ler_op: operation for computing LER
        """
        # Compute LER (normalize by label length)
        ler_op = tf.reduce_mean(tf.edit_distance(
            labels_pred, labels_true, normalize=True))
        # TODO: consider <EOS>

        # Add a scalar summary for the snapshot of LER
        # with tf.name_scope("ler"):
        #     self.summaries_train.append(tf.summary.scalar(
        #         'ler_train', ler_op))
        #     self.summaries_dev.append(tf.summary.scalar(
        #         'ler_dev', ler_op))
        # TODO: feed_dictのタイミング違うからエラーになる
        # global_stepをupdateする前にする?

        return ler_op
Example #10
0
    def _build_graph(self, inputs):
        feat, labelidx, labelvalue, labelshape, seqlen = inputs
        label = tf.SparseTensor(labelidx, labelvalue, labelshape)

        cell = tf.contrib.rnn.BasicLSTMCell(num_units=HIDDEN)
        cell = tf.contrib.rnn.MultiRNNCell([cell] * NLAYER)

        initial = cell.zero_state(tf.shape(feat)[0], tf.float32)

        outputs, last_state = tf.nn.dynamic_rnn(cell, feat,
                                                seqlen, initial,
                                                dtype=tf.float32, scope='rnn')

        # o: b x t x HIDDEN
        output = tf.reshape(outputs, [-1, HIDDEN])  # (Bxt) x rnnsize
        logits = FullyConnected('fc', output, NR_CLASS, nl=tf.identity,
                                W_init=tf.truncated_normal_initializer(stddev=0.01))
        logits = tf.reshape(logits, (BATCH, -1, NR_CLASS))

        loss = tf.nn.ctc_loss(label, logits, seqlen, time_major=False)

        self.cost = tf.reduce_mean(loss, name='cost')

        logits = tf.transpose(logits, [1, 0, 2])

        isTrain = get_current_tower_context().is_training
        if isTrain:
            # beam search is too slow to run in training
            predictions = tf.to_int32(
                tf.nn.ctc_greedy_decoder(logits, seqlen)[0][0])
        else:
            predictions = tf.to_int32(
                tf.nn.ctc_beam_search_decoder(logits, seqlen)[0][0])
        err = tf.edit_distance(predictions, label, normalize=True)
        err.set_shape([None])
        err = tf.reduce_mean(err, name='error')
        summary.add_moving_summary(err, self.cost)
Example #11
0
def runCTC(batch):
    INPUT_PATH = '../TRAIN/All/mfcc/' #directory of MFCC nFeatures x nFrames 2-D array .npy files
    TARGET_PATH = '../TRAIN/All/phone_y/' #directory of nPhonemes 1-D array .npy files


    ####Learning Parameters
    learningRate = 0.001
    momentum = 0.9
    nEpochs = 300
    batchSize = batch.shape[1]

    ####Network Parameters
    nFeatures = 39 #12 MFCC coefficients + energy, and derivatives
    nHidden = 256
    nClasses = 30 #39 phonemes, plus the "blank" for CTC

    ####Load data
    print('Loading data')
    with open('TIMIT_data_prepared_for_CTC.pkl','rb') as f:
        data= pickle.load(f)
    input_list = batch
    charmap = data['chars']
    print(charmap)
    charmap.append('_')
    #batchedData, maxTimeSteps = data_lists_to_batches(input_list, target_list, batchSize)
    maxTimeSteps = 776
    totalN = len(input_list)

    ####Define graph
    print('Defining graph')
    graph = tf.Graph()
    with graph.as_default():

        ####NOTE: try variable-steps inputs and dynamic bidirectional rnn, when it's implemented in tensorflow
            
        ####Graph input
        inputX = tf.placeholder(tf.float32, shape=(maxTimeSteps, batchSize, nFeatures))
        #Prep input data to fit requirements of rnn.bidirectional_rnn
        #  Reshape to 2-D tensor (nTimeSteps*batchSize, nfeatures)
        inputXrs = tf.reshape(inputX, [-1, nFeatures])
        #  Split to get a list of 'n_steps' tensors of shape (batch_size, n_hidden)
        inputList = tf.split(0, maxTimeSteps, inputXrs)
        targetIxs = tf.placeholder(tf.int64)
        targetVals = tf.placeholder(tf.int32)
        targetShape = tf.placeholder(tf.int64)
        targetY = tf.SparseTensor(targetIxs, targetVals, targetShape)
        seqLengths = tf.placeholder(tf.int32, shape=(batchSize))

        ####Weights & biases
        weightsOutH1 = tf.Variable(tf.truncated_normal([2, nHidden],
                                                       stddev=np.sqrt(2.0 / (2*nHidden))))
        biasesOutH1 = tf.Variable(tf.zeros([nHidden]))
        weightsOutH2 = tf.Variable(tf.truncated_normal([2, nHidden],
                                                       stddev=np.sqrt(2.0 / (2*nHidden))))
        biasesOutH2 = tf.Variable(tf.zeros([nHidden]))
        weightsClasses = tf.Variable(tf.truncated_normal([nHidden, nClasses],
                                                         stddev=np.sqrt(2.0 / nHidden)))
        biasesClasses = tf.Variable(tf.zeros([nClasses]))

        ####Network
        forwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True)
        backwardH1 = rnn_cell.LSTMCell(nHidden, use_peepholes=True, state_is_tuple=True)
        fbH1, _, _ = bidirectional_rnn(forwardH1, backwardH1, inputList, dtype=tf.float32,
                                           scope='BDLSTM_H1')
        fbH1rs = [tf.reshape(t, [batchSize, 2, nHidden]) for t in fbH1]
        outH1 = [tf.reduce_sum(tf.mul(t, weightsOutH1), reduction_indices=1) + biasesOutH1 for t in fbH1rs]

        logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1]

        ####Optimizing
        logits3d = tf.pack(logits)
        loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths))
        optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss)

        ####Evaluating
        logitsMaxTest = tf.slice(tf.argmax(logits3d,2), [0, 0], [seqLengths[0], 1])
        predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0])
        errorRate = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False)) / \
                    tf.to_float(tf.size(targetY.values))

    ####Run session
    with tf.Session(graph=graph) as session:
        print('Initializing')
        saver = tf.train.Saver()
        
        ckpt = tf.train.get_checkpoint_state('/users/TeamASR/models')
        if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
            print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
            saver.restore(session, ckpt.model_checkpoint_path)
        else:
            print("Created model with fresh parameters.")
            session.run(tf.initialize_all_variables())
        feedDict = {inputX: batch, seqLengths: (np.ones([batchSize])*776)}
        logit = session.run([logits3d], feed_dict=feedDict)
    return logit
Example #12
0
 def create_cer(sparse_decoded, sparse_targets):
     return tf.edit_distance(tf.cast(sparse_decoded, tf.int32), sparse_targets, normalize=True)
import tensorflow as tf

sess = tf.Session()

#----------------------------------
# First compute the edit distance between 'bear' and 'beers'
hypothesis = list('bear')
truth = list('beers')
h1 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3]], hypothesis,
                     [1, 1, 1])

t1 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 0, 4]],
                     truth, [1, 1, 1])

print(sess.run(tf.edit_distance(h1, t1, normalize=False)))

#----------------------------------
# Compute the edit distance between ('bear','beer') and 'beers':
hypothesis2 = list('bearbeer')
truth2 = list('beersbeers')
h2 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 1, 0],
                      [0, 1, 1], [0, 1, 2], [0, 1, 3]], hypothesis2, [1, 2, 4])

t2 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 0, 4],
                      [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3], [0, 1, 4]],
                     truth2, [1, 2, 5])

print(sess.run(tf.edit_distance(h2, t2, normalize=True)))

#----------------------------------
Example #14
0
outputs = tf.reshape(outputs1, [-1, num_hidden])

logits0 = tf.matmul(tf.nn.dropout(outputs, keep_prob), W) + b
logits1 = tf.reshape(logits0, [batch_s, -1, num_classes])
logits = tf.transpose(logits1, (1, 0, 2))
logits = tf.cast(logits, tf.float32)

loss = tf.nn.ctc_loss(labels, logits, seq_len)
cost = tf.reduce_mean(loss)
width1_decoded, width1_log_prob = tf.nn.ctc_beam_search_decoder(
    logits, seq_len, merge_repeated=False, beam_width=1)
decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits,
                                                  seq_len,
                                                  merge_repeated=False)
width1_acc = tf.reduce_mean(
    tf.edit_distance(tf.cast(width1_decoded[0], tf.int32), labels))
acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), labels))
saver = tf.train.Saver(max_to_keep=1)

result = 0
imgFiles = glob.glob(os.path.join("test_img", "*"))
imgFiles.sort()
txtFiles = glob.glob(os.path.join("test_txt", "*"))
txtFiles.sort()
for i in range(len(imgFiles)):
    goldLines = []
    fin = open(txtFiles[i])
    lines = fin.readlines()
    fin.close()
    for j in range(len(lines)):
        goldLines.append(lines[j])
Example #15
0
def train():
    global_step = tf.Variable(0, trainable=False)

    # learning_rate = tf.train.exponential_decay(LEARNING_RATE_INITIAL,
    #    global_step,
    #    LEARNING_RATE_DECAY_STEPS,
    #    LEARNING_RATE_DECAY_FACTOR,
    #    staircase=True, name="learning_rate")
    # 决定还是自定义学习速率比较靠谱
    curr_learning_rate = 1e-5
    learning_rate = tf.placeholder(tf.float32, shape=[])

    logits, inputs, labels, seq_len, keep_prob = neural_networks()

    # If time_major == True (default), this will be a Tensor shaped: [max_time x batch_size x num_classes]
    # 返回 A 1-D float Tensor, size [batch], containing the negative log probabilities.
    loss = tf.nn.ctc_loss(labels=labels,
                          inputs=logits,
                          sequence_length=seq_len)
    cost = tf.reduce_mean(loss, name="cost")

    # 收敛效果不好
    # optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=MOMENTUM).minimize(cost, global_step=global_step)

    # 做一个梯度裁剪,貌似也没啥用, 将梯度控制到 -1 和 1 之间
    # grads_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    # grads_and_vars = grads_optimizer.compute_gradients(loss)
    # capped_grads_and_vars = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in grads_and_vars]
    # gradients, variables = zip(*grads_optimizer.compute_gradients(loss))
    # gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
    # capped_grads_and_vars = zip(gradients, variables)

    #capped_grads_and_vars = [(tf.clip_by_norm(g, 5), v) for g,v in grads_and_vars]
    # optimizer = grads_optimizer.apply_gradients(capped_grads_and_vars, global_step=global_step)

    # 最小化 loss
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(
        cost, global_step=global_step)
    # The ctc_greedy_decoder is a special case of the ctc_beam_search_decoder with top_paths=1 (but that decoder is faster for this special case).
    # decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len, merge_repeated=False)
    decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits,
                                                      seq_len,
                                                      beam_width=10,
                                                      merge_repeated=False)
    # decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len, merge_repeated=False)

    acc = tf.reduce_sum(
        tf.edit_distance(tf.cast(decoded[0], tf.int32),
                         labels,
                         normalize=False))
    acc = 1 - acc / tf.to_float(tf.size(labels.values))

    init = tf.global_variables_initializer()

    def report_accuracy(decoded_list, test_labels):
        original_list = decode_sparse_tensor(test_labels)
        detected_list = decode_sparse_tensor(decoded_list)
        if len(original_list) != len(detected_list):
            print("len(original_list)", len(original_list),
                  "len(detected_list)", len(detected_list),
                  " test and detect length desn't match")
        print("T/F: original(length) <-------> detectcted(length)")
        _acc = 0.
        for idx in range(min(len(original_list), len(detected_list))):
            number = original_list[idx]
            detect_number = detected_list[idx]
            hit = (number == detect_number)
            print("%6s" % hit, list_to_chars(number), "(", len(number), ")")
            print("%6s" % "", list_to_chars(detect_number), "(",
                  len(detect_number), ")")
            # 计算莱文斯坦比
            import Levenshtein
            _acc += Levenshtein.ratio(list_to_chars(number),
                                      list_to_chars(detect_number))
        print("Test Accuracy:", _acc / len(original_list))

    def do_report():
        test_inputs, test_labels, test_seq_len = get_next_batch(
            TEST_BATCH_SIZE)
        test_feed = {
            inputs: test_inputs,
            labels: test_labels,
            seq_len: test_seq_len,
            keep_prob: 1.0
        }
        dd = session.run(decoded[0], test_feed)
        report_accuracy(dd, test_labels)

    def restore(sess):
        curr_dir = os.path.dirname(__file__)
        model_dir = os.path.join(curr_dir, "model_ascii_res_lstm")
        if not os.path.exists(model_dir): os.mkdir(model_dir)
        saver_prefix = os.path.join(model_dir, "model.ckpt")
        ckpt = tf.train.get_checkpoint_state(model_dir)
        saver = tf.train.Saver(max_to_keep=5)
        if ckpt and ckpt.model_checkpoint_path:
            print("Restore Model ...")
            saver.restore(sess, ckpt.model_checkpoint_path)
        return saver, model_dir, saver_prefix

    with tf.Session() as session:
        session.run(init)
        saver, model_dir, checkpoint_path = restore(
            session)  # tf.train.Saver(tf.global_variables(), max_to_keep=100)
        while True:
            train_cost = 0
            for batch in range(BATCHES):
                start = time.time()

                train_inputs, train_labels, train_seq_len = get_next_batch(
                    BATCH_SIZE)
                feed = {
                    inputs: train_inputs,
                    labels: train_labels,
                    seq_len: train_seq_len,
                    keep_prob: 0.95,
                    learning_rate: curr_learning_rate
                }

                # l=session.run(layer,feed)
                # print(train_inputs.shape)
                # print(l.shape)
                # print(train_seq_len[0])

                b_acc, b_loss, b_labels, b_logits, b_seq_len, b_cost, steps, b_learning_rate, _ = \
                    session.run([acc, loss, labels, logits, seq_len, cost, global_step, learning_rate, optimizer], feed)

                train_cost += b_cost * BATCH_SIZE
                seconds = round(time.time() - start, 2)
                print("step:", steps, "cost:", b_cost, "batch seconds:",
                      seconds, "acc:", b_acc, "width:", train_seq_len[0])
                if np.isnan(b_cost) or np.isinf(b_cost):
                    print("Error: cost is nan or inf")
                    train_labels_list = decode_sparse_tensor(train_labels)
                    for i, train_label in enumerate(train_labels_list):
                        print(i, list_to_chars(train_label))
                    return

                if seconds > 60:
                    print('Exit for long time')
                    return

                if steps > 0 and steps % REPORT_STEPS == 0:
                    do_report()

            saver.save(session, checkpoint_path, global_step=steps)
Example #16
0
    with tf.name_scope('decoder'):
        decode, log_prob = tf.nn.ctc_beam_search_decoder(
            inputs=logits, sequence_length=seq_len, merge_repeated=True)

        targets = tf.sparse_placeholder(tf.int32, [None, None], name="target")

    with tf.name_scope('loss'):
        ctc_loss = tf.nn.ctc_loss(labels=targets,
                                  inputs=logits,
                                  sequence_length=seq_len)

        avg_loss = tf.reduce_mean(ctc_loss)
        tf.summary.histogram("avg_loss", avg_loss)

    with tf.name_scope('accuracy'):
        distance = tf.edit_distance(tf.cast(decode[0], tf.int32), targets)
        ler = tf.reduce_mean(distance, name='label_error_rate')

    with tf.name_scope('optimizer'):
        optimizer = tf.train.AdamOptimizer(learning_rate=alpha,
                                           beta1=beta1,
                                           beta2=beta2,
                                           epsilon=epsilon)

        optimizer = optimizer.minimize(avg_loss)

    elapsed_time = timer() - start
    print("Elapsed time : " + str(elapsed_time))


def run_model(sess, client):
Example #17
0
def train_shadownet(dataset_dir, weights_path=None):
    """

    :param dataset_dir:
    :param weights_path:
    :return:
    """
    # decode the tf records to get the training data
    decoder = data_utils.TextFeatureIO().reader
    images, labels, imagenames = decoder.read_features(ops.join(dataset_dir, 'train_feature.tfrecords'),
                                                       num_epochs=None)
    inputdata, input_labels, input_imagenames = tf.train.shuffle_batch(
        tensors=[images, labels, imagenames], batch_size=32, capacity=1000+2*32, min_after_dequeue=100, num_threads=1)

    inputdata = tf.cast(x=inputdata, dtype=tf.float32)

    # initializa the net model
    shadownet = crnn_model.ShadowNet(phase='Train', hidden_nums=256, layers_nums=2, seq_length=25, num_classes=37)

    with tf.variable_scope('shadow', reuse=False):
        net_out = shadownet.build_shadownet(inputdata=inputdata)

    cost = tf.reduce_mean(tf.nn.ctc_loss(labels=input_labels, inputs=net_out, sequence_length=25*np.ones(32)))

    decoded, log_prob = tf.nn.ctc_beam_search_decoder(net_out, 25*np.ones(32), merge_repeated=False)

    sequence_dist = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), input_labels))

    global_step = tf.Variable(0, name='global_step', trainable=False)

    starter_learning_rate = config.cfg.TRAIN.LEARNING_RATE
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                               config.cfg.TRAIN.LR_DECAY_STEPS, config.cfg.TRAIN.LR_DECAY_RATE,
                                               staircase=True)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

    with tf.control_dependencies(update_ops):
        optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate).minimize(loss=cost, global_step=global_step)

    # Set tf summary
    tboard_save_path = 'tboard/shadownet'
    if not ops.exists(tboard_save_path):
        os.makedirs(tboard_save_path)
    tf.summary.scalar(name='Cost', tensor=cost)
    tf.summary.scalar(name='Learning_Rate', tensor=learning_rate)
    tf.summary.scalar(name='Seq_Dist', tensor=sequence_dist)
    merge_summary_op = tf.summary.merge_all()

    # Set saver configuration
    saver = tf.train.Saver()
    model_save_dir = 'model/shadownet'
    if not ops.exists(model_save_dir):
        os.makedirs(model_save_dir)
    train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))
    model_name = 'shadownet_{:s}.ckpt'.format(str(train_start_time))
    model_save_path = ops.join(model_save_dir, model_name)

    # Set sess configuration
    sess_config = tf.ConfigProto()
    sess_config.gpu_options.per_process_gpu_memory_fraction = config.cfg.TRAIN.GPU_MEMORY_FRACTION
    sess_config.gpu_options.allow_growth = config.cfg.TRAIN.TF_ALLOW_GROWTH

    sess = tf.Session(config=sess_config)

    summary_writer = tf.summary.FileWriter(tboard_save_path)
    summary_writer.add_graph(sess.graph)

    # Set the training parameters
    train_epochs = config.cfg.TRAIN.EPOCHS

    with sess.as_default():
        if weights_path is None:
            logger.info('Training from scratch')
            init = tf.global_variables_initializer()
            sess.run(init)
        else:
            logger.info('Restore model from {:s}'.format(weights_path))
            saver.restore(sess=sess, save_path=weights_path)

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        for epoch in range(train_epochs):
            _, c, seq_distance, preds, gt_labels, summary = sess.run(
                [optimizer, cost, sequence_dist, decoded, input_labels, merge_summary_op])

            # calculate the precision
            preds = decoder.sparse_tensor_to_str(preds[0])
            gt_labels = decoder.sparse_tensor_to_str(gt_labels)

            accuracy = []

            for index, gt_label in enumerate(gt_labels):
                pred = preds[index]
                totol_count = len(gt_label)
                correct_count = 0
                try:
                    for i, tmp in enumerate(gt_label):
                        if tmp == pred[i]:
                            correct_count += 1
                except IndexError:
                    continue
                finally:
                    try:
                        accuracy.append(correct_count / totol_count)
                    except ZeroDivisionError:
                        if len(pred) == 0:
                            accuracy.append(1)
                        else:
                            accuracy.append(0)
            accuracy = np.mean(np.array(accuracy).astype(np.float32), axis=0)
            #
            if epoch % config.cfg.TRAIN.DISPLAY_STEP == 0:
                logger.info('Epoch: {:d} cost= {:9f} seq distance= {:9f} train accuracy= {:9f}'.format(
                    epoch + 1, c, seq_distance, accuracy))

            summary_writer.add_summary(summary=summary, global_step=epoch)
            saver.save(sess=sess, save_path=model_save_path, global_step=epoch)

        coord.request_stop()
        coord.join(threads=threads)

    sess.close()

    return
Example #18
0
    def build(self, ctc_beam_search=False, decay_steps=8000, decay_rate=0.7):
        '''Build all necessary ops into the object's tensorflow graph'''
        if self.built:
            raise RuntimeError("Graph has already been built! Please reset.")

        self.rate = tf.placeholder(tf.float32, shape=[])

        global_step = tf.Variable(0, trainable=False)
        learning_rate = tf.train.exponential_decay(self.rate,
                                                   global_step,
                                                   decay_steps,
                                                   decay_rate,
                                                   staircase=False)

        self.features = tf.placeholder(tf.float32, [None, None])
        self.speaker = tf.placeholder(tf.int32, [None])
        self.targets = tf.sparse_placeholder(tf.int32)

        n_windows = tf.shape(self.features)[0] - self.n_frames

        c_logit_array = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
        i_logit_array = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
        d_vec_array = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
        window_array = tf.TensorArray(tf.float32, size=0, dynamic_size=True)

        loop_vars = [
            0, n_windows, self.features, c_logit_array, i_logit_array,
            window_array, d_vec_array
        ]

        # define loop that applies the feedforward model over a frame sequence
        def cond(t, t_stop, *args):
            # stop iterating when the full frame sequence has been encoded
            return t < t_stop

        def body(t, t_stop, features, char_logits, id_logits, windows, d_vecs):

            n_per_branch = self.n_layers - self.n_shared

            shared = ['shared_layer_' + str(n) for n in range(self.n_shared)]
            char_scopes = ['char_layer_' + str(n) for n in range(n_per_branch)]
            id_scopes = ['id_layer_' + str(n) for n in range(n_per_branch)]

            char_out_scope = 'char_output'
            id_out_scope = 'id_output'

            # slice window out of feature array and flatten it
            inp = self.features[t:t + self.n_frames, :]
            windows = windows.write(t, tf.reshape(inp, [1, self.size_in]))

            x = tf.reshape(inp, [1, self.size_in])

            # build and stack shared feedforward layers
            for i, scope in enumerate(shared):
                size_in = self.size_in if i < 1 else self.n_per_layer
                x = self.ff_layer(x, size_in, self.n_per_layer, scope)

            if n_per_branch > 0:
                x_char = self.build_ff_branch(x, char_scopes)
                x_id = self.build_ff_branch(x, id_scopes)
            else:
                x_char = x
                x_id = x

            # build output layers for each task
            char_out = self.ff_layer(x_char,
                                     self.n_per_layer,
                                     self.n_chars,
                                     char_out_scope,
                                     logits=True)
            id_out = self.ff_layer(x_id,
                                   self.n_per_layer,
                                   self.n_speakers,
                                   id_out_scope,
                                   logits=True)

            # accumulate logit values for each window
            char_logits = char_logits.write(t, char_out)
            id_logits = id_logits.write(t, id_out)

            # accumulate ID d-vectors
            d_vecs = d_vecs.write(t, x_id)

            return [
                t + 1, t_stop, features, char_logits, id_logits, windows,
                d_vecs
            ]

        # note that because there are no dependencies between time steps, we
        # can run the loop iterations in parallel (doesn't make much of a diff)
        loop_output = tf.while_loop(cond,
                                    body,
                                    loop_vars,
                                    parallel_iterations=20)

        # use squeeze to create 2D instead of 3D arrays
        self.c_logits = loop_output[3].stack()  # can't squeeze b/c ctc loss
        self.i_logits = tf.squeeze(loop_output[4].stack())
        self.all_windows = tf.squeeze(loop_output[5].stack())
        self.d_vecs = tf.squeeze(loop_output[6].stack())

        n_windows = tf.expand_dims(n_windows, 0)

        char_loss = tf.nn.ctc_loss(self.targets,
                                   self.c_logits,
                                   n_windows,
                                   preprocess_collapse_repeated=False,
                                   ctc_merge_repeated=True,
                                   ignore_longer_outputs_than_inputs=True,
                                   time_major=True)

        id_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=self.i_logits, labels=self.speaker)

        # TODO: figure out a good weighting scheme for combining losses
        self.cost = tf.reduce_sum(id_loss) + tf.reduce_sum(char_loss)

        # build the loss and an op for doing parameter updates
        tvars = tf.trainable_variables()
        grads = tf.gradients(self.cost, tvars)
        grads, _ = tf.clip_by_global_norm(grads, 5.0)  # avoid explosions

        optimizer = tf.train.RMSPropOptimizer(learning_rate)

        self.train_step = optimizer.apply_gradients(zip(grads, tvars),
                                                    global_step=global_step)

        self.speaker_decode = tf.argmax(self.i_logits,
                                        axis=-1,
                                        output_type=tf.int32)
        if ctc_beam_search:
            self.char_decode, _ = tf.nn.ctc_beam_search_decoder(
                self.c_logits, n_windows)
        else:
            self.char_decode, _ = tf.nn.ctc_greedy_decoder(
                self.c_logits, n_windows)

        self.ler = tf.reduce_mean(
            tf.edit_distance(tf.cast(self.char_decode[0], tf.int32),
                             self.targets))

        self.built = True
Example #19
0
    def build_graph(self, args, maxTimeSteps):
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.inputX = tf.placeholder(
                tf.float32,
                shape=(maxTimeSteps, args.batch_size,
                       args.num_feature))  # [maxL,32,39]
            inputXrs = tf.reshape(self.inputX, [-1, args.num_feature])
            # self.inputList = tf.split(0, maxTimeSteps, inputXrs) #convert inputXrs from [32*maxL,39] to [32,maxL,39]
            self.inputList = tf.split(
                inputXrs, maxTimeSteps,
                0)  # convert inputXrs from [32*maxL,39] to [32,maxL,39]
            self.targetIxs = tf.placeholder(tf.int64)
            self.targetVals = tf.placeholder(tf.int32)
            self.targetShape = tf.placeholder(tf.int64)
            self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals,
                                           self.targetShape)
            self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size))
            self.config = {
                'name': args.model,
                'rnncell': self.cell_fn,
                'num_layer': args.num_layer,
                'num_hidden': args.num_hidden,
                'num_class': args.num_class,
                'activation': args.activation,
                'optimizer': args.optimizer,
                'learning rate': args.learning_rate,
                'keep prob': args.keep_prob,
                'batch size': args.batch_size
            }

            fbHrs = build_multi_dynamic_brnn(self.args, maxTimeSteps,
                                             self.inputX, self.cell_fn,
                                             self.seqLengths)
            with tf.name_scope('fc-layer'):
                with tf.variable_scope('fc'):
                    weightsClasses = tf.Variable(
                        tf.truncated_normal([args.num_hidden, args.num_class],
                                            name='weightsClasses'))
                    biasesClasses = tf.Variable(tf.zeros([args.num_class]),
                                                name='biasesClasses')
                    logits = [
                        tf.matmul(t, weightsClasses) + biasesClasses
                        for t in fbHrs
                    ]
            logits3d = tf.stack(logits)
            self.loss = tf.reduce_mean(
                tf.nn.ctc_loss(self.targetY, logits3d, self.seqLengths))
            self.var_op = tf.global_variables()
            self.var_trainable_op = tf.trainable_variables()

            if args.grad_clip == -1:
                # not apply gradient clipping
                self.optimizer = tf.train.AdamOptimizer(
                    args.learning_rate).minimize(self.loss)
            else:
                # apply gradient clipping
                grads, _ = tf.clip_by_global_norm(
                    tf.gradients(self.loss, self.var_trainable_op),
                    args.grad_clip)
                opti = tf.train.AdamOptimizer(args.learning_rate)
                self.optimizer = opti.apply_gradients(
                    zip(grads, self.var_trainable_op))
            self.predictions = tf.to_int32(
                tf.nn.ctc_beam_search_decoder(logits3d,
                                              self.seqLengths,
                                              merge_repeated=False)[0][0])
            if args.level == 'cha':
                self.errorRate = tf.reduce_sum(
                    tf.edit_distance(self.predictions,
                                     self.targetY,
                                     normalize=True))
            self.initial_op = tf.global_variables_initializer()
            self.saver = tf.train.Saver(tf.global_variables(),
                                        max_to_keep=5,
                                        keep_checkpoint_every_n_hours=1)
            self.logfile = args.log_dir + str(
                datetime.datetime.strftime(datetime.datetime.now(),
                                           '%Y-%m-%d %H:%M:%S') +
                '.txt').replace(' ', '').replace('/', '')
Example #20
0
    def __init__(self,
                 stochastic=False,
                 use_slope=True,
                 variational_dropout=False,
                 vocabularySize=283,
                 label_size=50,
                 rnnSize=256,
                 n_layers=3,
                 dropout=0.5,
                 zoneout=0.1,
                 embedding_size=None,
                 dtype=tf.float32,
                 clip=0.35,
                 k_width=3,
                 name='hlstm',
                 conv_filter=3,
                 mid_filter=25,
                 batch_size=128):
        self.rnnSize = rnnSize
        self.vocabularySize = vocabularySize
        self.outputSize = label_size
        self.stochastic = stochastic

        self.dtype = dtype
        self.dropout = dropout
        self.n_layers = n_layers
        self.clip = clip
        self.name = name
        self.use_slope = use_slope
        self.zoneout = zoneout
        self.batch_size = batch_size
        self.k_width = k_width
        self.conv_filter = conv_filter

        self.mid_filter = mid_filter
        f_bias = 0.0

        # placeholders
        self.x = tf.placeholder(tf.float32, [None, None, 40, 3],
                                name='x')  #[batch, seq_len]
        self.label = tf.sparse_placeholder(tf.int32,
                                           name='label')  #[batch, seq_len]
        self.seq_len = tf.placeholder(tf.int32, [None],
                                      name='seq_len')  # [batch_size]

        self.is_train = tf.placeholder(tf.bool, [], name='train')
        self.lr = tf.placeholder(tf.float32, [], name='lr')
        dropout_p = tf.where(self.is_train, self.dropout, 1.0)
        dropout_p = tf.cast(dropout_p, dtype=self.dtype)

        # LSTM layers
        self.lstm_cells = []
        conv_filter_size = (self.conv_filter, self.conv_filter)

        h = tf.layers.conv2d(self.x,
                             32,
                             conv_filter_size, (2, 2),
                             'same',
                             use_bias=False,
                             name='conv0')
        h = tf.contrib.layers.batch_norm(h,
                                         center=True,
                                         scale=True,
                                         is_training=self.is_train,
                                         decay=0.9,
                                         epsilon=1e-3,
                                         scope='bn0')
        h = tf.nn.tanh(h, name='tanh0')

        h = tf.layers.conv2d(h,
                             32,
                             conv_filter_size, (1, 2),
                             'same',
                             use_bias=False,
                             name='conv1')
        h = tf.contrib.layers.batch_norm(h,
                                         center=True,
                                         scale=True,
                                         is_training=self.is_train,
                                         decay=0.9,
                                         epsilon=1e-3,
                                         scope='bn1')
        h = tf.nn.tanh(h, name='tanh1')
        time_convolution = 2
        _seq_len_char = self.seq_len
        _seq_len_word = tf.div(self.seq_len, 2)
        #reshape
        # ([0] : batch_size, [1] : seq_len, [2]*[3] : feature dimension)
        h_shape = tf.shape(h)
        h = tf.reshape(h, [h_shape[0], h_shape[1], 320])

        cell = LSTMCell(self.rnnSize,
                        initializer=tf.variance_scaling_initializer(
                            1.0, 'fan_out', 'uniform'))
        #cell = LSTMCell(self.rnnSize)
        cell = DropoutWrapper(cell,
                              output_keep_prob=dropout_p,
                              variational_recurrent=variational_dropout,
                              dtype=self.dtype)

        self.lstm_cells.append(cell)

        for i in range(self.n_layers - 1):
            cell = LSTMCell(self.rnnSize,
                            initializer=tf.variance_scaling_initializer(
                                1.0, 'fan_out', 'uniform'),
                            forget_bias=f_bias)
            #cell = SRUCell(self.rnnSize,  initializer = tf.variance_scaling_initializer(1.0, 'fan_out', 'uniform'))
            cell = DropoutWrapper(cell,
                                  output_keep_prob=dropout_p,
                                  variational_recurrent=variational_dropout,
                                  dtype=self.dtype)
            #cell = HLSTMCell(self.rnnSize)
            self.lstm_cells.append(cell)

#        self.h = []
#        self.gate = []
        with tf.variable_scope('lstm0'):

            _h, last_state = tf.nn.dynamic_rnn(cell=self.lstm_cells[0],
                                               inputs=h,
                                               dtype=self.dtype)

        lstm_input = _h
        for i in range(1, self.n_layers):
            with tf.variable_scope('lstm' + str(i)):
                output, last_state = tf.nn.dynamic_rnn(cell=self.lstm_cells[i],
                                                       inputs=lstm_input,
                                                       dtype=self.dtype)
                lstm_input = output

            if i == self.n_layers - 3:
                character_h = output
                lstm_input = tf.expand_dims(lstm_input, -2)
                conv_filter = tf.get_variable(
                    'lstm_time_conv_filter',
                    shape=[time_convolution, 1, self.rnnSize, 1],
                    trainable=True)
                lstm_input = tf.nn.depthwise_conv2d(
                    lstm_input,
                    conv_filter, [1, time_convolution, time_convolution, 1],
                    padding='SAME',
                    name='lstm_time_conv')
                lstm_input = tf.squeeze(lstm_input, axis=[-2])
        lstm_output = lstm_input

        # character-ctc layer
        h_shape = tf.shape(character_h)
        output_h = tf.reshape(character_h, [-1, self.rnnSize])
        print(output_h)

        with tf.variable_scope('dense_character'):
            dense = tf.layers.dense(
                output_h,
                self.outputSize,
                kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))

        self.char_logit = tf.reshape(dense,
                                     [h_shape[0], h_shape[1], self.outputSize])
        self.char_loss = tf.nn.ctc_loss(inputs=self.char_logit,
                                        labels=self.label,
                                        sequence_length=_seq_len_char,
                                        time_major=False)
        self.char_loss = tf.reduce_mean(self.char_loss)
        train_loss = self.char_loss
        char_opt = tf.train.AdamOptimizer(self.lr)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            grad, var = zip(*char_opt.compute_gradients(train_loss))
            clipped_gradients, _ = tf.clip_by_global_norm(grad, clip)
            self.char_optimizer = char_opt.apply_gradients(
                zip(clipped_gradients, var))

        self.sentence, _ = tf.nn.ctc_greedy_decoder(
            tf.transpose(self.char_logit, (1, 0, 2)), _seq_len_char)
        self.cer = tf.reduce_mean(
            tf.edit_distance(tf.cast(self.sentence[0], tf.int32), self.label))

        # wordpiece-ctc layer
        h_shape = tf.shape(lstm_output)
        output_h = tf.reshape(lstm_output, [-1, self.rnnSize])
        print(output_h)

        with tf.variable_scope('dense_wordpiece'):
            dense = tf.layers.dense(
                output_h,
                self.vocabularySize + 1,
                kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))

        self.word_logit = tf.reshape(
            dense, [h_shape[0], h_shape[1], self.vocabularySize + 1])
        self.word_loss = tf.nn.ctc_loss(inputs=self.word_logit,
                                        labels=self.label,
                                        sequence_length=_seq_len_word,
                                        time_major=False)
        self.word_loss = tf.reduce_mean(self.word_loss)
        train_loss = self.word_loss
        word_opt = tf.train.AdamOptimizer(self.lr)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            grad, var = zip(*word_opt.compute_gradients(train_loss))
            clipped_gradients, _ = tf.clip_by_global_norm(grad, clip)
            self.word_optimizer = word_opt.apply_gradients(
                zip(clipped_gradients, var))

        self.word_sentence, _ = tf.nn.ctc_greedy_decoder(
            tf.transpose(self.word_logit, (1, 0, 2)), _seq_len_word)
        print(self.word_sentence)
        self.word_sentence = tf.sparse_tensor_to_dense(self.word_sentence[0],
                                                       default_value=2)
        #self.word_sentence = tf.sparse_tensor_to_dense(self.word_sentence.indices, self.word_sentence.shape, self.word_sentence.values,default_value = 2 )
        #self.word_distance = tf.reduce_mean(tf.edit_distance(tf.cast(word_sentence[0], tf.int32),self.wp_label))
        # last states to placeholder
        self.logsoftmax = tf.nn.log_softmax(self.word_logit)

        self.saver = tf.train.Saver()
Example #21
0
def train():
    test_names, test_inputs, test_targets, test_seq_len = utils.get_data_set(
        'valid.txt')
    global_step = tf.Variable(0, trainable=False)
    learning_rate = tf.train.exponential_decay(
        common.INITIAL_LEARNING_RATE,
        global_step,
        common.DECAY_STEPS,
        common.LEARNING_RATE_DECAY_FACTOR,
        staircase=True)
    logits, inputs, targets, seq_len, Wforward, Wbackward, b = model.get_train_model(
    )
    loss = tf.nn.ctc_loss(logits, targets, seq_len)
    cost = tf.reduce_mean(loss)

    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                           momentum=common.MOMENTUM).minimize(
                                               cost, global_step=global_step)

    decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits,
                                                      seq_len,
                                                      merge_repeated=False)

    acc = tf.reduce_mean(
        tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))

    def do_report():
        test_feed = {
            inputs: test_inputs,
            targets: test_targets,
            seq_len: test_seq_len
        }
        dd, log_probs, accuracy = session.run([decoded[0], log_prob, acc],
                                              test_feed)
        accuracy = report_accuracy(dd, test_targets, test_names)
        save_path = saver.save(session,
                               "models/ocr.model-" + str(accuracy),
                               global_step=steps)
        # decoded_list = decode_sparse_tensor(dd)

    def do_batch():
        feed = {
            inputs: train_inputs,
            targets: train_targets,
            seq_len: train_seq_len
        }
        b_cost, steps, _ = session.run([cost, global_step, optimizer], feed)
        if steps > 0 and steps % common.REPORT_STEPS == 0:
            do_report()
        return b_cost, steps

    with tf.Session(config=tf.ConfigProto(
            log_device_placement=True)) as session:
        ckpt = tf.train.get_checkpoint_state("models")
        if ckpt and ckpt.model_checkpoint_path:
            saver = tf.train.Saver()
            saver.restore(session, ckpt.model_checkpoint_path)
        else:
            print("no checkpoint found")
            # Initializate the weights and biases
            init = tf.initialize_all_variables()
            session.run(init)
            saver = tf.train.Saver(tf.all_variables(), max_to_keep=100)
        for curr_epoch in xrange(num_epochs):

            print("Epoch.......", curr_epoch)
            train_cost = train_ler = 0
            for batch in xrange(common.BATCHES):
                start = time.time()
                train_names, train_inputs, train_targets, train_seq_len = utils.get_data_set(
                    'trainimg.txt', batch * common.BATCH_SIZE,
                    (batch + 1) * common.BATCH_SIZE)

                print("get data time", time.time() - start)
                start = time.time()
                c, steps = do_batch()
                train_cost += c * common.BATCH_SIZE
                seconds = time.time() - start
                print("Step:", steps, ", batch seconds:", seconds)

            train_cost /= common.TRAIN_SIZE

            val_feed = {
                inputs: train_inputs,
                targets: train_targets,
                seq_len: train_seq_len
            }

            val_cost, val_ler, lr, steps = session.run(
                [cost, acc, learning_rate, global_step], feed_dict=val_feed)

            log = "Epoch {}/{}, steps = {}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}s, learning_rate = {}"
            print(
                log.format(curr_epoch + 1, num_epochs, steps, train_cost,
                           train_ler, val_cost, val_ler,
                           time.time() - start, lr))
Example #22
0
    def build_graph(self, args, maxTimeSteps):
        self.maxTimeSteps = maxTimeSteps
        self.inputX = tf.placeholder(
            tf.float32,
            shape=[maxTimeSteps, args.batch_size, args.num_feature])

        # define tf.SparseTensor for ctc loss
        self.targetIxs = tf.placeholder(tf.int64)
        self.targetVals = tf.placeholder(tf.int32)
        self.targetShape = tf.placeholder(tf.int64)
        self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals,
                                       self.targetShape)
        self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size))

        self.config = {
            'name': args.model,
            'num_layer': args.num_layer,
            'num_hidden': args.num_hidden,
            'num_class': args.num_class,
            'activation': args.activation,
            'optimizer': args.optimizer,
            'learning rate': args.learning_rate,
            'keep prob': args.keep_prob,
            'batch size': args.batch_size
        }

        inputX = tf.reshape(
            self.inputX, [args.batch_size, maxTimeSteps, args.num_feature, 1])
        print(inputX.get_shape())
        with tf.variable_scope("layer_conv1"):
            # shape of kernel: [batch, in_height, in_width, in_channels]
            kernel = tf.get_variable("kernel",
                                     shape=[3, 3, 1, 16],
                                     dtype=tf.float32)
            # shape of conv1:  [batch, height, width, channels]
            conv1 = tf.nn.conv2d(inputX, kernel, (1, 1, 1, 1), padding='VALID')

        print(conv1.get_shape())
        output = conv1
        for layer_id in range(args.num_layer):
            vars_scope = "capsule_cnn_layer_" + str(layer_id + 1)
            # (self, num_capsules, num_channels, output_vector_len, layer_type='conv', vars_scope=None):
            capLayer = CapsuleLayer(4,
                                    8,
                                    2,
                                    layer_type='conv',
                                    vars_scope=vars_scope)
            # (self, inputX, kernel_size, strides, routing=True, padding='VALID'):
            output = capLayer(output, [2, 2], (1, 1, 1, 1), args.num_iter)
            print(output.get_shape())

        # last dnn layer for classification
        vars_scope = "capsule_dnn_layer"
        capLayer = CapsuleLayer(8,
                                16,
                                args.num_classes,
                                layer_type='dnn',
                                vars_scope=vars_scope)
        logits3d = capLayer(output, [3, 3], (1, 1, 1, 1), args.num_iter)
        logits3d = tf.transpose(logits3d, perm=[1, 0, 2])
        self.loss = tf.reduce_mean(
            tf.nn.ctc_loss(self.targetY, logits3d, self.seqLengths))
        self.var_op = tf.global_variables()
        self.var_trainable_op = tf.trainable_variables()
        if args.grad_clip == -1:
            # not apply gradient clipping
            self.optimizer = tf.train.AdamOptimizer(
                args.learning_rate).minimize(self.loss)
        else:
            # apply gradient clipping
            grads, _ = tf.clip_by_global_norm(
                tf.gradients(self.loss, self.var_trainable_op), args.grad_clip)
            opti = tf.train.AdamOptimizer(args.learning_rate)
            self.optimizer = opti.apply_gradients(
                zip(grads, self.var_trainable_op))
        self.predictions = tf.to_int32(
            tf.nn.ctc_beam_search_decoder(logits3d,
                                          self.seqLengths,
                                          merge_repeated=False)[0][0])
        if args.level == 'cha':
            self.errorRate = tf.reduce_sum(
                tf.edit_distance(self.predictions,
                                 self.targetY,
                                 normalize=True))
        self.initial_op = tf.global_variables_initializer()
        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=5,
                                    keep_checkpoint_every_n_hours=1)
Example #23
0
    def _get_cer(self, pred_chars: Tensor) -> None:

        # Compute the Character Error Rate (CER) per batch in the computing graph
        self._cer = tf.reduce_mean(tf.edit_distance(pred_chars, self._targets),
                                   name='cer')
Example #24
0
def train():
    #test_inputs, test_targets, test_seq_len = utils.get_data_set('LSTM2', 0, 128)#118100, 118200)#120100, 120200) #IMGN1 # GO2 120100, 120200
    S = 'train'
    m = CNNLSTM(S)
    m.build_graph()
    global_step = tf.train.get_or_create_global_step(
    )  #tf.Variable(0, trainable=False)
    learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                               global_step,
                                               DECAY_STEPS,
                                               LEARNING_RATE_DECAY_FACTOR,
                                               staircase=True)
    loss = tf.nn.ctc_loss(labels=m.labels,
                          inputs=m.logits,
                          sequence_length=m.seq_len)
    cost = tf.reduce_mean(loss)
    #cost = model.ctc_loss_layer(logits,targets,seq_len)

    #optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=common.MOMENTUM).minimize(cost, global_step=global_step)
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                       beta1=0.9,
                                       beta2=0.999).minimize(
                                           loss, global_step=global_step)
    # Option 2: tf.contrib.ctc.ctc_beam_search_decoder
    # (it's slower but you'll get better results)
    decoded, log_prob = tf.nn.ctc_beam_search_decoder(m.logits,
                                                      m.seq_len,
                                                      merge_repeated=False)

    # Accuracy: label error rate
    acc = tf.reduce_mean(
        tf.edit_distance(tf.cast(decoded[0], tf.int32), m.labels))

    print('loading train data, please wait---------------------')
    train_feeder = DataIterator(data_dir=train_dir)
    print('get image: ', train_feeder.size)

    print('loading validation data, please wait---------------------')
    val_feeder = DataIterator(data_dir=val_dir)
    print('get image: ', val_feeder.size)

    num_train_samples = train_feeder.size  # 100000
    num_batches_per_epoch = int(num_train_samples / B_SIZE)  # номер партии
    shuffle_idx = np.random.permutation(num_train_samples)

    #-----------------------2й - словарь--------------------------#
    num_val_samples = val_feeder.size  #val_feeder.size
    num_batches_per_epoch_val = int(num_val_samples /
                                    B_SIZE)  # example: 10000/100
    shuffle_idx_val = np.random.permutation(num_val_samples)

    def do_report():  #Информация/сохранение модели
        indexs_val = [
            shuffle_idx_val[i % num_val_samples]
            for i in range(batch * B_SIZE, (batch + B_SIZE) * B_SIZE)
        ]
        val_inputs, val_seq_len, val_labels = val_feeder.input_index_generate_batch(
            indexs_val)
        #test_feed = {m.inputs: val_inputs, m.labels: val_labels, m.seq_len: val_seq_len}
        test_feed = {m.inputs: val_inputs, m.labels: val_labels}
        dd, log_probs, accuracy = session.run([decoded[0], log_prob, acc],
                                              test_feed)
        accuracy = report_accuracy(dd, val_labels)

        save_path = saver.save(session,
                               "models/ocr.model-" + str(accuracy),
                               global_step=steps)
        # decoded_list = decode_sparse_tensor(dd)

    def do_batch():  #Партия
        #feed = {m.inputs: train_inputs, m.labels: train_labels, m.seq_len: train_seq_len}
        feed = {m.inputs: train_inputs, m.labels: train_labels}
        b_cost, steps, _ = session.run([cost, global_step, optimizer], feed)
        #b_cost, steps, _ = session.run([m.cost, m.global_step, m.train_op], feed)
        #print "ПАРТИЯ"
        if steps > 0 and steps % 10000 == 0:
            do_report()
        return b_cost, steps

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.30)
    with tf.Session(config=tf.ConfigProto(
            gpu_options=gpu_options, log_device_placement=True)) as session:
        ckpt = tf.train.get_checkpoint_state("models")
        #writer = tf.summary.FileWriter('log/', graph=session.graph)
        if ckpt and ckpt.model_checkpoint_path:
            saver = tf.train.Saver()
            saver.restore(session, ckpt.model_checkpoint_path)
        else:
            print("no checkpoint found")
            # Initializate the weights and biases
            #init = tf.initialize_all_variables()
            #session.run(init)
            #saver = tf.train.Saver(tf.all_variables(), max_to_keep=100)
            session.run(tf.global_variables_initializer())

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)
        for curr_epoch in xrange(num_epochs):
            #variables = tf.all_variables()
            #for i in variables:
            #print(i.name)

            print("Epoch.......", curr_epoch)

            train_cost = train_ler = 0
            for batch in range(num_batches_per_epoch):
                #print (batch)
                start = time.time()
                #train_inputs, train_targets, train_seq_len = utils.get_data_set('GO1', batch * common.BATCH_SIZE,
                #                                                                (batch + 1) * common.BATCH_SIZE)
                indexs = [
                    shuffle_idx[i % num_train_samples]
                    for i in range(batch * B_SIZE, (batch + B_SIZE) * B_SIZE)
                ]
                #print indexs
                train_inputs, train_seq_len, train_labels = train_feeder.input_index_generate_batch(
                    indexs
                )  #utils.get_data_set('LSTM2', batch * common.BATCH_SIZE, (batch + 1) * common.BATCH_SIZE)

                #print("get data time", time.time() - start)
                start = time.time()
                c, steps = do_batch()
                train_cost += c * B_SIZE
                seconds = time.time() - start
                #print("Step:", steps, ", batch seconds:", seconds)

                #feed = {m.inputs: train_inputs, m.labels: train_targets, m.seq_len: train_seq_len} #готовлю данные
                #summary_str, batch_cost, step, _ = session.run([m.merged_summay, m.cost, m.global_step, m.train_op], feed)
            train_cost /= B_SIZE
            indexs_val = [
                shuffle_idx_val[i % num_val_samples]
                for i in range(batch * B_SIZE, (batch + B_SIZE) * B_SIZE)
            ]
            val_inputs, val_seq_len, val_labels = val_feeder.input_index_generate_batch(
                indexs_val)
            #val_feed = {m.inputs: val_inputs, m.labels: val_labels, m.seq_len: val_seq_len}
            val_feed = {m.inputs: val_inputs, m.labels: val_labels}
            val_cost, val_ler, lr, steps = session.run(
                [cost, acc, learning_rate, global_step], feed_dict=val_feed)
            log = "Epoch {}/{}, steps = {}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}s, learning_rate = {}"
            print(
                log.format(curr_epoch + 1, num_epochs, steps, train_cost,
                           train_ler, val_cost, val_ler,
                           time.time() - start, lr))
Example #25
0
    def create(self, imageHeight, imageWidth, num_classes, evalFLAG):
        graph = tf.Graph()
        with graph.as_default():

            num_hidden = 256

            training = not evalFLAG

            with tf.name_scope('Inputs'):
                inputs = tf.placeholder(tf.float32,
                                        [None, imageHeight, imageWidth, 1],
                                        name='inputs')
                if evalFLAG:
                    tf.summary.image('inputs', inputs, max_outputs=1)

            #seq_len should be feed with a list containing the real width of the images before padding to obtain imageWidth
            seq_len = tf.placeholder(tf.int32, [None], name='seq_len')

            targets = tf.sparse_placeholder(tf.int32, name='targets')

            targets_len = tf.placeholder(tf.int32, name='targets_len')

            conv_keep_prob = 0.8
            lstm_keep_prob = 0.5

            # Layer 1
            with tf.name_scope('Layer_Conv_1'):
                h_conv1 = CNN(x=inputs,
                              filters=16,
                              kernel_size=[3, 3],
                              strides=[1, 1],
                              name='conv1',
                              activation=tf.nn.leaky_relu,
                              evalFLAG=evalFLAG,
                              initializer=tf.contrib.layers.xavier_initializer(
                                  uniform=False))
                h_pool1, seq_len_1, imageHeight, imageWidth = max_pool(
                    h_conv1, [2, 2], seq_len, imageHeight, imageWidth,
                    evalFLAG)
                h_pool1 = tf.layers.dropout(h_pool1,
                                            rate=0.0,
                                            training=training)

            # Layer 2
            with tf.name_scope('Layer_Conv_2'):
                h_conv2 = CNN(x=h_pool1,
                              filters=32,
                              kernel_size=[3, 3],
                              strides=[1, 1],
                              name='conv2',
                              activation=tf.nn.leaky_relu,
                              evalFLAG=evalFLAG,
                              initializer=tf.contrib.layers.xavier_initializer(
                                  uniform=False))
                h_pool2, seq_len_2, imageHeight, imageWidth = max_pool(
                    h_conv2, [2, 2], seq_len_1, imageHeight, imageWidth,
                    evalFLAG)
                h_pool2 = tf.layers.dropout(h_pool2,
                                            rate=(1 - conv_keep_prob),
                                            training=training)

            # Layer 3
            with tf.name_scope('Layer_Conv_3'):
                h_conv3 = CNN(x=h_pool2,
                              filters=48,
                              kernel_size=[3, 3],
                              strides=[1, 1],
                              name='conv3',
                              activation=tf.nn.leaky_relu,
                              evalFLAG=evalFLAG,
                              initializer=tf.contrib.layers.xavier_initializer(
                                  uniform=False))
                h_pool3, seq_len_3, imageHeight, imageWidth = max_pool(
                    h_conv3, [2, 2], seq_len_2, imageHeight, imageWidth,
                    evalFLAG)
                h_pool3 = tf.layers.dropout(h_pool3,
                                            rate=(1 - conv_keep_prob),
                                            training=training)

            # Layer 4
            with tf.name_scope('Layer_Conv_4'):
                h_conv4 = CNN(x=h_pool3,
                              filters=64,
                              kernel_size=[3, 3],
                              strides=[1, 1],
                              name='conv4',
                              activation=tf.nn.leaky_relu,
                              evalFLAG=evalFLAG,
                              initializer=tf.contrib.layers.xavier_initializer(
                                  uniform=False))
                h_pool4, seq_len_4, imageHeight, imageWidth = max_pool(
                    h_conv4, [1, 1], seq_len_3, imageHeight, imageWidth,
                    evalFLAG)
                h_pool4 = tf.layers.dropout(h_pool4,
                                            rate=(1 - conv_keep_prob),
                                            training=training)

            # Layer 5
            with tf.name_scope('Layer_Conv_5'):
                h_conv5 = CNN(x=h_pool4,
                              filters=80,
                              kernel_size=[3, 3],
                              strides=[1, 1],
                              name='conv5',
                              activation=tf.nn.leaky_relu,
                              evalFLAG=evalFLAG,
                              initializer=tf.contrib.layers.xavier_initializer(
                                  uniform=False))
                h_pool5, seq_len_5, imageHeight, imageWidth = max_pool(
                    h_conv5, [1, 1], seq_len_4, imageHeight, imageWidth,
                    evalFLAG)
                h_pool5 = tf.layers.dropout(h_pool5,
                                            rate=(1 - lstm_keep_prob),
                                            training=training)

            with tf.name_scope('Reshaping_step'):
                h_cw_concat = tf.transpose(h_pool5, (2, 0, 1, 3))
                h_cw_concat = tf.reshape(
                    h_cw_concat, (int(imageWidth), -1, int(imageHeight * 80)))
                h_cw_concat = tf.transpose(h_cw_concat, (1, 0, 2))

            with tf.name_scope('Layer_BLSTM_1'):

                h_bilstm1 = bidirectionalLSTM(h_cw_concat, num_hidden,
                                              seq_len_5, '1', evalFLAG)
                h_bilstm1 = tf.concat(h_bilstm1, 2)
                h_bilstm1 = tf.layers.dropout(h_bilstm1,
                                              rate=(1 - lstm_keep_prob),
                                              training=training)

            with tf.name_scope('Layer_BLSTM_2'):

                h_bilstm2 = bidirectionalLSTM(h_bilstm1, num_hidden, seq_len_5,
                                              '2', evalFLAG)
                h_bilstm2 = tf.concat(h_bilstm2, 2)
                h_bilstm2 = tf.layers.dropout(h_bilstm2,
                                              rate=(1 - lstm_keep_prob),
                                              training=training)

            with tf.name_scope('Layer_BLSTM_3'):

                h_bilstm3 = bidirectionalLSTM(h_bilstm2, num_hidden, seq_len_5,
                                              '3', evalFLAG)
                h_bilstm3 = tf.concat(h_bilstm3, 2)
                h_bilstm3 = tf.layers.dropout(h_bilstm3,
                                              rate=(1 - lstm_keep_prob),
                                              training=training)

            with tf.name_scope('Layer_BLSTM_4'):

                h_bilstm4 = bidirectionalLSTM(h_bilstm3, num_hidden, seq_len_5,
                                              '4', evalFLAG)
                h_bilstm4 = tf.concat(h_bilstm4, 2)
                h_bilstm4 = tf.layers.dropout(h_bilstm4,
                                              rate=(1 - lstm_keep_prob),
                                              training=training)

            with tf.name_scope('Layer_BLSTM_5'):

                h_bilstm5 = bidirectionalLSTM(h_bilstm4, num_hidden, seq_len_5,
                                              '5', evalFLAG)
                h_bilstm5 = tf.concat(h_bilstm5, 2)
                h_bilstm5 = tf.layers.dropout(h_bilstm5,
                                              rate=(1 - lstm_keep_prob),
                                              training=training)

            with tf.name_scope('Layer_Linear') as ns:
                outputs = tf.transpose(h_bilstm5, (1, 0, 2))
                outputs = tf.reshape(outputs, (-1, 2 * num_hidden))
                logits = FNN(outputs, num_classes, ns, None, evalFLAG)

            with tf.name_scope('Logits'):
                logits = tf.reshape(logits, (int(imageWidth), -1, num_classes))

            seq_len_5 = tf.maximum(seq_len_5, targets_len)

            n_batches = tf.placeholder(tf.float32, name='n_batches')
            previousCost = tf.placeholder(tf.float32, name='previous_cost')

            with tf.name_scope('CTC_Loss'):
                loss = tf.nn.ctc_loss(targets,
                                      logits,
                                      seq_len_5,
                                      preprocess_collapse_repeated=False,
                                      ctc_merge_repeated=True)
                with tf.name_scope('total'):
                    batch_cost = tf.reduce_mean(loss)
                    cost = batch_cost / n_batches + previousCost

            tf.summary.scalar('CTC_loss', cost)

            with tf.name_scope('train'):
                train_vars = tf.get_collection(
                    tf.GraphKeys.TRAINABLE_VARIABLES,
                    scope='Layer_Linear') + tf.get_collection(
                        tf.GraphKeys.TRAINABLE_VARIABLES,
                        scope='BLSTM[12345]') + tf.get_collection(
                            tf.GraphKeys.TRAINABLE_VARIABLES,
                            scope='conv[12345]')
                print(train_vars)
                learning_rate = tf.placeholder(tf.float32,
                                               name='learning_rate')
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=learning_rate).minimize(batch_cost)

            with tf.name_scope('predictions'):
                predictions, log_prob = tf.nn.ctc_beam_search_decoder(
                    logits, seq_len_5, merge_repeated=False)

            with tf.name_scope('CER'):
                with tf.name_scope('Mean_CER_per_word'):
                    previousEDnorm = tf.placeholder(tf.float32,
                                                    name='previousEDnorm')
                    EDnorm = tf.reduce_mean(
                        tf.edit_distance(
                            tf.cast(predictions[0], tf.int32),
                            targets,
                            normalize=True)) / n_batches + previousEDnorm

                    if evalFLAG:
                        tf.summary.scalar('EDnorm', EDnorm)

                with tf.name_scope('Absolute_CER_total_set'):
                    setTotalChars = tf.placeholder(tf.float32,
                                                   name='setTotalChars')
                    previousEDabs = tf.placeholder(tf.float32,
                                                   name='previousEDabs')
                    errors = tf.edit_distance(tf.cast(predictions[0],
                                                      tf.int32),
                                              targets,
                                              normalize=False)
                    EDabs = tf.reduce_sum(
                        errors) / setTotalChars + previousEDabs
                    if evalFLAG:
                        tf.summary.scalar('EDabs', EDabs)

            ED = [EDnorm, EDabs]

            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=5,
                                   keep_checkpoint_every_n_hours=24)

            transferred_vars = tf.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES,
                scope="BLSTM[12345]") + tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES, scope="conv")

            transferred_vars_dict = dict([(var.op.name, var)
                                          for var in transferred_vars])

            transfer_saver = tf.train.Saver(transferred_vars_dict)

            merged = tf.summary.merge_all()

            return graph, [
                saver, transfer_saver
            ], inputs, seq_len, targets, targets_len, learning_rate, n_batches, setTotalChars, previousEDabs, previousEDnorm, previousCost, optimizer, batch_cost, cost, errors, ED, predictions, merged
Example #26
0
def ler(y_true, y_pred, **kwargs):
    """
        Label Error Rate. For more information see 'tf.edit_distance'
    """
    return tf.reduce_mean(tf.edit_distance(y_pred, y_true, **kwargs))
Example #27
0
def main(_):

    batch_size = FLAGS.batch_size
    # num_readers = 4
    num_epochs = FLAGS.epoch
    checkpoint_dir = FLAGS.checkpoint_dir

    with tf.Graph().as_default():

        # deploy_config = model_deploy.DeploymentConfig()
        # Create global_step.
        global_step = tf.placeholder(tf.int64, name='global_step')

        tr_file_names = [os.path.join("/mnt/sdb/mark/SynthText/", "synthtext_train.tfrecords")]
        te_file_names = [os.path.join("/mnt/sdb/mark/SynthText/", "synthtext_test.tfrecords")]

        sh_images, sh_labels, sh_length, sh_width = read_utils.inputs( filename=tr_file_names, batch_size=batch_size, num_epochs=num_epochs, preprocess=True)
        val_images, val_labels, val_length, val_width = read_utils.inputs( filename=te_file_names, batch_size=batch_size, num_epochs=10000*num_epochs, preprocess=True)


        # Build Model
        crnn = model.CRNNNet()

        with tf.variable_scope('crnn'):
            logits, seq_len = crnn.net(sh_images, sh_width, is_training=True, kp=1.0)
            tf.get_variable_scope().reuse_variables()
            val_logits, val_seq_len = crnn.net(val_images, val_width, is_training=False, kp=1.0)

        loss = crnn.losses(sh_labels, logits, seq_len)
        tf.summary.scalar("train/loss", loss)
        tf.summary.image("train/inputs", sh_images)

        val_loss = crnn.losses(val_labels, val_logits, val_seq_len)
        # TODO: BK-tree NN search
        decoded, log_prob = tf.nn.ctc_beam_search_decoder(tf.transpose(val_logits, perm=[1, 0, 2]), val_seq_len, merge_repeated=False)

        acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), val_labels, normalize=False))
        acc_norm = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), val_labels))

        val_loss_sum = tf.placeholder(tf.float32, name='val_loss_sum')
        val_acc_sum = tf.placeholder(tf.float32, name='val_acc_sum')
        val_acc_norm_sum = tf.placeholder(tf.float32, name='val_acc_norm_sum')

        tf.summary.scalar("test/val_loss", val_loss_sum)
        tf.summary.scalar("test/edit_distance", val_acc_sum)
        tf.summary.scalar("test/edit_distance_norm", val_acc_norm_sum)



        starter_learning_rate = FLAGS.learning_rate
        learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                                   FLAGS.lr_decay_step, FLAGS.lr_decay_rate, staircase=True)
        tf.summary.scalar("train/learning_rate",learning_rate)

        train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):        
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss=loss, var_list=train_vars)

        # Start Training
        with tf.Session(config=config) as sess:
            if FLAGS.debug:
                sess = tf_debug.LocalCLIDebugWrapperSession(sess)
            save = tf.train.Saver(max_to_keep=50)

            raw_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='crnn/CRNN_net/')
            init_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='crnn/ResNet/')
            pretrain = tf.train.Saver({v.op.name.replace('crnn/ResNet/', ''): v for v in init_vars if v.op.name.find('Adam') == -1})


            if not FLAGS.load:
                base_step = 0
                init_op = tf.group(tf.global_variables_initializer(),
                                   tf.local_variables_initializer())

                sess.run(init_op)            # Start input enqueue threads.
            else:
                try:
                    base_step = int(FLAGS.ckpt_file.split('-')[-1])
                except:
                    base_step = 0

                if FLAGS.mode == 'raw':
                    # ckpt_file = 'model.ckpt-' + FLAGS.ckpt_step
                    ckpt_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.ckpt_file)
                    save.restore(sess, ckpt_path)
                    sess.run(tf.local_variables_initializer())

                elif FLAGS.mode == 'pretrain':
                    ckpt_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.ckpt_file)
                    init_op = tf.group(tf.local_variables_initializer(), tf.variables_initializer([v for v in init_vars if v.op.name.find('Adam') != -1] + raw_vars + [v for v in tf.global_variables() if v.op.name.find('crnn/') == -1]))
                    sess.run(init_op)
                    pretrain.restore(sess, ckpt_path)


            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            #sess = tf_debug.LocalCLIDebugWrapperSession(sess)
            merged = tf.summary.merge(tf.get_collection(tf.GraphKeys.SUMMARIES, scope='train/*'))
            val_merged = tf.summary.merge(tf.get_collection(tf.GraphKeys.SUMMARIES, scope='test/*'))

            file_writer = tf.summary.FileWriter(FLAGS.logdir, sess.graph)

            try:

                step = 0
                while not coord.should_stop():
                    start_time = time.time()

                    _, merged_t, tr_loss, lr, step, db_labels, db_images, db_logits = sess.run([optimizer, merged, loss, learning_rate, global_step, sh_labels, sh_images, logits], feed_dict={global_step: step})

                    duration = time.time() - start_time

                    print("loss", tr_loss, "time", duration)
                    file_writer.add_summary(merged_t, step)

                    # Print an overview fairly often.
                    if step % FLAGS.save_steps == 0 and step > 0:
                        #######################################################

                        val_loss_s, val_acc_s, val_acc_norm_s = 0, 0, 0
                        for ite in range(FLAGS.sample_size):
                            te_loss, te_acc, te_acc_norm = sess.run([val_loss, acc, acc_norm])
                            val_loss_s += te_loss
                            val_acc_s += te_acc
                            val_acc_norm_s += te_acc_norm
                        val_loss_s /= FLAGS.sample_size
                        val_acc_s /= FLAGS.sample_size
                        val_acc_norm_s /= FLAGS.sample_size

                        print('Step %d: loss %.3f acc %.3f %.3f (%.3f sec)' % (step, val_loss_s, val_acc_s, val_acc_norm_s, duration))

                        # Add summary
                        val_sum = sess.run(val_merged, feed_dict={val_loss_sum: val_loss_s, val_acc_sum: val_acc_s, val_acc_norm_sum: val_acc_norm_s})
                        file_writer.add_summary(val_sum, step)

                        save.save(sess, os.path.join(FLAGS.checkpoint_dir, 'model.ckpt'), global_step=step+base_step)
                    step += 1
                    
            except tf.errors.OutOfRangeError:
                print('Done training for %d epochs, %d steps.' % (num_epochs, step))
            finally:
                # When done, ask the threads to stop.
                coord.request_stop()

                # Wait for threads to finish.
            coord.join(threads)
Example #28
0
def create_graph_for_validation_ctc(pipeline, nnet_config):
    graph = dict()

    nnet_input = pipeline['nnet_input']
    graph['nnet_input'] = nnet_input

    sequence_length = pipeline['sequence_length']
    graph['sequence_length'] = sequence_length

    nnet_type = nnet_config.get('nnet_type')
    create_logits = get_create_logits(nnet_type)
    logits, encoder, reg_loss = create_logits(
        nnet_input=nnet_input,
        sequence_length=sequence_length,
        nnet_config=nnet_config,
    )
    graph['logits'] = logits

    # Convert from [batch, time, target] to [time, batch, target]
    logits = tf.transpose(logits, (1, 0, 2))

    nnet_target = pipeline['nnet_target']
    graph['raw_target'] = nnet_target
    sparse_indices = \
        tf.where(
            tf.not_equal(
                nnet_target, tf.constant(-1, dtype=tf.int64)
            )
        )
    sparse_values = \
        tf.gather_nd(
            params=nnet_target,
            indices=sparse_indices,
        )
    dense_shape = \
        tf.cast(
            x=tf.shape(nnet_target),
            dtype=tf.int64,
        )
    sparse = \
        tf.SparseTensor(
            indices=sparse_indices,
            values=sparse_values,
            dense_shape=dense_shape,
        )
    sparse = \
        tf.cast(
            x=sparse,
            dtype=tf.int32,
        )
    nnet_target = sparse
    graph['nnet_target'] = nnet_target
    batch_size = tf.shape(sparse_values)[0]
    graph['size'] = batch_size

    loss = tf.nn.ctc_loss(labels=nnet_target,
                          inputs=logits,
                          sequence_length=sequence_length,
                          ignore_longer_outputs_than_inputs=True)

    loss = tf.reduce_sum(loss)
    tf.summary.scalar('loss', loss)
    graph['eval_loss'] = loss

    other_weights = 0
    other_loss = None
    for item in reg_loss:
        if item[0] is not None and item[1] is not None \
           and item[1]>0:
            other_weights += item[1]
            if other_loss == None:
                other_loss = item[0]
            else:
                other_loss += item[0]

    if other_loss is not None and other_weights != 0:
        #loss  = (1-other_weights)* loss + other_loss
        loss = loss + other_loss

    graph['loss'] = loss  # keep track of the total loss

    decoded, neg_sum_logits = tf.nn.ctc_greedy_decoder(
        inputs=logits, sequence_length=sequence_length, merge_repeated=True)
    dist = tf.reduce_sum(
        tf.edit_distance(tf.cast(decoded[0], tf.int64),
                         tf.cast(nnet_target, tf.int64),
                         normalize=False), )
    graph['eval'] = dist

    global_step = tf.train.get_or_create_global_step()
    global_step = tf.assign(global_step, global_step + 1, name='global_step')
    graph['global_step'] = global_step

    summary = tf.summary.merge_all()
    graph['summary'] = summary

    for key, val in graph.iteritems():
        tf.add_to_collection(key, val)

    return graph
Example #29
0
def train_shadownet(dataset_dir, weights_path=None, decode: bool=False, num_threads=4):
    """
    :param dataset_dir:
    :param weights_path:
    :param num_threads: Number of threads to use in tf.train.shuffle_batch
    :return:
    """
    # Load config
    cfg = load_config().cfg

    # decode the tf records to get the training data
    decoder = data_utils.TextFeatureIO().reader
    input_images, input_labels, input_image_names = decoder.read_features(ops.join(dataset_dir, 'train_feature.tfrecords'), cfg.TRAIN.BATCH_SIZE, num_threads)

    # initialise the net model
    shadownet = crnn_model.ShadowNet(phase='Train', hidden_nums=cfg.ARCH.HIDDEN_UNITS, layers_nums=cfg.ARCH.HIDDEN_LAYERS, num_classes=len(decoder.char_dict) + 1)

    with tf.variable_scope('shadow', reuse=False):
        net_out = shadownet.build_shadownet(inputdata=input_images)

    cost = tf.reduce_mean(tf.nn.ctc_loss(labels=input_labels, inputs=net_out,
                                         sequence_length=cfg.ARCH.SEQ_LENGTH*np.ones(cfg.TRAIN.BATCH_SIZE)))

    decoded, log_prob = tf.nn.ctc_beam_search_decoder(net_out,
                                                      cfg.ARCH.SEQ_LENGTH*np.ones(cfg.TRAIN.BATCH_SIZE),
                                                      merge_repeated=False)

    sequence_dist = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), input_labels))

    global_step = tf.Variable(0, name='global_step', trainable=False)

    starter_learning_rate = cfg.TRAIN.LEARNING_RATE
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                               cfg.TRAIN.LR_DECAY_STEPS, cfg.TRAIN.LR_DECAY_RATE,
                                               staircase=True)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

    with tf.control_dependencies(update_ops):
        optimizer = tf.train.AdadeltaOptimizer(learning_rate=learning_rate).minimize(loss=cost, global_step=global_step)

    # Setup TF summary
    tboard_save_path = 'tboard/shadownet'
    if not ops.exists(tboard_save_path):
        os.makedirs(tboard_save_path)
    tf.summary.scalar(name='Cost', tensor=cost)
    tf.summary.scalar(name='Learning_Rate', tensor=learning_rate)
    if decode:
        tf.summary.scalar(name='Seq_Dist', tensor=sequence_dist)

    merge_summary_op = tf.summary.merge_all()

    # Set saver configuration
    saver = tf.train.Saver()
    model_save_dir = cfg.PATH.CRNN_MODEL_SAVE_DIR
    if not ops.exists(model_save_dir):
        os.makedirs(model_save_dir)
    train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time()))
    model_name = 'shadownet_{:s}.ckpt'.format(str(train_start_time))
    model_save_path = ops.join(model_save_dir, model_name)

    # Set sess configuration
    sess_config = tf.ConfigProto()
    sess_config.gpu_options.per_process_gpu_memory_fraction = cfg.TRAIN.GPU_MEMORY_FRACTION
    sess_config.gpu_options.allow_growth = cfg.TRAIN.TF_ALLOW_GROWTH

    sess = tf.Session(config=sess_config)

    summary_writer = tf.summary.FileWriter(tboard_save_path)
    summary_writer.add_graph(sess.graph)

    # Set the training parameters
    train_epochs = cfg.TRAIN.EPOCHS

    with sess.as_default():
        if weights_path is None:
            logger.info('Training from scratch')
            init = tf.global_variables_initializer()
            sess.run(init)
        else:
            logger.info('Restore model from {:s}'.format(weights_path))
            saver.restore(sess=sess, save_path=weights_path)

        cost_history = [np.inf]
        for epoch in range(train_epochs):
            if decode:
                _, c, seq_distance, predictions, labels, summary = sess.run([optimizer, cost, sequence_dist, decoded, input_labels, merge_summary_op])

                labels = decoder.sparse_tensor_to_str(labels)
                predictions = decoder.sparse_tensor_to_str(predictions[0])
                accuracy = compute_accuracy(labels, predictions)

                if epoch % cfg.TRAIN.DISPLAY_STEP == 0:
                    logger.info('Epoch: {:d} cost= {:9f} seq distance= {:9f} train accuracy= {:9f}'.format(
                        epoch + 1, c, seq_distance, accuracy))

            else:
                _, c, summary = sess.run([optimizer, cost, merge_summary_op])
                if epoch % cfg.TRAIN.DISPLAY_STEP == 0:
                    logger.info('Epoch: {:d} cost= {:9f}'.format(epoch + 1, c))

            cost_history.append(c)
            summary_writer.add_summary(summary=summary, global_step=epoch)
            saver.save(sess=sess, save_path=model_save_path, global_step=epoch)

        return np.array(cost_history[1:])  # Don't return the first np.inf
Example #30
0
    def model_fn(self, features, labels, mode, params):
        """ Model function for transformer.

        Args:
            features: float Tensor with shape [batch_size, T, H, W, C]. Input sequence.
            labels: string Tensor with shape [batch_size,]. Target labels.
            mode: Indicate train or eval or predict.
            params: dict. model parameters.

        Returns: tf.estimator.EstimatorSpec.

        """
        #learning_rate = params.get('learning_rate', 0.001)

        in_training = mode == tf.estimator.ModeKeys.TRAIN

        video = features['video']
        inputs_unpadded_length = features['unpadded_length']

        if params.get('feature_extractor') == 'early_fusion':
            from .cnn_extractor import EarlyFusion2D as CnnExtractor
        elif params.get('feature_extractor') == 'res':
            from .cnn_extractor import ResNet as CnnExtractor
        else:
            from .cnn_extractor import LipNet as CnnExtractor

        feature_extractor = CnnExtractor(feature_len=params.get('hidden_size'),
                                         training=in_training,
                                         scope='cnn_feature_extractor')

        inputs = feature_extractor.build(
            video)  # [batch_size, input_length, hidden_size]

        params.update({'pinyin_vocab_size': len(self.pinyin_dic)})
        params.update({'viseme_vocab_size': len(self.viseme_dic)})
        v_p_transformer = Transformer1(params,
                                       in_training,
                                       scope="v_p_transformer")

        label_params = params.copy()
        label_params.update({'vocab_size': len(self.label_dic)})
        label_params.update({'target_vocab_size': len(self.label_dic)})
        label_params.update({'scope': "v_c_transformer"})
        label_params.update({'dic': self.label_dic})

        if params.get('co_attention') == 1:
            v_c_transformer = Transformer_co1(label_params,
                                              in_training,
                                              scope="v_c_transformer")
        elif params.get('co_attention') == 2:
            v_c_transformer = Transformer_co2(label_params,
                                              in_training,
                                              scope="v_c_transformer")
        elif params.get('co_attention') == 3:
            v_c_transformer = Transformer_co3(label_params,
                                              in_training,
                                              scope="v_c_transformer")
        else:
            v_c_transformer = Transformer_co4(label_params,
                                              in_training,
                                              scope="v_c_transformer")

        viseme_labels = labels['viseme']
        sparse_viseme, viseme_string = self.procesess_label(
            viseme_labels, self.viseme_dic)
        viseme_char_list_labels = label_util.string2char_list(viseme_string)

        pinyin_labels = tf.squeeze(labels['pinyin'])  # [batch_size, ]
        pinyin_char_list_labels, pinyin_labels = self.preprocess_labels(
            pinyin_labels, self.pinyin_dic)  # [batch_size, target_length]
        pinyin_string = self.id_to_string(pinyin_labels, self.pinyin_dic)

        label_targets = labels['label']
        label_index, label_string = self.procesess_label(
            label_targets, self.label_dic)

        pinyin_logits, viseme_logits, encode, attention_bias = v_p_transformer(
            inputs, inputs_unpadded_length, pinyin_labels, viseme_labels)
        pinyin_sequence = tf.argmax(pinyin_logits, 2)
        viseme_sequence = tf.argmax(viseme_logits, 2)
        pinyin_embedded = tf.contrib.layers.embed_sequence(
            ids=pinyin_sequence,
            vocab_size=params["target_pinyin_vocab_size"],
            embed_dim=512)
        viseme_embedded = tf.contrib.layers.embed_sequence(
            ids=viseme_sequence,
            vocab_size=params["target_viseme_vocab_size"],
            embed_dim=512)

        # Calculate model loss.
        # xentropy contains the cross entropy loss of every nonpadding token in the

        # train

        pinyin_xentropy, pinyin_weights = metrics.padded_cross_entropy_loss(
            pinyin_logits, pinyin_labels, params["label_smoothing"],
            params["pinyin_vocab_size"])
        pinyin_loss = tf.reduce_sum(pinyin_xentropy) / tf.reduce_sum(
            pinyin_weights)

        viseme_xentropy, viseme_weights = metrics.padded_cross_entropy_loss(
            viseme_logits, viseme_labels, params["label_smoothing"],
            params["viseme_vocab_size"])
        viseme_loss = tf.reduce_sum(viseme_xentropy) / tf.reduce_sum(
            viseme_weights)

        #embedded =  tf.concat([viseme_embedded, pinyin_embedded], 1)

        pinyin_unpadded_length = tf.cast(
            tf.count_nonzero(pinyin_sequence, 1, keepdims=True) - 1, tf.int32)
        viseme_unpadded_length = tf.cast(
            tf.count_nonzero(viseme_sequence, 1, keepdims=True) - 1, tf.int32)

        # label_logits = v_c_transformer(tf.cast(viseme_embedded, tf.float32), inputs_unpadded_length, label_targets)
        #label_logits = v_c_transformer(tf.cast(embedded, tf.float32), inputs_unpadded_length,  encode, attention_bias,label_targets)

        label_logits = v_c_transformer(tf.cast(pinyin_embedded, tf.float32),
                                       pinyin_unpadded_length,
                                       tf.cast(viseme_embedded, tf.float32),
                                       viseme_unpadded_length, encode,
                                       attention_bias, label_targets)

        label_xentropy, label_weights = metrics.padded_cross_entropy_loss(
            label_logits, label_targets, params["label_smoothing"],
            label_params["vocab_size"])
        label_loss = tf.reduce_sum(label_xentropy) / tf.reduce_sum(
            label_weights)

        loss = label_loss + pinyin_loss + viseme_loss

        if mode == tf.estimator.ModeKeys.TRAIN:

            train_op, metric_dict = get_train_op_and_metrics(loss, params)

            # if params["ckpt_path"] != "":
            # print('restore from: {}'.format(params["ckpt_path"]))
            # tf.train.init_from_checkpoint(
            # params["ckpt_path"], assignment_map={"/": "/"})

            # Epochs can be quite long. This gives some intermediate information
            # in TensorBoard.
            metric_dict["minibatch_loss"] = loss
            record_scalars(metric_dict)

            pinyin_sequence = tf.argmax(pinyin_logits, 2)
            viseme_sequence = tf.argmax(viseme_logits, 2)
            label_sequence = tf.argmax(label_logits, 2)

            sparse_pinyin_prediction, pinyin_predicted_string = self.procesess_label(
                pinyin_sequence, self.pinyin_dic)
            pinyin_predicted_char_list = label_util.string2char_list(
                pinyin_predicted_string)

            sparse_viseme_prediction, viseme_predicted_string = self.procesess_label(
                viseme_sequence, self.viseme_dic)
            viseme_predicted_char_list = label_util.string2char_list(
                viseme_predicted_string)

            label_predicted_index, label_predicted_string = self.procesess_label(
                label_sequence, self.label_dic)

            ver = self.cal_pinyin_metrics(viseme_char_list_labels,
                                          viseme_predicted_char_list)
            per = self.cal_pinyin_metrics(pinyin_char_list_labels,
                                          pinyin_predicted_char_list)
            cer = tf.edit_distance(label_index,
                                   tf.cast(label_predicted_index, tf.int64))

            logging_hook = tf.train.LoggingTensorHook(
                {
                    'loss': loss,
                    'ver': tf.reduce_mean(ver),
                    'per': tf.reduce_mean(per),
                    'cer': tf.reduce_mean(cer),
                },
                every_n_iter=1,
            )

            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op,
                                              training_hooks=[logging_hook])

            # Save loss as named tensor that will be logged with the logging hook.
            tf.identity(loss, "cross_entropy")

        # eval

        if mode == tf.estimator.ModeKeys.EVAL:
            pinyin_logits, viseme_logits, encode, attention_bias = v_p_transformer(
                inputs, inputs_unpadded_length, None, None)

            label_logits = v_c_transformer(
                tf.cast(pinyin_embedded, tf.float32), pinyin_unpadded_length,
                tf.cast(viseme_embedded, tf.float32), viseme_unpadded_length,
                encode, attention_bias, None)

            predicted_pinyin = pinyin_logits['outputs']
            sparse_pinyin_prediction, pinyin_predicted_string = self.procesess_label(
                predicted_pinyin, self.pinyin_dic)
            pinyin_predicted_char_list = label_util.string2char_list(
                pinyin_predicted_string)

            predicted_viseme = viseme_logits['outputs']
            sparse_viseme_prediction, viseme_predicted_string = self.procesess_label(
                predicted_viseme, self.viseme_dic)
            viseme_predicted_char_list = label_util.string2char_list(
                viseme_predicted_string)

            label_predictions = label_logits['outputs']
            label_predicted_index, label_predicted_string = self.procesess_label(
                label_predictions, self.label_dic)

            ver = self.cal_pinyin_metrics(viseme_char_list_labels,
                                          viseme_predicted_char_list)
            per = self.cal_pinyin_metrics(pinyin_char_list_labels,
                                          pinyin_predicted_char_list)
            cer = tf.edit_distance(label_index,
                                   tf.cast(label_predicted_index, tf.int64))
            tf.summary.scalar('ver', tf.reduce_mean(ver))
            tf.summary.scalar('per', tf.reduce_mean(per))
            tf.summary.scalar('cer', tf.reduce_mean(cer))

            eval_metric_ops = {
                'ver': tf.metrics.mean(ver),
                'per': tf.metrics.mean(per),
                'cer': tf.metrics.mean(cer),
            }

            def custom_formatter(tensors):
                hook_list = ['predicted_sentence', 'sentence_label']
                ostrs = []
                for k, v in tensors.items():
                    if k in hook_list:
                        v = [str(vv, encoding='UTF8') for vv in v]
                    ostrs.append('{}: {}'.format(k, v))
                return '\n'.join(ostrs)

            logging_hook = tf.train.LoggingTensorHook(
                {
                    'loss': loss,
                    'ver': tf.reduce_mean(ver),
                    'per': tf.reduce_mean(per),
                    'cer': tf.reduce_mean(cer),
                    'viseme_labels': viseme_string[:5],
                    'pinyin_labels': pinyin_string[:5],
                    'sentence_label': label_string[:5],
                    'predicted_viseme': viseme_predicted_string[:5],
                    'predicted_pinyin': pinyin_predicted_string[:5],
                    'predicted_sentence': label_predicted_string[:5],
                },
                every_n_iter=10,
                formatter=custom_formatter)

            return tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                predictions={"predictions": val[0]},
                eval_metric_ops=eval_metric_ops,
                evaluation_hooks=[logging_hook])
Example #31
0
def train():
    global_step = tf.Variable(0, trainable=False)
    learning_rate = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                               global_step,
                                               DECAY_STEPS,
                                               LEARNING_RATE_DECAY_FACTOR,
                                               staircase=True)
    logits, inputs, targets, seq_len, W, b = get_train_model()

    loss = tf.nn.ctc_loss(labels=targets,
                          inputs=logits,
                          sequence_length=seq_len)
    cost = tf.reduce_mean(loss)

    #optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,momentum=MOMENTUM).minimize(cost, global_step=global_step)
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(
        loss, global_step=global_step)
    decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits,
                                                      seq_len,
                                                      merge_repeated=False)

    acc = tf.reduce_mean(
        tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))

    init = tf.global_variables_initializer()

    def do_report():
        test_inputs, test_targets, test_seq_len = get_next_batch(BATCH_SIZE)
        test_feed = {
            inputs: test_inputs,
            targets: test_targets,
            seq_len: test_seq_len
        }
        dd, log_probs, accuracy = session.run([decoded[0], log_prob, acc],
                                              test_feed)
        report_accuracy(dd, test_targets)
        # decoded_list = decode_sparse_tensor(dd)

    def do_batch():
        train_inputs, train_targets, train_seq_len = get_next_batch(BATCH_SIZE)

        feed = {
            inputs: train_inputs,
            targets: train_targets,
            seq_len: train_seq_len
        }

        b_loss, b_targets, b_logits, b_seq_len, b_cost, steps, _ = session.run(
            [loss, targets, logits, seq_len, cost, global_step, optimizer],
            feed)

        #print b_loss
        #print b_targets, b_logits, b_seq_len
        print b_cost, steps
        if steps > 0 and steps % REPORT_STEPS == 0:
            do_report()
            #save_path = saver.save(session, "ocr.model", global_step=steps)
            # print(save_path)
        return b_cost, steps

    with tf.Session() as session:
        session.run(init)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)
        for curr_epoch in xrange(num_epochs):
            print("Epoch.......", curr_epoch)
            train_cost = train_ler = 0
            for batch in xrange(BATCHES):
                start = time.time()
                c, steps = do_batch()
                train_cost += c * BATCH_SIZE
                seconds = time.time() - start
                print("Step:", steps, ", batch seconds:", seconds)

            train_cost /= TRAIN_SIZE

            train_inputs, train_targets, train_seq_len = get_next_batch(
                BATCH_SIZE)
            val_feed = {
                inputs: train_inputs,
                targets: train_targets,
                seq_len: train_seq_len
            }

            val_cost, val_ler, lr, steps = session.run(
                [cost, acc, learning_rate, global_step], feed_dict=val_feed)

            log = "Epoch {}/{}, steps = {}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}s, learning_rate = {}"
            print(
                log.format(curr_epoch + 1, num_epochs, steps, train_cost,
                           train_ler, val_cost, val_ler,
                           time.time() - start, lr))
Example #32
0
def train_model(ENV,
                train_data=None,
                test_data=None,
                decode=False,
                file_decode=False):
    graph = tf.Graph()
    with graph.as_default():
        # e.g: log filter bank or MFCC features
        # Has size [batch_size, max_stepsize, num_features], but the
        # batch_size and max_stepsize can vary along each step
        inputs = tf.placeholder(tf.float32, [None, None, num_features])

        targets_idx = tf.placeholder(tf.int64)
        targets_val = tf.placeholder(tf.int32)
        targets_shape = tf.placeholder(tf.int64)
        targets = tf.SparseTensor(targets_idx, targets_val, targets_shape)
        # 1d array of size [batch_size]
        seq_len = tf.placeholder(tf.int32, [None])

        # Weights & biases
        weight_classes = tf.Variable(
            tf.truncated_normal([num_hidden, num_classes],
                                mean=0,
                                stddev=0.1,
                                dtype=tf.float32))
        bias_classes = tf.Variable(tf.zeros([num_classes]), dtype=tf.float32)

        # Network
        forward_cell = tf.nn.rnn_cell.LSTMCell(num_hidden,
                                               use_peepholes=True,
                                               state_is_tuple=True)
        backward_cell = tf.nn.rnn_cell.LSTMCell(num_hidden,
                                                use_peepholes=True,
                                                state_is_tuple=True)

        stack_forward_cell = tf.nn.rnn_cell.MultiRNNCell([forward_cell] *
                                                         num_layers,
                                                         state_is_tuple=True)
        stack_backward_cell = tf.nn.rnn_cell.MultiRNNCell([backward_cell] *
                                                          num_layers,
                                                          state_is_tuple=True)

        outputs, _ = tf.nn.bidirectional_dynamic_rnn(
            stack_forward_cell,
            stack_backward_cell,
            inputs,
            sequence_length=seq_len,
            time_major=False,  # [batch_size, max_time, num_hidden]
            dtype=tf.float32)
        inputs_shape = tf.shape(inputs)
        batch_size = inputs_shape[0]
        """
        outputs_concate = tf.concat_v2(outputs, 2)
        outputs_concate = tf.reshape(outputs_concate, [-1, 2*num_hidden])
        # logits = tf.matmul(outputs_concate, weight_classes) + bias_classes
        """
        fw_output = tf.reshape(outputs[0], [-1, num_hidden])
        bw_output = tf.reshape(outputs[1], [-1, num_hidden])
        logits = tf.add(
            tf.add(tf.matmul(fw_output, weight_classes),
                   tf.matmul(bw_output, weight_classes)), bias_classes)

        logits = tf.reshape(logits, [batch_size, -1, num_classes])
        loss = tf.reduce_mean(
            ctc_ops.ctc_loss(logits, targets, seq_len, time_major=False))
        optimizer = tf.train.MomentumOptimizer(learning_rate,
                                               momentum).minimize(loss)

        # Evaluating
        # decoded, log_prob = ctc_ops.ctc_greedy_decoder(tf.transpose(logits, perm=[1, 0, 2]), seq_len)
        decoded, log_prob = ctc_ops.ctc_beam_search_decoder(
            tf.transpose(logits, perm=[1, 0, 2]), seq_len)
        label_error_rate = tf.reduce_mean(
            tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)

    with tf.Session(graph=graph,
                    config=tf.ConfigProto(gpu_options=gpu_options)) as session:
        session.run(tf.global_variables_initializer())
        saver = tf.train.Saver(tf.all_variables(), max_to_keep=0)
        if not decode:
            ckpt = tf.train.get_checkpoint_state(ENV.output)
            if ckpt:
                print('load', ckpt.model_checkpoint_path)
                saver.restore(session, ckpt.model_checkpoint_path)

            total_train_data = len(train_data)
            total_test_data = len(test_data)
            num_batch = total_train_data
            for curr_epoch in range(num_epochs):
                start = time.time()
                train_cost = 0
                train_ler = 0
                for i in range(num_batch - 1):
                    feed = {
                        inputs: train_data[i][0],
                        targets_idx: train_data[i][1][0],
                        targets_val: train_data[i][1][1],
                        targets_shape: train_data[i][1][2],
                        seq_len: train_data[i][2]
                    }
                    batch_cost, _ = session.run([loss, optimizer], feed)
                    train_cost += batch_cost * batch_size
                    train_ler += session.run(label_error_rate,
                                             feed_dict=feed) * batch_size
                    log = "Epoch {}/{}, iter {}, batch_cost {}"
                    logging.info(
                        log.format(curr_epoch + 1, num_epochs, i, batch_cost))

                train_cost /= num_batch
                train_ler /= num_batch
                saver.save(session,
                           os.path.join(ENV.output, 'best.ckpt'),
                           global_step=curr_epoch)

                feed_test = {
                    inputs: test_data[0][0],
                    targets_idx: test_data[0][1][0],
                    targets_val: test_data[0][1][1],
                    targets_shape: train_data[0][1][2],
                    seq_len: test_data[0][2]
                }
                test_cost, test_ler = session.run([loss, label_error_rate],
                                                  feed_dict=feed_test)
                log = "Epoch {}/{}, test_cost {}, test_ler {}"
                logging.info(
                    log.format(curr_epoch + 1, num_epochs, test_cost,
                               test_ler))
        else:
            ckpt = tf.train.get_checkpoint_state(ENV.model_path)
            print('load', ckpt.model_checkpoint_path)
            saver = tf.train.Saver()
            saver.restore(session, ckpt.model_checkpoint_path)

            while True:
                if file_decode:
                    wav_file = raw_input('Enter the wav file path:')
                else:
                    wav_file = 'temp.wav'
                    raw_input('Press Enter to start...')
                    try:
                        sox = subprocess.Popen([
                            'sox', '-d', '-b', '16', '-c', '1', '-r', '16000',
                            'temp.wav'
                        ])
                        sox.communicate()
                    except KeyboardInterrupt:
                        os.kill(sox.pid, signal.SIGTERM)
                        if sox.poll() is None:
                            time.sleep(2)
                    print('Done recording')
                features = process_wav(wav_file)
                batch_features = np.array([features for i in range(16)])
                batch_seq_len = np.array(
                    [features.shape[0] for i in range(16)])
                print(batch_features.shape)
                feed = {inputs: batch_features, seq_len: batch_seq_len}
                d, oc = session.run([decoded[0], outputs], feed_dict=feed)
                dsp = d.shape
                res = []
                for label in d.values[:dsp[1]]:
                    for k, v in phoneme_set_39.items():
                        if v == label + 1:
                            res.append(k)
                print(res)
Example #33
0
def _get_testing(rnn_logits, sequence_length, label, label_length):
    """Create ops for testing (all scalars): 
       label_error:  Normalized edit distance on beam search max
       sequence_error: Normalized sequence error rate
    """
    with tf.name_scope("evaluate"):
        predictions, _ = tf.nn.ctc_beam_search_decoder(rnn_logits,
                                                       sequence_length,
                                                       beam_width=128,
                                                       top_paths=1,
                                                       merge_repeated=True)
        hypothesis = tf.cast(predictions[0], tf.int32)  # for edit_distance

        # Per-sequence statistic
        num_label_errors = tf.edit_distance(hypothesis, label, normalize=False)
        # Per-batch summary counts
        batch_num_label_errors = tf.reduce_sum(num_label_errors)
        batch_num_sequence_errors = tf.count_nonzero(num_label_errors, axis=0)
        batch_num_labels = tf.reduce_sum(label_length)
        batch_size = tf.shape(label_length)[0]

        # Wide integer type casts (prefer unsigned, but truediv dislikes those)
        batch_num_label_errors = tf.cast(batch_num_label_errors, tf.int64)
        batch_num_sequence_errors = tf.cast(batch_num_sequence_errors,
                                            tf.int64)
        batch_num_labels = tf.cast(batch_num_labels, tf.int64)
        batch_size = tf.cast(batch_size, tf.int64)

        # Variables to tally across batches (all initially zero)

        # Make sure the variables are local so the Saver doesn't try to read them
        # from the saved model checkpoint
        var_collections = [tf.GraphKeys.LOCAL_VARIABLES]

        total_num_label_errors = tf.Variable(0,
                                             trainable=False,
                                             name='total_num_label_errors',
                                             dtype=tf.int64,
                                             collections=var_collections)
        total_num_sequence_errors = tf.Variable(
            0,
            trainable=False,
            name='total_num_sequence_errors',
            dtype=tf.int64,
            collections=var_collections)
        total_num_labels = tf.Variable(0,
                                       trainable=False,
                                       name='total_num_labels',
                                       dtype=tf.int64,
                                       collections=var_collections)

        total_num_sequences = tf.Variable(0,
                                          trainable=False,
                                          name='total_num_sequences',
                                          dtype=tf.int64,
                                          collections=var_collections)

        # Create the "+=" update ops and group together as one
        update_label_errors = tf.assign_add(total_num_label_errors,
                                            batch_num_label_errors)
        update_num_labels = tf.assign_add(total_num_labels, batch_num_labels)
        update_sequence_errors = tf.assign_add(total_num_sequence_errors,
                                               batch_num_sequence_errors)
        update_num_sequences = tf.assign_add(total_num_sequences, batch_size)

        update_metrics = tf.group(update_label_errors, update_num_labels,
                                  update_sequence_errors, update_num_sequences)

        metrics = [
            total_num_label_errors, total_num_labels,
            total_num_sequence_errors, total_num_sequences
        ]

        # Tensors to make final calculations
        label_error = tf.truediv(total_num_label_errors,
                                 total_num_labels,
                                 name='label_error')
        sequence_error = tf.truediv(total_num_sequence_errors,
                                    total_num_sequences,
                                    name='sequence_error')

    return label_error, sequence_error, update_metrics, metrics, predictions
Example #34
0
 def cer(decoded, targets, targets_length):
     greedy_decoded = tf.sparse.from_dense(decoded)
     sparse_targets = tf.cast(K.ctc_label_dense_to_sparse(targets, math_ops.cast(
         K.flatten(targets_length), dtype='int32')), 'int32')
     return tf.edit_distance(tf.cast(greedy_decoded, tf.int32), sparse_targets, normalize=True)
Example #35
0
    def _build_graph(self, inputs):
        l, labelidx, labelvalue, labelshape, seqlen = inputs
        tf.summary.image('input_img', l)
        label = tf.SparseTensor(labelidx, labelvalue, labelshape)
        l = tf.cast(l, tf.float32)
        l = l / 255.0 * 2 - 1

        self.batch_size = tf.shape(l)[0]

        # cnn part
        with tf.variable_scope('cnn') as scope:
            feature_height = cfg.input_height
            for i, kernel_height in enumerate(cfg.cnn.kernel_heights):
                out_channel = cfg.cnn.channels[i]
                kernel_width = cfg.cnn.kernel_widths[i]
                stride = cfg.cnn.stride[i]
                l = Conv2D('conv.{}'.format(i),
                           l,
                           out_channel, (kernel_height, kernel_width),
                           cfg.cnn.padding,
                           stride=(1, stride))
                if cfg.cnn.with_bn:
                    l = BatchNorm('bn.{}'.format(i), l)
                l = tf.clip_by_value(l, 0, 20, "clipped_relu.{}".format(i))
                if cfg.cnn.padding == "VALID":
                    feature_height = feature_height - kernel_height + 1
                    seqlen = tf.cast(
                        tf.ceil(
                            (tf.cast(seqlen, tf.float32) - kernel_width + 1) /
                            stride), tf.int32)
                else:
                    seqlen = tf.cast(
                        tf.ceil((tf.cast(seqlen, tf.float32)) / stride),
                        tf.int32)

            feature_size = feature_height * out_channel

        # rnn part
        l = tf.transpose(l, perm=[0, 2, 1, 3])
        l = tf.reshape(l, [self.batch_size, -1, feature_size])

        if cfg.rnn.hidden_layers_no > 0:
            cell_fw = [
                tf.nn.rnn_cell.BasicLSTMCell(cfg.rnn.hidden_size)
                for _ in range(cfg.rnn.hidden_layers_no)
            ]
            cell_bw = [
                tf.nn.rnn_cell.BasicLSTMCell(cfg.rnn.hidden_size)
                for _ in range(cfg.rnn.hidden_layers_no)
            ]
            l = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
                cell_fw, cell_bw, l, dtype=tf.float32)
            feature_size = cfg.rnn.hidden_size

        # fc part
        l = tf.reshape(l[0], [-1, 2 * feature_size])
        # l = tf.reshape(l, [-1, feature_size])
        output = BatchNorm('bn', l)
        logits = FullyConnected(
            'fc',
            output,
            cfg.label_size,
            nl=tf.identity,
            W_init=tf.truncated_normal_initializer(stddev=0.01))
        logits = tf.reshape(logits, (self.batch_size, -1, cfg.label_size))
        softmaxed_logits = tf.nn.softmax(logits, name='logits')

        # ctc output
        loss = tf.nn.ctc_loss(inputs=logits,
                              labels=label,
                              sequence_length=seqlen,
                              ignore_longer_outputs_than_inputs=True,
                              time_major=False)
        if cfg.hard_sample_mining:
            self.cost = hard_loss(loss, self.hard_sample_num, name='cost')
        else:
            self.cost = tf.reduce_mean(loss, name='cost')

        # prediction error
        logits = tf.transpose(logits, [1, 0, 2])

        isTrain = get_current_tower_context().is_training
        predictions = tf.to_int32(
            tf.nn.ctc_greedy_decoder(inputs=logits,
                                     sequence_length=seqlen)[0][0])
        # predictions = tf.to_int32(tf.nn.ctc_beam_search_decoder(inputs=logits,
        #                                                         sequence_length=seqlen)[0][0])

        dense_pred = tf.sparse_tensor_to_dense(predictions, name="prediction")

        err = tf.edit_distance(predictions, label, normalize=True)
        err.set_shape([None])
        err = tf.reduce_mean(err, name='error')
        summary.add_moving_summary(err, self.cost)
Example #36
0
def run_ctc():
    graph = tf.Graph()
    with graph.as_default():
        # e.g: log filter bank or MFCC features
        # Has size [batch_size, max_step_size, num_features], but the
        # batch_size and max_step_size can vary along each step
        inputs = tf.placeholder(tf.float32, [None, None, num_features])

        # Here we use sparse_placeholder that will generate a
        # SparseTensor required by ctc_loss op.
        targets = tf.sparse_placeholder(tf.int32)

        # 1d array of size [batch_size]
        seq_len = tf.placeholder(tf.int32, [None])

        # Defining the cell
        # Can be:
        cell = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)

        # Stacking rnn cells
        stack = tf.contrib.rnn.MultiRNNCell([cell] * num_layers,
                                            state_is_tuple=True)

        # The second output is the last state and we will no use that
        outputs, _ = tf.nn.dynamic_rnn(stack, inputs, seq_len, dtype=tf.float32)

        shape = tf.shape(inputs)
        batch_s, max_time_steps = shape[0], shape[1]

        # Reshaping to apply the same weights over the timesteps
        outputs = tf.reshape(outputs, [-1, num_hidden])

        # Truncated normal with mean 0 and stdev=0.1
        # Tip: Try another initialization
        W = tf.Variable(tf.truncated_normal([num_hidden,
                                             num_classes],
                                            stddev=0.1))
        # Zero initialization
        # Tip: Is tf.zeros_initializer the same?
        b = tf.Variable(tf.constant(0., shape=[num_classes]))

        # Doing the affine projection
        logits = tf.matmul(outputs, W) + b

        # Reshaping back to the original shape
        logits = tf.reshape(logits, [batch_s, -1, num_classes])

        # Time major
        logits = tf.transpose(logits, (1, 0, 2))

        loss = tf.nn.ctc_loss(targets, logits, seq_len)
        cost = tf.reduce_mean(loss)

        optimizer = tf.train.MomentumOptimizer(learning_rate=0.005,
                                               momentum=0.9).minimize(cost)

        # Option 2: tf.contrib.ctc.ctc_beam_search_decoder
        # (it's slower but you'll get better results)
        decoded, log_prob = tf.nn.ctc_greedy_decoder(logits, seq_len)

        # Inaccuracy: label error rate
        ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                              targets))

    files = find_files("/home/burak/Downloads/vctk-p225-small/wav48/p225")
        
    with tf.Session(graph=graph) as session:

        tf.global_variables_initializer().run()

        saver = tf.train.Saver()                

        for curr_epoch in range(num_epochs):
            train_cost = train_ler = 0
            for batch in range(num_batches_per_epoch):
                filename = random.choice(files)
                txtfile = filename.replace("wav48","txt")
                txtfile = txtfile.replace(".wav",".txt")
                txt = open(txtfile).read()
                audio = read_audio_from_filename(filename, sample_rate)
                out = convert_inputs_to_ctc_format(audio,sample_rate,txt)
                train_inputs, train_targets, train_seq_len, original = out

                feed = {inputs: train_inputs,
                        targets: train_targets,
                        seq_len: train_seq_len}

                batch_cost, _ = session.run([cost, optimizer], feed)
                train_ler += session.run(ler, feed_dict=feed)
                
                print 'batch_cost', batch_cost, 'train_ler', train_ler

            # Decoding
            d = session.run(decoded[0], feed_dict=feed)
            str_decoded = ''.join([chr(x) for x in np.asarray(d[1]) + FIRST_INDEX])
            # Replacing blank label to none
            str_decoded = str_decoded.replace(chr(ord('z') + 1), '')
            # Replacing space label to space
            str_decoded = str_decoded.replace(chr(ord('a') - 1), ' ')

            print('Original: %s' % original)
            print('Decoded: %s' % str_decoded)
                
            if curr_epoch % 10 == 0: saver.save(session, mfile)
Example #37
0
def prediction(logits,seq_length,label):
    logits = tf.transpose(logits,perm = [1,0,2])
    predict = tf.to_int32(tf.nn.ctc_beam_search_decoder(logits,seq_length,merge_repeated = False)[0][0])
    error = tf.reduce_sum(tf.edit_distance(predict, label, normalize=False)) / tf.to_float(tf.size(label.values))
    return error
    # Swap dimensions to time major for CTC loss.
    logits = tf.transpose(logits, (1, 0, 2))

    loss = ctc.ctc_loss(targets, logits, seq_len)
    cost = tf.reduce_mean(loss)

    # Record the loss
    tf.contrib.deprecated.scalar_summary('loss', cost)

    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=momentum, use_nesterov=True).minimize(cost)

    decoded, log_prob = ctc.ctc_beam_search_decoder(inputs=logits, sequence_length=seq_len)

    # Label error rate using the edit distance between output and target
    ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                          targets))

    # Record the label error rate
    tf.contrib.deprecated.scalar_summary('label error rate', ler)

    saver = tf.train.Saver()
    merged = tf.contrib.deprecated.merge_all_summaries()
    train_writer = tf.summary.FileWriter('./summaries/train', graph)
    test_writer = tf.summary.FileWriter('./summaries/test', graph)
    
def test_decoding(input_feed_dict, input_original):
    """
    Runs the classifier on a feed dictionary and prints the decoded predictions.
    """

    d = session.run(decoded, feed_dict=input_feed_dict)
Example #39
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.inputs = tf.placeholder(
                tf.float32, [None, utils.image_width, utils.image_height, 1])
            '''with tf.variable_scope('STN'):
                #Localisation net
                conv1_loc = slim.conv2d(self.inputs, 32, [3, 3], scope='conv1_loc')
                pool1_loc = slim.max_pool2d(conv1_loc, [2, 2], scope='pool1_loc')
                conv2_loc = slim.conv2d(pool1_loc, 64, [3, 3], scope='conv2_loc')
                pool2_loc = slim.max_pool2d(conv2_loc, [2, 2], scope='pool2_loc')
                pool2_loc_flat = slim.flatten(pool2_loc)
                fc1_loc = slim.fully_connected(pool2_loc_flat, 1024, scope='fc1_loc')
                fc2_loc = slim.fully_connected(fc1_loc, 128, scope='fc2_loc')
                W = tf.Variable(tf.zeros([128, 20]))
                b = tf.Variable(initial_value=[-1, -0.2, -0.5, -0.35, 0, -0.5, 0.5, -0.67, 1, -0.8,
                                               -1, 0.8, -0.5, 0.65, 0, 0.5, 0.5, 0.33, 1, 0.2], dtype=tf.float32)
                # fc3_loc=tf.layers.dense(fc2_loc,20,activation=tf.nn.tanh,kernel_initializer=tf.zeros_initializer)
                # fc3_loc = slim.fully_connected(fc2_loc, 8, activation_fn=tf.nn.tanh, scope='fc3_loc')
                # spatial transformer
                fc3_loc = tf.nn.tanh(tf.matmul(fc2_loc, W) + b)
                loc = tf.reshape(fc3_loc, [-1, 10, 2])
                # spatial transformer
                s = np.array([[-0.95, -0.95], [-0.5, -0.95], [0, -0.95], [0.5, -0.95], [0.95, -0.95], [-0.95, 0.95], [-0.5, 0.95], [0, 0.95], [0.5, 0.95],
                              [0.95,0.95]] * 256)
                s = tf.constant(s.reshape([256, 10, 2]), dtype=tf.float32)
                self.h_trans = stn(self.inputs, s, loc, (utils.image_width, utils.image_height))'''
            if FLAGS.Use_CRNN:
                with tf.variable_scope('CNN'):
                    self.keep_prob_cv1 = tf.placeholder("float")
                    self.keep_prob_cv2 = tf.placeholder("float")
                    self.keep_prob_cv3 = tf.placeholder("float")
                    self.keep_prob_cv4 = tf.placeholder("float")
                    net = slim.conv2d(self.inputs, 64, [3, 3], scope='conv1')
                    net = tf.nn.dropout(net, self.keep_prob_cv1)
                    net = slim.max_pool2d(net, [2, 2], scope='pool1')
                    net = slim.conv2d(net, 128, [3, 3], scope='conv2')
                    net = slim.max_pool2d(net, [2, 2], scope='pool2')
                    net = slim.conv2d(net,
                                      256, [3, 3],
                                      activation_fn=None,
                                      scope='conv3')
                    net = tf.layers.batch_normalization(net,
                                                        training=is_training)
                    net = tf.nn.relu(net)
                    net = tf.nn.dropout(net, self.keep_prob_cv2)
                    net = slim.conv2d(net, 256, [3, 3], scope='conv4')
                    net = slim.max_pool2d(net, [2, 2], [1, 2], scope='pool3')
                    net = slim.conv2d(net,
                                      512, [3, 3],
                                      activation_fn=None,
                                      scope='conv5')
                    net = tf.nn.dropout(net, self.keep_prob_cv3)
                    net = tf.layers.batch_normalization(net,
                                                        training=is_training)
                    net = tf.nn.relu(net)
                    net = slim.conv2d(net, 512, [3, 3], scope='conv6')
                    net = slim.max_pool2d(net, [2, 2], [1, 2], scope='pool4')
                    net = slim.conv2d(net,
                                      512, [2, 2],
                                      padding='VALID',
                                      activation_fn=None,
                                      scope='conv7')
                    net = tf.nn.dropout(net, self.keep_prob_cv4)
                    net = tf.layers.batch_normalization(net,
                                                        training=is_training)
                    net = tf.nn.relu(net)
                    self.cnn_time = net.get_shape().as_list()[1]
                    self.num_feauture = 512
            else:
                with tf.variable_scope('Dense_CNN'):
                    nb_filter = 64
                    net = tf.layers.conv2d(self.inputs,
                                           nb_filter,
                                           5, (2, 2),
                                           "SAME",
                                           use_bias=False)
                    net, nb_filter = dense_block(net, 8, 8, nb_filter,
                                                 is_training)
                    net, nb_filter = transition_block(net,
                                                      128,
                                                      is_training,
                                                      pooltype=2)
                    net, nb_filter = dense_block(net, 8, 8, nb_filter,
                                                 is_training)
                    net, nb_filter = transition_block(net,
                                                      128,
                                                      is_training,
                                                      pooltype=3)
                    net, nb_filter = dense_block(net, 8, 8, nb_filter,
                                                 is_training)
                    #net, nb_filter = transition_block(net, 128, is_training, pooltype=3)
                    print(net)
                    #net = tf.layers.conv2d(net, nb_filter, 3, (1, 2), "SAME", use_bias=True)
                    self.cnn_time = net.get_shape().as_list()[1]
                    self.num_feauture = 4 * 192

            temp_inputs = net
            with tf.variable_scope('BLSTM'):
                self.labels = tf.sparse_placeholder(tf.int32)
                self.seq_len = tf.placeholder(tf.int32, [None])
                self.lstm_inputs = tf.reshape(
                    temp_inputs, [-1, self.cnn_time, self.num_feauture])
                outputs = stacked_bidirectional_rnn(tf.contrib.rnn.LSTMCell,
                                                    FLAGS.num_hidden, 2,
                                                    self.lstm_inputs,
                                                    self.seq_len)
            shape = tf.shape(self.lstm_inputs)
            batch_s, max_timesteps = shape[0], 40
            outputs = tf.reshape(outputs, [-1, FLAGS.num_hidden * 2])
            self.keep_prob_fc = tf.placeholder("float")
            h_fc1_drop = tf.nn.dropout(outputs, self.keep_prob_fc)
            W = tf.Variable(tf.truncated_normal(
                [FLAGS.num_hidden * 2, num_classes],
                stddev=0.1,
                dtype=tf.float32),
                            name='W')
            b = tf.Variable(
                tf.constant(0.,
                            dtype=tf.float32,
                            shape=[num_classes],
                            name='b'))
            logits = tf.matmul(outputs, W) + b
            logits = tf.reshape(logits, [batch_s, -1, num_classes])
            self.logits_before_ctc = tf.argmax(logits, 2)
            logits = tf.transpose(logits, (1, 0, 2))
            self.global_step = tf.Variable(0, trainable=False)
            print(
                "###########################################################")
            print(self.labels)
            print(logits)
            print(self.seq_len)
            self.loss = tf.nn.ctc_loss(labels=self.labels,
                                       inputs=logits,
                                       sequence_length=self.seq_len)
            self.cost = tf.reduce_mean(self.loss)
            self.learning_rate = tf.train.exponential_decay(
                FLAGS.initial_learning_rate,
                self.global_step,
                FLAGS.decay_steps,
                FLAGS.decay_rate,
                staircase=True)
            self.optimizer = tf.train.MomentumOptimizer(
                learning_rate=self.learning_rate,
                momentum=FLAGS.momentum,
                use_nesterov=True).minimize(self.cost,
                                            global_step=self.global_step)

            self.decoded, self.log_prob = tf.nn.ctc_beam_search_decoder(
                logits, self.seq_len, merge_repeated=False)
            self.dense_decoded = tf.sparse_tensor_to_dense(self.decoded[0],
                                                           default_value=-1)
            self.lerr = tf.reduce_mean(
                tf.edit_distance(tf.cast(self.decoded[0], tf.int32),
                                 self.labels))

            tf.summary.scalar('cost', self.cost)
            self.merged_summay = tf.summary.merge_all()
Example #40
0
def CheckpointTest():
    # input_tensor为输入音频数据,由前面分析可知,它的结构是[batch_size, amax_stepsize, n_input + (2 * n_input * n_context)]
    # 其中,batch_size是batch的长度,amax_stepsize是时序长度,n_input + (2 * n_input * n_context)是MFCC特征数,
    # batch_size是可变的,所以设为None,由于每一批次的时序长度不固定,所有,amax_stepsize也设为None
    input_tensor = tf.placeholder(
        tf.float32, [None, None, n_input + (2 * n_input * n_context)],
        name='input')
    # Use sparse_placeholder; will generate a SparseTensor, required by ctc_loss op.
    # targets保存的是音频数据对应的文本的系数张量,所以用sparse_placeholder创建一个稀疏张量
    targets = tf.sparse_placeholder(tf.int32, name='targets')
    # seq_length保存的是当前batch数据的时序长度
    seq_length = tf.placeholder(tf.int32, [None], name='seq_length')
    # keep_dropout则是dropout的参数
    keep_dropout = tf.placeholder(tf.float32)

    # logits is the non-normalized output/activations from the last layer.
    # logits will be input for the loss function.
    # nn_model is from the import statement in the load_model function
    logits = BiRNN_model(input_tensor, tf.to_int64(seq_length), n_input,
                         n_context, words_size + 1, keep_dropout)

    aa = ctc_ops.ctc_loss(targets, logits, seq_length)
    # 使用ctc loss计算损失
    avg_loss = tf.reduce_mean(aa)

    # 优化器
    learning_rate = 0.001
    optimizer = tf.train.AdamOptimizer(
        learning_rate=learning_rate).minimize(avg_loss)

    # 使用CTC decoder
    with tf.name_scope("decode"):
        decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits,
                                                       seq_length,
                                                       merge_repeated=True)

    # 计算编辑距离
    with tf.name_scope("accuracy"):
        distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets)
        # 计算label error rate (accuracy)
        ler = tf.reduce_mean(distance, name='label_error_rate')

    # 迭代次数
    epochs = 150
    # 模型保存地址
    savedir = "saver/"
    # 如果该目录不存在,新建
    if os.path.exists(savedir) == False:
        os.mkdir(savedir)

    # 生成saver
    saver = tf.train.Saver(max_to_keep=1)
    # 创建session
    with tf.Session() as sess:
        # 初始化
        sess.run(tf.global_variables_initializer())
        # 没有模型的话,就重新初始化
        kpt = tf.train.latest_checkpoint(savedir)
        print("kpt:", kpt)
        startepo = 0
        if kpt != None:
            saver.restore(sess, kpt)
            ind = kpt.find("-")
            startepo = int(kpt[ind + 1:])

        # 要识别的语音文件
        wav_file = 'input.wav'

        source, source_lengths, sparse_labels = get_speech_file(
            wav_file, labels)
        feed2 = {
            input_tensor: source,
            targets: sparse_labels,
            seq_length: source_lengths,
            keep_dropout: 1.0
        }
        d, train_ler = sess.run([decoded[0], ler], feed_dict=feed2)
        dense_decoded = tf.sparse_tensor_to_dense(
            d, default_value=-1).eval(session=sess)
        if (len(dense_decoded) > 0):
            decoded_str = ndarray_to_text_ch(dense_decoded[0], words)
            print('Decoded:  {}'.format(decoded_str))
Example #41
0
import tensorflow as tf
import numpy as np

x = tf.SparseTensor([
    [0, 0],
    [0, 1],
    [0, 2],
    [0, 3],
    [0, 4],
    [1, 0],
    [1, 1],
    [1, 2],
    [1, 3],
    [1, 4],
], ["s", "i", "a", "l", "u", "s", "i", "a", "l", "u"], (1, 5))

target = tf.SparseTensor(
    [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [1, 0], [1, 1], [1, 2],
     [1, 3], [1, 4], [1, 5]],
    ["h", "a", "n", "d", "a", "l", "s", "y", "a", "l", "o", "m"], (2, 6))
ler = tf.edit_distance(x, target)
with tf.Session() as sess:
    _ler = sess.run(ler)
    print(_ler)
Example #42
0
    def build_graph(self, args, maxTimeSteps):
        self.graph = tf.Graph()
        with self.graph.as_default():
            # according to DeepSpeech2 paper, input is the spectrogram power of audio, but if you like,
            # you can also use mfcc feature as the input.
            self.inputX = tf.placeholder(tf.float32,
                                         shape=(maxTimeSteps, args.batch_size,
                                                args.num_feature))
            inputXrs = tf.reshape(
                self.inputX,
                [args.batch_size, args.num_feature, maxTimeSteps, 1])
            #self.inputList = tf.split(inputXrs, maxTimeSteps, 0)  # convert inputXrs from [32*maxL,39] to [32,maxL,39]

            self.targetIxs = tf.placeholder(tf.int64)
            self.targetVals = tf.placeholder(tf.int32)
            self.targetShape = tf.placeholder(tf.int64)
            self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals,
                                           self.targetShape)
            self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size))
            self.config = {
                'name': args.model,
                'rnncell': self.cell_fn,
                'num_layer': args.num_layer,
                'num_hidden': args.num_hidden,
                'num_class': args.num_class,
                'activation': args.activation,
                'optimizer': args.optimizer,
                'learning rate': args.learning_rate,
                'keep prob': args.keep_prob,
                'batch size': args.batch_size
            }

            output_fc = build_deepSpeech2(self.args, maxTimeSteps, self.inputX,
                                          self.cell_fn, self.seqLengths)
            self.loss = tf.reduce_mean(
                tf.nn.ctc_loss(self.targetY, output_fc, self.seqLengths))
            self.var_op = tf.global_variables()
            self.var_trainable_op = tf.trainable_variables()

            if args.grad_clip == -1:
                # not apply gradient clipping
                self.optimizer = tf.train.AdamOptimizer(
                    args.learning_rate).minimize(self.loss)
            else:
                # apply gradient clipping
                grads, _ = tf.clip_by_global_norm(
                    tf.gradients(self.loss, self.var_trainable_op),
                    args.grad_clip)
                opti = tf.train.AdamOptimizer(args.learning_rate)
                self.optimizer = opti.apply_gradients(
                    zip(grads, self.var_trainable_op))
            self.predictions = tf.to_int32(
                tf.nn.ctc_beam_search_decoder(output_fc,
                                              self.seqLengths,
                                              merge_repeated=False)[0][0])
            if args.level == 'cha':
                self.errorRate = tf.reduce_sum(
                    tf.edit_distance(self.predictions,
                                     self.targetY,
                                     normalize=True))
            self.initial_op = tf.global_variables_initializer()
            self.saver = tf.train.Saver(tf.global_variables(),
                                        max_to_keep=5,
                                        keep_checkpoint_every_n_hours=1)
def train():
    global_step = tf.Variable(0, trainable=False)
    learning_rate = tf.train.exponential_decay(common.INITIAL_LEARNING_RATE,
                                               global_step,
                                               common.DECAY_STEPS,
                                               common.LEARNING_RATE_DECAY_FACTOR,
                                               staircase=True)
    logits, inputs, targets, seq_len, W, b = model.get_train_model()

    loss = tf.nn.ctc_loss(targets, logits, seq_len)
    cost = tf.reduce_mean(loss)

    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                           momentum=common.MOMENTUM).minimize(cost, global_step=global_step)

    # Option 2: tf.contrib.ctc.ctc_beam_search_decoder
    # (it's slower but you'll get better results)
    decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len, merge_repeated=False)

    # Accuracy: label error rate
    acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))

    # Initializate the weights and biases
    init = tf.global_variables_initializer()

    def do_report():
        test_feed = {inputs: test_inputs,
                     targets: test_targets,
                     seq_len: test_seq_len}
        dd, log_probs, accuracy = session.run([decoded[0], log_prob, acc], test_feed)
        report_accuracy(dd, test_targets)
        # decoded_list = decode_sparse_tensor(dd)

    def do_batch():
        feed = {inputs: train_inputs, targets: train_targets, seq_len: train_seq_len}
        b_cost, steps, _ = session.run([cost, global_step, optimizer], feed)
        if steps > 0 and steps % common.REPORT_STEPS == 0:
            do_report()
            save_path = saver.save(session, "models/ocr.model", global_step=steps)
            #print(save_path)
        return b_cost, steps

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as session:
        session.run(init)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)
        for curr_epoch in range(num_epochs):
            # variables = tf.all_variables()
            # for i in variables:
            #     print(i.name)

            print("Epoch.......", curr_epoch)
            train_cost = train_ler = 0
            for batch in range(common.BATCHES):
                start = time.time()
                train_inputs, train_targets, train_seq_len = utils.get_data_set('train', batch * common.BATCH_SIZE,
                                                                                (batch + 1) * common.BATCH_SIZE)

                #print("get data time", time.time() - start)
                start = time.time()
                c, steps = do_batch()
                train_cost += c * common.BATCH_SIZE
                seconds = time.time() - start
                print("Step:", steps, ", batch seconds:", seconds)

            train_cost /= common.TRAIN_SIZE
            # train_ler /= common.TRAIN_SIZE
            val_feed = {inputs: train_inputs,
                        targets: train_targets,
                        seq_len: train_seq_len}

            val_cost, val_ler, lr, steps = session.run([cost, acc, learning_rate, global_step], feed_dict=val_feed)

            log = "Epoch {}/{}, steps = {}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}s, learning_rate = {}"
            print(log.format(curr_epoch + 1, num_epochs, steps, train_cost, train_ler, val_cost, val_ler,
                             time.time() - start, lr))
Example #44
0
    def crnn(self, max_width, batch_size):
        def BidirectionnalRNN(inputs, seq_len):
            """
                Bidirectionnal LSTM Recurrent Neural Network part
            """

            with tf.variable_scope(None, default_name="bidirectional-rnn-1"):
                # Forward
                lstm_fw_cell_1 = rnn.BasicLSTMCell(256)
                # Backward
                lstm_bw_cell_1 = rnn.BasicLSTMCell(256)

                inter_output, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell_1, lstm_bw_cell_1, inputs, seq_len, dtype=tf.float32)

                inter_output = tf.concat(inter_output, 2)

            with tf.variable_scope(None, default_name="bidirectional-rnn-2"):
                # Forward
                lstm_fw_cell_2 = rnn.BasicLSTMCell(256)
                # Backward
                lstm_bw_cell_2 = rnn.BasicLSTMCell(256)

                outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell_2, lstm_bw_cell_2, inter_output, seq_len, dtype=tf.float32)

                outputs = tf.concat(outputs, 2)


            return outputs

        def CNN(inputs):
            """
                Convolutionnal Neural Network part
            """

            # 64 / 3 x 3 / 1 / 1
            conv1 = tf.layers.conv2d(inputs=inputs, filters = 64, kernel_size = (3, 3), padding = "same", activation=tf.nn.relu)

            # 2 x 2 / 1
            pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

            # 128 / 3 x 3 / 1 / 1
            conv2 = tf.layers.conv2d(inputs=pool1, filters = 128, kernel_size = (3, 3), padding = "same", activation=tf.nn.relu)

            # 2 x 2 / 1
            pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)

            # 256 / 3 x 3 / 1 / 1
            conv3 = tf.layers.conv2d(inputs=pool2, filters = 256, kernel_size = (3, 3), padding = "same", activation=tf.nn.relu)

            # Batch normalization layer
            bnorm1 = tf.layers.batch_normalization(conv3)

            # 256 / 3 x 3 / 1 / 1
            conv4 = tf.layers.conv2d(inputs=bnorm1, filters = 256, kernel_size = (3, 3), padding = "same", activation=tf.nn.relu)

            # 1 x 2 / 1
            pool3 = tf.layers.max_pooling2d(inputs=conv4, pool_size=[2, 2], strides=[1, 2], padding="same")

            # 512 / 3 x 3 / 1 / 1
            conv5 = tf.layers.conv2d(inputs=pool3, filters = 512, kernel_size = (3, 3), padding = "same", activation=tf.nn.relu)

            # Batch normalization layer
            bnorm2 = tf.layers.batch_normalization(conv5)

            # 512 / 3 x 3 / 1 / 1
            conv6 = tf.layers.conv2d(inputs=bnorm2, filters = 512, kernel_size = (3, 3), padding = "same", activation=tf.nn.relu)

            # 1 x 2 / 2
            pool4 = tf.layers.max_pooling2d(inputs=conv6, pool_size=[2, 2], strides=[1, 2], padding="same")

            # 512 / 2 x 2 / 1 / 0
            conv7 = tf.layers.conv2d(inputs=pool4, filters = 512, kernel_size = (2, 2), padding = "valid", activation=tf.nn.relu)

            return conv7

        inputs = tf.placeholder(tf.float32, [batch_size, max_width, 32, 1])

        # Our target output
        targets = tf.sparse_placeholder(tf.int32, name='targets')

        # The length of the sequence
        seq_len = tf.placeholder(tf.int32, [None], name='seq_len')

        cnn_output = CNN(inputs)

        reshaped_cnn_output = tf.reshape(cnn_output, [batch_size, -1, 512])

        max_char_count = reshaped_cnn_output.get_shape().as_list()[1]

        crnn_model = BidirectionnalRNN(reshaped_cnn_output, seq_len)

        logits = tf.reshape(crnn_model, [-1, 512])

        W = tf.Variable(tf.truncated_normal([512, config.NUM_CLASSES], stddev=0.1), name="W")
        b = tf.Variable(tf.constant(0., shape=[config.NUM_CLASSES]), name="b")

        logits = tf.matmul(logits, W) + b

        logits = tf.reshape(logits, [batch_size, -1, config.NUM_CLASSES])

        # Final layer, the output of the BLSTM
        logits = tf.transpose(logits, (1, 0, 2))

        # Loss and cost calculation
        loss = tf.nn.ctc_loss(targets, logits, seq_len)

        cost = tf.reduce_mean(loss)

        # Training step
        optimizer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)

        # The decoded answer
        decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len)

        dense_decoded = tf.sparse_tensor_to_dense(decoded[0], default_value=-1)

        # The error rate
        acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))

        init = tf.global_variables_initializer()

        return inputs, targets, seq_len, logits, dense_decoded, optimizer, acc, cost, max_char_count, init
Example #45
0
def main(_):

    batch_size = FLAGS.batch_size
    # num_readers = 4
    num_epochs = FLAGS.epoch
    checkpoint_dir = FLAGS.checkpoint_dir

    with tf.Graph().as_default():

        # deploy_config = model_deploy.DeploymentConfig()
        # Create global_step.
        global_step = tf.Variable(0, name='global_step', trainable=False)

        tr_file_name = os.path.join("/mnt/sdb/mark/mjsyth", "mjsynth_train.tfrecords")
        te_file_name = os.path.join("/mnt/sdb/mark/mjsyth", "mjsynth_val.tfrecords")

        sh_images, sh_labels, sh_length= read_utils.inputs( filename=[tr_file_name], batch_size=batch_size, num_epochs=num_epochs)
        val_images, val_labels, val_length= read_utils.inputs( filename=[te_file_name], batch_size=batch_size, num_epochs=1000)


        # Build Model
        crnn = model.CRNNNet()
        with tf.variable_scope('crnn'):
            logits, seq_len = crnn.net(sh_images, is_training=True)
            tf.get_variable_scope().reuse_variables()
            val_logits, val_seq_len = crnn.net(val_images, is_training=False)

        loss = crnn.losses(sh_labels, logits, seq_len)
        tf.summary.scalar("train/loss", loss)

        val_loss = crnn.losses(val_labels, val_logits, val_seq_len)
        # TODO: BK-tree NN search
        decoded, log_prob = tf.nn.ctc_beam_search_decoder(tf.transpose(val_logits, perm=[1, 0, 2]), val_seq_len, merge_repeated=False)

        acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), val_labels, normalize=False))
        acc_norm = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), val_labels))

        val_loss_sum = tf.placeholder(tf.float32, name='val_loss_sum')
        val_acc_sum = tf.placeholder(tf.float32, name='val_acc_sum')
        val_acc_norm_sum = tf.placeholder(tf.float32, name='val_acc_norm_sum')

        tf.summary.scalar("test/val_loss", val_loss_sum)
        tf.summary.scalar("test/edit_distance", val_acc_sum)
        tf.summary.scalar("test/edit_distance_norm", val_acc_norm_sum)



        starter_learning_rate = FLAGS.learning_rate
        learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                                   500000, 0.5, staircase=True)
        tf.summary.scalar("train/learning_rate",learning_rate)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):        
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss=loss, global_step=global_step)

        # Start Training
        with tf.Session(config=config) as sess:
            save = tf.train.Saver(max_to_keep=50)

            if not FLAGS.load:
                init_op = tf.group(tf.global_variables_initializer(),
                                   tf.local_variables_initializer())

                sess.run(init_op)            # Start input enqueue threads.
            else:

                # ckpt_file = 'model.ckpt-' + FLAGS.ckpt_step
                ckpt_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.ckpt_file)
                save.restore(sess, ckpt_path)
                sess.run(tf.local_variables_initializer())


            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            #sess = tf_debug.LocalCLIDebugWrapperSession(sess)
            merged = tf.summary.merge(tf.get_collection(tf.GraphKeys.SUMMARIES, scope='train/*'))
            val_merged = tf.summary.merge(tf.get_collection(tf.GraphKeys.SUMMARIES, scope='test/*'))

            file_writer = tf.summary.FileWriter(FLAGS.logdir, sess.graph)

            try:

                while not coord.should_stop():
                    start_time = time.time()

                    _, merged_t, tr_loss, lr, step, db_lables, db_images, db_logits = sess.run([optimizer, merged, loss, learning_rate, global_step, sh_labels, sh_images, logits])

                    duration = time.time() - start_time

                    print("loss", tr_loss, "time", duration)
                    file_writer.add_summary(merged_t, step)

                    # Print an overview fairly often.
                    if step % 10000 == 0:
                        #######################################################

                        val_loss_s, val_acc_s, val_acc_norm_s = 0, 0, 0
                        for ite in range(FLAGS.sample_size):
                            te_loss, te_acc, te_acc_norm = sess.run([val_loss, acc, acc_norm])
                            val_loss_s += te_loss
                            val_acc_s += te_acc
                            val_acc_norm_s += te_acc_norm
                        val_loss_s /= FLAGS.sample_size
                        val_acc_s /= FLAGS.sample_size
                        val_acc_norm_s /= FLAGS.sample_size

                        print('Step %d: loss %.3f acc %.3f %.3f (%.3f sec)' % (step, val_loss_s, val_acc_s, val_acc_norm_s, duration))

                        # Add summary
                        val_sum = sess.run(val_merged, feed_dict={val_loss_sum: val_loss_s, val_acc_sum: val_acc_s, val_acc_norm_sum: val_acc_norm_s})
                        file_writer.add_summary(val_sum, step)

                        save.save(sess, os.path.join(FLAGS.checkpoint_dir, 'model.ckpt'), global_step=step)
                    
            except tf.errors.OutOfRangeError:
                print('Done training for %d epochs, %d steps.' % (num_epochs, step))
            finally:
                # When done, ask the threads to stop.
                coord.request_stop()

                # Wait for threads to finish.
            coord.join(threads)
Example #46
0
    def _add_training_on_rnn(self, logits, grad_clip, learning_rate, lr_decay_factor,
                             sparse_labels, input_seq_lengths, prediction):
        """
        Build the training add-on of the Acoustic RNN
        
        This add-on offer ops that can be used to train the network :
          * self.learning_rate_decay_op : will decay the learning rate
          * self.acc_mean_loss_op : will compute the loss and accumulate it over multiple mini-batchs
          * self.acc_mean_loss_zero_op : will reset the loss accumulator to 0
          * self.acc_error_rate_op : will compute the error rate and accumulate it over multiple mini-batchs
          * self.acc_error_rate_zero_op : will reset the error_rate accumulator to 0
          * self.increase_mini_batch_op : will increase the mini-batchs counter
          * self.mini_batch_zero_op : will reset the mini-batchs counter
          * self.acc_gradients_zero_op : will reset the gradients
          * self.accumulate_gradients_op : will compute the gradients and accumulate them over multiple mini-batchs
          * self.train_step_op : will clip the accumulated gradients and apply them on the RNN

        Parameters
        ----------
        :param logits: the output of the RNN before the beam search
        :param grad_clip: max gradient size (prevent exploding gradients)
        :param learning_rate: learning rate parameter fed to optimizer
        :param lr_decay_factor: decay factor of the learning rate
        :param sparse_labels: the labels in a sparse tensor
        :param input_seq_lengths: vector containing the length of each input from 'inputs'
        :param prediction: the predicted label given by the RNN

        Returns
        -------
        :returns: tensorflow variable keeping the current learning rate
        """
        # Define the variable for the learning rate
        learning_rate_var = tf.Variable(float(learning_rate), trainable=False, name='learning_rate')
        # Define an op to decrease the learning rate
        self.learning_rate_decay_op = learning_rate_var.assign(tf.multiply(learning_rate_var, lr_decay_factor))

        # Compute the CTC loss between the logits and the truth for each item of the batch
        with tf.name_scope('CTC'):
            ctc_loss = tf.nn.ctc_loss(sparse_labels, logits, input_seq_lengths, ignore_longer_outputs_than_inputs=True)

            # Compute the mean loss of the batch (only used to check on progression in learning)
            # The loss is averaged accross the batch but before we take into account the real size of the label
            mean_loss = tf.reduce_mean(tf.truediv(ctc_loss, tf.to_float(input_seq_lengths)))

            # Set an accumulator to sum the loss between mini-batchs
            self.accumulated_mean_loss = tf.Variable(0.0, trainable=False)
            self.acc_mean_loss_op = self.accumulated_mean_loss.assign_add(mean_loss)
            self.acc_mean_loss_zero_op = self.accumulated_mean_loss.assign(tf.zeros_like(self.accumulated_mean_loss))

        # Compute the error between the logits and the truth
        with tf.name_scope('Error_Rate'):
            error_rate = tf.reduce_mean(tf.edit_distance(prediction, sparse_labels, normalize=True))

            # Set an accumulator to sum the error rate between mini-batchs
            self.accumulated_error_rate = tf.Variable(0.0, trainable=False)
            self.acc_error_rate_op = self.accumulated_error_rate.assign_add(error_rate)
            self.acc_error_rate_zero_op = self.accumulated_error_rate.assign(tf.zeros_like(self.accumulated_error_rate))

        # Count mini-batchs
        with tf.name_scope('Mini_batch'):
            # Set an accumulator to count the number of mini-batchs in a batch
            # Note : variable is defined as float to avoid type conversion error using tf.divide
            self.mini_batch = tf.Variable(0.0, trainable=False)
            self.increase_mini_batch_op = self.mini_batch.assign_add(1)
            self.mini_batch_zero_op = self.mini_batch.assign(tf.zeros_like(self.mini_batch))

        # Compute the gradients
        trainable_variables = tf.trainable_variables()
        with tf.name_scope('Gradients'):
            opt = tf.train.AdamOptimizer(learning_rate_var)
            gradients = opt.compute_gradients(ctc_loss, trainable_variables)

            # Define a list of variables to store the accumulated gradients between batchs
            accumulated_gradients = [tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False)
                                     for tv in trainable_variables]

            # Define an op to reset the accumulated gradient
            self.acc_gradients_zero_op = [tv.assign(tf.zeros_like(tv)) for tv in accumulated_gradients]

            # Define an op to accumulate the gradients calculated by the current batch with
            # the accumulated gradients variable
            self.accumulate_gradients_op = [accumulated_gradients[i].assign_add(gv[0])
                                            for i, gv in enumerate(gradients)]

            # Define an op to apply the result of the accumulated gradients
            clipped_gradients, _norm = tf.clip_by_global_norm(accumulated_gradients, grad_clip)
            self.train_step_op = opt.apply_gradients([(clipped_gradients[i], gv[1]) for i, gv in enumerate(gradients)],
                                                     global_step=self.global_step)
        return learning_rate_var
Example #47
0
def main(_):


    checkpoint_dir = FLAGS.checkpoint_dir

    with tf.Graph().as_default():

        # deploy_config = model_deploy.DeploymentConfig()
        # Create global_step.
        
        val_images = tf.placeholder(tf.float32, shape=[1, HEIGHT, WIDTH, 3], name='input_img')
        val_labels = tf.sparse_placeholder(tf.int32, name='input_labels')
        val_width = tf.placeholder(tf.int32, shape=[1], name='input_width')
        #indices = tf.placeholder(tf.int32, [None, 2])
        #values = tf.placeholder(tf.int32, [None])
        #shape = tf.placeholder(tf.int32, [2])

        #val_labels = tf.SparseTensor(indices, values, shape)


        # Build Model
        crnn = model.CRNNNet()
        with tf.variable_scope('crnn'):
            val_logits, val_seq_len = crnn.net(val_images, val_width, is_training=False, kp=1.0)


        val_loss = crnn.losses(val_labels, val_logits, val_seq_len)
        # TODO: BK-tree NN search
        decoded, log_prob = tf.nn.ctc_beam_search_decoder(tf.transpose(val_logits, perm=[1, 0, 2]), val_seq_len, merge_repeated=False)

        acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), val_labels, normalize=False))
        acc_norm = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), val_labels))



        # Start Training
        with tf.Session(config=config) as sess:
            save = tf.train.Saver(max_to_keep=50)

            assert FLAGS.load
            if not FLAGS.load:
                init_op = tf.group(tf.global_variables_initializer(),
                                   tf.local_variables_initializer())

                sess.run(init_op)            # Start input enqueue threads.
            else:

                # ckpt_file = 'model.ckpt-' + FLAGS.ckpt_step
                ckpt_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.ckpt_file)
                save.restore(sess, ckpt_path)
                print("Done loading checkpoint")
                # sess.run(tf.local_variables_initializer())


            with open(FLAGS.gt_file, 'r') as f:

                val_loss_s, val_acc_s, val_acc_norm_s = 0, 0, 0
                counter = 0
                hit = 0
                for line in f:
                    # print(line)
                    if FLAGS.dataset == 'ch4':
                        line = line.replace('\xef\xbb\xbf','')
                        line = line.replace('\r\n','')
                        # parse each line
                        img_file = line.split(', ')[0]
                        img_label = line.split(', ')[1][1:-1]
                        print(img_file, img_label)
                    if FLAGS.dataset == 'coco':
                        line = line.replace('\r\n','')
                        line = line.replace('\n','')
                        img_file = line.split(',')[0]+'.jpg'
                        if len(line) < 10:
                            continue
                        start = line.find(',')
                        img_label = line[start+1:]
                        print(img_file, img_label)
                        # labels.append(label)
                        # print(img_file, label)
                        # imgLists.append(os.path.join(data_prefix, img_file))        
                    if FLAGS.dataset == 'IC13':
                        line = line.replace('\xef\xbb\xbf','')
                        line = line.replace('\r\n','')
                        # parse each line
                        img_file = line.split(', ')[0]
                        img_label = line.split(', ')[1][1:-1]
                        # print(img_file, img_label)


                    img = Image.open(os.path.join(FLAGS.data_dir, img_file))
                    # w, h = img.size
                    # # print(w, h)
                    # ratio = 32 / float(h)
                    # data = data.resize([int(ratio*w), 32])
                    # # print(data.size)
                    # container = Image.new('RGB', (32, 100))
                    # container.paste(img)
                    # img = container
                    w, h = img.size
                    if w < h:
                        img = img.rotate(-90, expand=True)
                    w, h = img.size
                    # print(w, h)
                    ratio = HEIGHT / float(h)
                    if int(ratio*w) > WIDTH:
                        img = img.resize([WIDTH, HEIGHT])
                        actual_width = [WIDTH]
                    else:
                        img = img.resize([int(ratio*w), HEIGHT])
                        actual_width = [int(ratio*w)]
                    # print(data.size)
                    container = Image.new('RGB', (WIDTH, HEIGHT))
                    container.paste(img)
                    img = container

                    img = np.asarray(img, np.float32)

                    # img = img * (1. / 255) - 0.5

                    img /= 255.
                    img = mean_image_subtraction(
                        img,
                        [_R_MEAN, _G_MEAN, _B_MEAN])

                    img = np.expand_dims(img, axis=0)

                    str_label = img_label
                    if FLAGS.case_insensitive:
                        str_label = str_label.lower()
                    img_label = str2code(img_label)
                    if -1 in img_label:
                        continue
                    print(img_file, str_label)

                    indices = [(0, i) for i in range(len(img_label))]
                    values = [c for c in img_label]
                    shape = [1, len(img_label)]



                    t1 = time.time()
                    output_label, te_acc, te_acc_norm = sess.run([decoded, acc, acc_norm], feed_dict={
                        val_images: img,
                        val_labels: (indices, values, shape),
                        val_width: actual_width
                        })

                    t2 = time.time()
                    print(t2 - t1)
                    val_loss_s += 0
                    val_acc_s += te_acc
                    val_acc_norm_s += te_acc_norm
                    counter += 1
                    output_str = code2str(output_label[0].values)

                    print(img_file, output_str)
                    print(te_acc)
                     
                    if FLAGS.case_insensitive:
                        output_str = output_str.lower()
                    if output_str == str_label:
                        hit += 1
                        print(hit)

                val_loss_s /= counter
                val_acc_s /= counter
                val_acc_norm_s /= counter
                pred_acc = hit / float(counter)

                print(hit, counter)
                        
                print('loss %.3f edit dist %.3f %.3f acc %.3f' % (val_loss_s, val_acc_s, val_acc_norm_s, pred_acc))
sess = tf.Session()

#----------------------------------
# First compute the edit distance between 'bear' and 'beers'
hypothesis = list('bear')
truth = list('beers')
h1 = tf.SparseTensor([[0,0,0], [0,0,1], [0,0,2], [0,0,3]],
                     hypothesis,
                     [1,1,1])

t1 = tf.SparseTensor([[0,0,0], [0,0,1], [0,0,2], [0,0,3],[0,0,4]],
                     truth,
                     [1,1,1])

print(sess.run(tf.edit_distance(h1, t1, normalize=False)))

#----------------------------------
# Compute the edit distance between ('bear','beer') and 'beers':
hypothesis2 = list('bearbeer')
truth2 = list('beersbeers')
h2 = tf.SparseTensor([[0,0,0], [0,0,1], [0,0,2], [0,0,3], [0,1,0], [0,1,1], [0,1,2], [0,1,3]],
                     hypothesis2,
                     [1,2,4])

t2 = tf.SparseTensor([[0,0,0], [0,0,1], [0,0,2], [0,0,3], [0,0,4], [0,1,0], [0,1,1], [0,1,2], [0,1,3], [0,1,4]],
                     truth2,
                     [1,2,5])

print(sess.run(tf.edit_distance(h2, t2, normalize=True)))
Example #49
0
W2 = weight_variable([2*hidden_size,output_size])
b2 = bias_variable([output_size])
#n_batch, n_time_steps, n_features = l_in.input_var.shape #Unnecessary in this version. Just collecting the info so that we can reshape the output back to the original shape
l_reshape3 = tf.reshape(lstm_output_tr,[-1,2*hidden_size] )
h_2 = tf.matmul(l_reshape3,W2) + b2

l_reshape4 = tf.reshape(h_2,[-1,output_size])

l_soft = tf.nn.softmax(l_reshape4)
l_soft_reshaped = tf.reshape(l_soft,[-1,n_time_steps,output_size])
l_soft_tr = tf.transpose(l_soft_reshaped, [1,0,2])
loss = tf.reduce_mean(tf.nn.ctc_loss(l_soft_tr, targets,seqLengths))
optimizer = tf.train.AdamOptimizer(learningRate).minimize(loss)
logitsMaxTest = tf.slice(tf.argmax(l_soft_reshaped, 2), [0, 0], [seqLengths[0], 1])
predictions = tf.to_int32(ctc.ctc_beam_search_decoder(l_soft_reshaped , seqLengths)[0][0])
errorRate = tf.reduce_sum(tf.edit_distance(predictions, targets, normalize=False)) / \
                tf.to_float(tf.size(targets.values))
def getminibatch(x,y,bs):
    perm = np.random.permutation(len(x))
    toselect = perm[:bs]
    batch = {}
    batch['x'] = np.array([x[i] for i in toselect])
    batch['ind'], batch['val'], batch['shape'] = target_list_to_sparse_tensor([y[i] for i in toselect])
    batch['seqlen'] = np.zeros([bs])
    batch['seqlen'].fill(776)
    return batch

number_of_batches = 100
batch_size_var = 38
nEpochs = 100
Example #50
0
def build_net():
    net = {}
    net['x'] = tf.placeholder(tf.float32, shape=[None, 40, 120, 4], name="X")
    net['y'] = tf.sparse_placeholder(tf.int32, name="Y")
    net['len'] = tf.placeholder(tf.int32, shape=[None])

    layer = net['x']
    layer = conv(layer, 32, 5, 2)
    layer = conv(layer, 64, 3, 1)
    layer = conv(layer, 128, 3, 1)
    layer = conv(layer, 128, 3, 2)
    layer = conv(layer, 256, 3, 1)
    layer = conv(layer, 256, 3, 1)
    layer = conv(layer, 512, 3, 2)
    layer = conv(layer, 512, 3, 1)
    layer = conv(layer, 1024, 3, 1)
    # layer = conv(layer, 512, (5, 1), (5, 1))
    logits = layer
    # print logits.get_shape()

    # (?, 5, 15, 1024) -> (15, ?, 5, 1024)
    logits = tf.transpose(logits, (2, 0, 1, 3))
    # (15, ?, 5, 1024) -> (15 * ?, 5 * 1024)
    logits = tf.reshape(logits, (-1, 5120))

    # (15 * ?, ??) -> (15 * ?, 512)
    logits = tf.layers.dense(logits,
        units       = 512,
        activation  = tf.nn.leaky_relu,
        use_bias    = True,
        kernel_initializer = tf.truncated_normal_initializer(stddev=0.1),
        bias_initializer = tf.constant_initializer(0.01),
    )
    # # (15 * ?, 512) -> (15, ?, 512)
    # logits = tf.reshape(logits, (15, -1, 512))

    # # (15 * ?, 512) -> (15, ?, 512)
    # logits = tf.reshape(logits, (15, -1, 512))

    # # (15, ?, ??) -> (15, ?, 256)
    # rnn_layers = [tf.nn.rnn_cell.GRUCell(size) for size in [256]]
    # multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
    # logits, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
    #     inputs=logits, dtype=tf.float32, time_major=True)
    # # (15, ?, 256) -> (15 * ?, 256)
    # logits = tf.reshape(logits, (-1, 256))

    # (15 * ?, ??) -> (15 * ?, n_class)
    logits = tf.layers.dense(logits,
        units       = labels_units,
        activation  = None,
        use_bias    = True,
        kernel_initializer = tf.truncated_normal_initializer(stddev=0.1),
        bias_initializer = tf.constant_initializer(0.01),
    )
    # (15 * ?, n_class) -> (15, ?, n_class)
    logits = tf.reshape(logits, (15, -1, labels_units))
    loss = tf.nn.ctc_loss(labels=net['y'], inputs=logits, sequence_length=net['len'],
        ignore_longer_outputs_than_inputs=True)
    net['loss'] = tf.reduce_mean(loss)
    net['train_op'] = tf.train.AdamOptimizer(learning_rate=0.000005).minimize(net['loss'])
    decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, net['len'], merge_repeated=False)
    net['decoded'] = decoded[0]
    net['acc'] = tf.reduce_mean(tf.edit_distance(tf.cast(net['decoded'], tf.int32), net['y']))
    return net
	outH1 = [tf.reduce_sum(tf.multiply(t, weightsOutH1), axis=1) + biasesOutH1 for t in fbH1rs]
	print("building logits ")
	logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in outH1]
	print("len(outH1) %d"% len(outH1))
	####Optimizing
	print("building loss")
	logits3d = tf.stack(logits)
	loss = tf.reduce_mean(ctc.ctc_loss(logits3d, targetY, seqLengths))
	out = tf.identity(loss, 'ctc_loss_mean')
	optimizer = tf.train.MomentumOptimizer(learningRate, momentum).minimize(loss)

	####Evaluating
	print("building Evaluation")
	logitsMaxTest = tf.slice(tf.argmax(logits3d, 2), [0, 0], [seqLengths[0], 1])
	predictions = tf.to_int32(ctc.ctc_beam_search_decoder(logits3d, seqLengths)[0][0])
	reduced_sum = tf.reduce_sum(tf.edit_distance(predictions, targetY, normalize=False))
	errorRate = reduced_sum / tf.to_float(tf.size(targetY.values))

	check_op = tf.add_check_numerics_ops()
print("done building graph")

####Run session
with tf.Session(graph=graph) as session:
	try: merged = tf.summary.merge_all()
	except: merged = tf.summary.merge_all()
	try:writer = tf.summary.FileWriter("/tmp/basic_new", session.graph)
	except: writer = tf.summary.FileWriter("/tmp/basic_new", session.graph)
	try:saver = tf.train.Saver()  # defaults to saving all variables
	except:
		print("tf.train.Saver() broken in tensorflow 0.12")
		saver = tf.train.Saver(tf.global_variables())# WTF stupid API breaking
Example #52
0
    def __init__(self):
        self.graph = tf.Graph()
        with self.graph.as_default():
            with tf.variable_scope('weights'):
                self.weights = {
                    'W_conv1':
                    tf.get_variable(
                        'W_conv1', [10, 1, 1, 4],
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.1)),
                    'W_conv2':
                    tf.get_variable(
                        'W_conv2', [5, 1, 4, 8],
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.1)),
                    'W_conv3':
                    tf.get_variable(
                        'W_conv3', [3, 1, 8, 16],
                        initializer=tf.truncated_normal_initializer(
                            stddev=0.1))
                }

            with tf.variable_scope('biases'):
                self.biases = {
                    'b_conv1':
                    tf.get_variable('b_conv1', [4],
                                    initializer=tf.constant_initializer(
                                        0, dtype=tf.float32)),
                    'b_conv2':
                    tf.get_variable('b_conv2', [8],
                                    initializer=tf.constant_initializer(
                                        0, dtype=tf.float32)),
                    'b_conv3':
                    tf.get_variable('b_conv3', [16],
                                    initializer=tf.constant_initializer(
                                        0, dtype=tf.float32))
                }

            # input_x.shape: [batch_size, max_step, fea_dim]
            self.input_x = tf.placeholder(tf.float32,
                                          shape=[None, None, config.fea_dim],
                                          name="inputs_x")
            # input_y.shape:[batch_size, emo_num]
            self.input_y = tf.placeholder(tf.int32,
                                          shape=[None, None],
                                          name="labels_y")
            # seq_len: [batch_size]
            self.seq_len = tf.placeholder(tf.int32,
                                          shape=[None],
                                          name="feature_len")  # nums of frames
            # label_len : [batch_szie]
            self.lab_len = tf.placeholder(tf.int32,
                                          shape=[None],
                                          name="label_len")  # [A A A ]
            # batch_szie
            self.batch_size = tf.placeholder(tf.int32, [], name="batch_size")
            # training or testing label
            self.is_train = tf.placeholder(tf.bool, None)
            self.keep_prob = tf.placeholder(tf.float32, name="keep_prob")

            self.mu = tf.placeholder(tf.float32,
                                     shape=[config.fea_dim],
                                     name="mu")
            self.var = tf.placeholder(tf.float32,
                                      shape=[config.fea_dim],
                                      name="var")

            fea_norm = tf.nn.batch_normalization(self.input_x,
                                                 self.mu,
                                                 self.var,
                                                 0,
                                                 2,
                                                 0.001,
                                                 name="normalize")
            self.input_x_bn = fea_norm

            with tf.name_scope('cnn_net'):
                # x_data.shape:[batch_size, max_step, fea_dim, 1]
                self.x_data = tf.reshape(
                    self.input_x_bn, [self.batch_size, -1, config.fea_dim, 1])
                # first convolution and pooling
                with tf.name_scope('conv1'):
                    print('self.x_data:', self.x_data)
                    conv1 = tf.nn.conv2d(self.x_data,
                                         self.weights['W_conv1'],
                                         strides=[1, 1, 1, 1],
                                         padding='SAME')
                    h_conv1 = tf.nn.relu(
                        tf.nn.bias_add(conv1, self.biases['b_conv1']))
                    h_pool1 = tf.nn.max_pool(h_conv1,
                                             ksize=[1, 3, 1, 1],
                                             strides=[1, 2, 1, 1],
                                             padding='SAME')
                    print("h_pool1:", h_pool1)
                # second convolution and pooling
                with tf.name_scope('conv2'):
                    conv2 = tf.nn.conv2d(h_pool1,
                                         self.weights['W_conv2'],
                                         strides=[1, 1, 1, 1],
                                         padding='SAME')
                    h_conv2 = tf.nn.relu(
                        tf.nn.bias_add(conv2, self.biases['b_conv2']))
                    h_pool2 = tf.nn.max_pool(h_conv2,
                                             ksize=[1, 3, 1, 1],
                                             strides=[1, 2, 1, 1],
                                             padding='SAME')
                    print("h_pool2:", h_pool2)
                # third convolution and pooling
                with tf.name_scope('conv3'):
                    conv3 = tf.nn.conv2d(h_pool2,
                                         self.weights['W_conv3'],
                                         strides=[1, 1, 1, 1],
                                         padding='SAME')
                    h_conv3 = tf.nn.relu(
                        tf.nn.bias_add(conv3, self.biases['b_conv3']))
                    h_pool3 = tf.nn.max_pool(h_conv3,
                                             ksize=[1, 3, 1, 1],
                                             strides=[1, 2, 1, 1],
                                             padding='SAME')
                    print("h_pool3:", h_pool3)
                self.cnn_result = h_pool3
                print(
                    "self.cnn_result:",
                    self.cnn_result)  # [batch_size, frames_nums, fea_dim, 16]

            shape = self.cnn_result.get_shape().as_list()
            print('shape:', shape)
            self.cnn_results = tf.reshape(self.cnn_result,
                                          [self.batch_size, -1, shape[2] * 16])
            print("self.cnn_results:", self.cnn_results)

            self.new_seq_len = tf.ceil((tf.to_float(self.seq_len)) / 8)
            self.new_seq_len = tf.cast(self.new_seq_len, tf.int32)

            with tf.name_scope('lstm_net'):
                count = -1
                hidden_layer = []
                with tf.name_scope('lstm_layer'):
                    for unit_num in config.lstm_hidden_size:
                        count = count + 1
                        with tf.name_scope('lstm_cell_' + str(count)):
                            lstm_cell = tf.contrib.rnn.LSTMCell(unit_num)
                        hidden_layer.append(lstm_cell)

                stack = tf.contrib.rnn.MultiRNNCell(hidden_layer,
                                                    state_is_tuple=True)
                init_state = stack.zero_state(self.batch_size,
                                              dtype=tf.float32)
                outputs, last_states = tf.nn.dynamic_rnn(
                    stack,
                    self.cnn_results,
                    self.new_seq_len,
                    initial_state=init_state,
                    dtype=tf.float32,
                    time_major=False)  # tf.ceil(tf.to_float(self.seq_len))
                print('outputs:', outputs)  # [batch_size, frame_nums, 256]
                print('last_states:', last_states)
                #h_output = last_states[-1][-1]
                h_output = tf.reshape(outputs,
                                      [-1, config.lstm_hidden_size[-1]
                                       ])  #  [batch_size*frame_nums, 256]
                self.h_output = h_output
                print("self.h_output:", self.h_output)

            with tf.name_scope('dense_net'):
                # full_conn  f_dense.shape:[batch_size, config.full_connect_layer_unit]
                f_dense = tf.contrib.layers.fully_connected(
                    self.h_output,
                    config.full_connect_layer_unit,
                    activation_fn=None,
                    scope='full_conn')
                if config.do_batchnorm:
                    self.f_dense = tf.contrib.layers.batch_norm(
                        f_dense,
                        decay=0.99,
                        center=True,
                        scale=True,
                        updates_collections=None,
                        is_training=self.is_train,
                        scope='bn')
                else:
                    self.f_dense = f_dense
                #logits.shape: [batch_size, config.emo_num]
                logits = tf.contrib.layers.fully_connected(self.f_dense,
                                                           config.class_num,
                                                           activation_fn=None,
                                                           scope='logits')
                #[batch_size, timestep, config.emo_num]
                self.logit = tf.reshape(
                    logits, [self.batch_size, -1, config.class_num]
                )  # [batch_size*frame_nums, 4]--> [batch_size, frame_nums, 256]
                logits = tf.transpose(self.logit, (1, 0, 2))
                self.logits = logits
                #[timesteps ,batch_size, 5]
                print("logits:", self.logits)

            with tf.name_scope('accuracy'):
                self.global_step = tf.Variable(0, trainable=False)
                targets = tf.contrib.keras.backend.ctc_label_dense_to_sparse(
                    self.input_y, self.lab_len)  #framenums*0.03
                loss = tf.nn.ctc_loss(
                    labels=targets,
                    inputs=self.logits,
                    sequence_length=self.new_seq_len)  # framenums/8
                self.cost = tf.reduce_mean(loss)
                self.optimizer = tf.train.AdamOptimizer(
                    config.initial_learning_rate).minimize(
                        self.cost, self.global_step)
                self.decoded, log_prob = tf.nn.ctc_beam_search_decoder(
                    self.logits, self.new_seq_len, merge_repeated=False)
                self.decoded_dense = tf.sparse_tensor_to_dense(
                    self.decoded[0], default_value=(config.class_num - 1))
                dis = tf.edit_distance(tf.cast(self.decoded[0], tf.int32),
                                       targets)
                self.acc = tf.reduce_mean(dis)

            if config.out_model:
                saver = tf.train.Saver(max_to_keep=30)
                self.saver = saver
# Now we can perform address matching
# Create graph
sess = tf.Session()

# Placeholders
test_address = tf.sparse_placeholder( dtype=tf.string)
test_zip = tf.placeholder(shape=[None, 1], dtype=tf.float32)
ref_address = tf.sparse_placeholder(dtype=tf.string)
ref_zip = tf.placeholder(shape=[None, n], dtype=tf.float32)

# Declare Zip code distance for a test zip and reference set
zip_dist = tf.square(tf.subtract(ref_zip, test_zip))

# Declare Edit distance for address
address_dist = tf.edit_distance(test_address, ref_address, normalize=True)

# Create similarity scores
zip_max = tf.gather(tf.squeeze(zip_dist), tf.argmax(zip_dist, 1))
zip_min = tf.gather(tf.squeeze(zip_dist), tf.argmin(zip_dist, 1))
zip_sim = tf.div(tf.subtract(zip_max, zip_dist), tf.subtract(zip_max, zip_min))
address_sim = tf.subtract(1., address_dist)

# Combine distance functions
address_weight = 0.5
zip_weight = 1. - address_weight
weighted_sim = tf.add(tf.transpose(tf.multiply(address_weight, address_sim)), tf.multiply(zip_weight, zip_sim))

# Predict: Get max similarity entry
top_match_index = tf.argmax(weighted_sim, 1)
h1 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3]], hypothesis,
                     [1, 1, 1])

t1 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 0, 4]],
                     truth, [1, 1, 1])

#print sess.run(h1)
#print sess.run(t1)

#tf.edit_distance的两个输入都是稀疏张量
"""This operation takes variable-length sequences (`hypothesis` and `truth`),
each provided as a `SparseTensor`, and computes the Levenshtein distance.
You can normalize the edit distance by length of `truth` by setting
`normalize` to true."""

print(sess.run(tf.edit_distance(h1, t1, normalize=False)))

#----------------------------------
# Compute the edit distance between ('bear','beer') and 'beers':
hypothesis2 = list('bearbeer')
truth2 = list('beersbeers')

h2 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 1, 0],
                      [0, 1, 1], [0, 1, 2], [0, 1, 3]], hypothesis2, [1, 2, 4])

t2 = tf.SparseTensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 0, 3], [0, 0, 4],
                      [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 1, 3], [0, 1, 4]],
                     truth2, [1, 2, 5])
#normalize: 布尔值,如果值True的话,求出来的Levenshtein距离除以真实序列的长度. 默认为True
#bear and beers编辑距离2 max len is 5  ;normalize 2/5=0.4
#beer and beers编辑距离1 max len is 5 ;normalize  1/5=0.2