Beispiel #1
0
    def check(self, encoder_type, attention_type, label_type='character'):

        print('==================================================')
        print('  encoder_type: %s' % encoder_type)
        print('  attention_type: %s' % attention_type)
        print('  label_type: %s' % label_type)
        print('==================================================')

        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 4
            inputs, labels, inputs_seq_len, labels_seq_len = generate_data(
                label_type=label_type,
                model='attention',
                batch_size=batch_size)

            # Define model graph
            num_classes = 27 if label_type == 'character' else 61
            model = AttentionSeq2Seq(input_size=inputs[0].shape[1],
                                     encoder_type=encoder_type,
                                     encoder_num_units=256,
                                     encoder_num_layers=2,
                                     encoder_num_proj=None,
                                     attention_type=attention_type,
                                     attention_dim=128,
                                     decoder_type='lstm',
                                     decoder_num_units=256,
                                     decoder_num_layers=1,
                                     embedding_dim=64,
                                     num_classes=num_classes,
                                     sos_index=num_classes,
                                     eos_index=num_classes + 1,
                                     max_decode_length=100,
                                     use_peephole=True,
                                     splice=1,
                                     parameter_init=0.1,
                                     clip_grad_norm=5.0,
                                     clip_activation_encoder=50,
                                     clip_activation_decoder=50,
                                     weight_decay=1e-8,
                                     time_major=True,
                                     sharpening_factor=1.0,
                                     logits_temperature=1.0)

            # Define placeholders
            model.create_placeholders()
            learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate')

            # Add to the graph each operation
            loss_op, logits, decoder_outputs_train, decoder_outputs_infer = model.compute_loss(
                model.inputs_pl_list[0],
                model.labels_pl_list[0],
                model.inputs_seq_len_pl_list[0],
                model.labels_seq_len_pl_list[0],
                model.keep_prob_encoder_pl_list[0],
                model.keep_prob_decoder_pl_list[0],
                model.keep_prob_embedding_pl_list[0])
            train_op = model.train(loss_op,
                                   optimizer='adam',
                                   learning_rate=learning_rate_pl)
            decode_op_train, decode_op_infer = model.decode(
                decoder_outputs_train, decoder_outputs_infer)
            ler_op = model.compute_ler(model.labels_st_true_pl,
                                       model.labels_st_pred_pl)

            # Define learning rate controller
            learning_rate = 1e-3
            lr_controller = Controller(learning_rate_init=learning_rate,
                                       decay_start_epoch=20,
                                       decay_rate=0.9,
                                       decay_patient_epoch=10,
                                       lower_better=True)

            # Add the variable initializer operation
            init_op = tf.global_variables_initializer()

            # Count total parameters
            parameters_dict, total_parameters = count_total_parameters(
                tf.trainable_variables())
            for parameter_name in sorted(parameters_dict.keys()):
                print("%s %d" %
                      (parameter_name, parameters_dict[parameter_name]))
            print("Total %d variables, %s M parameters" %
                  (len(parameters_dict.keys()),
                   "{:,}".format(total_parameters / 1000000)))

            # Make feed dict
            feed_dict = {
                model.inputs_pl_list[0]: inputs,
                model.labels_pl_list[0]: labels,
                model.inputs_seq_len_pl_list[0]: inputs_seq_len,
                model.labels_seq_len_pl_list[0]: labels_seq_len,
                model.keep_prob_encoder_pl_list[0]: 0.8,
                model.keep_prob_decoder_pl_list[0]: 1.0,
                model.keep_prob_embedding_pl_list[0]: 1.0,
                learning_rate_pl: learning_rate
            }

            idx2phone = Idx2phone(map_file_path='./phone61.txt')

            with tf.Session() as sess:
                # Initialize parameters
                sess.run(init_op)

                # Wrapper for tfdbg
                # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

                # Train model
                max_steps = 1000
                start_time_step = time.time()
                for step in range(max_steps):

                    # Compute loss
                    _, loss_train = sess.run(
                        [train_op, loss_op], feed_dict=feed_dict)

                    # Gradient check
                    # grads = sess.run(model.clipped_grads,
                    #                  feed_dict=feed_dict)
                    # for grad in grads:
                    #     print(np.max(grad))

                    if (step + 1) % 10 == 0:
                        # Change to evaluation mode
                        feed_dict[model.keep_prob_encoder_pl_list[0]] = 1.0
                        feed_dict[model.keep_prob_decoder_pl_list[0]] = 1.0
                        feed_dict[model.keep_prob_embedding_pl_list[0]] = 1.0

                        # Predict class ids
                        predicted_ids_train, predicted_ids_infer = sess.run(
                            [decode_op_train, decode_op_infer],
                            feed_dict=feed_dict)

                        # Compute accuracy
                        try:
                            feed_dict_ler = {
                                model.labels_st_true_pl: list2sparsetensor(
                                    labels, padded_value=model.eos_index),
                                model.labels_st_pred_pl: list2sparsetensor(
                                    predicted_ids_infer, padded_value=model.eos_index)
                            }
                            ler_train = sess.run(
                                ler_op, feed_dict=feed_dict_ler)
                        except IndexError:
                            ler_train = 1

                        duration_step = time.time() - start_time_step
                        print('Step %d: loss = %.3f / ler = %.3f (%.3f sec) / lr = %.5f' %
                              (step + 1, loss_train, ler_train, duration_step, learning_rate))
                        start_time_step = time.time()

                        # Visualize
                        if label_type == 'character':
                            print('True            : %s' %
                                  idx2alpha(labels[0]))
                            print('Pred (Training) : <%s' %
                                  idx2alpha(predicted_ids_train[0]))
                            print('Pred (Inference): <%s' %
                                  idx2alpha(predicted_ids_infer[0]))
                        else:
                            print('True            : %s' %
                                  idx2phone(labels[0]))
                            print('Pred (Training) : < %s' %
                                  idx2phone(predicted_ids_train[0]))
                            print('Pred (Inference): < %s' %
                                  idx2phone(predicted_ids_infer[0]))

                        if ler_train < 0.1:
                            print('Model is Converged.')
                            break

                        # Update learning rate
                        learning_rate = lr_controller.decay_lr(
                            learning_rate=learning_rate,
                            epoch=step,
                            value=ler_train)
                        feed_dict[learning_rate_pl] = learning_rate
    def check(self, decoder_type):

        print('==================================================')
        print('  decoder_type: %s' % decoder_type)
        print('==================================================')

        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 2
            num_stack = 2
            inputs, labels, inputs_seq_len = generate_data(
                label_type='character',
                model='ctc',
                batch_size=batch_size,
                num_stack=num_stack,
                splice=1)
            max_time = inputs.shape[1]

            # Define model graph
            model = CTC(encoder_type='blstm',
                        input_size=inputs[0].shape[-1],
                        splice=1,
                        num_stack=num_stack,
                        num_units=256,
                        num_layers=2,
                        num_classes=27,
                        lstm_impl='LSTMBlockCell',
                        parameter_init=0.1,
                        clip_grad_norm=5.0,
                        clip_activation=50,
                        num_proj=256,
                        weight_decay=1e-6)

            # Define placeholders
            model.create_placeholders()

            # Add to the graph each operation
            _, logits = model.compute_loss(
                model.inputs_pl_list[0],
                model.labels_pl_list[0],
                model.inputs_seq_len_pl_list[0],
                model.keep_prob_pl_list[0])
            beam_width = 20 if 'beam_search' in decoder_type else 1
            decode_op = model.decoder(logits,
                                      model.inputs_seq_len_pl_list[0],
                                      beam_width=beam_width)
            ler_op = model.compute_ler(decode_op, model.labels_pl_list[0])
            posteriors_op = model.posteriors(logits, blank_prior=1)

            if decoder_type == 'np_greedy':
                decoder = GreedyDecoder(blank_index=model.num_classes)
            elif decoder_type == 'np_beam_search':
                decoder = BeamSearchDecoder(space_index=26,
                                            blank_index=model.num_classes - 1)

            # Make feed dict
            feed_dict = {
                model.inputs_pl_list[0]: inputs,
                model.labels_pl_list[0]: list2sparsetensor(labels,
                                                           padded_value=-1),
                model.inputs_seq_len_pl_list[0]: inputs_seq_len,
                model.keep_prob_pl_list[0]: 1.0
            }

            # Create a saver for writing training checkpoints
            saver = tf.train.Saver()

            with tf.Session() as sess:
                ckpt = tf.train.get_checkpoint_state('./')

                # If check point exists
                if ckpt:
                    model_path = ckpt.model_checkpoint_path
                    saver.restore(sess, model_path)
                    print("Model restored: " + model_path)
                else:
                    raise ValueError('There are not any checkpoints.')

                if decoder_type in ['tf_greedy', 'tf_beam_search']:
                    # Decode
                    labels_pred_st = sess.run(decode_op, feed_dict=feed_dict)
                    labels_pred = sparsetensor2list(
                        labels_pred_st, batch_size=batch_size)

                    # Compute accuracy
                    cer = sess.run(ler_op, feed_dict=feed_dict)
                else:
                    # Compute CTC posteriors
                    probs = sess.run(posteriors_op, feed_dict=feed_dict)
                    probs = probs.reshape(-1, max_time, model.num_classes)

                    if decoder_type == 'np_greedy':
                        # Decode
                        labels_pred = decoder(probs=probs,
                                              seq_len=inputs_seq_len)

                    elif decoder_type == 'np_beam_search':
                        # Decode
                        labels_pred, scores = decoder(probs=probs,
                                                      seq_len=inputs_seq_len,
                                                      beam_width=beam_width)

                    # Compute accuracy
                    cer = compute_cer(str_pred=idx2alpha(labels_pred[0]),
                                      str_true=idx2alpha(labels[0]),
                                      normalize=True)

                # Visualize
                print('CER: %.3f %%' % (cer * 100))
                print('Ref: %s' % idx2alpha(labels[0]))
                print('Hyp: %s' % idx2alpha(labels_pred[0]))
Beispiel #3
0
    def check_training(self, label_type):
        print('----- ' + label_type + ' -----')
        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 1
            inputs, att_labels, inputs_seq_len, att_labels_seq_len, ctc_labels_st = generate_data(
                label_type=label_type,
                model='joint_ctc_attention',
                batch_size=batch_size)

            # Define model graph
            att_num_classes = 26 + 2 if label_type == 'character' else 61 + 2
            ctc_num_classes = 26 if label_type == 'character' else 61
            # model = load(model_type=model_type)
            network = JointCTCAttention(input_size=inputs[0].shape[1],
                                        encoder_num_unit=256,
                                        encoder_num_layer=2,
                                        attention_dim=128,
                                        attention_type='content',
                                        decoder_num_unit=256,
                                        decoder_num_layer=1,
                                        embedding_dim=20,
                                        att_num_classes=att_num_classes,
                                        ctc_num_classes=ctc_num_classes,
                                        att_task_weight=0.5,
                                        sos_index=att_num_classes - 2,
                                        eos_index=att_num_classes - 1,
                                        max_decode_length=50,
                                        attention_weights_tempareture=1.0,
                                        logits_tempareture=1.0,
                                        parameter_init=0.1,
                                        clip_grad=5.0,
                                        clip_activation_encoder=50,
                                        clip_activation_decoder=50,
                                        dropout_ratio_input=0.9,
                                        dropout_ratio_hidden=0.9,
                                        dropout_ratio_output=1.0,
                                        weight_decay=1e-8,
                                        beam_width=1,
                                        time_major=False)

            # Define placeholders
            network.create_placeholders()
            learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate')

            # Add to the graph each operation
            loss_op, att_logits, ctc_logits, decoder_outputs_train, decoder_outputs_infer = network.compute_loss(
                network.inputs_pl_list[0], network.att_labels_pl_list[0],
                network.inputs_seq_len_pl_list[0],
                network.att_labels_seq_len_pl_list[0],
                network.ctc_labels_pl_list[0],
                network.keep_prob_input_pl_list[0],
                network.keep_prob_hidden_pl_list[0],
                network.keep_prob_output_pl_list[0])
            train_op = network.train(loss_op,
                                     optimizer='adam',
                                     learning_rate=learning_rate_pl)
            decode_op_train, decode_op_infer = network.decoder(
                decoder_outputs_train, decoder_outputs_infer)
            ler_op = network.compute_ler(network.att_labels_st_true_pl,
                                         network.att_labels_st_pred_pl)

            # Define learning rate controller
            learning_rate = 1e-3
            lr_controller = Controller(learning_rate_init=learning_rate,
                                       decay_start_epoch=10,
                                       decay_rate=0.99,
                                       decay_patient_epoch=5,
                                       lower_better=True)

            # Add the variable initializer operation
            init_op = tf.global_variables_initializer()

            # Count total parameters
            parameters_dict, total_parameters = count_total_parameters(
                tf.trainable_variables())
            for parameter_name in sorted(parameters_dict.keys()):
                print("%s %d" %
                      (parameter_name, parameters_dict[parameter_name]))
            print("Total %d variables, %s M parameters" %
                  (len(parameters_dict.keys()), "{:,}".format(
                      total_parameters / 1000000)))

            # Make feed dict
            feed_dict = {
                network.inputs_pl_list[0]: inputs,
                network.att_labels_pl_list[0]: att_labels,
                network.inputs_seq_len_pl_list[0]: inputs_seq_len,
                network.att_labels_seq_len_pl_list[0]: att_labels_seq_len,
                network.ctc_labels_pl_list[0]: ctc_labels_st,
                network.keep_prob_input_pl_list[0]:
                network.dropout_ratio_input,
                network.keep_prob_hidden_pl_list[0]:
                network.dropout_ratio_hidden,
                network.keep_prob_output_pl_list[0]:
                network.dropout_ratio_output,
                learning_rate_pl: learning_rate
            }

            map_file_path = '../../experiments/timit/metrics/mapping_files/attention/phone61_to_num.txt'

            with tf.Session() as sess:

                # Initialize parameters
                sess.run(init_op)

                # Wrapper for tfdbg
                # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

                # Train model
                max_steps = 400
                start_time_global = time.time()
                start_time_step = time.time()
                ler_train_pre = 1
                not_improved_count = 0
                for step in range(max_steps):

                    # Compute loss
                    _, loss_train = sess.run([train_op, loss_op],
                                             feed_dict=feed_dict)

                    # Gradient check
                    # grads = sess.run(network.clipped_grads,
                    #                  feed_dict=feed_dict)
                    # for grad in grads:
                    #     print(np.max(grad))

                    if (step + 1) % 10 == 0:
                        # Change to evaluation mode
                        feed_dict[network.keep_prob_input_pl_list[0]] = 1.0
                        feed_dict[network.keep_prob_hidden_pl_list[0]] = 1.0
                        feed_dict[network.keep_prob_output_pl_list[0]] = 1.0

                        # Predict class ids
                        predicted_ids_train, predicted_ids_infer = sess.run(
                            [decode_op_train, decode_op_infer],
                            feed_dict=feed_dict)

                        # Compute accuracy
                        try:
                            feed_dict_ler = {
                                network.att_labels_st_true_pl:
                                list2sparsetensor(att_labels, padded_value=27),
                                network.att_labels_st_pred_pl:
                                list2sparsetensor(predicted_ids_infer,
                                                  padded_value=27)
                            }
                            ler_train = sess.run(ler_op,
                                                 feed_dict=feed_dict_ler)
                        except ValueError:
                            ler_train = 1

                        duration_step = time.time() - start_time_step
                        print(
                            'Step %d: loss = %.3f / ler = %.4f (%.3f sec) / lr = %.5f'
                            % (step + 1, loss_train, ler_train, duration_step,
                               learning_rate))
                        start_time_step = time.time()

                        # Visualize
                        if label_type == 'character':
                            print('True            : %s' %
                                  idx2alpha(att_labels[0]))
                            print('Pred (Training) : <%s' %
                                  idx2alpha(predicted_ids_train[0]))
                            print('Pred (Inference): <%s' %
                                  idx2alpha(predicted_ids_infer[0]))
                        else:
                            print('True            : %s' %
                                  idx2phone(att_labels[0], map_file_path))
                            print('Pred (Training) : < %s' % idx2phone(
                                predicted_ids_train[0], map_file_path))
                            print('Pred (Inference): < %s' % idx2phone(
                                predicted_ids_infer[0], map_file_path))

                        if ler_train >= ler_train_pre:
                            not_improved_count += 1
                        else:
                            not_improved_count = 0
                        if not_improved_count >= 10:
                            print('Model is Converged.')
                            break
                        ler_train_pre = ler_train

                        # Update learning rate
                        learning_rate = lr_controller.decay_lr(
                            learning_rate=learning_rate,
                            epoch=step,
                            value=ler_train)
                        feed_dict[learning_rate_pl] = learning_rate

                duration_global = time.time() - start_time_global
                print('Total time: %.3f sec' % (duration_global))
    def check(self, encoder_type, lstm_impl, time_major=False):

        print('==================================================')
        print('  encoder_type: %s' % str(encoder_type))
        print('  lstm_impl: %s' % str(lstm_impl))
        print('  time_major: %s' % str(time_major))
        print('==================================================')

        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 2
            inputs, labels_char, labels_phone, inputs_seq_len = generate_data(
                label_type='multitask', model='ctc', batch_size=batch_size)

            # Define model graph
            num_classes_main = 27
            num_classes_sub = 61
            model = MultitaskCTC(
                encoder_type=encoder_type,
                input_size=inputs[0].shape[1],
                num_units=256,
                num_layers_main=2,
                num_layers_sub=1,
                num_classes_main=num_classes_main,
                num_classes_sub=num_classes_sub,
                main_task_weight=0.8,
                lstm_impl=lstm_impl,
                parameter_init=0.1,
                clip_grad_norm=5.0,
                clip_activation=50,
                num_proj=256,
                weight_decay=1e-8,
                # bottleneck_dim=50,
                bottleneck_dim=None,
                time_major=time_major)

            # Define placeholders
            model.create_placeholders()
            learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate')

            # Add to the graph each operation
            loss_op, logits_main, logits_sub = model.compute_loss(
                model.inputs_pl_list[0], model.labels_pl_list[0],
                model.labels_sub_pl_list[0], model.inputs_seq_len_pl_list[0],
                model.keep_prob_pl_list[0])
            train_op = model.train(loss_op,
                                   optimizer='adam',
                                   learning_rate=learning_rate_pl)
            decode_op_main, decode_op_sub = model.decoder(
                logits_main,
                logits_sub,
                model.inputs_seq_len_pl_list[0],
                beam_width=20)
            ler_op_main, ler_op_sub = model.compute_ler(
                decode_op_main, decode_op_sub, model.labels_pl_list[0],
                model.labels_sub_pl_list[0])

            # Define learning rate controller
            learning_rate = 1e-3
            lr_controller = Controller(learning_rate_init=learning_rate,
                                       decay_start_epoch=20,
                                       decay_rate=0.9,
                                       decay_patient_epoch=5,
                                       lower_better=True)

            # Add the variable initializer operation
            init_op = tf.global_variables_initializer()

            # Count total parameters
            parameters_dict, total_parameters = count_total_parameters(
                tf.trainable_variables())
            for parameter_name in sorted(parameters_dict.keys()):
                print("%s %d" %
                      (parameter_name, parameters_dict[parameter_name]))
            print("Total %d variables, %s M parameters" %
                  (len(parameters_dict.keys()), "{:,}".format(
                      total_parameters / 1000000)))

            # Make feed dict
            feed_dict = {
                model.inputs_pl_list[0]:
                inputs,
                model.labels_pl_list[0]:
                list2sparsetensor(labels_char, padded_value=-1),
                model.labels_sub_pl_list[0]:
                list2sparsetensor(labels_phone, padded_value=-1),
                model.inputs_seq_len_pl_list[0]:
                inputs_seq_len,
                model.keep_prob_pl_list[0]:
                0.9,
                learning_rate_pl:
                learning_rate
            }

            idx2phone = Idx2phone(map_file_path='./phone61.txt')

            with tf.Session() as sess:
                # Initialize parameters
                sess.run(init_op)

                # Wrapper for tfdbg
                # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

                # Train model
                max_steps = 1000
                start_time_step = time.time()
                for step in range(max_steps):

                    # Compute loss
                    _, loss_train = sess.run([train_op, loss_op],
                                             feed_dict=feed_dict)

                    # Gradient check
                    # grads = sess.run(model.clipped_grads,
                    #                  feed_dict=feed_dict)
                    # for grad in grads:
                    #     print(np.max(grad))

                    if (step + 1) % 10 == 0:
                        # Change to evaluation mode
                        feed_dict[model.keep_prob_pl_list[0]] = 1.0

                        # Compute accuracy
                        ler_train_char, ler_train_phone = sess.run(
                            [ler_op_main, ler_op_sub], feed_dict=feed_dict)

                        duration_step = time.time() - start_time_step
                        print(
                            'Step %d: loss = %.3f / cer = %.3f / per = %.3f (%.3f sec) / lr = %.5f'
                            % (step + 1, loss_train, ler_train_char,
                               ler_train_phone, duration_step, learning_rate))
                        start_time_step = time.time()

                        # Visualize
                        labels_pred_char_st, labels_pred_phone_st = sess.run(
                            [decode_op_main, decode_op_sub],
                            feed_dict=feed_dict)
                        labels_pred_char = sparsetensor2list(
                            labels_pred_char_st, batch_size=batch_size)
                        labels_pred_phone = sparsetensor2list(
                            labels_pred_phone_st, batch_size=batch_size)

                        print('Character')
                        try:
                            print('  Ref: %s' % idx2alpha(labels_char[0]))
                            print('  Hyp: %s' % idx2alpha(labels_pred_char[0]))
                        except IndexError:
                            print('Character')
                            print('  Ref: %s' % idx2alpha(labels_char[0]))
                            print('  Hyp: %s' % '')

                        print('Phone')
                        try:
                            print('  Ref: %s' % idx2phone(labels_phone[0]))
                            print('  Hyp: %s' %
                                  idx2phone(labels_pred_phone[0]))
                        except IndexError:
                            print('  Ref: %s' % idx2phone(labels_phone[0]))
                            print('  Hyp: %s' % '')
                            # NOTE: This is for no prediction
                        print('-' * 30)

                        if ler_train_char < 0.1:
                            print('Modle is Converged.')
                            break

                        # Update learning rate
                        learning_rate = lr_controller.decay_lr(
                            learning_rate=learning_rate,
                            epoch=step,
                            value=ler_train_char)
                        feed_dict[learning_rate_pl] = learning_rate
    def check(self, encoder_type, lstm_impl, time_major=False):

        print('==================================================')
        print('  encoder_type: %s' % str(encoder_type))
        print('  lstm_impl: %s' % str(lstm_impl))
        print('  time_major: %s' % str(time_major))
        print('==================================================')

        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 2
            inputs, labels_char, labels_phone, inputs_seq_len = generate_data(
                label_type='multitask',
                model='ctc',
                batch_size=batch_size)

            # Define model graph
            num_classes_main = 27
            num_classes_sub = 61
            model = MultitaskCTC(
                encoder_type=encoder_type,
                input_size=inputs[0].shape[1],
                num_units=256,
                num_layers_main=2,
                num_layers_sub=1,
                num_classes_main=num_classes_main,
                num_classes_sub=num_classes_sub,
                main_task_weight=0.8,
                lstm_impl=lstm_impl,
                parameter_init=0.1,
                clip_grad_norm=5.0,
                clip_activation=50,
                num_proj=256,
                weight_decay=1e-8,
                # bottleneck_dim=50,
                bottleneck_dim=None,
                time_major=time_major)

            # Define placeholders
            model.create_placeholders()
            learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate')

            # Add to the graph each operation
            loss_op, logits_main, logits_sub = model.compute_loss(
                model.inputs_pl_list[0],
                model.labels_pl_list[0],
                model.labels_sub_pl_list[0],
                model.inputs_seq_len_pl_list[0],
                model.keep_prob_pl_list[0])
            train_op = model.train(
                loss_op,
                optimizer='adam',
                learning_rate=learning_rate_pl)
            decode_op_main, decode_op_sub = model.decoder(
                logits_main,
                logits_sub,
                model.inputs_seq_len_pl_list[0],
                beam_width=20)
            ler_op_main, ler_op_sub = model.compute_ler(
                decode_op_main, decode_op_sub,
                model.labels_pl_list[0], model.labels_sub_pl_list[0])

            # Define learning rate controller
            learning_rate = 1e-3
            lr_controller = Controller(learning_rate_init=learning_rate,
                                       decay_start_epoch=20,
                                       decay_rate=0.9,
                                       decay_patient_epoch=5,
                                       lower_better=True)

            # Add the variable initializer operation
            init_op = tf.global_variables_initializer()

            # Count total parameters
            parameters_dict, total_parameters = count_total_parameters(
                tf.trainable_variables())
            for parameter_name in sorted(parameters_dict.keys()):
                print("%s %d" %
                      (parameter_name, parameters_dict[parameter_name]))
            print("Total %d variables, %s M parameters" %
                  (len(parameters_dict.keys()),
                   "{:,}".format(total_parameters / 1000000)))

            # Make feed dict
            feed_dict = {
                model.inputs_pl_list[0]: inputs,
                model.labels_pl_list[0]: list2sparsetensor(labels_char, padded_value=-1),
                model.labels_sub_pl_list[0]: list2sparsetensor(labels_phone, padded_value=-1),
                model.inputs_seq_len_pl_list[0]: inputs_seq_len,
                model.keep_prob_pl_list[0]: 0.9,
                learning_rate_pl: learning_rate
            }

            idx2phone = Idx2phone(map_file_path='./phone61.txt')

            with tf.Session() as sess:
                # Initialize parameters
                sess.run(init_op)

                # Wrapper for tfdbg
                # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

                # Train model
                max_steps = 1000
                start_time_step = time.time()
                for step in range(max_steps):

                    # Compute loss
                    _, loss_train = sess.run(
                        [train_op, loss_op], feed_dict=feed_dict)

                    # Gradient check
                    # grads = sess.run(model.clipped_grads,
                    #                  feed_dict=feed_dict)
                    # for grad in grads:
                    #     print(np.max(grad))

                    if (step + 1) % 10 == 0:
                        # Change to evaluation mode
                        feed_dict[model.keep_prob_pl_list[0]] = 1.0

                        # Compute accuracy
                        ler_train_char, ler_train_phone = sess.run(
                            [ler_op_main, ler_op_sub], feed_dict=feed_dict)

                        duration_step = time.time() - start_time_step
                        print('Step %d: loss = %.3f / cer = %.3f / per = %.3f (%.3f sec) / lr = %.5f' %
                              (step + 1, loss_train, ler_train_char,
                               ler_train_phone, duration_step, learning_rate))
                        start_time_step = time.time()

                        # Visualize
                        labels_pred_char_st, labels_pred_phone_st = sess.run(
                            [decode_op_main, decode_op_sub],
                            feed_dict=feed_dict)
                        labels_pred_char = sparsetensor2list(
                            labels_pred_char_st, batch_size=batch_size)
                        labels_pred_phone = sparsetensor2list(
                            labels_pred_phone_st, batch_size=batch_size)

                        print('Character')
                        try:
                            print('  Ref: %s' % idx2alpha(labels_char[0]))
                            print('  Hyp: %s' % idx2alpha(labels_pred_char[0]))
                        except IndexError:
                            print('Character')
                            print('  Ref: %s' % idx2alpha(labels_char[0]))
                            print('  Hyp: %s' % '')

                        print('Phone')
                        try:
                            print('  Ref: %s' % idx2phone(labels_phone[0]))
                            print('  Hyp: %s' %
                                  idx2phone(labels_pred_phone[0]))
                        except IndexError:
                            print('  Ref: %s' % idx2phone(labels_phone[0]))
                            print('  Hyp: %s' % '')
                            # NOTE: This is for no prediction
                        print('-' * 30)

                        if ler_train_char < 0.1:
                            print('Modle is Converged.')
                            break

                        # Update learning rate
                        learning_rate = lr_controller.decay_lr(
                            learning_rate=learning_rate,
                            epoch=step,
                            value=ler_train_char)
                        feed_dict[learning_rate_pl] = learning_rate
Beispiel #6
0
    def check_training(self,
                       encoder_type,
                       label_type,
                       lstm_impl='LSTMBlockCell',
                       save_params=False):

        print('==================================================')
        print('  encoder_type: %s' % encoder_type)
        print('  label_type: %s' % label_type)
        print('  lstm_impl: %s' % lstm_impl)
        print('==================================================')

        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 1
            splice = 11 if encoder_type in [
                'vgg_blstm', 'vgg_lstm', 'vgg_wang', 'resnet_wang', 'cnn_zhang'
            ] else 1
            inputs, labels_true_st, inputs_seq_len = generate_data(
                label_type=label_type,
                model='ctc',
                batch_size=batch_size,
                splice=splice)
            # NOTE: input_size must be even number when using CudnnLSTM

            # Define model graph
            num_classes = 26 if label_type == 'character' else 61
            model = CTC(
                encoder_type=encoder_type,
                input_size=inputs[0].shape[-1] // splice,
                splice=splice,
                num_units=256,
                num_layers=2,
                num_classes=num_classes,
                lstm_impl=lstm_impl,
                parameter_init=0.1,
                clip_grad=5.0,
                clip_activation=50,
                num_proj=256,
                # bottleneck_dim=50,
                bottleneck_dim=None,
                weight_decay=1e-8)

            # Define placeholders
            model.create_placeholders()
            learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate')

            # Add to the graph each operation
            loss_op, logits = model.compute_loss(
                model.inputs_pl_list[0], model.labels_pl_list[0],
                model.inputs_seq_len_pl_list[0],
                model.keep_prob_input_pl_list[0],
                model.keep_prob_hidden_pl_list[0],
                model.keep_prob_output_pl_list[0])
            train_op = model.train(loss_op,
                                   optimizer='adam',
                                   learning_rate=learning_rate_pl)
            # NOTE: Adam does not run on CudnnLSTM
            decode_op = model.decoder(logits,
                                      model.inputs_seq_len_pl_list[0],
                                      beam_width=20)
            ler_op = model.compute_ler(decode_op, model.labels_pl_list[0])

            # Define learning rate controller
            learning_rate = 1e-3
            lr_controller = Controller(learning_rate_init=learning_rate,
                                       decay_start_epoch=10,
                                       decay_rate=0.98,
                                       decay_patient_epoch=5,
                                       lower_better=True)

            if save_params:
                # Create a saver for writing training checkpoints
                saver = tf.train.Saver(max_to_keep=None)

            # Add the variable initializer operation
            init_op = tf.global_variables_initializer()

            # Count total parameters
            if lstm_impl != 'CudnnLSTM':
                parameters_dict, total_parameters = count_total_parameters(
                    tf.trainable_variables())
                for parameter_name in sorted(parameters_dict.keys()):
                    print("%s %d" %
                          (parameter_name, parameters_dict[parameter_name]))
                print("Total %d variables, %s M parameters" %
                      (len(parameters_dict.keys()), "{:,}".format(
                          total_parameters / 1000000)))

            # Make feed dict
            feed_dict = {
                model.inputs_pl_list[0]: inputs,
                model.labels_pl_list[0]: labels_true_st,
                model.inputs_seq_len_pl_list[0]: inputs_seq_len,
                model.keep_prob_input_pl_list[0]: 1.0,
                model.keep_prob_hidden_pl_list[0]: 1.0,
                model.keep_prob_output_pl_list[0]: 1.0,
                learning_rate_pl: learning_rate
            }

            idx2phone = Idx2phone(map_file_path='./phone61_ctc.txt')

            with tf.Session() as sess:
                # Initialize parameters
                sess.run(init_op)

                # Wrapper for tfdbg
                # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

                # Train model
                max_steps = 1000
                start_time_global = time.time()
                start_time_step = time.time()
                ler_train_pre = 1
                not_improved_count = 0
                for step in range(max_steps):

                    # Compute loss
                    _, loss_train = sess.run([train_op, loss_op],
                                             feed_dict=feed_dict)

                    # Gradient check
                    # grads = sess.run(model.clipped_grads,
                    #                  feed_dict=feed_dict)
                    # for grad in grads:
                    #     print(np.max(grad))

                    if (step + 1) % 10 == 0:
                        # Change to evaluation mode
                        feed_dict[model.keep_prob_input_pl_list[0]] = 1.0
                        feed_dict[model.keep_prob_hidden_pl_list[0]] = 1.0
                        feed_dict[model.keep_prob_output_pl_list[0]] = 1.0

                        # Compute accuracy
                        ler_train = sess.run(ler_op, feed_dict=feed_dict)

                        duration_step = time.time() - start_time_step
                        print(
                            'Step %d: loss = %.3f / ler = %.3f (%.3f sec) / lr = %.5f'
                            % (step + 1, loss_train, ler_train, duration_step,
                               learning_rate))
                        start_time_step = time.time()

                        # Decode
                        labels_pred_st = sess.run(decode_op,
                                                  feed_dict=feed_dict)
                        labels_true = sparsetensor2list(labels_true_st,
                                                        batch_size=batch_size)

                        # Visualize
                        try:
                            labels_pred = sparsetensor2list(
                                labels_pred_st, batch_size=batch_size)
                            if label_type == 'character':
                                print('Ref: %s' % idx2alpha(labels_true[0]))
                                print('Hyp: %s' % idx2alpha(labels_pred[0]))
                            else:
                                print('Ref: %s' % idx2phone(labels_true[0]))
                                print('Hyp: %s' % idx2phone(labels_pred[0]))

                        except IndexError:
                            if label_type == 'character':
                                print('Ref: %s' % idx2alpha(labels_true[0]))
                                print('Hyp: %s' % '')
                            else:
                                print('Ref: %s' % idx2phone(labels_true[0]))
                                print('Hyp: %s' % '')
                            # NOTE: This is for no prediction

                        if ler_train >= ler_train_pre:
                            not_improved_count += 1
                        else:
                            not_improved_count = 0
                        if ler_train < 0.05:
                            print('Modle is Converged.')
                            if save_params:
                                # Save model (check point)
                                checkpoint_file = './model.ckpt'
                                save_path = saver.save(sess,
                                                       checkpoint_file,
                                                       global_step=1)
                                print("Model saved in file: %s" % save_path)
                            break
                        ler_train_pre = ler_train

                        # Update learning rate
                        learning_rate = lr_controller.decay_lr(
                            learning_rate=learning_rate,
                            epoch=step,
                            value=ler_train)
                        feed_dict[learning_rate_pl] = learning_rate

                duration_global = time.time() - start_time_global
                print('Total time: %.3f sec' % (duration_global))
    def check(self, decoder_type):

        print('==================================================')
        print('  decoder_type: %s' % decoder_type)
        print('==================================================')

        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 2
            num_stack = 2
            inputs, labels, inputs_seq_len = generate_data(
                label_type='character',
                model='ctc',
                batch_size=batch_size,
                num_stack=num_stack,
                splice=1)
            max_time = inputs.shape[1]

            # Define model graph
            model = CTC(encoder_type='blstm',
                        input_size=inputs[0].shape[-1],
                        splice=1,
                        num_stack=num_stack,
                        num_units=256,
                        num_layers=2,
                        num_classes=27,
                        lstm_impl='LSTMBlockCell',
                        parameter_init=0.1,
                        clip_grad_norm=5.0,
                        clip_activation=50,
                        num_proj=256,
                        weight_decay=1e-6)

            # Define placeholders
            model.create_placeholders()

            # Add to the graph each operation
            _, logits = model.compute_loss(model.inputs_pl_list[0],
                                           model.labels_pl_list[0],
                                           model.inputs_seq_len_pl_list[0],
                                           model.keep_prob_pl_list[0])
            beam_width = 20 if 'beam_search' in decoder_type else 1
            decode_op = model.decoder(logits,
                                      model.inputs_seq_len_pl_list[0],
                                      beam_width=beam_width)
            ler_op = model.compute_ler(decode_op, model.labels_pl_list[0])
            posteriors_op = model.posteriors(logits, blank_prior=1)

            if decoder_type == 'np_greedy':
                decoder = GreedyDecoder(blank_index=model.num_classes)
            elif decoder_type == 'np_beam_search':
                decoder = BeamSearchDecoder(space_index=26,
                                            blank_index=model.num_classes - 1)

            # Make feed dict
            feed_dict = {
                model.inputs_pl_list[0]: inputs,
                model.labels_pl_list[0]: list2sparsetensor(labels,
                                                           padded_value=-1),
                model.inputs_seq_len_pl_list[0]: inputs_seq_len,
                model.keep_prob_pl_list[0]: 1.0
            }

            # Create a saver for writing training checkpoints
            saver = tf.train.Saver()

            with tf.Session() as sess:
                ckpt = tf.train.get_checkpoint_state('./')

                # If check point exists
                if ckpt:
                    model_path = ckpt.model_checkpoint_path
                    saver.restore(sess, model_path)
                    print("Model restored: " + model_path)
                else:
                    raise ValueError('There are not any checkpoints.')

                if decoder_type in ['tf_greedy', 'tf_beam_search']:
                    # Decode
                    labels_pred_st = sess.run(decode_op, feed_dict=feed_dict)
                    labels_pred = sparsetensor2list(labels_pred_st,
                                                    batch_size=batch_size)

                    # Compute accuracy
                    cer = sess.run(ler_op, feed_dict=feed_dict)
                else:
                    # Compute CTC posteriors
                    probs = sess.run(posteriors_op, feed_dict=feed_dict)
                    probs = probs.reshape(-1, max_time, model.num_classes)

                    if decoder_type == 'np_greedy':
                        # Decode
                        labels_pred = decoder(probs=probs,
                                              seq_len=inputs_seq_len)

                    elif decoder_type == 'np_beam_search':
                        # Decode
                        labels_pred, scores = decoder(probs=probs,
                                                      seq_len=inputs_seq_len,
                                                      beam_width=beam_width)

                    # Compute accuracy
                    cer = compute_cer(str_pred=idx2alpha(labels_pred[0]),
                                      str_true=idx2alpha(labels[0]),
                                      normalize=True)

                # Visualize
                print('CER: %.3f %%' % (cer * 100))
                print('Ref: %s' % idx2alpha(labels[0]))
                print('Hyp: %s' % idx2alpha(labels_pred[0]))
    def check(self, encoder_type, label_type='character',
              lstm_impl=None, time_major=True, save_params=False):

        print('==================================================')
        print('  encoder_type: %s' % encoder_type)
        print('  label_type: %s' % label_type)
        print('  lstm_impl: %s' % lstm_impl)
        print('  time_major: %s' % str(time_major))
        print('  save_params: %s' % str(save_params))
        print('==================================================')

        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 2
            splice = 11 if encoder_type in ['vgg_blstm', 'vgg_lstm', 'cnn_zhang',
                                            'vgg_wang', 'resnet_wang', 'cldnn_wang'] else 1
            num_stack = 2
            inputs, labels, inputs_seq_len = generate_data(
                label_type=label_type,
                model='ctc',
                batch_size=batch_size,
                num_stack=num_stack,
                splice=splice)
            # NOTE: input_size must be even number when using CudnnLSTM

            # Define model graph
            num_classes = 27 if label_type == 'character' else 61
            model = CTC(encoder_type=encoder_type,
                        input_size=inputs[0].shape[-1] // splice // num_stack,
                        splice=splice,
                        num_stack=num_stack,
                        num_units=256,
                        num_layers=2,
                        num_classes=num_classes,
                        lstm_impl=lstm_impl,
                        parameter_init=0.1,
                        clip_grad_norm=5.0,
                        clip_activation=50,
                        num_proj=256,
                        weight_decay=1e-10,
                        # bottleneck_dim=50,
                        bottleneck_dim=None,
                        time_major=time_major)

            # Define placeholders
            model.create_placeholders()
            learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate')

            # Add to the graph each operation
            loss_op, logits = model.compute_loss(
                model.inputs_pl_list[0],
                model.labels_pl_list[0],
                model.inputs_seq_len_pl_list[0],
                model.keep_prob_pl_list[0])
            train_op = model.train(loss_op,
                                   optimizer='nestrov',
                                   learning_rate=learning_rate_pl)
            # NOTE: Adam does not run on CudnnLSTM
            decode_op = model.decoder(logits,
                                      model.inputs_seq_len_pl_list[0],
                                      beam_width=20)
            ler_op = model.compute_ler(decode_op, model.labels_pl_list[0])

            # Define learning rate controller
            learning_rate = 1e-4
            lr_controller = Controller(learning_rate_init=learning_rate,
                                       decay_start_epoch=50,
                                       decay_rate=0.9,
                                       decay_patient_epoch=10,
                                       lower_better=True)

            if save_params:
                # Create a saver for writing training checkpoints
                saver = tf.train.Saver(max_to_keep=None)

            # Add the variable initializer operation
            init_op = tf.global_variables_initializer()

            # Count total parameters
            if lstm_impl != 'CudnnLSTM':
                parameters_dict, total_parameters = count_total_parameters(
                    tf.trainable_variables())
                for parameter_name in sorted(parameters_dict.keys()):
                    print("%s %d" %
                          (parameter_name, parameters_dict[parameter_name]))
                print("Total %d variables, %s M parameters" %
                      (len(parameters_dict.keys()),
                       "{:,}".format(total_parameters / 1000000)))

            # Make feed dict
            feed_dict = {
                model.inputs_pl_list[0]: inputs,
                model.labels_pl_list[0]: list2sparsetensor(labels, padded_value=-1),
                model.inputs_seq_len_pl_list[0]: inputs_seq_len,
                model.keep_prob_pl_list[0]: 1.0,
                learning_rate_pl: learning_rate
            }

            idx2phone = Idx2phone(map_file_path='./phone61.txt')

            with tf.Session() as sess:
                # Initialize parameters
                sess.run(init_op)

                # Wrapper for tfdbg
                # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

                # Train model
                max_steps = 1000
                start_time_step = time.time()
                for step in range(max_steps):

                    # for debug
                    # encoder_outputs = sess.run(
                    #     model.encoder_outputs, feed_dict)
                    # print(encoder_outputs.shape)

                    # Compute loss
                    _, loss_train = sess.run(
                        [train_op, loss_op], feed_dict=feed_dict)

                    # Gradient check
                    # grads = sess.run(model.clipped_grads,
                    #                  feed_dict=feed_dict)
                    # for grad in grads:
                    #     print(np.max(grad))

                    if (step + 1) % 10 == 0:
                        # Change to evaluation mode
                        feed_dict[model.keep_prob_pl_list[0]] = 1.0

                        # Compute accuracy
                        ler_train = sess.run(ler_op, feed_dict=feed_dict)

                        duration_step = time.time() - start_time_step
                        print('Step %d: loss = %.3f / ler = %.3f (%.3f sec) / lr = %.5f' %
                              (step + 1, loss_train, ler_train, duration_step, learning_rate))
                        start_time_step = time.time()

                        # Decode
                        labels_pred_st = sess.run(
                            decode_op, feed_dict=feed_dict)

                        # Visualize
                        try:
                            labels_pred = sparsetensor2list(
                                labels_pred_st, batch_size=batch_size)
                            if label_type == 'character':
                                print('Ref: %s' % idx2alpha(labels[0]))
                                print('Hyp: %s' % idx2alpha(labels_pred[0]))
                            else:
                                print('Ref: %s' % idx2phone(labels[0]))
                                print('Hyp: %s' % idx2phone(labels_pred[0]))

                        except IndexError:
                            if label_type == 'character':
                                print('Ref: %s' % idx2alpha(labels[0]))
                                print('Hyp: %s' % '')
                            else:
                                print('Ref: %s' % idx2phone(labels[0]))
                                print('Hyp: %s' % '')
                            # NOTE: This is for no prediction

                        if ler_train < 0.1:
                            print('Modle is Converged.')
                            if save_params:
                                # Save model (check point)
                                checkpoint_file = './model.ckpt'
                                save_path = saver.save(
                                    sess, checkpoint_file, global_step=2)
                                print("Model saved in file: %s" % save_path)
                            break

                        # Update learning rate
                        learning_rate = lr_controller.decay_lr(
                            learning_rate=learning_rate,
                            epoch=step,
                            value=ler_train)
                        feed_dict[learning_rate_pl] = learning_rate