Example #1
0
    def check_loading(self, num_gpu, is_sorted):
        print('----- num_gpu: ' + str(num_gpu) + ', is_sorted: ' +
              str(is_sorted) + ' -----')

        batch_size = 64
        dataset = Dataset(data_type='train',
                          label_type_main='character',
                          label_type_sub='phone61',
                          batch_size=batch_size,
                          num_stack=3,
                          num_skip=3,
                          is_sorted=is_sorted,
                          is_progressbar=True,
                          num_gpu=num_gpu)

        tf.reset_default_graph()
        with tf.Session().as_default() as sess:
            print('=> Loading mini-batch...')
            map_file_path_char = '../metrics/mapping_files/ctc/char2num.txt'
            map_file_path_phone = '../metrics/mapping_files/ctc/phone2num_61.txt'

            mini_batch = dataset.next_batch(session=sess)

            iter_per_epoch = int(dataset.data_num / (batch_size * num_gpu)) + 1
            for i in range(iter_per_epoch + 1):
                return_tuple = mini_batch.__next__()
                inputs = return_tuple[0]
                labels_char_st = return_tuple[1]
                labels_phone_st = return_tuple[2]

                if num_gpu > 1:
                    for inputs_gpu in inputs:
                        print(inputs_gpu.shape)
                    labels_char_st = labels_char_st[0]
                    labels_phone_st = labels_phone_st[0]

                labels_char = sparsetensor2list(labels_char_st,
                                                batch_size=len(inputs))
                labels_phone = sparsetensor2list(labels_phone_st,
                                                 batch_size=len(inputs))

                if num_gpu == 1:
                    for inputs_i, labels_i in zip(inputs, labels_char):
                        if len(inputs_i) < len(labels_i):
                            print(len(inputs_i))
                            print(len(labels_i))
                            raise ValueError
                    for inputs_i, labels_i in zip(inputs, labels_phone):
                        if len(inputs_i) < len(labels_i):
                            print(len(inputs_i))
                            print(len(labels_i))
                            raise ValueError

                str_true_char = num2char(labels_char[0], map_file_path_char)
                str_true_char = re.sub(r'_', ' ', str_true_char)
                str_true_phone = num2phone(labels_phone[0],
                                           map_file_path_phone)
def decode_test(session, decode_op, network, dataset, label_type,
                save_path=None):
    """Visualize label outputs of CTC model.
    Args:
        session: session of training model
        decode_op: operation for decoding
        network: network to evaluate
        dataset: An instance of a `Dataset` class
        label_type: string, phone39 or phone48 or phone61 or character
        save_path: path to save decoding results
    """
    # Batch size is expected to be 1
    iteration = dataset.data_num

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=1)

    if label_type == 'character':
        map_file_path = '../metrics/mapping_files/ctc/char2num.txt'
    else:
        map_file_path = '../metrics/mapping_files/ctc/phone2num_' + \
            label_type[5:7] + '.txt'

    if save_path is not None:
        sys.stdout = open(join(network.model_dir, 'decode.txt'), 'w')

    for step in range(iteration):
        # Create feed dictionary for next mini batch
        inputs, labels_true_st, inputs_seq_len, input_names = mini_batch.__next__()

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        # Visualize
        labels_pred_st = session.run(decode_op, feed_dict=feed_dict)
        labels_true = sparsetensor2list(labels_true_st, batch_size=1)
        labels_pred = sparsetensor2list(labels_pred_st, batch_size=1)

        if label_type == 'character':
            print('----- wav: %s -----' % input_names[0])
            print('True: %s' % num2char(
                labels_true[0], map_file_path))
            print('Pred: %s' % num2char(
                labels_pred[0], map_file_path))

        else:
            print('----- wav: %s -----' % input_names[0])
            print('True: %s' % num2phone(
                labels_true[0], map_file_path))

            print('Pred: %s' % num2phone(
                labels_pred[0], map_file_path))
Example #3
0
    def check_training(self):
        print('----- multitask -----')
        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 4
            inputs, labels_true_char_st, labels_true_phone_st, inputs_seq_len = generate_data(
                label_type='multitask', model='ctc', batch_size=batch_size)

            # Define placeholders
            inputs_pl = tf.placeholder(tf.float32,
                                       shape=[None, None, inputs.shape[-1]],
                                       name='input')
            indices_pl = tf.placeholder(tf.int64, name='indices')
            values_pl = tf.placeholder(tf.int32, name='values')
            shape_pl = tf.placeholder(tf.int64, name='shape')
            labels_pl = tf.SparseTensor(indices_pl, values_pl, shape_pl)
            indices_second_pl = tf.placeholder(tf.int64, name='indices_second')
            values_second_pl = tf.placeholder(tf.int32, name='values_second')
            shape_second_pl = tf.placeholder(tf.int64, name='shape_second')
            labels_second_pl = tf.SparseTensor(indices_second_pl,
                                               values_second_pl,
                                               shape_second_pl)
            inputs_seq_len_pl = tf.placeholder(tf.int64,
                                               shape=[None],
                                               name='inputs_seq_len')
            keep_prob_input_pl = tf.placeholder(tf.float32,
                                                name='keep_prob_input')
            keep_prob_hidden_pl = tf.placeholder(tf.float32,
                                                 name='keep_prob_hidden')

            # Define model graph
            output_size_main = 26
            output_size_second = 61
            network = Multitask_BLSTM_CTC(
                batch_size=batch_size,
                input_size=inputs[0].shape[1],
                num_unit=256,
                num_layer_main=2,
                num_layer_second=1,
                output_size_main=output_size_main,
                output_size_second=output_size_second,
                main_task_weight=0.8,
                parameter_init=0.1,
                clip_grad=5.0,
                clip_activation=50,
                dropout_ratio_input=1.0,
                dropout_ratio_hidden=1.0,
                num_proj=None,
                weight_decay=1e-6)

            # Add to the graph each operation
            loss_op, logits_main, logits_second = network.compute_loss(
                inputs_pl, labels_pl, labels_second_pl, inputs_seq_len_pl,
                keep_prob_input_pl, keep_prob_hidden_pl)
            learning_rate = 1e-3
            train_op = network.train(loss_op,
                                     optimizer='rmsprop',
                                     learning_rate_init=learning_rate,
                                     is_scheduled=False)
            decode_op_main, decode_op_second = network.decoder(
                logits_main,
                logits_second,
                inputs_seq_len_pl,
                decode_type='beam_search',
                beam_width=20)
            ler_op_main, ler_op_second = network.compute_ler(
                decode_op_main, decode_op_second, labels_pl, labels_second_pl)

            # Add the variable initializer operation
            init_op = tf.global_variables_initializer()

            # Count total parameters
            parameters_dict, total_parameters = count_total_parameters(
                tf.trainable_variables())
            for parameter_name in sorted(parameters_dict.keys()):
                print("%s %d" %
                      (parameter_name, parameters_dict[parameter_name]))
            print("Total %d variables, %s M parameters" %
                  (len(parameters_dict.keys()), "{:,}".format(
                      total_parameters / 1000000)))

            # Make feed dict
            feed_dict = {
                inputs_pl: inputs,
                labels_pl: labels_true_char_st,
                labels_second_pl: labels_true_phone_st,
                inputs_seq_len_pl: inputs_seq_len,
                keep_prob_input_pl: network.dropout_ratio_input,
                keep_prob_hidden_pl: network.dropout_ratio_hidden,
                network.lr: learning_rate
            }

            with tf.Session() as sess:
                # Initialize parameters
                sess.run(init_op)

                # Wrapper for tfdbg
                # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

                # Train model
                max_steps = 400
                start_time_global = time.time()
                start_time_step = time.time()
                ler_train_char_pre = 1
                not_improved_count = 0
                for step in range(max_steps):

                    # Compute loss
                    _, loss_train = sess.run([train_op, loss_op],
                                             feed_dict=feed_dict)

                    # Gradient check
                    # grads = sess.run(network.clipped_grads, feed_dict=feed_dict)
                    # for grad in grads:
                    #     print(np.max(grad))

                    if (step + 1) % 10 == 0:
                        # Change to evaluation mode
                        feed_dict[keep_prob_input_pl] = 1.0
                        feed_dict[keep_prob_hidden_pl] = 1.0

                        # Compute accuracy
                        ler_train_char, ler_train_phone = sess.run(
                            [ler_op_main, ler_op_second], feed_dict=feed_dict)

                        duration_step = time.time() - start_time_step
                        print(
                            'Step %d: loss = %.3f / cer = %.4f / per = %.4f (%.3f sec)\n'
                            % (step + 1, loss_train, ler_train_char,
                               ler_train_phone, duration_step))
                        start_time_step = time.time()

                        # Visualize
                        labels_pred_char_st, labels_pred_phone_st = sess.run(
                            [decode_op_main, decode_op_second],
                            feed_dict=feed_dict)
                        labels_true_char = sparsetensor2list(
                            labels_true_char_st, batch_size=batch_size)
                        labels_true_phone = sparsetensor2list(
                            labels_true_phone_st, batch_size=batch_size)
                        labels_pred_char = sparsetensor2list(
                            labels_pred_char_st, batch_size=batch_size)
                        labels_pred_phone = sparsetensor2list(
                            labels_pred_phone_st, batch_size=batch_size)

                        # character
                        print('Character')
                        print('  True: %s' % num2alpha(labels_true_char[0]))
                        print('  Pred: %s' % num2alpha(labels_pred_char[0]))
                        print('Phone')
                        print('  True: %s' % num2phone(labels_true_phone[0]))
                        print('  Pred: %s' % num2phone(labels_pred_phone[0]))
                        print('----------------------------------------')

                        if ler_train_char >= ler_train_char_pre:
                            not_improved_count += 1
                        else:
                            not_improved_count = 0
                        if not_improved_count >= 5:
                            print('Modle is Converged.')
                            break
                        ler_train_char_pre = ler_train_char

                        # Change to training mode
                        network.is_training = True

                duration_global = time.time() - start_time_global
                print('Total time: %.3f sec' % (duration_global))
def do_eval_per(session,
                decode_op,
                per_op,
                network,
                dataset,
                label_type,
                eval_batch_size=None,
                is_progressbar=False,
                is_multitask=False):
    """Evaluate trained model by Phone Error Rate.
    Args:
        session: session of training model
        decode_op: operation for decoding
        per_op: operation for computing phone error rate
        network: network to evaluate
        dataset: An instance of a `Dataset' class
        label_type: string, phone39 or phone48 or phone61
        eval_batch_size: int, the batch size when evaluating the model
        is_progressbar: if True, visualize the progressbar
        is_multitask: if True, evaluate the multitask model
    Returns:
        per_mean: An average of PER
    """
    if eval_batch_size is not None:
        batch_size = eval_batch_size
    else:
        batch_size = dataset.batch_size

    train_label_type = label_type
    if is_multitask:
        eval_label_type = dataset.label_type_sub
    else:
        eval_label_type = dataset.label_type

    num_examples = dataset.data_num
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1
    per_mean = 0

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=batch_size)

    train_phone2num_map_file_path = '../metrics/mapping_files/ctc/' + \
        train_label_type + '_to_num.txt'
    eval_phone2num_map_file_path = '../metrics/mapping_files/ctc/' + \
        eval_label_type + '_to_num.txt'
    phone2num_39_map_file_path = '../metrics/mapping_files/ctc/phone39_to_num.txt'
    phone2phone_map_file_path = '../metrics/mapping_files/phone2phone.txt'
    for step in wrap_iterator(range(iteration), is_progressbar):
        # Create feed dictionary for next mini batch
        if not is_multitask:
            inputs, labels_true, inputs_seq_len, _ = mini_batch.__next__()
        else:
            inputs, _, labels_true, inputs_seq_len, _ = mini_batch.__next__()

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        batch_size_each = len(inputs_seq_len)

        # Evaluate by 39 phones
        labels_pred_st = session.run(decode_op, feed_dict=feed_dict)
        labels_pred = sparsetensor2list(labels_pred_st, batch_size_each)

        labels_pred_mapped, labels_true_mapped = [], []
        for i_batch in range(batch_size_each):
            ###############
            # Hypothesis
            ###############
            # Convert from num to phone (-> list of phone strings)
            phone_pred_list = num2phone(
                labels_pred[i_batch], train_phone2num_map_file_path).split(' ')

            # Mapping to 39 phones (-> list of phone strings)
            phone_pred_list = map_to_39phone(phone_pred_list, train_label_type,
                                             phone2phone_map_file_path)

            # Convert from phone to num (-> list of phone indices)
            phone_pred_list = phone2num(phone_pred_list,
                                        phone2num_39_map_file_path)
            labels_pred_mapped.append(phone_pred_list)

            ###############
            # Reference
            ###############
            # Convert from num to phone (-> list of phone strings)
            phone_true_list = num2phone(
                labels_true[i_batch], eval_phone2num_map_file_path).split(' ')

            # Mapping to 39 phones (-> list of phone strings)
            phone_true_list = map_to_39phone(phone_true_list, eval_label_type,
                                             phone2phone_map_file_path)

            # Convert from phone to num (-> list of phone indices)
            phone_true_list = phone2num(phone_true_list,
                                        phone2num_39_map_file_path)
            labels_true_mapped.append(phone_true_list)

        # Compute edit distance
        labels_true_st = list2sparsetensor(labels_true_mapped, padded_value=-1)
        labels_pred_st = list2sparsetensor(labels_pred_mapped, padded_value=-1)
        per_each = compute_edit_distance(session, labels_true_st,
                                         labels_pred_st)
        per_mean += per_each * batch_size_each

    per_mean /= dataset.data_num

    return per_mean
def do_eval_cer(session,
                decode_op,
                network,
                dataset,
                eval_batch_size=None,
                is_progressbar=False,
                is_multitask=False):
    """Evaluate trained model by Character Error Rate.
    Args:
        session: session of training model
        decode_op: operation for decoding
        network: network to evaluate
        dataset: An instance of a `Dataset` class
        eval_batch_size: int, the batch size when evaluating the model
        is_progressbar: if True, visualize the progressbar
        is_multitask: if True, evaluate the multitask model
    Return:
        cer_mean: An average of CER
    """
    if eval_batch_size is not None:
        batch_size = eval_batch_size
    else:
        batch_size = dataset.batch_size

    num_examples = dataset.data_num
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1
    cer_sum = 0

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=batch_size)

    map_file_path = '../metrics/mapping_files/ctc/character_to_num.txt'
    for step in wrap_iterator(range(iteration), is_progressbar):
        # Create feed dictionary for next mini batch
        if not is_multitask:
            inputs, labels_true, inputs_seq_len, _ = mini_batch.__next__()
        else:
            inputs, labels_true, _, inputs_seq_len, _ = mini_batch.__next__()

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        batch_size_each = len(inputs_seq_len)

        labels_pred_st = session.run(decode_op, feed_dict=feed_dict)
        labels_pred = sparsetensor2list(labels_pred_st, batch_size_each)

        for i_batch in range(batch_size_each):

            # Convert from list to string
            str_true = num2char(labels_true[i_batch], map_file_path)
            str_pred = num2char(labels_pred[i_batch], map_file_path)

            # Remove silence(_) labels
            str_true = re.sub(r'[_,.\'-?!]+', "", str_true)
            str_pred = re.sub(r'[_,.\'-?!]+', "", str_pred)

            # Compute edit distance
            cer_each = Levenshtein.distance(str_pred, str_true) / len(
                list(str_true))
            cer_sum += cer_each

    cer_mean = cer_sum / dataset.data_num

    return cer_mean
Example #6
0
def do_eval_cer(session,
                decode_op,
                network,
                dataset,
                label_type,
                is_test=None,
                eval_batch_size=None,
                is_progressbar=False,
                is_multitask=False,
                is_main=False):
    """Evaluate trained model by Character Error Rate.
    Args:
        session: session of training model
        decode_op: operation for decoding
        network: network to evaluate
        dataset: An instance of `Dataset` class
        label_type: string, kanji or kana or phone
        is_test: bool, set to True when evaluating by the test set
        eval_batch_size: int, the batch size when evaluating the model
        is_progressbar: if True, visualize progressbar
        is_multitask: if True, evaluate the multitask model
        is_main: if True, evaluate the main task
    Return:
        cer_mean: An average of CER
    """
    if eval_batch_size is None:
        batch_size = dataset.batch_size
    else:
        batch_size = eval_batch_size

    num_examples = dataset.data_num
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1
    cer_sum = 0

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=batch_size)

    if label_type == 'kanji':
        map_file_path = '../metrics/mapping_files/ctc/kanji2num.txt'
    elif label_type == 'kana':
        map_file_path = '../metrics/mapping_files/ctc/kana2num.txt'
    elif label_type == 'phone':
        map_file_path == '../metrics/mapping_files/ctc/phone2num.txt'

    for step in wrap_iterator(range(iteration), is_progressbar):
        # Create feed dictionary for next mini batch
        if not is_multitask:
            inputs, labels_true, inputs_seq_len, _ = mini_batch.__next__()
        else:
            if is_main:
                inputs, labels_true, _, inputs_seq_len, _ = mini_batch.__next__(
                )
            else:
                inputs, _, labels_true, inputs_seq_len, _ = mini_batch.__next__(
                )

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        batch_size_each = len(inputs_seq_len)

        labels_pred_st = session.run(decode_op, feed_dict=feed_dict)
        labels_pred = sparsetensor2list(labels_pred_st, batch_size_each)

        for i_batch in range(batch_size_each):
            # Convert from list to string
            if label_type != 'phone' and is_test:
                str_true = ''.join(labels_true[i_batch])
                # NOTE: 漢字とかなの場合はテストデータのラベルはそのまま保存してある
            else:
                str_true = num2char(labels_true[i_batch], map_file_path)
            str_pred = num2char(labels_pred[i_batch], map_file_path)

            # Remove silence(_) & noise(NZ) labels
            str_true = re.sub(r'[_NZー]+', "", str_true)
            str_pred = re.sub(r'[_NZー]+', "", str_pred)

            # Compute edit distance
            cer_each = Levenshtein.distance(str_pred, str_true) / len(
                list(str_true))

            cer_sum += cer_each

    cer_mean = cer_sum / dataset.data_num

    return cer_mean
Example #7
0
    def check_loading(self, label_type, num_gpu, is_sorted):
        print('----- label_type: ' + label_type + ', num_gpu: ' +
              str(num_gpu) + ', is_sorted: ' + str(is_sorted) + ' -----')

        batch_size = 64
        eos_index = 2 if label_type == 'character' else 1
        dataset = Dataset(data_type='train',
                          label_type=label_type,
                          batch_size=batch_size,
                          eos_index=eos_index,
                          is_sorted=is_sorted,
                          is_progressbar=True,
                          num_gpu=num_gpu)

        tf.reset_default_graph()
        with tf.Session().as_default() as sess:
            print('=> Loading mini-batch...')
            if label_type == 'character':
                att_map_file_path = '../metrics/mapping_files/attention/char2num.txt'
                ctc_map_file_path = '../metrics/mapping_files/ctc/char2num.txt'
                map_fn = num2char
            else:
                att_map_file_path = '../metrics/mapping_files/attention/phone2num_' + \
                    label_type[5:7] + '.txt'
                ctc_map_file_path = '../metrics/mapping_files/ctc/phone2num_' + \
                    label_type[5:7] + '.txt'
                map_fn = num2phone

            mini_batch = dataset.next_batch(session=sess)

            iter_per_epoch = int(dataset.data_num / (batch_size * num_gpu)) + 1
            for i in range(iter_per_epoch + 1):
                return_tuple = mini_batch.__next__()
                inputs = return_tuple[0]
                att_labels = return_tuple[1]
                ctc_labels_st = return_tuple[2]
                att_labels_seq_len = return_tuple[4]

                if num_gpu > 1:
                    for inputs_gpu in inputs:
                        print(inputs_gpu.shape)
                    inputs = inputs[0]
                    att_labels = att_labels[0]
                    ctc_labels_st = ctc_labels_st[0]
                    att_labels_seq_len = att_labels_seq_len[0]

                ctc_labels = sparsetensor2list(ctc_labels_st,
                                               batch_size=len(inputs))

                if num_gpu == 1:
                    for inputs_i, labels_i in zip(inputs, ctc_labels):
                        if len(inputs_i) < len(labels_i):
                            print(len(inputs_i))
                            print(len(labels_i))
                            raise ValueError

                att_str_true = map_fn(att_labels[0][0:att_labels_seq_len[0]],
                                      att_map_file_path)
                ctc_str_true = map_fn(ctc_labels[0], ctc_map_file_path)
                att_str_true = re.sub(r'_', ' ', att_str_true)
                ctc_str_true = re.sub(r'_', ' ', ctc_str_true)