def __init__(self,
                 data_type,
                 label_type,
                 batch_size,
                 eos_index,
                 is_sorted=True,
                 is_progressbar=False,
                 num_gpu=1):
        """A class for loading dataset.
        Args:
            data_type: string, train or dev or test
            label_type: string, phone39 or phone48 or phone61 or character
            eos_index: int , the index of <EOS> class
            is_sorted: if True, sort dataset by frame num
            is_progressbar: if True, visualize progressbar
            num_gpu: int, if more than 1, divide batch_size by num_gpu
        """
        if data_type not in ['train', 'dev', 'test']:
            raise ValueError('data_type is "train" or "dev" or "test".')

        self.data_type = data_type
        self.label_type = label_type
        self.batch_size = batch_size * num_gpu
        self.eos_index = eos_index
        self.is_sorted = is_sorted
        self.is_progressbar = is_progressbar
        self.num_gpu = num_gpu

        self.input_size = 123
        input_path = join('/n/sd8/inaguma/corpus/timit/dataset/inputs/',
                          data_type)
        label_path = join(
            '/n/sd8/inaguma/corpus/timit/dataset/labels/attention/',
            label_type, data_type)

        # Load the frame number dictionary
        with open(join(input_path, 'frame_num.pickle'), 'rb') as f:
            self.frame_num_dict = pickle.load(f)

        # Sort paths to input & label by frame num
        frame_num_tuple_sorted = sorted(self.frame_num_dict.items(),
                                        key=lambda x: x[1])
        input_paths, label_paths = [], []
        for input_name, frame_num in frame_num_tuple_sorted:
            input_paths.append(join(input_path, input_name + '.npy'))
            label_paths.append(join(label_path, input_name + '.npy'))
        self.input_paths = np.array(input_paths)
        self.label_paths = np.array(label_paths)
        self.data_num = len(self.input_paths)

        # Load all dataset in advance
        print('=> Loading ' + data_type + ' dataset (' + label_type + ')...')
        input_list, label_list = [], []
        for i in wrap_iterator(range(self.data_num), self.is_progressbar):
            input_list.append(np.load(self.input_paths[i]))
            label_list.append(np.load(self.label_paths[i]))
        self.input_list = np.array(input_list)
        self.label_list = np.array(label_list)

        self.rest = set(range(0, self.data_num, 1))
def do_eval_per(session,
                decode_op,
                per_op,
                network,
                dataset,
                label_type,
                eos_index,
                eval_batch_size=None,
                is_progressbar=False):
    """Evaluate trained model by Phone Error Rate.
    Args:
        session: session of training model
        decode_op: operation for decoding
        per_op: operation for computing phone error rate
        network: network to evaluate
        dataset: An instance of a `Dataset' class
        label_type: string, phone39 or phone48 or phone61
        eos_index: int, the index of <EOS> class
        eval_batch_size: int, the batch size when evaluating the model
        is_progressbar: if True, visualize the progressbar
    Returns:
        per_global: An average of PER
    """
    if eval_batch_size is not None:
        batch_size = eval_batch_size
    else:
        batch_size = dataset.batch_size

    train_label_type = label_type
    data_label_type = dataset.label_type

    num_examples = dataset.data_num
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1
    per_global = 0

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=batch_size)

    phone2num_map_file_path = '../metrics/mapping_files/attention/phone2num_' + \
        train_label_type[5:7] + '.txt'
    phone2num_39_map_file_path = '../metrics/mapping_files/attention/phone2num_39.txt'
    phone2phone_map_file_path = '../metrics/mapping_files/phone2phone.txt'
    for step in wrap_iterator(range(iteration), is_progressbar):
        # Create feed dictionary for next mini-batch
        inputs, att_labels_true, _, inputs_seq_len, _, _ = mini_batch.__next__(
        )

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        batch_size_each = len(inputs_seq_len)

        if False:
            # Evaluate by 61 phones
            per_local = session.run(per_op, feed_dict=feed_dict)
            per_global += per_local * batch_size_each

        else:
            # Evaluate by 39 phones
            predicted_ids = session.run(decode_op, feed_dict=feed_dict)
            predicted_ids_phone39 = []
            labels_true_phone39 = []
            for i_batch in range(batch_size_each):
                # Convert from num to phone (-> list of phone strings)
                phone_pred_seq = num2phone(predicted_ids[i_batch],
                                           phone2num_map_file_path)
                phone_pred_list = phone_pred_seq.split(' ')

                # Mapping to 39 phones (-> list of phone strings)
                phone_pred_list = map_to_39phone(phone_pred_list,
                                                 train_label_type,
                                                 phone2phone_map_file_path)

                # Convert from phone to num (-> list of phone indices)
                phone_pred_list = phone2num(phone_pred_list,
                                            phone2num_39_map_file_path)
                predicted_ids_phone39.append(phone_pred_list)

                if data_label_type != 'phone39':
                    # Convert from num to phone (-> list of phone strings)
                    phone_true_seq = num2phone(att_labels_true[i_batch],
                                               phone2num_map_file_path)
                    phone_true_list = phone_true_seq.split(' ')

                    # Mapping to 39 phones (-> list of phone strings)
                    phone_true_list = map_to_39phone(
                        phone_true_list, train_label_type,
                        phone2phone_map_file_path)

                    # Convert from phone to num (-> list of phone indices)
                    phone_true_list = phone2num(phone_true_list,
                                                phone2num_39_map_file_path)
                    labels_true_phone39.append(phone_true_list)
                else:
                    labels_true_phone39 = att_labels_true

            # Compute edit distance
            labels_true_st = list2sparsetensor(labels_true_phone39,
                                               padded_value=eos_index)
            labels_pred_st = list2sparsetensor(predicted_ids_phone39,
                                               padded_value=eos_index)
            per_local = compute_edit_distance(session, labels_true_st,
                                              labels_pred_st)
            per_global += per_local * batch_size_each

    per_global /= dataset.data_num

    return per_global
def do_eval_cer(session,
                decode_op,
                network,
                dataset,
                eval_batch_size=None,
                is_progressbar=False):
    """Evaluate trained model by Character Error Rate.
    Args:
        session: session of training model
        decode_op: operation for decoding
        network: network to evaluate
        dataset: An instance of a `Dataset` class
        eval_batch_size: int, batch size when evaluating the model
        is_progressbar: if True, visualize the progressbar
    Return:
        cer_mean: An average of CER
    """
    if eval_batch_size is not None:
        batch_size = eval_batch_size
    else:
        batch_size = dataset.batch_size

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=batch_size)

    num_examples = dataset.data_num
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1
    cer_sum = 0

    map_file_path = '../metrics/mapping_files/attention/char2num.txt'
    for step in wrap_iterator(range(iteration), is_progressbar):
        # Create feed dictionary for next mini-batch
        inputs, att_labels_true, _, inputs_seq_len, _, _ = mini_batch.__next__(
        )

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        batch_size_each = len(inputs_seq_len)

        predicted_ids = session.run(decode_op, feed_dict=feed_dict)
        for i_batch in range(batch_size_each):

            # Convert from list to string
            str_true = num2char(att_labels_true[i_batch], map_file_path)
            str_pred = num2char(predicted_ids[i_batch], map_file_path)

            # Remove silence(_) labels
            str_true = re.sub(r'[_<>,.\'-?!]+', "", str_true)
            str_pred = re.sub(r'[_<>,.\'-?!]+', "", str_pred)

            # Compute edit distance
            cer_each = Levenshtein.distance(str_pred, str_true) / len(
                list(str_true))
            cer_sum += cer_each

    cer_mean = cer_sum / dataset.data_num

    return cer_mean
Example #4
0
    def __init__(self,
                 data_type,
                 train_data_size,
                 label_type_main,
                 label_type_sub,
                 batch_size,
                 num_stack=None,
                 num_skip=None,
                 is_sorted=True,
                 is_progressbar=False,
                 num_gpu=1,
                 is_gpu=True):
        """A class for loading dataset.
        Args:
            data_type: string, train or dev or eval1 or eval2 or eval3
            train_data_size: string, default or large
            label_type_main: string, character or kanji
            label_type_sub: string, character or phone
            batch_size: int, the size of mini-batch
            num_stack: int, the number of frames to stack
            num_skip: int, the number of frames to skip
            is_sorted: if True, sort dataset by frame num
            is_progressbar: if True, visualize progressbar
            num_gpu: int, if more than 1, divide batch_size by num_gpu
            is_gpu: bool
        """
        if data_type not in ['train', 'dev', 'eval1', 'eval2', 'eval3']:
            raise ValueError(
                'data_type is "train" or "dev", "eval1", "eval2", "eval3".')

        self.data_type = data_type
        self.train_data_size = train_data_size
        self.label_type_main = label_type_main
        self.label_type_sub = label_type_sub
        self.batch_size = batch_size * num_gpu
        self.num_stack = num_stack
        self.num_skip = num_skip
        self.is_sorted = is_sorted
        self.is_progressbar = is_progressbar
        self.num_gpu = num_gpu
        self.input_size = 123

        if is_gpu:
            # GPU
            input_path = join('/data/inaguma/csj/inputs', train_data_size,
                              data_type)
            label_main_path = join('/data/inaguma/csj/labels/ctc/',
                                   train_data_size, label_type_main, data_type)
            label_sub_path = join('/data/inaguma/csj/labels/ctc/',
                                  train_data_size, label_type_sub, data_type)
        else:
            # CPU
            input_path = join('/n/sd8/inaguma/corpus/csj/dataset/inputs',
                              train_data_size, data_type)
            label_main_path = join(
                '/n/sd8/inaguma/corpus/csj/dataset/labels/ctc/',
                train_data_size, label_type_main, data_type)
            label_sub_path = join(
                '/n/sd8/inaguma/corpus/csj/dataset/labels/ctc/',
                train_data_size, label_type_sub, data_type)

        # Load the frame number dictionary
        with open(join(input_path, 'frame_num.pickle'), 'rb') as f:
            self.frame_num_dict = pickle.load(f)

        # Sort paths to input & label by frame num
        print('=> loading paths to dataset...')
        frame_num_tuple_sorted = sorted(self.frame_num_dict.items(),
                                        key=lambda x: x[1])
        input_paths, label_main_paths, label_sub_paths = [], [], []
        for input_name, frame_num in wrap_iterator(frame_num_tuple_sorted,
                                                   self.is_progressbar):
            speaker_name = input_name.split('_')[0]
            input_paths.append(
                join(input_path, speaker_name, input_name + '.npy'))
            label_main_paths.append(
                join(label_main_path, speaker_name, input_name + '.npy'))
            label_sub_paths.append(
                join(label_sub_path, speaker_name, input_name + '.npy'))
        self.input_paths = np.array(input_paths)
        self.label_main_paths = np.array(label_main_paths)
        self.label_sub_paths = np.array(label_sub_paths)
        self.data_num = len(self.input_paths)

        if (self.num_stack is not None) and (self.num_skip is not None):
            self.input_size = self.input_size * num_stack
        # NOTE: Not load dataset yet

        self.rest = set(range(0, self.data_num, 1))

        if data_type in ['eval1', 'eval2', 'eval3'
                         ] and label_type_sub != 'phone':
            self.is_test = True
        else:
            self.is_test = False
def do_eval_per(session,
                decode_op,
                per_op,
                network,
                dataset,
                label_type,
                eos_index,
                eval_batch_size=None,
                is_progressbar=False,
                is_multitask=False):
    """Evaluate trained model by Phone Error Rate.
    Args:
        session: session of training model
        decode_op: operation for decoding
        per_op: operation for computing phone error rate
        network: network to evaluate
        dataset: An instance of a `Dataset' class
        label_type: string, phone39 or phone48 or phone61
        eos_index: int, the index of <EOS> class
        eval_batch_size: int, the batch size when evaluating the model
        is_progressbar: if True, visualize the progressbar
        is_multitask: if True, evaluate the multitask model
    Returns:
        per_mean: An average of PER
    """
    if eval_batch_size is not None:
        batch_size = eval_batch_size
    else:
        batch_size = dataset.batch_size

    train_label_type = label_type
    eval_label_type = dataset.label_type

    num_examples = dataset.data_num
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1
    per_mean = 0

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=batch_size)

    train_phone2num_map_file_path = '../metrics/mapping_files/ctc/' + \
        train_label_type + '_to_num.txt'
    eval_phone2num_map_file_path = '../metrics/mapping_files/ctc/' + \
        train_label_type + '_to_num.txt'
    phone2num_39_map_file_path = '../metrics/mapping_files/ctc/phone39_to_num.txt'
    phone2phone_map_file_path = '../metrics/mapping_files/phone2phone.txt'
    for step in wrap_iterator(range(iteration), is_progressbar):
        # Create feed dictionary for next mini-batch
        if not is_multitask:
            inputs, labels_true, inputs_seq_len, _, _ = mini_batch.__next__()
        else:
            inputs, _, labels_true, inputs_seq_len, _, _ = mini_batch.__next__(
            )

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        batch_size_each = len(inputs_seq_len)

        # Evaluate by 39 phones
        predicted_ids = session.run(decode_op, feed_dict=feed_dict)

        labels_pred_mapped, labels_true_mapped = [], []
        for i_batch in range(batch_size_each):
            ###############
            # Hypothesis
            ###############
            # Convert from num to phone (-> list of phone strings)
            phone_pred_list = num2phone(
                predicted_ids[i_batch],
                train_phone2num_map_file_path).split(' ')

            # Mapping to 39 phones (-> list of phone strings)
            phone_pred_list = map_to_39phone(phone_pred_list, train_label_type,
                                             phone2phone_map_file_path)

            # Convert from phone to num (-> list of phone indices)
            phone_pred_list = phone2num(phone_pred_list,
                                        phone2num_39_map_file_path)
            labels_pred_mapped.append(phone_pred_list)

            ###############
            # Reference
            ###############
            # Convert from num to phone (-> list of phone strings)
            phone_true_list = num2phone(
                labels_true[i_batch], eval_phone2num_map_file_path).split(' ')

            # Mapping to 39 phones (-> list of phone strings)
            phone_true_list = map_to_39phone(phone_true_list, eval_label_type,
                                             phone2phone_map_file_path)

            # Convert from phone to num (-> list of phone indices)
            phone_true_list = phone2num(phone_true_list,
                                        phone2num_39_map_file_path)
            labels_true_mapped.append(phone_true_list)

        # Compute edit distance
        labels_true_st = list2sparsetensor(labels_true_mapped,
                                           padded_value=eos_index)
        labels_pred_st = list2sparsetensor(labels_pred_mapped,
                                           padded_value=eos_index)
        per_each = compute_edit_distance(session, labels_true_st,
                                         labels_pred_st)
        per_mean += per_each * batch_size_each

    per_mean /= dataset.data_num

    return per_mean
Example #6
0
def main():

    # Make mapping dictionary from kana to phone
    phone2kana_dict = {}
    with open('../metrics/mapping_files/kana2phone.txt', 'r') as f:
        for line in f:
            line = line.strip().split('+')
            kana, phone_seq = line
            phone = re.sub(' ', '', phone_seq)
            if phone in phone2kana_dict.keys():
                continue
            phone2kana_dict[phone] = kana
            phone2kana_dict[phone + ':'] = kana + 'ー'

    # Julius Results
    for data_type in ['eval1', 'eval2', 'eval3']:
        results_paths = [path for path in glob(
            '/home/lab5/inaguma/asru2017/csj_results_0710_kana/' + data_type + '/*.kana')]
        result_dict = {}

        for path in results_paths:
            with codecs.open(path, 'r', 'euc_jp') as f:
                file_name = ''
                output = ''
                for line in f:
                    line = line.strip()

                    if 'wav' in line:
                        file_name = line.split(': ')[-1]
                        file_name = '_'.join(line.split('/')[-2:])
                        file_name = re.sub('.wav', '', file_name)

                    else:
                        output = line
                        output = re.sub('sp', '', output)
                        result_dict[file_name] = output

        label_type = 'kana'
        dataset = Dataset(data_type=data_type,
                          label_type=label_type,
                          batch_size=1,
                          train_data_size='large',
                          is_sorted=False,
                          is_progressbar=True,
                          is_gpu=False)

        num_examples = dataset.data_num
        cer_sum = 0

        mini_batch = dataset.next_batch(batch_size=1)

        def map_fn(phone): return phone2kana_dict[phone]

        for _ in wrap_iterator(range(num_examples), False):
            # Create feed dictionary for next mini batch
            _, labels_true, _, input_names = mini_batch.__next__()

            if input_names[0] not in result_dict.keys():
                continue

            output = result_dict[input_names[0]].split(' ')
            while '' in output:
                output.remove('')

            str_pred = ''.join(list(map(map_fn, output)))

            if input_names[0] in ['A03M0106_0057', 'A03M0016_0014']:
                print(str_pred)
                print(labels_true[0])
                print('-----')

            # Remove silence(_) & noise(NZ) labels
            str_true = re.sub(r'[_NZー・]+', "", labels_true[0])
            str_pred = re.sub(r'[_NZー・]+', "", str_pred)

            # Compute edit distance
            cer_each = Levenshtein.distance(
                str_pred, str_true) / len(list(str_true))

            cer_sum += cer_each

        print('CER (' + data_type + '): %f' % (cer_sum / dataset.data_num))
def stack_frame(input_list,
                input_paths,
                frame_num_dict,
                num_stack,
                num_skip,
                is_progressbar=False):
    """Stack & skip some frames. This implementation is based on
       https://arxiv.org/abs/1507.06947.
           Sak, Haşim, et al.
           "Fast and accurate recurrent neural network acoustic models for speech recognition."
           arXiv preprint arXiv:1507.06947 (2015).
    Args:
        input_list: list of input data
        input_paths: list of paths to input data
        frame_num_dict:
            key => utterance index
            value => the number of frames
        num_stack: int, the number of frames to stack
        num_skip: int, the number of frames to skip
        is_progressbar: if True, visualize progressbar
    Returns:
        stacked_input_list: list of frame-stacked inputs
    """
    if num_stack < num_skip:
        raise ValueError('num_skip must be less than num_stack.')

    input_size = input_list[0].shape[1]
    utt_num = len(input_paths)

    stacked_input_list = []
    for i_utt in wrap_iterator(range(utt_num), is_progressbar):
        # Per utterance
        input_name = basename(input_paths[i_utt]).split('.')[0]
        frame_num = frame_num_dict[input_name]
        frame_num_decimated = frame_num / num_skip
        if frame_num_decimated != int(frame_num_decimated):
            frame_num_decimated += 1
        frame_num_decimated = int(frame_num_decimated)

        stacked_frames = np.zeros(
            (frame_num_decimated, input_size * num_stack))
        stack_count = 0  # counter for stacked_frames
        stack = []
        for i_frame, frame in enumerate(input_list[i_utt]):
            #####################
            # final frame
            #####################
            if i_frame == len(input_list[i_utt]) - 1:
                # Stack the final frame
                stack.append(frame)

                while stack_count != int(frame_num_decimated):
                    # Concatenate stacked frames
                    for i_stack in range(len(stack)):
                        stacked_frames[stack_count][input_size *
                                                    i_stack:input_size *
                                                    (i_stack +
                                                     1)] = stack[i_stack]
                    stack_count += 1

                    # Delete some frames to skip
                    for _ in range(num_skip):
                        if len(stack) != 0:
                            stack.pop(0)

            ########################
            # first & middle frames
            ########################
            elif len(stack) < num_stack:
                # Stack some frames until stack is filled
                stack.append(frame)

                if len(stack) == num_stack:
                    # Concatenate stacked frames
                    for i_stack in range(num_stack):
                        stacked_frames[stack_count][input_size *
                                                    i_stack:input_size *
                                                    (i_stack +
                                                     1)] = stack[i_stack]
                    stack_count += 1

                    # Delete some frames to skip
                    for _ in range(num_skip):
                        stack.pop(0)

        stacked_input_list.append(stacked_frames)

    return np.array(stacked_input_list)
    def __init__(self,
                 data_type,
                 train_data_size,
                 label_type,
                 batch_size,
                 eos_index,
                 is_sorted=True,
                 is_progressbar=False,
                 num_gpu=1):
        """A class for loading dataset.
        Args:
            data_type: string, train, dev, eval1, eval2, eval3
            train_data_size: string, default or large
            label_type: string, phone or character or kanji
            batch_size: int, the size of mini-batch
            eos_index: int , the index of <EOS> class
            is_sorted: if True, sort dataset by frame num
            is_progressbar: if True, visualize progressbar
            num_gpu: int, if more than 1, divide batch_size by num_gpu
        """
        if data_type not in ['train', 'dev', 'eval1', 'eval2', 'eval3']:
            raise ValueError(
                'data_type is "train" or "dev", "eval1" "eval2" "eval3".')

        self.data_type = data_type
        self.train_data_size = train_data_size
        self.label_type = label_type
        self.batch_size = batch_size * num_gpu
        self.eos_index = eos_index
        self.is_sorted = is_sorted
        self.is_progressbar = is_progressbar
        self.num_gpu = num_gpu

        self.input_size = 123
        input_path = join('/data/inaguma/csj/inputs', train_data_size,
                          data_type)
        label_path = join('/data/inaguma/csj/labels/attention/',
                          train_data_size, label_type, data_type)

        # Load the frame number dictionary
        with open(join(input_path, 'frame_num.pickle'), 'rb') as f:
            self.frame_num_dict = pickle.load(f)

        # Sort paths to input & label by frame num
        print('=> Loading paths to dataset...')
        frame_num_tuple_sorted = sorted(self.frame_num_dict.items(),
                                        key=lambda x: x[1])
        input_paths, label_paths = [], []
        for input_name, frame_num in wrap_iterator(frame_num_tuple_sorted,
                                                   self.is_progressbar):
            speaker_name = input_name.split('_')[0]
            input_paths.append(
                join(input_path, speaker_name, input_name + '.npy'))
            label_paths.append(
                join(label_path, speaker_name, input_name + '.npy'))
        self.input_paths = np.array(input_paths)
        self.label_paths = np.array(label_paths)
        self.data_num = len(self.input_paths)

        self.rest = set(range(0, self.data_num, 1))

        if data_type in ['eval1', 'eval2', 'eval3'] and label_type != 'phone':
            self.is_test = True
        else:
            self.is_test = False
    def __init__(self,
                 data_type,
                 label_type_main,
                 label_type_sub,
                 batch_size,
                 num_stack=None,
                 num_skip=None,
                 is_sorted=True,
                 is_progressbar=False,
                 num_gpu=1):
        """A class for loading dataset.
        Args:
            data_type: string, train or dev or test
            label_type_sub: string, phone39 or phone48 or phone61
            batch_size: int, the size of mini-batch
            num_stack: int, the number of frames to stack
            num_skip: int, the number of frames to skip
            is_sorted: if True, sort dataset by frame num
            is_progressbar: if True, visualize progressbar
            num_gpu: int, if more than 1, divide batch_size by num_gpu
        """
        if data_type not in ['train', 'dev', 'test']:
            raise ValueError('data_type is "train" or "dev" or "test".')

        self.data_type = data_type
        self.label_type_main = 'character'
        self.label_type_sub = label_type_sub
        self.batch_size = batch_size * num_gpu
        self.num_stack = num_stack
        self.num_skip = num_skip
        self.is_sorted = is_sorted
        self.is_progressbar = is_progressbar
        self.num_gpu = num_gpu

        self.input_size = 123
        input_path = join('/n/sd8/inaguma/corpus/timit/dataset/inputs/',
                          data_type)
        label_main_path = join(
            '/n/sd8/inaguma/corpus/timit/dataset/labels/ctc/character/',
            data_type)
        label_sub_path = join(
            '/n/sd8/inaguma/corpus/timit/dataset/labels/ctc/', label_type_sub,
            data_type)

        # Load the frame number dictionary
        with open(join(input_path, 'frame_num.pickle'), 'rb') as f:
            self.frame_num_dict = pickle.load(f)

        # Sort paths to input & label by frame num
        frame_num_tuple_sorted = sorted(self.frame_num_dict.items(),
                                        key=lambda x: x[1])
        input_paths, label_main_paths, label_sub_paths = [], [], []
        for input_name, frame_num in frame_num_tuple_sorted:
            input_paths.append(join(input_path, input_name + '.npy'))
            label_main_paths.append(join(label_main_path, input_name + '.npy'))
            label_sub_paths.append(join(label_sub_path, input_name + '.npy'))
        if len(label_main_paths) != len(label_sub_paths):
            raise ValueError('The numbers of labels between ' +
                             'character and phone are not same.')
        self.input_paths = np.array(input_paths)
        self.label_main_paths = np.array(label_main_paths)
        self.label_sub_paths = np.array(label_sub_paths)
        self.data_num = len(self.input_paths)

        # Load all dataset in advance
        print('=> Loading ' + data_type + ' dataset (' + label_type_sub +
              ')...')
        input_list, label_main_list, label_sub_list = [], [], []
        for i in wrap_iterator(range(self.data_num), self.is_progressbar):
            input_list.append(np.load(self.input_paths[i]))
            label_main_list.append(np.load(self.label_main_paths[i]))
            label_sub_list.append(np.load(self.label_sub_paths[i]))
        self.input_list = np.array(input_list)
        self.label_main_list = np.array(label_main_list)
        self.label_sub_list = np.array(label_sub_list)

        # Frame stacking
        if (num_stack is not None) and (num_skip is not None):
            print('=> Stacking frames...')
            self.input_list = stack_frame(self.input_list, self.input_paths,
                                          self.frame_num_dict, num_stack,
                                          num_skip, is_progressbar)
            self.input_size = self.input_size * num_stack

        self.rest = set(range(0, self.data_num, 1))
Example #10
0
def do_eval_cer(session,
                decode_op,
                network,
                dataset,
                label_type,
                is_test=None,
                eval_batch_size=None,
                is_progressbar=False,
                is_multitask=False,
                is_main=False):
    """Evaluate trained model by Character Error Rate.
    Args:
        session: session of training model
        decode_op: operation for decoding
        network: network to evaluate
        dataset: An instance of `Dataset` class
        label_type: string, kanji or kana or phone
        is_test: bool, set to True when evaluating by the test set
        eval_batch_size: int, the batch size when evaluating the model
        is_progressbar: if True, visualize progressbar
        is_multitask: if True, evaluate the multitask model
        is_main: if True, evaluate the main task
    Return:
        cer_mean: An average of CER
    """
    if eval_batch_size is None:
        batch_size = dataset.batch_size
    else:
        batch_size = eval_batch_size

    num_examples = dataset.data_num
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1
    cer_sum = 0

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=batch_size)

    if label_type == 'kanji':
        map_file_path = '../metrics/mapping_files/ctc/kanji2num.txt'
    elif label_type == 'kana':
        map_file_path = '../metrics/mapping_files/ctc/kana2num.txt'
    elif label_type == 'phone':
        map_file_path == '../metrics/mapping_files/ctc/phone2num.txt'

    for step in wrap_iterator(range(iteration), is_progressbar):
        # Create feed dictionary for next mini batch
        if not is_multitask:
            inputs, labels_true, inputs_seq_len, _ = mini_batch.__next__()
        else:
            if is_main:
                inputs, labels_true, _, inputs_seq_len, _ = mini_batch.__next__(
                )
            else:
                inputs, _, labels_true, inputs_seq_len, _ = mini_batch.__next__(
                )

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        batch_size_each = len(inputs_seq_len)

        labels_pred_st = session.run(decode_op, feed_dict=feed_dict)
        labels_pred = sparsetensor2list(labels_pred_st, batch_size_each)

        for i_batch in range(batch_size_each):
            # Convert from list to string
            if label_type != 'phone' and is_test:
                str_true = ''.join(labels_true[i_batch])
                # NOTE: 漢字とかなの場合はテストデータのラベルはそのまま保存してある
            else:
                str_true = num2char(labels_true[i_batch], map_file_path)
            str_pred = num2char(labels_pred[i_batch], map_file_path)

            # Remove silence(_) & noise(NZ) labels
            str_true = re.sub(r'[_NZー]+', "", str_true)
            str_pred = re.sub(r'[_NZー]+', "", str_pred)

            # Compute edit distance
            cer_each = Levenshtein.distance(str_pred, str_true) / len(
                list(str_true))

            cer_sum += cer_each

    cer_mean = cer_sum / dataset.data_num

    return cer_mean