def do_eval_per(session,
                per_op,
                network,
                dataset,
                eval_batch_size=None,
                is_progressbar=False,
                is_multitask=False):
    """Evaluate trained model by Phone Error Rate.
    Args:
        session: session of training model
        per_op: operation for computing phone error rate
        network: network to evaluate
        dataset: An instance of a `Dataset' class
        eval_batch_size: int, the batch size when evaluating the model
        is_progressbar: if True, visualize progressbar
        is_multitask: if True, evaluate the multitask model
    Returns:
        per_global: An average of PER
    """
    if eval_batch_size is None:
        batch_size = network.batch_size
    else:
        batch_size = eval_batch_size

    num_examples = dataset.data_num
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1
    per_global = 0

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=batch_size)

    for step in wrap_iterator(range(iteration), is_progressbar):
        # Create feed dictionary for next mini batch
        if not is_multitask:
            inputs, labels_true_st, inputs_seq_len, _ = mini_batch.__next__()
        else:
            inputs, _, labels_true_st, inputs_seq_len, _ = mini_batch.__next__(
            )

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        batch_size_each = len(inputs_seq_len)

        per_local = session.run(per_op, feed_dict=feed_dict)
        per_global += per_local * batch_size_each

    per_global /= dataset.data_num

    return per_global
    def __init__(self,
                 data_type,
                 label_type,
                 batch_size,
                 eos_index,
                 max_epoch=None,
                 splice=1,
                 num_stack=1,
                 num_skip=1,
                 shuffle=False,
                 sort_utt=False,
                 sort_stop_epoch=None,
                 progressbar=False):
        """A class for loading dataset.
        Args:
            data_type (string): train or dev or test
            label_type (string): stirng, phone39 or phone48 or phone61 or
                character or character_capital_divide
            batch_size (int): the size of mini-batch
            eos_index (int): the index of <EOS> class
            max_epoch (int, optional): the max epoch. None means infinite loop.
            splice (int, optional): frames to splice. Default is 1 frame.
            num_stack (int, optional): the number of frames to stack
            num_skip (int, optional): the number of frames to skip
            shuffle (bool, optional): if True, shuffle utterances. This is
                disabled when sort_utt is True.
            sort_utt (bool, optional): if True, sort all utterances by the
                number of frames and utteraces in each mini-batch are shuffled.
                Otherwise, shuffle utteraces.
            sort_stop_epoch (int, optional): After sort_stop_epoch, training
                will revert back to a random order
            progressbar (bool, optional): if True, visualize progressbar
        """
        if data_type not in ['train', 'dev', 'test']:
            raise TypeError('data_type must be "train" or "dev" or "test".')
        if label_type not in [
                'phone39', 'phone48', 'phone61', 'character',
                'character_capital_divide'
        ]:
            raise TypeError(
                'label_type must be "phone39" or "phone48" or "phone61" or ' +
                '"character" or "character_capital_divide".')

        super(Dataset, self).__init__()

        self.data_type = data_type
        self.label_type = label_type
        self.batch_size = batch_size
        self.max_epoch = max_epoch
        self.eos_index = eos_index
        self.splice = splice
        self.num_stack = num_stack
        self.num_skip = num_skip
        self.shuffle = shuffle
        self.sort_utt = sort_utt
        self.sort_stop_epoch = sort_stop_epoch
        self.progressbar = progressbar
        self.padded_value = eos_index

        input_path = join(
            '/n/sd8/inaguma/corpus/timit/dataset/inputs/htk/speaker',
            data_type)
        label_path = join(
            '/n/sd8/inaguma/corpus/timit/dataset/labels/attention', label_type,
            data_type)

        # Load the frame number dictionary
        with open(join(input_path, 'frame_num.pickle'), 'rb') as f:
            self.frame_num_dict = pickle.load(f)

        # Sort paths to input & label
        axis = 1 if sort_utt else 0
        frame_num_tuple_sorted = sorted(self.frame_num_dict.items(),
                                        key=lambda x: x[axis])
        input_paths, label_paths = [], []
        for input_name, frame_num in frame_num_tuple_sorted:
            input_paths.append(join(input_path, input_name + '.npy'))
            label_paths.append(join(label_path, input_name + '.npy'))
        self.input_paths = np.array(input_paths)
        self.label_paths = np.array(label_paths)

        # Load all dataset in advance
        print('=> Loading dataset (%s, %s)...' % (data_type, label_type))
        input_list, label_list = [], []
        for i in wrap_iterator(range(len(self.input_paths)), self.progressbar):
            input_list.append(np.load(self.input_paths[i]))
            label_list.append(np.load(self.label_paths[i]))
        self.input_list = np.array(input_list)
        self.label_list = np.array(label_list)

        # Frame stacking
        print('=> Stacking frames...')
        self.input_list = stack_frame(self.input_list, self.input_paths,
                                      self.frame_num_dict, num_stack, num_skip,
                                      progressbar)

        self.rest = set(range(0, len(self.input_paths), 1))
    def __init__(self, data_type, label_type_second, batch_size,
                 num_stack=None, num_skip=None,
                 is_sorted=True, is_progressbar=False, num_gpu=1):
        """
        Args:
            data_type: string, train or dev or test
            label_type_second: string, phone39 or phone48 or phone61
            batch_size: int, the size of mini-batch
            num_stack: int, the number of frames to stack
            num_skip: int, the number of frames to skip
            is_sorted: if True, sort dataset by frame num
            is_progressbar: if True, visualize progressbar
            num_gpu: int, if more than 1, divide batch_size by num_gpu
        """
        if data_type not in ['train', 'dev', 'test']:
            raise ValueError('data_type is "train" or "dev" or "test".')

        self.data_type = data_type
        self.label_type_second = label_type_second
        self.batch_size = batch_size * num_gpu
        self.num_stack = num_stack
        self.num_skip = num_skip
        self.is_sorted = is_sorted
        self.is_progressbar = is_progressbar
        self.num_gpu = num_gpu

        self.input_size = 123
        self.dataset_char_path = join(
            '/n/sd8/inaguma/corpus/timit/dataset/ctc/character', data_type)
        self.dataset_phone_path = join(
            '/n/sd8/inaguma/corpus/timit/dataset/ctc/',
            label_type_second, data_type)

        # Load the frame number dictionary
        self.frame_num_dict_path = join(
            self.dataset_char_path, 'frame_num.pickle')
        with open(self.frame_num_dict_path, 'rb') as f:
            self.frame_num_dict = pickle.load(f)

        # Sort paths to input & label by frame num
        self.frame_num_tuple_sorted = sorted(
            self.frame_num_dict.items(), key=lambda x: x[1])
        input_paths, label_char_paths, label_phone_paths = [], [], []
        for input_name, frame_num in self.frame_num_tuple_sorted:
            input_paths.append(join(
                self.dataset_char_path, 'input', input_name + '.npy'))
            label_char_paths.append(join(
                self.dataset_char_path, 'label', input_name + '.npy'))
            label_phone_paths.append(join(
                self.dataset_phone_path, 'label', input_name + '.npy'))
        if len(label_char_paths) != len(label_phone_paths):
            raise ValueError(
                'The numbers of labels between ' +
                'character and phone are not same.')
        self.input_paths = np.array(input_paths)
        self.label_char_paths = np.array(label_char_paths)
        self.label_phone_paths = np.array(label_phone_paths)
        self.data_num = len(self.input_paths)

        # Load all dataset in advance
        print('=> Loading ' + data_type +
              ' dataset (' + label_type_second + ')...')
        input_list, label_char_list, label_phone_list = [], [], []
        for i in wrap_iterator(range(self.data_num), self.is_progressbar):
            input_list.append(np.load(self.input_paths[i]))
            label_char_list.append(np.load(self.label_char_paths[i]))
            label_phone_list.append(np.load(self.label_phone_paths[i]))
        self.input_list = np.array(input_list)
        self.label_char_list = np.array(label_char_list)
        self.label_phone_list = np.array(label_phone_list)

        # Frame stacking
        if (num_stack is not None) and (num_skip is not None):
            print('=> Stacking frames...')
            stacked_input_list = stack_frame(self.input_list,
                                             self.input_paths,
                                             self.frame_num_dict,
                                             num_stack,
                                             num_skip,
                                             is_progressbar)
            self.input_list = np.array(stacked_input_list)
            self.input_size = self.input_size * num_stack

        self.rest = set([i for i in range(self.data_num)])
def stack_frame(input_list, num_stack, num_skip, progressbar=False):
    """Stack & skip some frames. This implementation is based on
       https://arxiv.org/abs/1507.06947.
           Sak, Haşim, et al.
           "Fast and accurate recurrent neural network acoustic models for speech recognition."
           arXiv preprint arXiv:1507.06947 (2015).
    Args:
        input_list (list): list of input data
        num_stack (int): the number of frames to stack
        num_skip (int): the number of frames to skip
        progressbar (bool, optional): if True, visualize progressbar
    Returns:
        input_list_new (list): list of frame-stacked inputs
    """
    if num_stack == 1 and num_stack == 1:
        return input_list

    if num_stack < num_skip:
        raise ValueError('num_skip must be less than num_stack.')

    batch_size = len(input_list)

    input_list_new = []
    for i_batch in wrap_iterator(range(batch_size), progressbar):

        frame_num, input_size = input_list[i_batch].shape
        frame_num_new = math.ceil(frame_num / num_skip)

        stacked_frames = np.zeros((frame_num_new, input_size * num_stack))
        stack_count = 0  # counter
        stack = []
        for t, frame_t in enumerate(input_list[i_batch]):
            #####################
            # final frame
            #####################
            if t == len(input_list[i_batch]) - 1:
                # Stack the final frame
                stack.append(frame_t)

                while stack_count != int(frame_num_new):
                    # Concatenate stacked frames
                    for i_stack in range(len(stack)):
                        stacked_frames[stack_count][input_size *
                                                    i_stack:input_size * (i_stack + 1)] = stack[i_stack]
                    stack_count += 1

                    # Delete some frames to skip
                    for _ in range(num_skip):
                        if len(stack) != 0:
                            stack.pop(0)

            ########################
            # first & middle frames
            ########################
            elif len(stack) < num_stack:
                # Stack some frames until stack is filled
                stack.append(frame_t)

                if len(stack) == num_stack:
                    # Concatenate stacked frames
                    for i_stack in range(num_stack):
                        stacked_frames[stack_count][input_size *
                                                    i_stack:input_size * (i_stack + 1)] = stack[i_stack]
                    stack_count += 1

                    # Delete some frames to skip
                    for _ in range(num_skip):
                        stack.pop(0)

        input_list_new.append(stacked_frames)

    return np.array(input_list_new)
def do_eval_cer(session,
                decode_op,
                network,
                dataset,
                label_type,
                is_test=None,
                eval_batch_size=None,
                is_progressbar=False,
                is_multitask=False,
                is_main=False):
    """Evaluate trained model by Character Error Rate.
    Args:
        session: session of training model
        decode_op: operation for decoding
        network: network to evaluate
        dataset: An instance of `Dataset` class
        label_type: string, character or kanji
        is_test: set to True when evaluating by the test set
        eval_batch_size: int, the batch size when evaluating the model
        is_progressbar: if True, visualize progressbar
        is_multitask: if True, evaluate the multitask model
        is_main: if True, evaluate the main task
    Return:
        cer_mean: An average of CER
    """
    if eval_batch_size is None:
        batch_size = network.batch_size
    else:
        batch_size = eval_batch_size

    num_examples = dataset.data_num
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1
    cer_sum = 0

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=batch_size)

    if label_type == 'character':
        map_file_path = '../metric/mapping_files/ctc/char2num.txt'
    elif label_type == 'kanji':
        map_file_path = '../metric/mapping_files/ctc/kanji2num.txt'
    for step in wrap_iterator(range(iteration), is_progressbar):
        # Create feed dictionary for next mini batch
        if not is_multitask:
            inputs, labels_true_st, inputs_seq_len, _ = mini_batch.__next__()
        else:
            if is_main:
                inputs, labels_true_st, _, inputs_seq_len, _ = mini_batch.__next__(
                )
            else:
                inputs, _, labels_true_st, inputs_seq_len, _ = mini_batch.__next__(
                )

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        batch_size_each = len(inputs_seq_len)

        labels_pred_st = session.run(decode_op, feed_dict=feed_dict)
        labels_true = sparsetensor2list(labels_true_st, batch_size_each)
        labels_pred = sparsetensor2list(labels_pred_st, batch_size_each)
        for i_batch in range(batch_size_each):
            # Convert from list to string
            str_pred = num2char(labels_pred[i_batch], map_file_path)
            # TODO: change in case of character
            if label_type == 'kanji' and is_test:
                str_true = ''.join(labels_true[i_batch])
                # NOTE* 漢字の場合はテストデータのラベルはそのまま保存してある
            else:
                str_true = num2char(labels_true[i_batch], map_file_path)

            # Remove silence(_) labels
            str_true = re.sub(r'[_]+', "", str_true)
            str_pred = re.sub(r'[_]+', "", str_pred)

            # Compute edit distance
            cer_each = Levenshtein.distance(str_pred, str_true) / len(
                list(str_true))

            cer_sum += cer_each

    cer_mean = cer_sum / dataset.data_num

    return cer_mean
    def __init__(self,
                 data_type,
                 train_data_size,
                 label_type_main,
                 label_type_second,
                 batch_size,
                 num_stack=None,
                 num_skip=None,
                 is_sorted=True,
                 is_progressbar=False,
                 num_gpu=1):
        """
        Args:
            data_type: string, train or dev or eval1 or eval2 or eval3
            train_data_size: string, default or large
            label_type_main: string, character or kanji
            label_type_second: string, character or phone
            batch_size: int, the size of mini-batch
            num_stack: int, the number of frames to stack
            num_skip: int, the number of frames to skip
            is_sorted: if True, sort dataset by frame num
            is_progressbar: if True, visualize progressbar
            num_gpu: int, if more than 1, divide batch_size by num_gpu
        """
        if data_type not in ['train', 'dev', 'eval1', 'eval2', 'eval3']:
            raise ValueError(
                'data_type is "train" or "dev", "eval1", "eval2", "eval3".')

        self.data_type = data_type
        self.train_data_size = train_data_size
        self.label_type_main = label_type_main
        self.label_type_second = label_type_second
        self.batch_size = batch_size * num_gpu
        self.num_stack = num_stack
        self.num_skip = num_skip
        self.is_sorted = is_sorted
        self.is_progressbar = is_progressbar
        self.num_gpu = num_gpu

        self.input_size = 123
        self.input_size = self.input_size
        self.dataset_main_path = join(
            '/n/sd8/inaguma/corpus/csj/dataset/monolog/ctc/', label_type_main,
            train_data_size, data_type)
        self.dataset_second_path = join(
            '/n/sd8/inaguma/corpus/csj/dataset/monolog/ctc/',
            label_type_second, train_data_size, data_type)

        # Load the frame number dictionary
        self.frame_num_dict_path = join(self.dataset_main_path,
                                        'frame_num.pickle')
        with open(self.frame_num_dict_path, 'rb') as f:
            self.frame_num_dict = pickle.load(f)

        # Sort paths to input & label by frame num
        print('=> loading paths to dataset...')
        self.frame_num_tuple_sorted = sorted(self.frame_num_dict.items(),
                                             key=lambda x: x[1])
        input_paths, label_main_paths, label_second_paths = [], [], []
        for input_name, frame_num in wrap_iterator(self.frame_num_tuple_sorted,
                                                   self.is_progressbar):
            speaker_name = input_name.split('_')[0]
            input_paths.append(
                join(self.dataset_main_path, 'input', speaker_name,
                     input_name + '.npy'))
            label_main_paths.append(
                join(self.dataset_main_path, 'label', speaker_name,
                     input_name + '.npy'))
            label_second_paths.append(
                join(self.dataset_second_path, 'label', speaker_name,
                     input_name + '.npy'))
        self.input_paths = np.array(input_paths)
        self.label_main_paths = np.array(label_main_paths)
        self.label_second_paths = np.array(label_second_paths)
        self.data_num = len(self.input_paths)

        if (self.num_stack is not None) and (self.num_skip is not None):
            self.input_size = self.input_size * num_stack
        # NOTE: Not load dataset yet

        self.rest = set([i for i in range(self.data_num)])
def do_eval_per(session,
                decode_op,
                per_op,
                network,
                dataset,
                train_label_type,
                eval_batch_size=None,
                is_progressbar=False,
                is_multitask=False):
    """Evaluate trained model by Phone Error Rate.
    Args:
        session: session of training model
        decode_op: operation for decoding
        per_op: operation for computing phone error rate
        network: network to evaluate
        dataset: An instance of a `Dataset' class
        train_label_type: string, phone39 or phone48 or phone61
        eval_batch_size: int, the batch size when evaluating the model
        is_progressbar: if True, visualize the progressbar
        is_multitask: if True, evaluate the multitask model
    Returns:
        per_global: An average of PER
    """
    if eval_batch_size is not None:
        batch_size = eval_batch_size
    else:
        batch_size = dataset.batch_size

    data_label_type = dataset.label_type

    num_examples = dataset.data_num
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1
    per_global = 0

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=batch_size)

    phone2num_map_file_path = '../metric/mapping_files/ctc/phone2num_' + \
        train_label_type[5:7] + '.txt'
    phone2num_39_map_file_path = '../metric/mapping_files/ctc/phone2num_39.txt'
    phone2phone_map_file_path = '../metric/mapping_files/phone2phone.txt'
    for step in wrap_iterator(range(iteration), is_progressbar):
        # Create feed dictionary for next mini batch
        if not is_multitask:
            inputs, labels_true_st, inputs_seq_len, _ = mini_batch.__next__()
        else:
            inputs, _, labels_true_st, inputs_seq_len, _ = mini_batch.__next__(
            )

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        batch_size_each = len(inputs_seq_len)

        if False:
            # Evaluate by the same phones as phones used when training
            per_local = session.run(per_op, feed_dict=feed_dict)
            per_global += per_local * batch_size_each

        else:
            # Evaluate by 39 phones
            labels_pred_st = session.run(decode_op, feed_dict=feed_dict)
            labels_true = sparsetensor2list(labels_true_st, batch_size_each)
            labels_pred = sparsetensor2list(labels_pred_st, batch_size_each)
            for i_batch in range(batch_size_each):
                # Convert from num to phone (-> list of phone strings)
                phone_pred_seq = num2phone(labels_pred[i_batch],
                                           phone2num_map_file_path)
                phone_pred_list = phone_pred_seq.split(' ')

                # Mapping to 39 phones (-> list of phone strings)
                phone_pred_list = map_to_39phone(phone_pred_list,
                                                 train_label_type,
                                                 phone2phone_map_file_path)

                # Convert from phone to num (-> list of phone indices)
                phone_pred_list = phone2num(phone_pred_list,
                                            phone2num_39_map_file_path)
                labels_pred[i_batch] = phone_pred_list

                if data_label_type != 'phone39':
                    # Convert from num to phone (-> list of phone strings)
                    phone_true_seq = num2phone(labels_true[i_batch],
                                               phone2num_map_file_path)
                    phone_true_list = phone_true_seq.split(' ')

                    # Mapping to 39 phones (-> list of phone strings)
                    phone_true_list = map_to_39phone(
                        phone_true_list, data_label_type,
                        phone2phone_map_file_path)

                    # Convert from phone to num (-> list of phone indices)
                    phone_true_list = phone2num(phone_true_list,
                                                phone2num_39_map_file_path)
                    labels_true[i_batch] = phone_true_list

            # Compute edit distance
            labels_true_st = list2sparsetensor(labels_true)
            labels_pred_st = list2sparsetensor(labels_pred)
            per_local = compute_edit_distance(session, labels_true_st,
                                              labels_pred_st)
            per_global += per_local * batch_size_each

    per_global /= dataset.data_num

    return per_global
    def __init__(self,
                 data_type,
                 label_type,
                 batch_size,
                 eos_index,
                 is_sorted=True,
                 is_progressbar=False,
                 num_gpu=1):
        """
        Args:
            data_type: string, train or dev or test
            label_type: string, phone39 or phone48 or phone61 or character
            eos_index: int , the index of <EOS> class
            is_sorted: if True, sort dataset by frame num
            is_progressbar: if True, visualize progressbar
            num_gpu: int, if more than 1, divide batch_size by num_gpu
        """
        if data_type not in ['train', 'dev', 'test']:
            raise ValueError('data_type is "train" or "dev" or "test".')

        self.data_type = data_type
        self.label_type = label_type
        self.batch_size = batch_size * num_gpu
        self.eos_index = eos_index
        self.is_sorted = is_sorted
        self.is_progressbar = is_progressbar
        self.num_gpu = num_gpu

        self.input_size = 123
        self.dataset_path = join(
            '/n/sd8/inaguma/corpus/timit/dataset/attention/', label_type,
            data_type)

        # Load the frame number dictionary
        self.frame_num_dict_path = join(self.dataset_path,
                                        'frame_num_dict.pickle')
        with open(self.frame_num_dict_path, 'rb') as f:
            self.frame_num_dict = pickle.load(f)

        # Sort paths to input & label by frame num
        self.frame_num_tuple_sorted = sorted(self.frame_num_dict.items(),
                                             key=lambda x: x[1])
        input_paths, label_paths = [], []
        for input_name, frame_num in self.frame_num_tuple_sorted:
            input_paths.append(
                join(self.dataset_path, 'input', input_name + '.npy'))
            label_paths.append(
                join(self.dataset_path, 'label', input_name + '.npy'))
        self.input_paths = np.array(input_paths)
        self.label_paths = np.array(label_paths)
        self.data_num = len(self.input_paths)

        # Load all dataset in advance
        print('=> Loading ' + data_type + ' dataset (' + label_type + ')...')
        input_list, label_list = [], []
        for i in wrap_iterator(range(self.data_num), self.is_progressbar):
            input_list.append(np.load(self.input_paths[i]))
            label_list.append(np.load(self.label_paths[i]))
        self.input_list = np.array(input_list)
        self.label_list = np.array(label_list)

        self.rest = set([i for i in range(self.data_num)])
Ejemplo n.º 9
0
def stack_frame(input_list, num_stack, num_skip, progressbar=False):
    """Stack & skip some frames. This implementation is based on
       https://arxiv.org/abs/1507.06947.
           Sak, Haşim, et al.
           "Fast and accurate recurrent neural network acoustic models for speech recognition."
           arXiv preprint arXiv:1507.06947 (2015).
    Args:
        input_list (list): list of input data
        num_stack (int): the number of frames to stack
        num_skip (int): the number of frames to skip
        progressbar (bool, optional): if True, visualize progressbar
    Returns:
        input_list_new (list): list of frame-stacked inputs
    """
    if num_stack == 1 and num_stack == 1:
        return input_list

    if num_stack < num_skip:
        raise ValueError('num_skip must be less than num_stack.')

    batch_size = len(input_list)

    input_list_new = []
    for i_batch in wrap_iterator(range(batch_size), progressbar):

        frame_num, input_size = input_list[i_batch].shape
        frame_num_new = int(math.ceil(frame_num / num_skip))

        stacked_frames = np.zeros((frame_num_new, input_size * num_stack))
        stack_count = 0  # counter
        stack = []
        for t, frame_t in enumerate(input_list[i_batch]):
            #####################
            # final frame
            #####################
            if t == len(input_list[i_batch]) - 1:
                # Stack the final frame
                stack.append(frame_t)

                while stack_count != int(frame_num_new):
                    # Concatenate stacked frames
                    for i_stack in range(len(stack)):
                        stacked_frames[stack_count][input_size *
                                                    i_stack:input_size *
                                                    (i_stack +
                                                     1)] = stack[i_stack]
                    stack_count += 1

                    # Delete some frames to skip
                    for _ in range(num_skip):
                        if len(stack) != 0:
                            stack.pop(0)

            ########################
            # first & middle frames
            ########################
            elif len(stack) < num_stack:
                # Stack some frames until stack is filled
                stack.append(frame_t)

                if len(stack) == num_stack:
                    # Concatenate stacked frames
                    for i_stack in range(num_stack):
                        stacked_frames[stack_count][input_size *
                                                    i_stack:input_size *
                                                    (i_stack +
                                                     1)] = stack[i_stack]
                    stack_count += 1

                    # Delete some frames to skip
                    for _ in range(num_skip):
                        stack.pop(0)

        input_list_new.append(stacked_frames)

    return np.array(input_list_new)
Ejemplo n.º 10
0
def stack_frame(input_list,
                input_paths,
                frame_num_dict,
                num_stack,
                num_skip,
                progressbar=False):
    """Stack & skip some frames. This implementation is based on
       https://arxiv.org/abs/1507.06947.
           Sak, Haşim, et al.
           "Fast and accurate recurrent neural network acoustic models for speech recognition."
           arXiv preprint arXiv:1507.06947 (2015).
    Args:
        input_list (list): list of input data
        input_paths (list): paths to input data. This is used to get the
            number of frames from frame_num_dict.
        frame_num_dict (dict):
            key (string) => utterance index
            value (int) => the number of frames
        num_stack (int): the number of frames to stack
        num_skip (int): the number of frames to skip
        progressbar (bool, optional): if True, visualize progressbar
    Returns:
        stacked_input_list (list): list of frame-stacked inputs
    """
    if num_stack == 1 and num_stack == 1:
        return input_list

    if num_stack < num_skip:
        raise ValueError('num_skip must be less than num_stack.')

    input_size = input_list[0].shape[1]
    utt_num = len(input_paths)

    stacked_input_list = []
    for i_utt in wrap_iterator(range(utt_num), progressbar):
        # Per utterance
        input_name = basename(input_paths[i_utt]).split('.')[0]
        frame_num = frame_num_dict[input_name]
        frame_num_decimated = frame_num / num_skip
        if frame_num_decimated != int(frame_num_decimated):
            frame_num_decimated += 1
        frame_num_decimated = int(frame_num_decimated)

        stacked_frames = np.zeros(
            (frame_num_decimated, input_size * num_stack))
        stack_count = 0  # counter for stacked_frames
        stack = []
        for i_frame, frame in enumerate(input_list[i_utt]):
            #####################
            # final frame
            #####################
            if i_frame == len(input_list[i_utt]) - 1:
                # Stack the final frame
                stack.append(frame)

                while stack_count != int(frame_num_decimated):
                    # Concatenate stacked frames
                    for i_stack in range(len(stack)):
                        stacked_frames[stack_count][input_size *
                                                    i_stack:input_size *
                                                    (i_stack +
                                                     1)] = stack[i_stack]
                    stack_count += 1

                    # Delete some frames to skip
                    for _ in range(num_skip):
                        if len(stack) != 0:
                            stack.pop(0)

            ########################
            # first & middle frames
            ########################
            elif len(stack) < num_stack:
                # Stack some frames until stack is filled
                stack.append(frame)

                if len(stack) == num_stack:
                    # Concatenate stacked frames
                    for i_stack in range(num_stack):
                        stacked_frames[stack_count][input_size *
                                                    i_stack:input_size *
                                                    (i_stack +
                                                     1)] = stack[i_stack]
                    stack_count += 1

                    # Delete some frames to skip
                    for _ in range(num_skip):
                        stack.pop(0)

        stacked_input_list.append(stacked_frames)

    return np.array(stacked_input_list)
def do_eval_cer(session,
                decode_op,
                network,
                dataset,
                eval_batch_size=None,
                is_progressbar=False,
                is_multitask=False):
    """Evaluate trained model by Character Error Rate.
    Args:
        session: session of training model
        decode_op: operation for decoding
        network: network to evaluate
        dataset: An instance of a `Dataset` class
        eval_batch_size: int, batch size when evaluating the model
        is_progressbar: if True, visualize the progressbar
        is_multitask: if True, evaluate the multitask model
    Return:
        cer_mean: An average of CER
    """
    if eval_batch_size is not None:
        batch_size = eval_batch_size
    else:
        batch_size = dataset.batch_size

    # Make data generator
    mini_batch = dataset.next_batch(batch_size=batch_size)

    num_examples = dataset.data_num
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1
    cer_sum = 0

    map_file_path = '../metric/mapping_files/attention/char2num.txt'
    for step in wrap_iterator(range(iteration), is_progressbar):
        # Create feed dictionary for next mini batch
        if not is_multitask:
            inputs, labels_true, inputs_seq_len, _, _ = mini_batch.__next__()
        else:
            inputs, labels_true, _, inputs_seq_len, _, _ = mini_batch.__next__(
            )

        feed_dict = {
            network.inputs: inputs,
            network.inputs_seq_len: inputs_seq_len,
            network.keep_prob_input: 1.0,
            network.keep_prob_hidden: 1.0
        }

        batch_size_each = len(inputs_seq_len)

        predicted_ids = session.run(decode_op, feed_dict=feed_dict)
        for i_batch in range(batch_size_each):

            # Convert from list to string
            str_true = num2char(labels_true[i_batch], map_file_path)
            str_pred = num2char(predicted_ids[i_batch], map_file_path)

            # Remove silence(_) labels
            str_true = re.sub(r'[_<>]+', "", str_true)
            str_pred = re.sub(r'[_]+', "", str_pred)
            print(str_true)
            print(str_pred)

            # Compute edit distance
            cer_each = Levenshtein.distance(str_pred, str_true) / len(
                list(str_true))
            cer_sum += cer_each

    cer_mean = cer_sum / dataset.data_num

    return cer_mean