def __init__(self,
                 data_type,
                 label_type,
                 batch_size,
                 eos_index,
                 max_epoch=None,
                 splice=1,
                 num_stack=1,
                 num_skip=1,
                 shuffle=False,
                 sort_utt=False,
                 sort_stop_epoch=None,
                 progressbar=False):
        """A class for loading dataset.
        Args:
            data_type (string): train or dev or test
            label_type (string): stirng, phone39 or phone48 or phone61 or
                character or character_capital_divide
            batch_size (int): the size of mini-batch
            eos_index (int): the index of <EOS> class
            max_epoch (int, optional): the max epoch. None means infinite loop.
            splice (int, optional): frames to splice. Default is 1 frame.
            num_stack (int, optional): the number of frames to stack
            num_skip (int, optional): the number of frames to skip
            shuffle (bool, optional): if True, shuffle utterances. This is
                disabled when sort_utt is True.
            sort_utt (bool, optional): if True, sort all utterances by the
                number of frames and utteraces in each mini-batch are shuffled.
                Otherwise, shuffle utteraces.
            sort_stop_epoch (int, optional): After sort_stop_epoch, training
                will revert back to a random order
            progressbar (bool, optional): if True, visualize progressbar
        """
        if data_type not in ['train', 'dev', 'test']:
            raise TypeError('data_type must be "train" or "dev" or "test".')
        if label_type not in [
                'phone39', 'phone48', 'phone61', 'character',
                'character_capital_divide'
        ]:
            raise TypeError(
                'label_type must be "phone39" or "phone48" or "phone61" or ' +
                '"character" or "character_capital_divide".')

        super(Dataset, self).__init__()

        self.data_type = data_type
        self.label_type = label_type
        self.batch_size = batch_size
        self.max_epoch = max_epoch
        self.eos_index = eos_index
        self.splice = splice
        self.num_stack = num_stack
        self.num_skip = num_skip
        self.shuffle = shuffle
        self.sort_utt = sort_utt
        self.sort_stop_epoch = sort_stop_epoch
        self.progressbar = progressbar
        self.padded_value = eos_index

        input_path = join(
            '/n/sd8/inaguma/corpus/timit/dataset/inputs/htk/speaker',
            data_type)
        label_path = join(
            '/n/sd8/inaguma/corpus/timit/dataset/labels/attention', label_type,
            data_type)

        # Load the frame number dictionary
        with open(join(input_path, 'frame_num.pickle'), 'rb') as f:
            self.frame_num_dict = pickle.load(f)

        # Sort paths to input & label
        axis = 1 if sort_utt else 0
        frame_num_tuple_sorted = sorted(self.frame_num_dict.items(),
                                        key=lambda x: x[axis])
        input_paths, label_paths = [], []
        for input_name, frame_num in frame_num_tuple_sorted:
            input_paths.append(join(input_path, input_name + '.npy'))
            label_paths.append(join(label_path, input_name + '.npy'))
        self.input_paths = np.array(input_paths)
        self.label_paths = np.array(label_paths)

        # Load all dataset in advance
        print('=> Loading dataset (%s, %s)...' % (data_type, label_type))
        input_list, label_list = [], []
        for i in wrap_iterator(range(len(self.input_paths)), self.progressbar):
            input_list.append(np.load(self.input_paths[i]))
            label_list.append(np.load(self.label_paths[i]))
        self.input_list = np.array(input_list)
        self.label_list = np.array(label_list)

        # Frame stacking
        print('=> Stacking frames...')
        self.input_list = stack_frame(self.input_list, self.input_paths,
                                      self.frame_num_dict, num_stack, num_skip,
                                      progressbar)

        self.rest = set(range(0, len(self.input_paths), 1))
Ejemplo n.º 2
0
    def make_batch(self, data_indices):
        """Create mini-batch per step.
        Args:
            data_indices (np.ndarray):
        Returns:
            batch (dict):
                xs (np.ndarray): input data of size
                    `[B, T_in, input_size]`
                ys (np.ndarray): target labels in the main task of size
                    `[B, T_out]`
                x_lens (np.ndarray): lengths of inputs of of size
                    `[B]`
                y_lens (np.ndarray): lengths of target labels in the main task of size
                    `[B]`
                input_names (np.ndarray): file names of input data of size
                    `[B]`
        """
        input_path_list = np.array(self.df['input_path'][data_indices])
        str_indices_list = np.array(self.df['transcript'][data_indices])

        if not hasattr(self, 'input_size'):
            if self.use_double_delta:
                self.input_size = self.input_freq * 3
            elif self.use_delta:
                self.input_size = self.input_freq * 2
            else:
                self.input_size = self.input_freq
            self.input_size *= self.num_stack
            self.input_size *= self.splice

        # Compute max frame num in mini-batch
        max_frame_num = max(self.df['frame_num'][data_indices])
        max_frame_num = math.ceil(max_frame_num / self.num_skip)

        # Compute max target label length in mini-batch
        max_label_num = max(
            map(lambda x: len(str(x).split(' ')), str_indices_list))
        # TODO: fix POS tag (nan -> 'nan')

        # Initialization
        if self.backend == 'pytorch':
            xs = np.zeros((len(data_indices), max_frame_num, self.input_size),
                          dtype=np.float32)
        elif self.backend == 'chainer':
            xs = [None] * len(data_indices)
        if self.is_test:
            ys = np.array([[self.pad_value] * max_label_num] *
                          len(data_indices))
        else:
            ys = np.array([[self.pad_value] * max_label_num] *
                          len(data_indices),
                          dtype=np.int32)
        x_lens = np.zeros((len(data_indices), ), dtype=np.int32)
        y_lens = np.zeros((len(data_indices), ), dtype=np.int32)
        input_names = np.array(
            list(
                map(lambda path: basename(path).split('.')[0],
                    np.array(self.df['input_path'][data_indices]))))

        # Set values of each data in mini-batch
        for b in range(len(data_indices)):
            # Load input data
            try:
                data_i_tmp = self.load(input_path_list[b].replace(
                    '/n/sd8/inaguma/corpus', '/data/inaguma'))
            except:
                try:
                    data_i_tmp = self.load(input_path_list[b].replace(
                        '/n/sd8/inaguma/corpus', '/tmp/inaguma'))
                except:
                    data_i_tmp = self.load(input_path_list[b])

            if self.use_double_delta:
                data_i = data_i_tmp
            elif self.use_delta:
                data_i = data_i_tmp[:, :self.input_freq * 2]
            else:
                data_i = data_i_tmp[:, :self.input_freq]

            # Frame stacking
            if self.num_stack > 1:
                data_i = stack_frame(data_i,
                                     self.num_stack,
                                     self.num_skip,
                                     dtype=np.float32)
            frame_num = data_i.shape[0]

            # Splicing
            if self.splice > 1:
                data_i = do_splice(data_i,
                                   self.splice,
                                   self.num_stack,
                                   dtype=np.float32)

            if self.backend == 'pytorch':
                xs[b, :frame_num, :] = data_i
            elif self.backend == 'chainer':
                xs[b] = data_i.astype(np.float32)
            x_lens[b] = frame_num
            if self.is_test:
                ys[b, 0] = self.df['transcript'][data_indices[b]]
                # NOTE: transcript is not tokenized
            else:
                indices = list(map(int, str_indices_list[b].split(' ')))
                ys[b, :len(indices)] = indices
                y_lens[b] = len(indices)

        batch = {
            'xs': xs,
            'ys': ys,
            'x_lens': x_lens,
            'y_lens': y_lens,
            'input_names': input_names
        }

        return batch
    def __next__(self, batch_size=None):
        """Generate each mini-batch.
        Args:
            batch_size (int, optional): the size of mini-batch
        Returns:
            A tuple of `(inputs, labels, inputs_seq_len, input_names)`
                inputs: list of input data of size
                    `[num_gpu, B, T_in, input_size]`
                labels: list of target labels of size
                    `[num_gpu, B, T_out]`
                inputs_seq_len: list of length of inputs of size
                    `[num_gpu, B]`
                input_names: list of file name of input data of size
                    `[num_gpu, B]`
            is_new_epoch (bool): If true, 1 epoch is finished
        """
        if self.max_epoch is not None and self.epoch >= self.max_epoch:
            raise StopIteration
        # NOTE: max_epoch = None means infinite loop

        if batch_size is None:
            batch_size = self.batch_size

        # reset
        if self.is_new_epoch:
            self.is_new_epoch = False

        if not self.is_test:
            self.padded_value = -1
        else:
            self.padded_value = None
        # TODO(hirofumi): move this

        if self.sort_utt:
            # Sort all uttrances by length
            if len(self.rest) > batch_size:
                data_indices = sorted(list(self.rest))[:batch_size]
                self.rest -= set(data_indices)
                # NOTE: rest is uttrance length order
            else:
                # Last mini-batch
                data_indices = list(self.rest)
                self.reset()
                self.is_new_epoch = True
                self.epoch += 1
                if self.epoch == self.sort_stop_epoch:
                    self.sort_utt = False
                    self.shuffle = True

            # Shuffle data in the mini-batch
            random.shuffle(data_indices)

        elif self.shuffle:
            # Randomly sample uttrances
            if len(self.rest) > batch_size:
                data_indices = random.sample(list(self.rest), batch_size)
                self.rest -= set(data_indices)
            else:
                # Last mini-batch
                data_indices = list(self.rest)
                self.reset()
                self.is_new_epoch = True
                self.epoch += 1

                # Shuffle selected mini-batch
                random.shuffle(data_indices)

        else:
            if len(self.rest) > batch_size:
                data_indices = sorted(list(self.rest))[:batch_size]
                self.rest -= set(data_indices)
                # NOTE: rest is in name order
            else:
                # Last mini-batch
                data_indices = list(self.rest)
                self.reset()
                self.is_new_epoch = True
                self.epoch += 1

        # Load dataset in mini-batch
        input_list = np.array(
            list(
                map(lambda path: np.load(path),
                    np.take(self.input_paths, data_indices, axis=0))))
        label_list = np.array(
            list(
                map(lambda path: np.load(path),
                    np.take(self.label_paths, data_indices, axis=0))))

        if not hasattr(self, 'input_size'):
            self.input_size = input_list[0].shape[1]
            if self.num_stack is not None and self.num_skip is not None:
                self.input_size *= self.num_stack

        # Frame stacking
        input_list = stack_frame(input_list,
                                 self.num_stack,
                                 self.num_skip,
                                 progressbar=False)

        # Compute max frame num in mini-batch
        max_frame_num = max(map(lambda x: x.shape[0], input_list))

        # Compute max target label length in mini-batch
        max_seq_len = max(map(len, label_list))

        # Initialization
        inputs = np.zeros(
            (len(data_indices), max_frame_num, self.input_size * self.splice),
            dtype=np.float32)
        labels = np.array([[self.padded_value] * max_seq_len] *
                          len(data_indices))
        inputs_seq_len = np.zeros((len(data_indices), ), dtype=np.int32)
        input_names = list(
            map(lambda path: basename(path).split('.')[0],
                np.take(self.input_paths, data_indices, axis=0)))

        # Set values of each data in mini-batch
        for i_batch in range(len(data_indices)):
            data_i = input_list[i_batch]
            frame_num, input_size = data_i.shape

            # Splicing
            data_i = data_i.reshape(1, frame_num, input_size)
            data_i = do_splice(data_i, splice=self.splice,
                               batch_size=1).reshape(frame_num, -1)

            inputs[i_batch, :frame_num, :] = data_i
            if self.is_test:
                labels[i_batch, 0] = label_list[i_batch]
            else:
                labels[
                    i_batch, :len(label_list[i_batch])] = label_list[i_batch]
            inputs_seq_len[i_batch] = frame_num

        ###############
        # Multi-GPUs
        ###############
        if self.num_gpu > 1:
            # Now we split the mini-batch data by num_gpu
            inputs = np.array_split(inputs, self.num_gpu, axis=0)
            labels = np.array_split(labels, self.num_gpu, axis=0)
            inputs_seq_len = np.array_split(inputs_seq_len,
                                            self.num_gpu,
                                            axis=0)
            input_names = np.array_split(input_names, self.num_gpu, axis=0)
        else:
            inputs = inputs[np.newaxis, :, :, :]
            labels = labels[np.newaxis, :, :]
            inputs_seq_len = inputs_seq_len[np.newaxis, :]
            input_names = np.array(input_names)[np.newaxis, :]

        self.iteration += len(data_indices)

        # Clean up
        del input_list
        del label_list

        return (inputs, labels, inputs_seq_len, input_names), self.is_new_epoch
    def __next__(self, batch_size=None):
        """Generate each mini-batch.
        Args:
            batch_size (int, optional): the size of mini-batch
        Returns:
            A tuple of `(inputs, labels, inputs_seq_len, labels_seq_len, input_names)`
                inputs: list of input data of size
                    `[num_gpu, B, T_in, input_size]`
                labels: list of target labels of size
                    `[num_gpu, B, T_out]`
                inputs_seq_len: list of length of inputs of size
                    `[num_gpu, B]`
                labels_seq_len: list of length of target labels of size
                    `[num_gpu, B]`
                input_names: list of file name of input data of size
                    `[num_gpu, B]`
            is_new_epoch (bool): If true, 1 epoch is finished
        """
        if self.max_epoch is not None and self.epoch >= self.max_epoch:
            raise StopIteration
        # NOTE: max_epoch = None means infinite loop

        if batch_size is None:
            batch_size = self.batch_size

        # reset
        if self.is_new_epoch:
            self.is_new_epoch = False

        if not self.is_test:
            self.padded_value = self.eos_index
        else:
            self.padded_value = None
        # TODO(hirofumi): move this

        if self.sort_utt:
            # Sort all uttrances by length
            if len(self.rest) > batch_size:
                data_indices = sorted(list(self.rest))[:batch_size]
                self.rest -= set(data_indices)
                # NOTE: rest is uttrance length order
            else:
                # Last mini-batch
                data_indices = list(self.rest)
                self.reset()
                self.is_new_epoch = True
                self.epoch += 1
                if self.epoch == self.sort_stop_epoch:
                    self.sort_utt = False
                    self.shuffle = True

            # Shuffle data in the mini-batch
            random.shuffle(data_indices)

        elif self.shuffle:
            # Randomly sample uttrances
            if len(self.rest) > batch_size:
                data_indices = random.sample(list(self.rest), batch_size)
                self.rest -= set(data_indices)
            else:
                # Last mini-batch
                data_indices = list(self.rest)
                self.reset()
                self.is_new_epoch = True
                self.epoch += 1

                # Shuffle selected mini-batch
                random.shuffle(data_indices)

        else:
            if len(self.rest) > batch_size:
                data_indices = sorted(list(self.rest))[:batch_size]
                self.rest -= set(data_indices)
                # NOTE: rest is in name order
            else:
                # Last mini-batch
                data_indices = list(self.rest)
                self.reset()
                self.is_new_epoch = True
                self.epoch += 1

        # Load dataset in mini-batch
        input_list = np.array(list(
            map(lambda path: np.load(path),
                np.take(self.input_paths, data_indices, axis=0))))
        label_list = np.array(list(
            map(lambda path: np.load(path),
                np.take(self.label_paths, data_indices, axis=0))))

        if not hasattr(self, 'input_size'):
            self.input_size = input_list[0].shape[1]
            if self.num_stack is not None and self.num_skip is not None:
                self.input_size *= self.num_stack

        # Frame stacking
        input_list = stack_frame(input_list,
                                 self.num_stack,
                                 self.num_skip,
                                 progressbar=False)

        # Compute max frame num in mini-batch
        max_frame_num = max(map(lambda x: x.shape[0], input_list))

        # Compute max target label length in mini-batch
        max_seq_len = max(map(len, label_list)) + 2
        # NOTE: + <SOS> and <EOS>

        # Initialization
        inputs = np.zeros(
            (len(data_indices), max_frame_num, self.input_size * self.splice),
            dtype=np.float32)
        labels = np.array(
            [[self.padded_value] * max_seq_len] * len(data_indices))
        inputs_seq_len = np.zeros((len(data_indices),), dtype=np.int32)
        labels_seq_len = np.zeros((len(data_indices),), dtype=np.int32)
        input_names = list(
            map(lambda path: basename(path).split('.')[0],
                np.take(self.input_paths, data_indices, axis=0)))

        # Set values of each data in mini-batch
        for i_batch in range(len(data_indices)):
            data_i = input_list[i_batch]
            frame_num, input_size = data_i.shape

            # Splicing
            data_i = data_i.reshape(1, frame_num, input_size)
            data_i = do_splice(data_i,
                               splice=self.splice,
                               batch_size=1,
                               num_stack=self.num_stack)
            data_i = data_i.reshape(frame_num, -1)

            inputs[i_batch, : frame_num, :] = data_i
            if self.is_test:
                labels[i_batch, 0] = label_list[i_batch]
                # NOTE: transcript is saved as string
            else:
                labels[i_batch, 0] = self.sos_index
                labels[i_batch, 1:len(label_list[i_batch]) +
                       1] = label_list[i_batch]
                labels[i_batch, len(label_list[i_batch]) + 1] = self.eos_index
            inputs_seq_len[i_batch] = frame_num
            labels_seq_len[i_batch] = len(label_list[i_batch]) + 2
            # TODO: +2 ??

        ###############
        # Multi-GPUs
        ###############
        if self.num_gpu > 1:
            # Now we split the mini-batch data by num_gpu
            inputs = np.array_split(inputs, self.num_gpu, axis=0)
            labels = np.array_split(labels, self.num_gpu, axis=0)
            inputs_seq_len = np.array_split(
                inputs_seq_len, self.num_gpu, axis=0)
            labels_seq_len = np.array_split(
                labels_seq_len, self.num_gpu, axis=0)
            input_names = np.array_split(input_names, self.num_gpu, axis=0)
        else:
            inputs = inputs[np.newaxis, :, :, :]
            labels = labels[np.newaxis, :, :]
            inputs_seq_len = inputs_seq_len[np.newaxis, :]
            labels_seq_len = labels_seq_len[np.newaxis, :]
            input_names = np.array(input_names)[np.newaxis, :]

        self.iteration += len(data_indices)

        # Clean up
        del input_list
        del label_list

        return (inputs, labels, inputs_seq_len, labels_seq_len,
                input_names), self.is_new_epoch
def generate_data(label_type='char',
                  batch_size=1,
                  num_stack=1,
                  splice=1,
                  backend='pytorch'):
    """Generate dataset for unit test.
    Args:
        label_type (string, optional): char or word or word_char
        batch_size (int): the size of mini-batch
        splice (int): frames to splice. Default is 1 frame.
        backend (string, optional): pytorch or chainer
    Returns:
        xs (np.ndarray): A tensor of size `[B, T, input_size]`
        ys (np.ndarray): `[B, max_label_seq_len]`
        x_lens (np.ndarray): A tensor of size `[B]`
        y_lens (np.ndarray): A tensor of size `[B]`
    """
    # Make input data
    _xs, x_lens = wav2feature(['../../sample/LDC93S1.wav'] * batch_size,
                              feature_type='logfbank',
                              feature_dim=40,
                              energy=False,
                              delta1=True,
                              delta2=True,
                              dtype=np.float32)

    max_frame_num = math.ceil(x_lens[0] / num_stack)
    if backend == 'pytorch':
        xs = np.zeros(
            (batch_size, max_frame_num, _xs.shape[-1] * num_stack * splice),
            dtype=np.float32)
    elif backend == 'chainer':
        xs = [None] * batch_size

    for b in range(batch_size):
        # Frame stacking
        data_i = stack_frame(_xs[b],
                             num_stack=num_stack,
                             num_skip=num_stack,
                             dtype=np.float32)

        # Splice
        data_i = do_splice(data_i,
                           splice=splice,
                           num_stack=num_stack,
                           dtype=np.float32)

        xs[b] = data_i
        x_lens[b] = len(data_i)

    # Make transcripts
    trans = _read_text('../../sample/LDC93S1.txt')
    trans = trans.replace('.', '').replace(' ', SPACE)
    if label_type == 'char':
        ys = np.array([char2idx(trans)] * batch_size, dtype=np.int32)
        y_lens = np.array([len(char2idx(trans))] * batch_size, dtype=np.int32)
        return xs, ys, x_lens, y_lens

    elif label_type == 'word':
        ys = np.array([word2idx(trans)] * batch_size, dtype=np.int32)
        y_lens = np.array([len(word2idx(trans))] * batch_size, dtype=np.int32)
        return xs, ys, x_lens, y_lens

    elif label_type == 'word_char':
        ys = np.array([word2idx(trans)] * batch_size, dtype=np.int32)
        ys_sub = np.array([char2idx(trans)] * batch_size, dtype=np.int32)
        y_lens = np.array([len(word2idx(trans))] * batch_size, dtype=np.int32)
        y_lens_sub = np.array([len(char2idx(trans))] * batch_size,
                              dtype=np.int32)
        return xs, ys, ys_sub, x_lens, y_lens, y_lens_sub

    else:
        raise NotImplementedError
def generate_data(label_type, model, batch_size=1, num_stack=1, splice=1):
    """
    Args:
        label_type (string): character or phone or multitask
        model (string): ctc or attention or joint_ctc_attention
        batch_size (int, optional): the size of mini-batch
        num_stack (int, optional) the number of frames to stack
        splice (int, optional): frames to splice. Default is 1 frame.
    Returns:
        inputs: `[B, T, input_size]`
        labels: `[B]`
        inputs_seq_len: `[B, frame_num]`
        labels_seq_len: `[B]` (if model is attention)
    """
    # Make input data
    inputs, inputs_seq_len = wav2feature(
        ['./sample/LDC93S1.wav'] * batch_size,
        feature_type='logfbank', feature_dim=40,
        energy=False, delta1=True, delta2=True)

    # Frame stacking
    inputs = stack_frame(inputs,
                         num_stack=num_stack,
                         num_skip=num_stack,
                         progressbar=False)
    if num_stack != 1:
        for i in range(len(inputs_seq_len)):
            inputs_seq_len[i] = len(inputs[i])

    # Splice
    inputs = do_splice(inputs,
                       splice=splice,
                       batch_size=batch_size,
                       num_stack=num_stack)

    phone2idx = Phone2idx(map_file_path='./phone61.txt')

    trans_char = _read_text('./sample/LDC93S1.txt')
    trans_char = trans_char.replace('.', '')
    trans_phone = _read_phone('./sample/LDC93S1.phn')

    # Make transcripts
    if model == 'ctc':
        if label_type == 'character':
            labels = [alpha2idx(trans_char)] * batch_size
            return inputs, labels, inputs_seq_len

        elif label_type == 'phone':
            labels = [phone2idx(trans_phone.split(' '))] * batch_size
            return inputs, labels, inputs_seq_len

        elif label_type == 'multitask':
            labels_char = [alpha2idx(trans_char)] * batch_size
            labels_phone = [phone2idx(trans_phone.split(' '))] * batch_size
            return inputs, labels_char, labels_phone, inputs_seq_len

    elif model == 'attention':
        if label_type == 'character':
            trans_char = SOS + trans_char + EOS
            labels = [alpha2idx(trans_char)] * batch_size
            labels_seq_len = [len(labels[0])] * batch_size
            return inputs, labels, inputs_seq_len, labels_seq_len

        elif label_type == 'phone':
            trans_phone = SOS + ' ' + trans_phone + ' ' + EOS
            labels = [phone2idx(trans_phone.split(' '))] * batch_size
            labels_seq_len = [len(labels[0])] * batch_size
            return inputs, labels, inputs_seq_len, labels_seq_len

        elif label_type == 'multitask':
            trans_char = SOS + trans_char + EOS
            trans_phone = SOS + ' ' + trans_phone + ' ' + EOS
            labels_char = [alpha2idx(trans_char)] * batch_size
            labels_phone = [phone2idx(trans_phone.split(' '))] * batch_size
            target_len_char = [len(labels_char[0])] * batch_size
            target_len_phone = [len(labels_phone[0])] * batch_size
            return (inputs, labels_char, labels_phone,
                    inputs_seq_len, target_len_char, target_len_phone)

    elif model == 'joint_ctc_attention':
        if label_type == 'character':
            att_trans_char = SOS + trans_char + EOS
            att_labels = [alpha2idx(att_trans_char)] * batch_size
            labels_seq_len = [len(att_labels[0])] * batch_size
            ctc_labels = [alpha2idx(trans_char)] * batch_size
        elif label_type == 'phone':
            att_trans_phone = SOS + ' ' + trans_phone + ' ' + EOS
            att_labels = [phone2idx(att_trans_phone.split(' '))] * batch_size
            labels_seq_len = [len(att_labels[0])] * batch_size
            ctc_labels = [phone2idx(trans_phone.split(' '))] * batch_size
        return inputs, att_labels, ctc_labels, inputs_seq_len, labels_seq_len