def __init__(self, data_type, label_type, batch_size, eos_index, max_epoch=None, splice=1, num_stack=1, num_skip=1, shuffle=False, sort_utt=False, sort_stop_epoch=None, progressbar=False): """A class for loading dataset. Args: data_type (string): train or dev or test label_type (string): stirng, phone39 or phone48 or phone61 or character or character_capital_divide batch_size (int): the size of mini-batch eos_index (int): the index of <EOS> class max_epoch (int, optional): the max epoch. None means infinite loop. splice (int, optional): frames to splice. Default is 1 frame. num_stack (int, optional): the number of frames to stack num_skip (int, optional): the number of frames to skip shuffle (bool, optional): if True, shuffle utterances. This is disabled when sort_utt is True. sort_utt (bool, optional): if True, sort all utterances by the number of frames and utteraces in each mini-batch are shuffled. Otherwise, shuffle utteraces. sort_stop_epoch (int, optional): After sort_stop_epoch, training will revert back to a random order progressbar (bool, optional): if True, visualize progressbar """ if data_type not in ['train', 'dev', 'test']: raise TypeError('data_type must be "train" or "dev" or "test".') if label_type not in [ 'phone39', 'phone48', 'phone61', 'character', 'character_capital_divide' ]: raise TypeError( 'label_type must be "phone39" or "phone48" or "phone61" or ' + '"character" or "character_capital_divide".') super(Dataset, self).__init__() self.data_type = data_type self.label_type = label_type self.batch_size = batch_size self.max_epoch = max_epoch self.eos_index = eos_index self.splice = splice self.num_stack = num_stack self.num_skip = num_skip self.shuffle = shuffle self.sort_utt = sort_utt self.sort_stop_epoch = sort_stop_epoch self.progressbar = progressbar self.padded_value = eos_index input_path = join( '/n/sd8/inaguma/corpus/timit/dataset/inputs/htk/speaker', data_type) label_path = join( '/n/sd8/inaguma/corpus/timit/dataset/labels/attention', label_type, data_type) # Load the frame number dictionary with open(join(input_path, 'frame_num.pickle'), 'rb') as f: self.frame_num_dict = pickle.load(f) # Sort paths to input & label axis = 1 if sort_utt else 0 frame_num_tuple_sorted = sorted(self.frame_num_dict.items(), key=lambda x: x[axis]) input_paths, label_paths = [], [] for input_name, frame_num in frame_num_tuple_sorted: input_paths.append(join(input_path, input_name + '.npy')) label_paths.append(join(label_path, input_name + '.npy')) self.input_paths = np.array(input_paths) self.label_paths = np.array(label_paths) # Load all dataset in advance print('=> Loading dataset (%s, %s)...' % (data_type, label_type)) input_list, label_list = [], [] for i in wrap_iterator(range(len(self.input_paths)), self.progressbar): input_list.append(np.load(self.input_paths[i])) label_list.append(np.load(self.label_paths[i])) self.input_list = np.array(input_list) self.label_list = np.array(label_list) # Frame stacking print('=> Stacking frames...') self.input_list = stack_frame(self.input_list, self.input_paths, self.frame_num_dict, num_stack, num_skip, progressbar) self.rest = set(range(0, len(self.input_paths), 1))
def make_batch(self, data_indices): """Create mini-batch per step. Args: data_indices (np.ndarray): Returns: batch (dict): xs (np.ndarray): input data of size `[B, T_in, input_size]` ys (np.ndarray): target labels in the main task of size `[B, T_out]` x_lens (np.ndarray): lengths of inputs of of size `[B]` y_lens (np.ndarray): lengths of target labels in the main task of size `[B]` input_names (np.ndarray): file names of input data of size `[B]` """ input_path_list = np.array(self.df['input_path'][data_indices]) str_indices_list = np.array(self.df['transcript'][data_indices]) if not hasattr(self, 'input_size'): if self.use_double_delta: self.input_size = self.input_freq * 3 elif self.use_delta: self.input_size = self.input_freq * 2 else: self.input_size = self.input_freq self.input_size *= self.num_stack self.input_size *= self.splice # Compute max frame num in mini-batch max_frame_num = max(self.df['frame_num'][data_indices]) max_frame_num = math.ceil(max_frame_num / self.num_skip) # Compute max target label length in mini-batch max_label_num = max( map(lambda x: len(str(x).split(' ')), str_indices_list)) # TODO: fix POS tag (nan -> 'nan') # Initialization if self.backend == 'pytorch': xs = np.zeros((len(data_indices), max_frame_num, self.input_size), dtype=np.float32) elif self.backend == 'chainer': xs = [None] * len(data_indices) if self.is_test: ys = np.array([[self.pad_value] * max_label_num] * len(data_indices)) else: ys = np.array([[self.pad_value] * max_label_num] * len(data_indices), dtype=np.int32) x_lens = np.zeros((len(data_indices), ), dtype=np.int32) y_lens = np.zeros((len(data_indices), ), dtype=np.int32) input_names = np.array( list( map(lambda path: basename(path).split('.')[0], np.array(self.df['input_path'][data_indices])))) # Set values of each data in mini-batch for b in range(len(data_indices)): # Load input data try: data_i_tmp = self.load(input_path_list[b].replace( '/n/sd8/inaguma/corpus', '/data/inaguma')) except: try: data_i_tmp = self.load(input_path_list[b].replace( '/n/sd8/inaguma/corpus', '/tmp/inaguma')) except: data_i_tmp = self.load(input_path_list[b]) if self.use_double_delta: data_i = data_i_tmp elif self.use_delta: data_i = data_i_tmp[:, :self.input_freq * 2] else: data_i = data_i_tmp[:, :self.input_freq] # Frame stacking if self.num_stack > 1: data_i = stack_frame(data_i, self.num_stack, self.num_skip, dtype=np.float32) frame_num = data_i.shape[0] # Splicing if self.splice > 1: data_i = do_splice(data_i, self.splice, self.num_stack, dtype=np.float32) if self.backend == 'pytorch': xs[b, :frame_num, :] = data_i elif self.backend == 'chainer': xs[b] = data_i.astype(np.float32) x_lens[b] = frame_num if self.is_test: ys[b, 0] = self.df['transcript'][data_indices[b]] # NOTE: transcript is not tokenized else: indices = list(map(int, str_indices_list[b].split(' '))) ys[b, :len(indices)] = indices y_lens[b] = len(indices) batch = { 'xs': xs, 'ys': ys, 'x_lens': x_lens, 'y_lens': y_lens, 'input_names': input_names } return batch
def __next__(self, batch_size=None): """Generate each mini-batch. Args: batch_size (int, optional): the size of mini-batch Returns: A tuple of `(inputs, labels, inputs_seq_len, input_names)` inputs: list of input data of size `[num_gpu, B, T_in, input_size]` labels: list of target labels of size `[num_gpu, B, T_out]` inputs_seq_len: list of length of inputs of size `[num_gpu, B]` input_names: list of file name of input data of size `[num_gpu, B]` is_new_epoch (bool): If true, 1 epoch is finished """ if self.max_epoch is not None and self.epoch >= self.max_epoch: raise StopIteration # NOTE: max_epoch = None means infinite loop if batch_size is None: batch_size = self.batch_size # reset if self.is_new_epoch: self.is_new_epoch = False if not self.is_test: self.padded_value = -1 else: self.padded_value = None # TODO(hirofumi): move this if self.sort_utt: # Sort all uttrances by length if len(self.rest) > batch_size: data_indices = sorted(list(self.rest))[:batch_size] self.rest -= set(data_indices) # NOTE: rest is uttrance length order else: # Last mini-batch data_indices = list(self.rest) self.reset() self.is_new_epoch = True self.epoch += 1 if self.epoch == self.sort_stop_epoch: self.sort_utt = False self.shuffle = True # Shuffle data in the mini-batch random.shuffle(data_indices) elif self.shuffle: # Randomly sample uttrances if len(self.rest) > batch_size: data_indices = random.sample(list(self.rest), batch_size) self.rest -= set(data_indices) else: # Last mini-batch data_indices = list(self.rest) self.reset() self.is_new_epoch = True self.epoch += 1 # Shuffle selected mini-batch random.shuffle(data_indices) else: if len(self.rest) > batch_size: data_indices = sorted(list(self.rest))[:batch_size] self.rest -= set(data_indices) # NOTE: rest is in name order else: # Last mini-batch data_indices = list(self.rest) self.reset() self.is_new_epoch = True self.epoch += 1 # Load dataset in mini-batch input_list = np.array( list( map(lambda path: np.load(path), np.take(self.input_paths, data_indices, axis=0)))) label_list = np.array( list( map(lambda path: np.load(path), np.take(self.label_paths, data_indices, axis=0)))) if not hasattr(self, 'input_size'): self.input_size = input_list[0].shape[1] if self.num_stack is not None and self.num_skip is not None: self.input_size *= self.num_stack # Frame stacking input_list = stack_frame(input_list, self.num_stack, self.num_skip, progressbar=False) # Compute max frame num in mini-batch max_frame_num = max(map(lambda x: x.shape[0], input_list)) # Compute max target label length in mini-batch max_seq_len = max(map(len, label_list)) # Initialization inputs = np.zeros( (len(data_indices), max_frame_num, self.input_size * self.splice), dtype=np.float32) labels = np.array([[self.padded_value] * max_seq_len] * len(data_indices)) inputs_seq_len = np.zeros((len(data_indices), ), dtype=np.int32) input_names = list( map(lambda path: basename(path).split('.')[0], np.take(self.input_paths, data_indices, axis=0))) # Set values of each data in mini-batch for i_batch in range(len(data_indices)): data_i = input_list[i_batch] frame_num, input_size = data_i.shape # Splicing data_i = data_i.reshape(1, frame_num, input_size) data_i = do_splice(data_i, splice=self.splice, batch_size=1).reshape(frame_num, -1) inputs[i_batch, :frame_num, :] = data_i if self.is_test: labels[i_batch, 0] = label_list[i_batch] else: labels[ i_batch, :len(label_list[i_batch])] = label_list[i_batch] inputs_seq_len[i_batch] = frame_num ############### # Multi-GPUs ############### if self.num_gpu > 1: # Now we split the mini-batch data by num_gpu inputs = np.array_split(inputs, self.num_gpu, axis=0) labels = np.array_split(labels, self.num_gpu, axis=0) inputs_seq_len = np.array_split(inputs_seq_len, self.num_gpu, axis=0) input_names = np.array_split(input_names, self.num_gpu, axis=0) else: inputs = inputs[np.newaxis, :, :, :] labels = labels[np.newaxis, :, :] inputs_seq_len = inputs_seq_len[np.newaxis, :] input_names = np.array(input_names)[np.newaxis, :] self.iteration += len(data_indices) # Clean up del input_list del label_list return (inputs, labels, inputs_seq_len, input_names), self.is_new_epoch
def __next__(self, batch_size=None): """Generate each mini-batch. Args: batch_size (int, optional): the size of mini-batch Returns: A tuple of `(inputs, labels, inputs_seq_len, labels_seq_len, input_names)` inputs: list of input data of size `[num_gpu, B, T_in, input_size]` labels: list of target labels of size `[num_gpu, B, T_out]` inputs_seq_len: list of length of inputs of size `[num_gpu, B]` labels_seq_len: list of length of target labels of size `[num_gpu, B]` input_names: list of file name of input data of size `[num_gpu, B]` is_new_epoch (bool): If true, 1 epoch is finished """ if self.max_epoch is not None and self.epoch >= self.max_epoch: raise StopIteration # NOTE: max_epoch = None means infinite loop if batch_size is None: batch_size = self.batch_size # reset if self.is_new_epoch: self.is_new_epoch = False if not self.is_test: self.padded_value = self.eos_index else: self.padded_value = None # TODO(hirofumi): move this if self.sort_utt: # Sort all uttrances by length if len(self.rest) > batch_size: data_indices = sorted(list(self.rest))[:batch_size] self.rest -= set(data_indices) # NOTE: rest is uttrance length order else: # Last mini-batch data_indices = list(self.rest) self.reset() self.is_new_epoch = True self.epoch += 1 if self.epoch == self.sort_stop_epoch: self.sort_utt = False self.shuffle = True # Shuffle data in the mini-batch random.shuffle(data_indices) elif self.shuffle: # Randomly sample uttrances if len(self.rest) > batch_size: data_indices = random.sample(list(self.rest), batch_size) self.rest -= set(data_indices) else: # Last mini-batch data_indices = list(self.rest) self.reset() self.is_new_epoch = True self.epoch += 1 # Shuffle selected mini-batch random.shuffle(data_indices) else: if len(self.rest) > batch_size: data_indices = sorted(list(self.rest))[:batch_size] self.rest -= set(data_indices) # NOTE: rest is in name order else: # Last mini-batch data_indices = list(self.rest) self.reset() self.is_new_epoch = True self.epoch += 1 # Load dataset in mini-batch input_list = np.array(list( map(lambda path: np.load(path), np.take(self.input_paths, data_indices, axis=0)))) label_list = np.array(list( map(lambda path: np.load(path), np.take(self.label_paths, data_indices, axis=0)))) if not hasattr(self, 'input_size'): self.input_size = input_list[0].shape[1] if self.num_stack is not None and self.num_skip is not None: self.input_size *= self.num_stack # Frame stacking input_list = stack_frame(input_list, self.num_stack, self.num_skip, progressbar=False) # Compute max frame num in mini-batch max_frame_num = max(map(lambda x: x.shape[0], input_list)) # Compute max target label length in mini-batch max_seq_len = max(map(len, label_list)) + 2 # NOTE: + <SOS> and <EOS> # Initialization inputs = np.zeros( (len(data_indices), max_frame_num, self.input_size * self.splice), dtype=np.float32) labels = np.array( [[self.padded_value] * max_seq_len] * len(data_indices)) inputs_seq_len = np.zeros((len(data_indices),), dtype=np.int32) labels_seq_len = np.zeros((len(data_indices),), dtype=np.int32) input_names = list( map(lambda path: basename(path).split('.')[0], np.take(self.input_paths, data_indices, axis=0))) # Set values of each data in mini-batch for i_batch in range(len(data_indices)): data_i = input_list[i_batch] frame_num, input_size = data_i.shape # Splicing data_i = data_i.reshape(1, frame_num, input_size) data_i = do_splice(data_i, splice=self.splice, batch_size=1, num_stack=self.num_stack) data_i = data_i.reshape(frame_num, -1) inputs[i_batch, : frame_num, :] = data_i if self.is_test: labels[i_batch, 0] = label_list[i_batch] # NOTE: transcript is saved as string else: labels[i_batch, 0] = self.sos_index labels[i_batch, 1:len(label_list[i_batch]) + 1] = label_list[i_batch] labels[i_batch, len(label_list[i_batch]) + 1] = self.eos_index inputs_seq_len[i_batch] = frame_num labels_seq_len[i_batch] = len(label_list[i_batch]) + 2 # TODO: +2 ?? ############### # Multi-GPUs ############### if self.num_gpu > 1: # Now we split the mini-batch data by num_gpu inputs = np.array_split(inputs, self.num_gpu, axis=0) labels = np.array_split(labels, self.num_gpu, axis=0) inputs_seq_len = np.array_split( inputs_seq_len, self.num_gpu, axis=0) labels_seq_len = np.array_split( labels_seq_len, self.num_gpu, axis=0) input_names = np.array_split(input_names, self.num_gpu, axis=0) else: inputs = inputs[np.newaxis, :, :, :] labels = labels[np.newaxis, :, :] inputs_seq_len = inputs_seq_len[np.newaxis, :] labels_seq_len = labels_seq_len[np.newaxis, :] input_names = np.array(input_names)[np.newaxis, :] self.iteration += len(data_indices) # Clean up del input_list del label_list return (inputs, labels, inputs_seq_len, labels_seq_len, input_names), self.is_new_epoch
def generate_data(label_type='char', batch_size=1, num_stack=1, splice=1, backend='pytorch'): """Generate dataset for unit test. Args: label_type (string, optional): char or word or word_char batch_size (int): the size of mini-batch splice (int): frames to splice. Default is 1 frame. backend (string, optional): pytorch or chainer Returns: xs (np.ndarray): A tensor of size `[B, T, input_size]` ys (np.ndarray): `[B, max_label_seq_len]` x_lens (np.ndarray): A tensor of size `[B]` y_lens (np.ndarray): A tensor of size `[B]` """ # Make input data _xs, x_lens = wav2feature(['../../sample/LDC93S1.wav'] * batch_size, feature_type='logfbank', feature_dim=40, energy=False, delta1=True, delta2=True, dtype=np.float32) max_frame_num = math.ceil(x_lens[0] / num_stack) if backend == 'pytorch': xs = np.zeros( (batch_size, max_frame_num, _xs.shape[-1] * num_stack * splice), dtype=np.float32) elif backend == 'chainer': xs = [None] * batch_size for b in range(batch_size): # Frame stacking data_i = stack_frame(_xs[b], num_stack=num_stack, num_skip=num_stack, dtype=np.float32) # Splice data_i = do_splice(data_i, splice=splice, num_stack=num_stack, dtype=np.float32) xs[b] = data_i x_lens[b] = len(data_i) # Make transcripts trans = _read_text('../../sample/LDC93S1.txt') trans = trans.replace('.', '').replace(' ', SPACE) if label_type == 'char': ys = np.array([char2idx(trans)] * batch_size, dtype=np.int32) y_lens = np.array([len(char2idx(trans))] * batch_size, dtype=np.int32) return xs, ys, x_lens, y_lens elif label_type == 'word': ys = np.array([word2idx(trans)] * batch_size, dtype=np.int32) y_lens = np.array([len(word2idx(trans))] * batch_size, dtype=np.int32) return xs, ys, x_lens, y_lens elif label_type == 'word_char': ys = np.array([word2idx(trans)] * batch_size, dtype=np.int32) ys_sub = np.array([char2idx(trans)] * batch_size, dtype=np.int32) y_lens = np.array([len(word2idx(trans))] * batch_size, dtype=np.int32) y_lens_sub = np.array([len(char2idx(trans))] * batch_size, dtype=np.int32) return xs, ys, ys_sub, x_lens, y_lens, y_lens_sub else: raise NotImplementedError
def generate_data(label_type, model, batch_size=1, num_stack=1, splice=1): """ Args: label_type (string): character or phone or multitask model (string): ctc or attention or joint_ctc_attention batch_size (int, optional): the size of mini-batch num_stack (int, optional) the number of frames to stack splice (int, optional): frames to splice. Default is 1 frame. Returns: inputs: `[B, T, input_size]` labels: `[B]` inputs_seq_len: `[B, frame_num]` labels_seq_len: `[B]` (if model is attention) """ # Make input data inputs, inputs_seq_len = wav2feature( ['./sample/LDC93S1.wav'] * batch_size, feature_type='logfbank', feature_dim=40, energy=False, delta1=True, delta2=True) # Frame stacking inputs = stack_frame(inputs, num_stack=num_stack, num_skip=num_stack, progressbar=False) if num_stack != 1: for i in range(len(inputs_seq_len)): inputs_seq_len[i] = len(inputs[i]) # Splice inputs = do_splice(inputs, splice=splice, batch_size=batch_size, num_stack=num_stack) phone2idx = Phone2idx(map_file_path='./phone61.txt') trans_char = _read_text('./sample/LDC93S1.txt') trans_char = trans_char.replace('.', '') trans_phone = _read_phone('./sample/LDC93S1.phn') # Make transcripts if model == 'ctc': if label_type == 'character': labels = [alpha2idx(trans_char)] * batch_size return inputs, labels, inputs_seq_len elif label_type == 'phone': labels = [phone2idx(trans_phone.split(' '))] * batch_size return inputs, labels, inputs_seq_len elif label_type == 'multitask': labels_char = [alpha2idx(trans_char)] * batch_size labels_phone = [phone2idx(trans_phone.split(' '))] * batch_size return inputs, labels_char, labels_phone, inputs_seq_len elif model == 'attention': if label_type == 'character': trans_char = SOS + trans_char + EOS labels = [alpha2idx(trans_char)] * batch_size labels_seq_len = [len(labels[0])] * batch_size return inputs, labels, inputs_seq_len, labels_seq_len elif label_type == 'phone': trans_phone = SOS + ' ' + trans_phone + ' ' + EOS labels = [phone2idx(trans_phone.split(' '))] * batch_size labels_seq_len = [len(labels[0])] * batch_size return inputs, labels, inputs_seq_len, labels_seq_len elif label_type == 'multitask': trans_char = SOS + trans_char + EOS trans_phone = SOS + ' ' + trans_phone + ' ' + EOS labels_char = [alpha2idx(trans_char)] * batch_size labels_phone = [phone2idx(trans_phone.split(' '))] * batch_size target_len_char = [len(labels_char[0])] * batch_size target_len_phone = [len(labels_phone[0])] * batch_size return (inputs, labels_char, labels_phone, inputs_seq_len, target_len_char, target_len_phone) elif model == 'joint_ctc_attention': if label_type == 'character': att_trans_char = SOS + trans_char + EOS att_labels = [alpha2idx(att_trans_char)] * batch_size labels_seq_len = [len(att_labels[0])] * batch_size ctc_labels = [alpha2idx(trans_char)] * batch_size elif label_type == 'phone': att_trans_phone = SOS + ' ' + trans_phone + ' ' + EOS att_labels = [phone2idx(att_trans_phone.split(' '))] * batch_size labels_seq_len = [len(att_labels[0])] * batch_size ctc_labels = [phone2idx(trans_phone.split(' '))] * batch_size return inputs, att_labels, ctc_labels, inputs_seq_len, labels_seq_len