def prepare_minibatch(self, audio_paths, texts, overwrite=False, is_bi_graphemes=False, seq_length=-1, save_feature_as_csvfile=False): """ Featurize a minibatch of audio, zero pad them and return a dictionary Params: audio_paths (list(str)): List of paths to audio files texts (list(str)): List of texts corresponding to the audio files Returns: dict: See below for contents """ assert len(audio_paths) == len( texts ), "Inputs and outputs to the network must be of the same number" # Features is a list of (timesteps, feature_dim) arrays # Calculate the features for each audio clip, as the log of the # Fourier Transform of the audio features = [ self.featurize(a, overwrite=overwrite, save_feature_as_csvfile=save_feature_as_csvfile) for a in audio_paths ] input_lengths = [f.shape[0] for f in features] feature_dim = features[0].shape[1] mb_size = len(features) # Pad all the inputs so that they are all the same length if seq_length == -1: x = np.zeros((mb_size, self.max_seq_length, feature_dim)) else: x = np.zeros((mb_size, seq_length, feature_dim)) y = np.zeros((mb_size, self.max_label_length)) labelUtil = LabelUtil.getInstance() label_lengths = [] for i in range(mb_size): feat = features[i] feat = self.normalize(feat) # Center using means and std x[i, :feat.shape[0], :] = feat if is_bi_graphemes: label = generate_bi_graphemes_label(texts[i]) label = labelUtil.convert_bi_graphemes_to_num(label) y[i, :len(label)] = label else: label = labelUtil.convert_word_to_num(texts[i]) y[i, :len(texts[i])] = label label_lengths.append(len(label)) return { 'x': x, # (0-padded features of shape(mb_size,timesteps,feat_dim) 'y': y, # list(int) Flattened labels (integer sequences) 'texts': texts, # list(str) Original texts 'input_lengths': input_lengths, # list(int) Length of each input 'label_lengths': label_lengths, # list(int) Length of each label }
def get_max_label_length(self, partition, is_bi_graphemes=False): if partition == 'train': texts = self.train_texts + self.val_texts elif partition == 'test': texts = self.train_texts else: raise Exception("Invalid partition to load metadata. " "Must be train/validation/test") if is_bi_graphemes: self.max_label_length = max([len(generate_bi_graphemes_label(text)) for text in texts]) else: self.max_label_length = max([len(text) for text in texts]) return self.max_label_length
def get_max_label_length(self, partition, is_bi_graphemes=False): if partition == 'train': texts = self.train_texts + self.val_texts elif partition == 'test': texts = self.train_texts else: raise Exception("Invalid partition to load metadata. " "Must be train/validation/test") if is_bi_graphemes: self.max_label_length = max([len(generate_bi_graphemes_label(text)) for text in texts]) else: self.max_label_length = max([len(text) for text in texts]) return self.max_label_length
def prepare_minibatch(self, audio_paths, texts, overwrite=False, is_bi_graphemes=False, seq_length=-1, save_feature_as_csvfile=False): """ Featurize a minibatch of audio, zero pad them and return a dictionary Params: audio_paths (list(str)): List of paths to audio files texts (list(str)): List of texts corresponding to the audio files Returns: dict: See below for contents """ assert len(audio_paths) == len(texts),\ "Inputs and outputs to the network must be of the same number" # Features is a list of (timesteps, feature_dim) arrays # Calculate the features for each audio clip, as the log of the # Fourier Transform of the audio features = [self.featurize(a, overwrite=overwrite, save_feature_as_csvfile=save_feature_as_csvfile) for a in audio_paths] input_lengths = [f.shape[0] for f in features] feature_dim = features[0].shape[1] mb_size = len(features) # Pad all the inputs so that they are all the same length if seq_length == -1: x = np.zeros((mb_size, self.max_seq_length, feature_dim)) else: x = np.zeros((mb_size, seq_length, feature_dim)) y = np.zeros((mb_size, self.max_label_length)) labelUtil = LabelUtil.getInstance() label_lengths = [] for i in range(mb_size): feat = features[i] feat = self.normalize(feat) # Center using means and std x[i, :feat.shape[0], :] = feat if is_bi_graphemes: label = generate_bi_graphemes_label(texts[i]) label = labelUtil.convert_bi_graphemes_to_num(label) y[i, :len(label)] = label else: label = labelUtil.convert_word_to_num(texts[i]) y[i, :len(texts[i])] = label label_lengths.append(len(label)) return { 'x': x, # (0-padded features of shape(mb_size,timesteps,feat_dim) 'y': y, # list(int) Flattened labels (integer sequences) 'texts': texts, # list(str) Original texts 'input_lengths': input_lengths, # list(int) Length of each input 'label_lengths': label_lengths, # list(int) Length of each label }
def get_max_label_length(self, partition, is_bi_graphemes=False, language="zh", zh_type="zi"): if partition == 'train': texts = self.train_texts + self.val_texts elif partition == 'test': texts = self.train_texts else: raise Exception("Invalid partition to load metadata. " "Must be train/validation/test") if language == "en" and is_bi_graphemes: self.max_label_length = max( [len(generate_bi_graphemes_label(text)) for text in texts]) elif language == "zh" and zh_type == "phone": self.max_label_length = max( [len(generate_phone_label(text)) for text in texts]) elif language == "zh" and zh_type == "py": self.max_label_length = max( [len(generate_py_label(text)) for text in texts]) else: self.max_label_length = max([len(text) for text in texts]) return self.max_label_length
def prepare_minibatch_fbank(self, audio_paths, texts, overwrite=False, is_bi_graphemes=False, seq_length=-1, save_feature_as_csvfile=False, language="en", zh_type="zi", noise_percent=0.4): """ Featurize a minibatch of audio, zero pad them and return a dictionary Params: audio_paths (list(str)): List of paths to audio files texts (list(str)): List of texts corresponding to the audio files Returns: dict: See below for contents """ assert len(audio_paths) == len(texts), \ "Inputs and outputs to the network must be of the same number" # Features is a list of (timesteps, feature_dim(161)) arrays (channel(3), feature_dim(41), timesteps) # Calculate the features for each audio clip, as the log of the # Fourier Transform of the audio features = [ self.featurize_fbank( a, overwrite=overwrite, save_feature_as_csvfile=save_feature_as_csvfile, noise_percent=noise_percent, seq_length=seq_length) for a in audio_paths ] input_lengths = [f.shape[1] for f in features] channel, timesteps, feature_dim = features[0].shape mb_size = len(features) # Pad all the inputs so that they are all the same length if seq_length == -1: x = np.zeros((mb_size, channel, self.max_seq_length, feature_dim)) else: x = np.zeros((mb_size, channel, seq_length, feature_dim)) y = np.zeros((mb_size, self.max_label_length)) labelUtil = LabelUtil() label_lengths = [] for i in range(mb_size): feat = features[i] feat = self.normalize_fbank(feat) # Center using means and std x[i, :, :feat. shape[1], :] = feat # padding with 0 padding with noise? if language == "en" and is_bi_graphemes: label = generate_bi_graphemes_label(texts[i]) label = labelUtil.convert_bi_graphemes_to_num(label) y[i, :len(label)] = label elif language == "en" and not is_bi_graphemes: label = labelUtil.convert_word_to_num(texts[i]) y[i, :len(texts[i])] = label elif language == "zh" and zh_type == "phone": label = generate_phone_label(texts[i]) label = labelUtil.convert_bi_graphemes_to_num(label) y[i, :len(label)] = label elif language == "zh" and zh_type == "py": label = generate_py_label(texts[i]) label = labelUtil.convert_bi_graphemes_to_num(label) y[i, :len(label)] = label elif language == "zh" and zh_type == "zi": label = generate_zi_label(texts[i]) label = labelUtil.convert_bi_graphemes_to_num(label) y[i, :len(label)] = label label_lengths.append(len(label)) return { 'x': x, # (0-padded features of shape(mb_size,timesteps,feat_dim) 'y': y, # list(int) Flattened labels (integer sequences) 'texts': texts, # list(str) Original texts 'input_lengths': input_lengths, # list(int) Length of each input 'label_lengths': label_lengths, # list(int) Length of each label }