Esempio n. 1
0
 def prepare_minibatch(self,
                       audio_paths,
                       texts,
                       overwrite=False,
                       is_bi_graphemes=False,
                       seq_length=-1,
                       save_feature_as_csvfile=False):
     """ Featurize a minibatch of audio, zero pad them and return a dictionary
     Params:
         audio_paths (list(str)): List of paths to audio files
         texts (list(str)): List of texts corresponding to the audio files
     Returns:
         dict: See below for contents
     """
     assert len(audio_paths) == len(
         texts
     ), "Inputs and outputs to the network must be of the same number"
     # Features is a list of (timesteps, feature_dim) arrays
     # Calculate the features for each audio clip, as the log of the
     # Fourier Transform of the audio
     features = [
         self.featurize(a,
                        overwrite=overwrite,
                        save_feature_as_csvfile=save_feature_as_csvfile)
         for a in audio_paths
     ]
     input_lengths = [f.shape[0] for f in features]
     feature_dim = features[0].shape[1]
     mb_size = len(features)
     # Pad all the inputs so that they are all the same length
     if seq_length == -1:
         x = np.zeros((mb_size, self.max_seq_length, feature_dim))
     else:
         x = np.zeros((mb_size, seq_length, feature_dim))
     y = np.zeros((mb_size, self.max_label_length))
     labelUtil = LabelUtil.getInstance()
     label_lengths = []
     for i in range(mb_size):
         feat = features[i]
         feat = self.normalize(feat)  # Center using means and std
         x[i, :feat.shape[0], :] = feat
         if is_bi_graphemes:
             label = generate_bi_graphemes_label(texts[i])
             label = labelUtil.convert_bi_graphemes_to_num(label)
             y[i, :len(label)] = label
         else:
             label = labelUtil.convert_word_to_num(texts[i])
             y[i, :len(texts[i])] = label
         label_lengths.append(len(label))
     return {
         'x': x,  # (0-padded features of shape(mb_size,timesteps,feat_dim)
         'y': y,  # list(int) Flattened labels (integer sequences)
         'texts': texts,  # list(str) Original texts
         'input_lengths': input_lengths,  # list(int) Length of each input
         'label_lengths': label_lengths,  # list(int) Length of each label
     }
Esempio n. 2
0
 def get_max_label_length(self, partition, is_bi_graphemes=False):
     if partition == 'train':
         texts = self.train_texts + self.val_texts
     elif partition == 'test':
         texts = self.train_texts
     else:
         raise Exception("Invalid partition to load metadata. "
                         "Must be train/validation/test")
     if is_bi_graphemes:
         self.max_label_length = max([len(generate_bi_graphemes_label(text)) for text in texts])
     else:
         self.max_label_length = max([len(text) for text in texts])
     return self.max_label_length
Esempio n. 3
0
 def get_max_label_length(self, partition, is_bi_graphemes=False):
     if partition == 'train':
         texts = self.train_texts + self.val_texts
     elif partition == 'test':
         texts = self.train_texts
     else:
         raise Exception("Invalid partition to load metadata. "
                         "Must be train/validation/test")
     if is_bi_graphemes:
         self.max_label_length = max([len(generate_bi_graphemes_label(text)) for text in texts])
     else:
         self.max_label_length = max([len(text) for text in texts])
     return self.max_label_length
 def prepare_minibatch(self, audio_paths, texts, overwrite=False,
                       is_bi_graphemes=False, seq_length=-1, save_feature_as_csvfile=False):
     """ Featurize a minibatch of audio, zero pad them and return a dictionary
     Params:
         audio_paths (list(str)): List of paths to audio files
         texts (list(str)): List of texts corresponding to the audio files
     Returns:
         dict: See below for contents
     """
     assert len(audio_paths) == len(texts),\
         "Inputs and outputs to the network must be of the same number"
     # Features is a list of (timesteps, feature_dim) arrays
     # Calculate the features for each audio clip, as the log of the
     # Fourier Transform of the audio
     features = [self.featurize(a, overwrite=overwrite, save_feature_as_csvfile=save_feature_as_csvfile) for a in audio_paths]
     input_lengths = [f.shape[0] for f in features]
     feature_dim = features[0].shape[1]
     mb_size = len(features)
     # Pad all the inputs so that they are all the same length
     if seq_length == -1:
         x = np.zeros((mb_size, self.max_seq_length, feature_dim))
     else:
         x = np.zeros((mb_size, seq_length, feature_dim))
     y = np.zeros((mb_size, self.max_label_length))
     labelUtil = LabelUtil.getInstance()
     label_lengths = []
     for i in range(mb_size):
         feat = features[i]
         feat = self.normalize(feat)  # Center using means and std
         x[i, :feat.shape[0], :] = feat
         if is_bi_graphemes:
             label = generate_bi_graphemes_label(texts[i])
             label = labelUtil.convert_bi_graphemes_to_num(label)
             y[i, :len(label)] = label
         else:
             label = labelUtil.convert_word_to_num(texts[i])
             y[i, :len(texts[i])] = label
         label_lengths.append(len(label))
     return {
         'x': x,  # (0-padded features of shape(mb_size,timesteps,feat_dim)
         'y': y,  # list(int) Flattened labels (integer sequences)
         'texts': texts,  # list(str) Original texts
         'input_lengths': input_lengths,  # list(int) Length of each input
         'label_lengths': label_lengths,  # list(int) Length of each label
     }
Esempio n. 5
0
 def get_max_label_length(self,
                          partition,
                          is_bi_graphemes=False,
                          language="zh",
                          zh_type="zi"):
     if partition == 'train':
         texts = self.train_texts + self.val_texts
     elif partition == 'test':
         texts = self.train_texts
     else:
         raise Exception("Invalid partition to load metadata. "
                         "Must be train/validation/test")
     if language == "en" and is_bi_graphemes:
         self.max_label_length = max(
             [len(generate_bi_graphemes_label(text)) for text in texts])
     elif language == "zh" and zh_type == "phone":
         self.max_label_length = max(
             [len(generate_phone_label(text)) for text in texts])
     elif language == "zh" and zh_type == "py":
         self.max_label_length = max(
             [len(generate_py_label(text)) for text in texts])
     else:
         self.max_label_length = max([len(text) for text in texts])
     return self.max_label_length
Esempio n. 6
0
 def prepare_minibatch_fbank(self,
                             audio_paths,
                             texts,
                             overwrite=False,
                             is_bi_graphemes=False,
                             seq_length=-1,
                             save_feature_as_csvfile=False,
                             language="en",
                             zh_type="zi",
                             noise_percent=0.4):
     """ Featurize a minibatch of audio, zero pad them and return a dictionary
     Params:
         audio_paths (list(str)): List of paths to audio files
         texts (list(str)): List of texts corresponding to the audio files
     Returns:
         dict: See below for contents
     """
     assert len(audio_paths) == len(texts), \
         "Inputs and outputs to the network must be of the same number"
     # Features is a list of (timesteps, feature_dim(161)) arrays (channel(3), feature_dim(41), timesteps)
     # Calculate the features for each audio clip, as the log of the
     # Fourier Transform of the audio
     features = [
         self.featurize_fbank(
             a,
             overwrite=overwrite,
             save_feature_as_csvfile=save_feature_as_csvfile,
             noise_percent=noise_percent,
             seq_length=seq_length) for a in audio_paths
     ]
     input_lengths = [f.shape[1] for f in features]
     channel, timesteps, feature_dim = features[0].shape
     mb_size = len(features)
     # Pad all the inputs so that they are all the same length
     if seq_length == -1:
         x = np.zeros((mb_size, channel, self.max_seq_length, feature_dim))
     else:
         x = np.zeros((mb_size, channel, seq_length, feature_dim))
     y = np.zeros((mb_size, self.max_label_length))
     labelUtil = LabelUtil()
     label_lengths = []
     for i in range(mb_size):
         feat = features[i]
         feat = self.normalize_fbank(feat)  # Center using means and std
         x[i, :, :feat.
           shape[1], :] = feat  # padding with 0 padding with noise?
         if language == "en" and is_bi_graphemes:
             label = generate_bi_graphemes_label(texts[i])
             label = labelUtil.convert_bi_graphemes_to_num(label)
             y[i, :len(label)] = label
         elif language == "en" and not is_bi_graphemes:
             label = labelUtil.convert_word_to_num(texts[i])
             y[i, :len(texts[i])] = label
         elif language == "zh" and zh_type == "phone":
             label = generate_phone_label(texts[i])
             label = labelUtil.convert_bi_graphemes_to_num(label)
             y[i, :len(label)] = label
         elif language == "zh" and zh_type == "py":
             label = generate_py_label(texts[i])
             label = labelUtil.convert_bi_graphemes_to_num(label)
             y[i, :len(label)] = label
         elif language == "zh" and zh_type == "zi":
             label = generate_zi_label(texts[i])
             label = labelUtil.convert_bi_graphemes_to_num(label)
             y[i, :len(label)] = label
         label_lengths.append(len(label))
     return {
         'x': x,  # (0-padded features of shape(mb_size,timesteps,feat_dim)
         'y': y,  # list(int) Flattened labels (integer sequences)
         'texts': texts,  # list(str) Original texts
         'input_lengths': input_lengths,  # list(int) Length of each input
         'label_lengths': label_lengths,  # list(int) Length of each label
     }