def __init__(self, vocab_filepath, mean_std_filepath, augmentation_config='{}', max_duration=float('inf'), min_duration=0.0, stride_ms=10.0, window_ms=20.0, max_freq=None, specgram_type='linear', use_dB_normalization=True, num_threads=multiprocessing.cpu_count() // 2, random_seed=0): self._max_duration = max_duration self._min_duration = min_duration self._normalizer = FeatureNormalizer(mean_std_filepath) self._augmentation_pipeline = AugmentationPipeline( augmentation_config=augmentation_config, random_seed=random_seed) self._speech_featurizer = SpeechFeaturizer( vocab_filepath=vocab_filepath, specgram_type=specgram_type, stride_ms=stride_ms, window_ms=window_ms, max_freq=max_freq, use_dB_normalization=use_dB_normalization) self._num_threads = num_threads self._rng = random.Random(random_seed) self._epoch = 0 # for caching tar files info self._local_data = local() self._local_data.tar2info = {} self._local_data.tar2object = {}
def compute_mean_std(manifest_path, num_samples, output_path): normalizer = FeatureNormalizer(mean_std_filepath=None, manifest_path=manifest_path, num_samples=num_samples) # 将计算的结果保存的文件中 normalizer.write_to_file(output_path) print('计算的均值和标准值已保存在 %s!' % output_path)
def __init__(self, vocab_filepath, mean_std_filepath, augmentation_config='{}', max_duration=float('inf'), min_duration=0.0, stride_ms=10.0, window_ms=20.0, use_dB_normalization=True, random_seed=0, keep_transcription_text=False, place=paddle.CPUPlace(), is_training=True): self._max_duration = max_duration self._min_duration = min_duration self._normalizer = FeatureNormalizer(mean_std_filepath) self._augmentation_pipeline = AugmentationPipeline( augmentation_config=augmentation_config, random_seed=random_seed) self._speech_featurizer = SpeechFeaturizer( vocab_filepath=vocab_filepath, stride_ms=stride_ms, window_ms=window_ms, use_dB_normalization=use_dB_normalization) self._rng = random.Random(random_seed) self._keep_transcription_text = keep_transcription_text self.epoch = 0 self._is_training = is_training # for caching tar files info self._local_data = local() self._local_data.tar2info = {} self._local_data.tar2object = {} self._place = place
def get_audio_mfcc_features(txt_files, wav_files, n_input, n_context, word_num_map, txt_labels=None, specgram_type='mfcc', mean_std_filepath='data/aishell/mean_std.npz'): """ Get MFCC/linear specgram features. The dim of MFCC is 39, contains 13 mfcc + 13 delta1 + 13 delta2. Linear specgram contains 161 features in different frequency section. :param txt_files: :param wav_files: :param n_input: :param n_context: :param word_num_map: :param txt_labels: :return: """ audio_features = [] audio_features_len = [] text_vector = [] text_vector_len = [] if txt_files != None: txt_labels = txt_files get_feature = AudioFeaturizer(specgram_type) normalizer = FeatureNormalizer(mean_std_filepath) for txt_obj, wav_file in zip(txt_labels, wav_files): # Turn inputs into features if specgram_type == 'mfcc': audio_data = audiofile_to_input_vector( wav_file, n_input, n_context) # get mfcc feature ( ???, 741 ) elif specgram_type == 'linear': speech_segment = SpeechSegment.from_file(wav_file, "") specgram = get_feature.featurize(speech_segment) audio_data = normalizer.apply(specgram) audio_data = np.transpose( audio_data) # get linear specgram feature, (?, 161) audio_data = audio_data.astype('float32') audio_features.append(audio_data) audio_features_len.append(np.int32(len(audio_data))) target = [] if txt_files != None: # txt_obj是文件 target = trans_text_ch_to_vector(txt_obj, word_num_map) else: target = trans_text_ch_to_vector(None, word_num_map, txt_obj) # txt_obj是labels text_vector.append(target) text_vector_len.append(len(target)) audio_features = np.asarray(audio_features) audio_features_len = np.asarray(audio_features_len) text_vector = np.asarray(text_vector) text_vector_len = np.asarray(text_vector_len) return audio_features, audio_features_len, text_vector, text_vector_len
def compute_mean_std(manifest_path, num_samples, output_path): # 随机取指定的数量计算平均值归一化 normalizer = FeatureNormalizer(mean_std_filepath=None, manifest_path=manifest_path, num_samples=num_samples, num_workers=args.num_workers) # 将计算的结果保存的文件中 normalizer.write_to_file(output_path) print('计算的均值和标准值已保存在 %s!' % output_path)
def __init__(self, vocab_filepath, mean_std_filepath, stride_ms=10.0, window_ms=20.0, use_dB_normalization=True): self._normalizer = FeatureNormalizer(mean_std_filepath) self._speech_featurizer = SpeechFeaturizer( vocab_filepath=vocab_filepath, stride_ms=stride_ms, window_ms=window_ms, use_dB_normalization=use_dB_normalization)
def __init__(self, manifest, vocab_filepath, mean_std_filepath, augmentation_config='{}', max_duration=float('inf'), min_duration=0.0, stride_ms=10.0, window_ms=20.0, max_freq=None, specgram_type='linear', use_dB_normalization=True, random_seed=0, keep_transcription_text=False, segmented=False): self._max_duration = max_duration self._min_duration = min_duration self._segmented = segmented self._keep_transcription_text = keep_transcription_text if isinstance(manifest, str) and os.path.isfile(manifest): self.manifest = pd.read_csv(manifest) elif isinstance(manifest, pd.DataFrame): self.manifest = manifest else: raise BaseException( "{} is neither an valide path or a pandas DataFrame object". format(manifest)) # duration filtering self.manifest = self.manifest[ (self.manifest.duration >= self._min_duration) & (self.manifest.duration <= self._max_duration)] self.manifest = self.manifest.sort_values(by=["duration"], ascending=True) self._normalizer = FeatureNormalizer(mean_std_filepath) self._augmentation_pipeline = AugmentationPipeline( augmentation_config=augmentation_config, random_seed=random_seed) self._speech_featurizer = SpeechFeaturizer( vocab_filepath=vocab_filepath, specgram_type=specgram_type, stride_ms=stride_ms, window_ms=window_ms, max_freq=max_freq, use_dB_normalization=use_dB_normalization)
def main(): print_arguments(args) audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type) def augment_and_featurize(audio_segment): return audio_featurizer.featurize(audio_segment) normalizer = FeatureNormalizer( mean_std_filepath=None, manifest_path=args.manifest_path, featurize_func=augment_and_featurize, num_samples=args.num_samples) normalizer.write_to_file(args.output_path)
def main(): print_arguments(args) augmentation_pipeline = AugmentationPipeline('{}') audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type) def augment_and_featurize(audio_segment): augmentation_pipeline.transform_audio(audio_segment) return audio_featurizer.featurize(audio_segment) # 随机取指定的数量计算平均值归一化 normalizer = FeatureNormalizer(mean_std_filepath=None, manifest_path=args.manifest_path, featurize_func=augment_and_featurize, num_samples=args.num_samples) # 将计算的结果保存的文件中 normalizer.write_to_file(args.output_path)
class AudioInferProcess(object): """ 识别程序所使用的是对音频预处理的工具 :param vocab_filepath: 词汇表文件路径 :type vocab_filepath: str :param mean_std_filepath: 平均值和标准差的文件路径 :type mean_std_filepath: str :param stride_ms: 生成帧的跨步大小(以毫秒为单位) :type stride_ms: float :param window_ms: 用于生成帧的窗口大小(毫秒) :type window_ms: float :param use_dB_normalization: 提取特征前是否将音频归一化至-20 dB :type use_dB_normalization: bool """ def __init__(self, vocab_filepath, mean_std_filepath, stride_ms=10.0, window_ms=20.0, use_dB_normalization=True): self._normalizer = FeatureNormalizer(mean_std_filepath) self._speech_featurizer = SpeechFeaturizer( vocab_filepath=vocab_filepath, stride_ms=stride_ms, window_ms=window_ms, use_dB_normalization=use_dB_normalization) def process_utterance(self, audio_file): """对语音数据加载、预处理 :param audio_file: 音频文件的文件路径或文件对象 :type audio_file: str | file :return: 预处理的音频数据 :rtype: 2darray """ speech_segment = SpeechSegment.from_file(audio_file, "") specgram, _ = self._speech_featurizer.featurize(speech_segment, False) specgram = self._normalizer.apply(specgram) return specgram @property def vocab_size(self): """返回词汇表大小 :return: 词汇表大小 :rtype: int """ return self._speech_featurizer.vocab_size @property def vocab_list(self): """返回词汇表列表 :return: 词汇表列表 :rtype: list """ return self._speech_featurizer.vocab_list
class DataGenerator(object): """ DataGenerator provides basic audio data preprocessing pipeline, and offers data reader interfaces of PaddlePaddle requirements. :param vocab_filepath: Vocabulary filepath for indexing tokenized transcripts. :type vocab_filepath: basestring :param mean_std_filepath: File containing the pre-computed mean and stddev. :type mean_std_filepath: None|basestring :param augmentation_config: Augmentation configuration in json string. Details see AugmentationPipeline.__doc__. :type augmentation_config: str :param max_duration: Audio with duration (in seconds) greater than this will be discarded. :type max_duration: float :param min_duration: Audio with duration (in seconds) smaller than this will be discarded. :type min_duration: float :param stride_ms: Striding size (in milliseconds) for generating frames. :type stride_ms: float :param window_ms: Window size (in milliseconds) for generating frames. :type window_ms: float :param max_freq: Used when specgram_type is 'linear', only FFT bins corresponding to frequencies between [0, max_freq] are returned. :types max_freq: None|float :param specgram_type: Specgram feature type. Options: 'linear'. :type specgram_type: str :param use_dB_normalization: Whether to normalize the audio to -20 dB before extracting the features. :type use_dB_normalization: bool :param random_seed: Random seed. :type random_seed: int :param keep_transcription_text: If set to True, transcription text will be passed forward directly without converting to index sequence. :type keep_transcription_text: bool :param place: The place to run the program. :type place: CPUPlace or CUDAPlace :param is_training: If set to True, generate text data for training, otherwise, generate text data for infer. :type is_training: bool """ def __init__(self, vocab_filepath, mean_std_filepath, augmentation_config='{}', max_duration=float('inf'), min_duration=0.0, stride_ms=10.0, window_ms=20.0, max_freq=None, specgram_type='linear', use_dB_normalization=True, random_seed=0, keep_transcription_text=False, place=fluid.CPUPlace(), is_training=True): self._max_duration = max_duration self._min_duration = min_duration self._normalizer = FeatureNormalizer(mean_std_filepath) self._augmentation_pipeline = AugmentationPipeline( augmentation_config=augmentation_config, random_seed=random_seed) self._speech_featurizer = SpeechFeaturizer( vocab_filepath=vocab_filepath, specgram_type=specgram_type, stride_ms=stride_ms, window_ms=window_ms, max_freq=max_freq, use_dB_normalization=use_dB_normalization) self._rng = random.Random(random_seed) self._keep_transcription_text = keep_transcription_text self._epoch = 0 self._is_training = is_training # for caching tar files info self._local_data = local() self._local_data.tar2info = {} self._local_data.tar2object = {} self._place = place def process_utterance(self, audio_file, transcript): """Load, augment, featurize and normalize for speech data. :param audio_file: Filepath or file object of audio file. :type audio_file: basestring | file :param transcript: Transcription text. :type transcript: basestring :return: Tuple of audio feature tensor and data of transcription part, where transcription part could be token ids or text. :rtype: tuple of (2darray, list) """ try: is_str = isinstance(audio_file, basestring) except: is_str = isinstance(audio_file, str) if is_str and audio_file.startswith('tar:'): speech_segment = SpeechSegment.from_file( self._subfile_from_tar(audio_file), transcript) else: speech_segment = SpeechSegment.from_file(audio_file, transcript) self._augmentation_pipeline.transform_audio(speech_segment) specgram, transcript_part = self._speech_featurizer.featurize( speech_segment, self._keep_transcription_text) specgram = self._normalizer.apply(specgram) return specgram, transcript_part def batch_reader_creator(self, manifest_path, batch_size, padding_to=-1, flatten=False, sortagrad=False, shuffle_method="batch_shuffle"): """ Batch data reader creator for audio data. Return a callable generator function to produce batches of data. Audio features within one batch will be padded with zeros to have the same shape, or a user-defined shape. :param manifest_path: Filepath of manifest for audio files. :type manifest_path: basestring :param batch_size: Number of instances in a batch. :type batch_size: int :param padding_to: If set -1, the maximun shape in the batch will be used as the target shape for padding. Otherwise, `padding_to` will be the target shape. :type padding_to: int :param flatten: If set True, audio features will be flatten to 1darray. :type flatten: bool :param sortagrad: If set True, sort the instances by audio duration in the first epoch for speed up training. :type sortagrad: bool :param shuffle_method: Shuffle method. Options: '' or None: no shuffle. 'instance_shuffle': instance-wise shuffle. 'batch_shuffle': similarly-sized instances are put into batches, and then batch-wise shuffle the batches. For more details, please see ``_batch_shuffle.__doc__``. 'batch_shuffle_clipped': 'batch_shuffle' with head shift and tail clipping. For more details, please see ``_batch_shuffle``. If sortagrad is True, shuffle is disabled for the first epoch. :type shuffle_method: None|str :return: Batch reader function, producing batches of data when called. :rtype: callable """ def batch_reader(): # read manifest manifest = read_manifest(manifest_path=manifest_path, max_duration=self._max_duration, min_duration=self._min_duration) # sort (by duration) or batch-wise shuffle the manifest if self._epoch == 0 and sortagrad: manifest.sort(key=lambda x: x["duration"]) manifest.reverse() else: if shuffle_method == "batch_shuffle": manifest = self._batch_shuffle(manifest, batch_size, clipped=False) elif shuffle_method == "batch_shuffle_clipped": manifest = self._batch_shuffle(manifest, batch_size, clipped=True) elif shuffle_method == "instance_shuffle": self._rng.shuffle(manifest) elif shuffle_method is None: pass else: raise ValueError("Unknown shuffle method %s." % shuffle_method) # prepare batches batch = [] instance_reader = self._instance_reader_creator(manifest) for instance in instance_reader(): batch.append(instance) if len(batch) == batch_size: yield self._padding_batch(batch, padding_to, flatten) batch = [] if len(batch) >= 1: yield self._padding_batch(batch, padding_to, flatten) self._epoch += 1 return batch_reader @property def feeding(self): """Returns data reader's feeding dict. :return: Data feeding dict. :rtype: dict """ feeding_dict = {"audio_spectrogram": 0, "transcript_text": 1} return feeding_dict @property def vocab_size(self): """Return the vocabulary size. :return: Vocabulary size. :rtype: int """ return self._speech_featurizer.vocab_size @property def vocab_list(self): """Return the vocabulary in list. :return: Vocabulary in list. :rtype: list """ return self._speech_featurizer.vocab_list def _parse_tar(self, file): """Parse a tar file to get a tarfile object and a map containing tarinfoes """ result = {} f = tarfile.open(file) for tarinfo in f.getmembers(): result[tarinfo.name] = tarinfo return f, result def _subfile_from_tar(self, file): """Get subfile object from tar. It will return a subfile object from tar file and cached tar file info for next reading request. """ tarpath, filename = file.split(':', 1)[1].split('#', 1) if 'tar2info' not in self._local_data.__dict__: self._local_data.tar2info = {} if 'tar2object' not in self._local_data.__dict__: self._local_data.tar2object = {} if tarpath not in self._local_data.tar2info: object, infoes = self._parse_tar(tarpath) self._local_data.tar2info[tarpath] = infoes self._local_data.tar2object[tarpath] = object return self._local_data.tar2object[tarpath].extractfile( self._local_data.tar2info[tarpath][filename]) def _instance_reader_creator(self, manifest): """ Instance reader creator. Create a callable function to produce instances of data. Instance: a tuple of ndarray of audio spectrogram and a list of token indices for transcript. """ def reader(): for instance in manifest: inst = self.process_utterance(instance["audio_filepath"], instance["text"]), yield inst[0] return reader def _padding_batch(self, batch, padding_to=-1, flatten=False): """ Padding audio features with zeros to make them have the same shape (or a user-defined shape) within one bach. If ``padding_to`` is -1, the maximun shape in the batch will be used as the target shape for padding. Otherwise, `padding_to` will be the target shape (only refers to the second axis). If `flatten` is True, features will be flatten to 1darray. """ new_batch = [] # get target shape max_length = max([audio.shape[1] for audio, text in batch]) if padding_to != -1: if padding_to < max_length: raise ValueError( "If padding_to is not -1, it should be larger " "than any instance's shape in the batch") max_length = padding_to # padding padded_audios = [] texts, text_lens = [], [] audio_lens = [] masks = [] for audio, text in batch: padded_audio = np.zeros([audio.shape[0], max_length]) padded_audio[:, :audio.shape[1]] = audio if flatten: padded_audio = padded_audio.flatten() padded_audios.append(padded_audio) if self._is_training: texts += text else: texts.append(text) text_lens.append(len(text)) audio_lens.append(audio.shape[1]) mask_shape0 = (audio.shape[0] - 1) // 2 + 1 mask_shape1 = (audio.shape[1] - 1) // 3 + 1 mask_max_len = (max_length - 1) // 3 + 1 mask_ones = np.ones((mask_shape0, mask_shape1)) mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1)) mask = np.repeat(np.reshape( np.concatenate((mask_ones, mask_zeros), axis=1), (1, mask_shape0, mask_max_len)), 32, axis=0) masks.append(mask) padded_audios = np.array(padded_audios).astype('float32') if self._is_training: texts = np.expand_dims(np.array(texts).astype('int32'), axis=-1) texts = fluid.create_lod_tensor(texts, recursive_seq_lens=[text_lens], place=self._place) audio_lens = np.array(audio_lens).astype('int64').reshape([-1, 1]) masks = np.array(masks).astype('float32') return padded_audios, texts, audio_lens, masks def _batch_shuffle(self, manifest, batch_size, clipped=False): """Put similarly-sized instances into minibatches for better efficiency and make a batch-wise shuffle. 1. Sort the audio clips by duration. 2. Generate a random number `k`, k in [0, batch_size). 3. Randomly shift `k` instances in order to create different batches for different epochs. Create minibatches. 4. Shuffle the minibatches. :param manifest: Manifest contents. List of dict. :type manifest: list :param batch_size: Batch size. This size is also used for generate a random number for batch shuffle. :type batch_size: int :param clipped: Whether to clip the heading (small shift) and trailing (incomplete batch) instances. :type clipped: bool :return: Batch shuffled mainifest. :rtype: list """ manifest.sort(key=lambda x: x["duration"]) shift_len = self._rng.randint(0, batch_size - 1) batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size) self._rng.shuffle(batch_manifest) batch_manifest = [item for batch in batch_manifest for item in batch] if clipped: res_len = len(manifest) - shift_len - len(batch_manifest) batch_manifest.extend(manifest[-res_len:]) batch_manifest.extend(manifest[0:shift_len]) return batch_manifest
class SpecgramGenerator(DynamicLengthGenerator): """audio specgram generator""" def __init__(self, manifest, vocab_filepath, mean_std_filepath, augmentation_config='{}', max_duration=float('inf'), min_duration=0.0, stride_ms=10.0, window_ms=20.0, max_freq=None, specgram_type='linear', use_dB_normalization=True, random_seed=0, keep_transcription_text=False, segmented=False): self._max_duration = max_duration self._min_duration = min_duration self._segmented = segmented self._keep_transcription_text = keep_transcription_text if isinstance(manifest, str) and os.path.isfile(manifest): self.manifest = pd.read_csv(manifest) elif isinstance(manifest, pd.DataFrame): self.manifest = manifest else: raise BaseException( "{} is neither an valide path or a pandas DataFrame object". format(manifest)) # duration filtering self.manifest = self.manifest[ (self.manifest.duration >= self._min_duration) & (self.manifest.duration <= self._max_duration)] self.manifest = self.manifest.sort_values(by=["duration"], ascending=True) self._normalizer = FeatureNormalizer(mean_std_filepath) self._augmentation_pipeline = AugmentationPipeline( augmentation_config=augmentation_config, random_seed=random_seed) self._speech_featurizer = SpeechFeaturizer( vocab_filepath=vocab_filepath, specgram_type=specgram_type, stride_ms=stride_ms, window_ms=window_ms, max_freq=max_freq, use_dB_normalization=use_dB_normalization) def __len__(self): return len(self.manifest) def __getitem__(self, idx): if torch.is_tensor(idx): idx = idx.tolist() instance = self.manifest.iloc[idx] if self._segmented is True: specgram, transcript = self.process_utterance( instance["audio_path"], instance["text"], segments_info=None) else: specgram, transcript = self.process_utterance( instance["audio_path"], instance["text"], segments_info={ "start": instance["st"], "end": instance["et"] }) uttid = instance["uttid"] sample = { "uttid": uttid, "specgrams": specgram, "text": transcript, "trans": instance["text"] } return sample def process_utterance(self, audio_file, transcript, uttid=None, segments_info=None): """Load, augment, featurize and normalize for speech data. :param audio_file: Filepath or file object of audio file. :type audio_file: basestring | file :param transcript: Transcription text. :type transcript: basestring :return: Tuple of audio feature tensor and data of transcription part, where transcription part could be token ids or text. :rtype: tuple of (2darray, list) """ if isinstance(audio_file, str) and audio_file.startswith('tar:'): speech_segment = SpeechSegment.from_file( self._subfile_from_tar(audio_file), transcript) elif segments_info is None: speech_segment = SpeechSegment.from_file(audio_file, transcript) else: speech_segment = SpeechSegment.slice_from_file( audio_file, transcript, **segments_info) # augment speech. i.e. add noise, speedup self._augmentation_pipeline.transform_audio(speech_segment) specgram, transcript_part = self._speech_featurizer.featurize( speech_segment, self._keep_transcription_text) specgram = self._normalizer.apply(specgram) return specgram, transcript_part
class DataGenerator(object): """ DataGenerator provides basic audio data preprocessing pipeline, and offers data reader interfaces of PaddlePaddle requirements. :param vocab_filepath: Vocabulary filepath for indexing tokenized transcripts. :type vocab_filepath: str :param mean_std_filepath: File containing the pre-computed mean and stddev. :type mean_std_filepath: None|str :param augmentation_config: Augmentation configuration in json string. Details see AugmentationPipeline.__doc__. :type augmentation_config: str :param max_duration: Audio with duration (in seconds) greater than this will be discarded. :type max_duration: float :param min_duration: Audio with duration (in seconds) smaller than this will be discarded. :type min_duration: float :param stride_ms: Striding size (in milliseconds) for generating frames. :type stride_ms: float :param window_ms: Window size (in milliseconds) for generating frames. :type window_ms: float :param max_freq: Used when specgram_type is 'linear', only FFT bins corresponding to frequencies between [0, max_freq] are returned. :types max_freq: None|float :param specgram_type: Specgram feature type. Options: 'linear'. :type specgram_type: str :param use_dB_normalization: Whether to normalize the audio to -20 dB before extracting the features. :type use_dB_normalization: bool :param random_seed: Random seed. :type random_seed: int :param keep_transcription_text: If set to True, transcription text will be passed forward directly without converting to index sequence. :type keep_transcription_text: bool :param place: The place to run the program. :type place: CPUPlace or CUDAPlace :param is_training: If set to True, generate text data for training, otherwise, generate text data for infer. :type is_training: bool """ def __init__(self, vocab_filepath, mean_std_filepath, augmentation_config='{}', max_duration=float('inf'), min_duration=0.0, stride_ms=10.0, window_ms=20.0, max_freq=None, specgram_type='linear', use_dB_normalization=True, random_seed=0, keep_transcription_text=False, place=fluid.CPUPlace(), is_training=True): self._max_duration = max_duration self._min_duration = min_duration self._normalizer = FeatureNormalizer(mean_std_filepath) self._augmentation_pipeline = AugmentationPipeline( augmentation_config=augmentation_config, random_seed=random_seed) self._speech_featurizer = SpeechFeaturizer( vocab_filepath=vocab_filepath, specgram_type=specgram_type, stride_ms=stride_ms, window_ms=window_ms, max_freq=max_freq, use_dB_normalization=use_dB_normalization) self._rng = random.Random(random_seed) self._keep_transcription_text = keep_transcription_text self._epoch = 0 self._is_training = is_training # for caching tar files info self._local_data = local() self._local_data.tar2info = {} self._local_data.tar2object = {} self._place = place def process_utterance(self, audio_file, transcript): """对语音数据加载、扩充、特征化和归一化 :param audio_file: 音频文件的文件路径或文件对象 :type audio_file: str | file :param transcript: 音频对应的文本 :type transcript: str :return: 经过归一化等预处理的音频数据,音频文件对应文本的ID :rtype: tuple of (2darray, list) """ speech_segment = SpeechSegment.from_file(audio_file, transcript) self._augmentation_pipeline.transform_audio(speech_segment) specgram, transcript_part = self._speech_featurizer.featurize( speech_segment, self._keep_transcription_text) specgram = self._normalizer.apply(specgram) return specgram, transcript_part def batch_reader_creator(self, manifest_path, batch_size, padding_to=-1, flatten=False, shuffle_method="batch_shuffle"): """ Batch data reader creator for audio data. Return a callable generator function to produce batches of data. Audio features within one batch will be padded with zeros to have the same shape, or a user-defined shape. :param manifest_path: Filepath of manifest for audio files. :type manifest_path: str :param batch_size: Number of instances in a batch. :type batch_size: int :param padding_to: If set -1, the maximun shape in the batch will be used as the target shape for padding. Otherwise, `padding_to` will be the target shape. :type padding_to: int :param flatten: If set True, audio features will be flatten to 1darray. :type flatten: bool :param shuffle_method: Shuffle method. Options: '' or None: no shuffle. 'instance_shuffle': instance-wise shuffle. 'batch_shuffle': similarly-sized instances are put into batches, and then batch-wise shuffle the batches. For more details, please see ``_batch_shuffle.__doc__``. 'batch_shuffle_clipped': 'batch_shuffle' with head shift and tail clipping. For more details, please see ``_batch_shuffle``. If sortagrad is True, shuffle is disabled for the first epoch. :type shuffle_method: None|str :return: Batch reader function, producing batches of data when called. :rtype: callable """ def batch_reader(): # 读取数据列表 manifest = read_manifest(manifest_path=manifest_path, max_duration=self._max_duration, min_duration=self._min_duration) # 将数据列表长到短排序 if self._epoch == 0: manifest.sort(key=lambda x: x["duration"]) manifest.reverse() else: if shuffle_method == "batch_shuffle": manifest = self._batch_shuffle(manifest, batch_size, clipped=False) elif shuffle_method == "batch_shuffle_clipped": manifest = self._batch_shuffle(manifest, batch_size, clipped=True) elif shuffle_method == "instance_shuffle": self._rng.shuffle(manifest) elif shuffle_method is None: pass else: raise ValueError("Unknown shuffle method %s." % shuffle_method) # 准备批量数据 batch = [] instance_reader = self._instance_reader_creator(manifest) for instance in instance_reader(): batch.append(instance) if len(batch) == batch_size: yield self._padding_batch(batch, padding_to, flatten) batch = [] if len(batch) >= 1: yield self._padding_batch(batch, padding_to, flatten) self._epoch += 1 return batch_reader @property def feeding(self): """返回数据读取器的exe读取字典 :return: 数据读取字典 :rtype: dict """ feeding_dict = {"audio_spectrogram": 0, "transcript_text": 1} return feeding_dict @property def vocab_size(self): """返回词汇表大小 :return: 词汇表大小 :rtype: int """ return self._speech_featurizer.vocab_size @property def vocab_list(self): """返回词汇表列表 :return: 词汇表列表 :rtype: list """ return self._speech_featurizer.vocab_list def _instance_reader_creator(self, manifest): """ 创建一个数据生成器reader Instance: 生成器得到的数据是一个元组,包含了经过预处理音频数据和音频对应文本的ID """ def reader(): for instance in manifest: inst = self.process_utterance(instance["audio_filepath"], instance["text"]) yield inst return reader def _padding_batch(self, batch, padding_to=-1, flatten=False): """ 用零填充音频功能,使它们在同一个batch具有相同的形状(或一个用户定义的形状) 如果padding_to为-1,则批处理中的最大形状将被使用 作为填充的目标形状。 否则,' padding_to '将是目标形状(仅指第二轴)。 如果“flatten”为True,特征将被flatten为一维数据 """ # 获取目标形状 max_length = max([audio.shape[1] for audio, text in batch]) if padding_to != -1: if padding_to < max_length: raise ValueError("如果padding_to不是-1,它应该大于批处理中任何实例的形状") max_length = padding_to # 填充操作 padded_audios = [] texts, text_lens = [], [] audio_lens = [] masks = [] for audio, text in batch: padded_audio = np.zeros([audio.shape[0], max_length]) padded_audio[:, :audio.shape[1]] = audio if flatten: padded_audio = padded_audio.flatten() padded_audios.append(padded_audio) if self._is_training: texts += text else: texts.append(text) text_lens.append(len(text)) audio_lens.append(audio.shape[1]) mask_shape0 = (audio.shape[0] - 1) // 2 + 1 mask_shape1 = (audio.shape[1] - 1) // 3 + 1 mask_max_len = (max_length - 1) // 3 + 1 mask_ones = np.ones((mask_shape0, mask_shape1)) mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1)) mask = np.repeat(np.reshape( np.concatenate((mask_ones, mask_zeros), axis=1), (1, mask_shape0, mask_max_len)), 32, axis=0) masks.append(mask) padded_audios = np.array(padded_audios).astype('float32') if self._is_training: texts = np.expand_dims(np.array(texts).astype('int32'), axis=-1) texts = fluid.create_lod_tensor(texts, recursive_seq_lens=[text_lens], place=self._place) audio_lens = np.array(audio_lens).astype('int64').reshape([-1, 1]) masks = np.array(masks).astype('float32') return padded_audios, texts, audio_lens, masks def _batch_shuffle(self, manifest, batch_size, clipped=False): """将大小相似的实例放入小批量中可以提高效率,并进行批量打乱 1. 按持续时间对音频剪辑进行排序 2. 生成一个随机数k, k的范围[0,batch_size) 3. 随机移动k实例,为不同的epoch训练创建不同的批次 4. 打乱minibatches. :param manifest: 数据列表 :type manifest: list :param batch_size: 批量大小。这个大小还用于为批量洗牌生成一个随机数。 :type batch_size: int :param clipped: 是否剪辑头部(小移位)和尾部(不完整批处理)实例。 :type clipped: bool :return: Batch shuffled mainifest. :rtype: list """ manifest.sort(key=lambda x: x["duration"]) shift_len = self._rng.randint(0, batch_size - 1) batch_manifest = list(zip(*[iter(manifest[shift_len:])] * batch_size)) self._rng.shuffle(batch_manifest) batch_manifest = [item for batch in batch_manifest for item in batch] if not clipped: res_len = len(manifest) - shift_len - len(batch_manifest) batch_manifest.extend(manifest[-res_len:]) batch_manifest.extend(manifest[0:shift_len]) return batch_manifest
'ctc_beam_search', '结果解码方法', choices=['ctc_beam_search', 'ctc_greedy']) add_arg('lang_model_path', str, 'lm/zh_giga.no_cna_cmn.prune01244.klm', "语言模型文件路径") args = parser.parse_args() print_arguments(args) # 加载数据字典 with open(args.dataset_vocab, 'r', encoding='utf-8') as f: labels = eval(f.read()) vocabulary = [labels[i] for i in range(len(labels))] # 提取音频特征器和归一化器 audio_featurizer = AudioFeaturizer() normalizer = FeatureNormalizer(mean_std_filepath=args.mean_std_path) # 创建模型 model = DeepSpeech2Model(feat_size=audio_featurizer.feature_dim(), dict_size=len(vocabulary), num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_size=args.rnn_layer_size) model.set_state_dict( paddle.load(os.path.join(args.model_path, 'model.pdparams'))) model.eval() # 集束搜索方法的处理 if args.decoder == "ctc_beam_search": try: from decoders.beam_search_decoder import BeamSearchDecoder
class DataGenerator(object): """ DataGenerator provides basic audio data preprocessing pipeline, and offers data reader interfaces of PaddlePaddle requirements. :param vocab_filepath: Vocabulary filepath for indexing tokenized transcripts. :type vocab_filepath: basestring :param mean_std_filepath: File containing the pre-computed mean and stddev. :type mean_std_filepath: None|basestring :param augmentation_config: Augmentation configuration in json string. Details see AugmentationPipeline.__doc__. :type augmentation_config: str :param max_duration: Audio with duration (in seconds) greater than this will be discarded. :type max_duration: float :param min_duration: Audio with duration (in seconds) smaller than this will be discarded. :type min_duration: float :param stride_ms: Striding size (in milliseconds) for generating frames. :type stride_ms: float :param window_ms: Window size (in milliseconds) for generating frames. :type window_ms: float :param max_freq: Used when specgram_type is 'linear', only FFT bins corresponding to frequencies between [0, max_freq] are returned. :types max_freq: None|float :param specgram_type: Specgram feature type. Options: 'linear'. :type specgram_type: str :param use_dB_normalization: Whether to normalize the audio to -20 dB before extracting the features. :type use_dB_normalization: bool :param num_threads: Number of CPU threads for processing data. :type num_threads: int :param random_seed: Random seed. :type random_seed: int """ def __init__(self, vocab_filepath, mean_std_filepath, augmentation_config='{}', max_duration=float('inf'), min_duration=0.0, stride_ms=10.0, window_ms=20.0, max_freq=None, specgram_type='linear', use_dB_normalization=True, num_threads=multiprocessing.cpu_count() // 2, random_seed=0): self._max_duration = max_duration self._min_duration = min_duration self._normalizer = FeatureNormalizer(mean_std_filepath) self._augmentation_pipeline = AugmentationPipeline( augmentation_config=augmentation_config, random_seed=random_seed) self._speech_featurizer = SpeechFeaturizer( vocab_filepath=vocab_filepath, specgram_type=specgram_type, stride_ms=stride_ms, window_ms=window_ms, max_freq=max_freq, use_dB_normalization=use_dB_normalization) self._num_threads = num_threads self._rng = random.Random(random_seed) self._epoch = 0 # for caching tar files info self._local_data = local() self._local_data.tar2info = {} self._local_data.tar2object = {} def process_utterance(self, filename, transcript): """Load, augment, featurize and normalize for speech data. :param filename: Audio filepath :type filename: basestring | file :param transcript: Transcription text. :type transcript: basestring :return: Tuple of audio feature tensor and list of token ids for transcription. :rtype: tuple of (2darray, list) """ speech_segment = SpeechSegment.from_file(filename, transcript) self._augmentation_pipeline.transform_audio(speech_segment) specgram, text_ids = self._speech_featurizer.featurize(speech_segment) specgram = self._normalizer.apply(specgram) return specgram, text_ids def batch_reader_creator(self, manifest_path, batch_size, min_batch_size=1, padding_to=-1, flatten=False, sortagrad=False, shuffle_method="batch_shuffle"): """ Batch data reader creator for audio data. Return a callable generator function to produce batches of data. Audio features within one batch will be padded with zeros to have the same shape, or a user-defined shape. :param manifest_path: Filepath of manifest for audio files. :type manifest_path: basestring :param batch_size: Number of instances in a batch. :type batch_size: int :param min_batch_size: Any batch with batch size smaller than this will be discarded. (To be deprecated in the future.) :type min_batch_size: int :param padding_to: If set -1, the maximun shape in the batch will be used as the target shape for padding. Otherwise, `padding_to` will be the target shape. :type padding_to: int :param flatten: If set True, audio features will be flatten to 1darray. :type flatten: bool :param sortagrad: If set True, sort the instances by audio duration in the first epoch for speed up training. :type sortagrad: bool :param shuffle_method: Shuffle method. Options: '' or None: no shuffle. 'instance_shuffle': instance-wise shuffle. 'batch_shuffle': similarly-sized instances are put into batches, and then batch-wise shuffle the batches. For more details, please see ``_batch_shuffle.__doc__``. 'batch_shuffle_clipped': 'batch_shuffle' with head shift and tail clipping. For more details, please see ``_batch_shuffle``. If sortagrad is True, shuffle is disabled for the first epoch. :type shuffle_method: None|str :return: Batch reader function, producing batches of data when called. :rtype: callable """ def batch_reader(): # read manifest manifest = read_manifest(manifest_path=manifest_path, max_duration=self._max_duration, min_duration=self._min_duration) # sort (by duration) or batch-wise shuffle the manifest if self._epoch == 0 and sortagrad: manifest.sort(key=lambda x: x["duration"]) else: if shuffle_method == "batch_shuffle": manifest = self._batch_shuffle(manifest, batch_size, clipped=False) elif shuffle_method == "batch_shuffle_clipped": manifest = self._batch_shuffle(manifest, batch_size, clipped=True) elif shuffle_method == "instance_shuffle": self._rng.shuffle(manifest) elif shuffle_method == None: pass else: raise ValueError("Unknown shuffle method %s." % shuffle_method) # prepare batches instance_reader = self._instance_reader_creator(manifest) batch = [] for instance in instance_reader(): batch.append(instance) if len(batch) == batch_size: yield self._padding_batch(batch, padding_to, flatten) batch = [] if len(batch) >= min_batch_size: yield self._padding_batch(batch, padding_to, flatten) self._epoch += 1 return batch_reader @property def feeding(self): """Returns data reader's feeding dict. :return: Data feeding dict. :rtype: dict """ return {"audio_spectrogram": 0, "transcript_text": 1} @property def vocab_size(self): """Return the vocabulary size. :return: Vocabulary size. :rtype: int """ return self._speech_featurizer.vocab_size @property def vocab_list(self): """Return the vocabulary in list. :return: Vocabulary in list. :rtype: list """ return self._speech_featurizer.vocab_list def _parse_tar(self, file): """Parse a tar file to get a tarfile object and a map containing tarinfoes """ result = {} f = tarfile.open(file) for tarinfo in f.getmembers(): result[tarinfo.name] = tarinfo return f, result def _get_file_object(self, file): """Get file object by file path. If file startwith tar, it will return a tar file object and cached tar file info for next reading request. It will return file directly, if the type of file is not str. """ if file.startswith('tar:'): tarpath, filename = file.split(':', 1)[1].split('#', 1) if 'tar2info' not in self._local_data.__dict__: self._local_data.tar2info = {} if 'tar2object' not in self._local_data.__dict__: self._local_data.tar2object = {} if tarpath not in self._local_data.tar2info: object, infoes = self._parse_tar(tarpath) self._local_data.tar2info[tarpath] = infoes self._local_data.tar2object[tarpath] = object return self._local_data.tar2object[tarpath].extractfile( self._local_data.tar2info[tarpath][filename]) else: return open(file, 'r') def _instance_reader_creator(self, manifest): """ Instance reader creator. Create a callable function to produce instances of data. Instance: a tuple of ndarray of audio spectrogram and a list of token indices for transcript. """ def reader(): for instance in manifest: yield instance def mapper(instance): return self.process_utterance( self._get_file_object(instance["audio_filepath"]), instance["text"]) return paddle.reader.xmap_readers(mapper, reader, self._num_threads, 1024, order=True) def _padding_batch(self, batch, padding_to=-1, flatten=False): """ Padding audio features with zeros to make them have the same shape (or a user-defined shape) within one bach. If ``padding_to`` is -1, the maximun shape in the batch will be used as the target shape for padding. Otherwise, `padding_to` will be the target shape (only refers to the second axis). If `flatten` is True, features will be flatten to 1darray. """ new_batch = [] # get target shape max_length = max([audio.shape[1] for audio, text in batch]) if padding_to != -1: if padding_to < max_length: raise ValueError( "If padding_to is not -1, it should be larger " "than any instance's shape in the batch") max_length = padding_to # padding for audio, text in batch: padded_audio = np.zeros([audio.shape[0], max_length]) padded_audio[:, :audio.shape[1]] = audio if flatten: padded_audio = padded_audio.flatten() new_batch.append((padded_audio, text)) return new_batch def _batch_shuffle(self, manifest, batch_size, clipped=False): """Put similarly-sized instances into minibatches for better efficiency and make a batch-wise shuffle. 1. Sort the audio clips by duration. 2. Generate a random number `k`, k in [0, batch_size). 3. Randomly shift `k` instances in order to create different batches for different epochs. Create minibatches. 4. Shuffle the minibatches. :param manifest: Manifest contents. List of dict. :type manifest: list :param batch_size: Batch size. This size is also used for generate a random number for batch shuffle. :type batch_size: int :param clipped: Whether to clip the heading (small shift) and trailing (incomplete batch) instances. :type clipped: bool :return: Batch shuffled mainifest. :rtype: list """ manifest.sort(key=lambda x: x["duration"]) shift_len = self._rng.randint(0, batch_size - 1) batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size) self._rng.shuffle(batch_manifest) batch_manifest = list(sum(batch_manifest, ())) if not clipped: res_len = len(manifest) - shift_len - len(batch_manifest) batch_manifest.extend(manifest[-res_len:]) batch_manifest.extend(manifest[0:shift_len]) return batch_manifest