Example #1
0
class DataGenerator(object):
    """
    DataGenerator provides basic audio data preprocessing pipeline, and offers
    data reader interfaces of PaddlePaddle requirements.

    :param vocab_filepath: Vocabulary filepath for indexing tokenized
                           transcripts.
    :type vocab_filepath: basestring
    :param mean_std_filepath: File containing the pre-computed mean and stddev.
    :type mean_std_filepath: None|basestring
    :param augmentation_config: Augmentation configuration in json string.
                                Details see AugmentationPipeline.__doc__.
    :type augmentation_config: str
    :param max_duration: Audio with duration (in seconds) greater than
                         this will be discarded.
    :type max_duration: float
    :param min_duration: Audio with duration (in seconds) smaller than
                         this will be discarded.
    :type min_duration: float
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
    :param max_freq: Used when specgram_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
                     returned.
    :types max_freq: None|float
    :param specgram_type: Specgram feature type. Options: 'linear'.
    :type specgram_type: str
    :param use_dB_normalization: Whether to normalize the audio to -20 dB
                                before extracting the features.
    :type use_dB_normalization: bool
    :param random_seed: Random seed.
    :type random_seed: int
    :param keep_transcription_text: If set to True, transcription text will
                                    be passed forward directly without
                                    converting to index sequence.
    :type keep_transcription_text: bool
    :param place: The place to run the program.
    :type place: CPUPlace or CUDAPlace
    :param is_training: If set to True, generate text data for training, 
                        otherwise,  generate text data for infer.
    :type is_training: bool 
    """
    def __init__(self,
                 vocab_filepath,
                 mean_std_filepath,
                 augmentation_config='{}',
                 max_duration=float('inf'),
                 min_duration=0.0,
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_freq=None,
                 specgram_type='linear',
                 use_dB_normalization=True,
                 random_seed=0,
                 keep_transcription_text=False,
                 place=fluid.CPUPlace(),
                 is_training=True):
        self._max_duration = max_duration
        self._min_duration = min_duration
        self._normalizer = FeatureNormalizer(mean_std_filepath)
        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=augmentation_config, random_seed=random_seed)
        self._speech_featurizer = SpeechFeaturizer(
            vocab_filepath=vocab_filepath,
            specgram_type=specgram_type,
            stride_ms=stride_ms,
            window_ms=window_ms,
            max_freq=max_freq,
            use_dB_normalization=use_dB_normalization)
        self._rng = random.Random(random_seed)
        self._keep_transcription_text = keep_transcription_text
        self._epoch = 0
        self._is_training = is_training
        # for caching tar files info
        self._local_data = local()
        self._local_data.tar2info = {}
        self._local_data.tar2object = {}
        self._place = place

    def process_utterance(self, audio_file, transcript):
        """Load, augment, featurize and normalize for speech data.

        :param audio_file: Filepath or file object of audio file.
        :type audio_file: basestring | file
        :param transcript: Transcription text.
        :type transcript: basestring
        :return: Tuple of audio feature tensor and data of transcription part,
                 where transcription part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        try:
            is_str = isinstance(audio_file, basestring)
        except:
            is_str = isinstance(audio_file, str)
        if is_str and audio_file.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(audio_file), transcript)
        else:
            speech_segment = SpeechSegment.from_file(audio_file, transcript)
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        specgram = self._normalizer.apply(specgram)
        return specgram, transcript_part

    def batch_reader_creator(self,
                             manifest_path,
                             batch_size,
                             padding_to=-1,
                             flatten=False,
                             sortagrad=False,
                             shuffle_method="batch_shuffle"):
        """
        Batch data reader creator for audio data. Return a callable generator
        function to produce batches of data.

        Audio features within one batch will be padded with zeros to have the
        same shape, or a user-defined shape.

        :param manifest_path: Filepath of manifest for audio files.
        :type manifest_path: basestring
        :param batch_size: Number of instances in a batch.
        :type batch_size: int
        :param padding_to:  If set -1, the maximun shape in the batch
                            will be used as the target shape for padding.
                            Otherwise, `padding_to` will be the target shape.
        :type padding_to: int
        :param flatten: If set True, audio features will be flatten to 1darray.
        :type flatten: bool
        :param sortagrad: If set True, sort the instances by audio duration
                          in the first epoch for speed up training.
        :type sortagrad: bool
        :param shuffle_method: Shuffle method. Options:
                                '' or None: no shuffle.
                                'instance_shuffle': instance-wise shuffle.
                                'batch_shuffle': similarly-sized instances are
                                                 put into batches, and then
                                                 batch-wise shuffle the batches.
                                                 For more details, please see
                                                 ``_batch_shuffle.__doc__``.
                                'batch_shuffle_clipped': 'batch_shuffle' with
                                                         head shift and tail
                                                         clipping. For more
                                                         details, please see
                                                         ``_batch_shuffle``.
                              If sortagrad is True, shuffle is disabled
                              for the first epoch.
        :type shuffle_method: None|str
        :return: Batch reader function, producing batches of data when called.
        :rtype: callable
        """
        def batch_reader():
            # read manifest
            manifest = read_manifest(manifest_path=manifest_path,
                                     max_duration=self._max_duration,
                                     min_duration=self._min_duration)
            # sort (by duration) or batch-wise shuffle the manifest
            if self._epoch == 0 and sortagrad:
                manifest.sort(key=lambda x: x["duration"])
                manifest.reverse()
            else:
                if shuffle_method == "batch_shuffle":
                    manifest = self._batch_shuffle(manifest,
                                                   batch_size,
                                                   clipped=False)
                elif shuffle_method == "batch_shuffle_clipped":
                    manifest = self._batch_shuffle(manifest,
                                                   batch_size,
                                                   clipped=True)
                elif shuffle_method == "instance_shuffle":
                    self._rng.shuffle(manifest)
                elif shuffle_method is None:
                    pass
                else:
                    raise ValueError("Unknown shuffle method %s." %
                                     shuffle_method)
            # prepare batches
            batch = []
            instance_reader = self._instance_reader_creator(manifest)

            for instance in instance_reader():
                batch.append(instance)
                if len(batch) == batch_size:
                    yield self._padding_batch(batch, padding_to, flatten)
                    batch = []
            if len(batch) >= 1:
                yield self._padding_batch(batch, padding_to, flatten)
            self._epoch += 1

        return batch_reader

    @property
    def feeding(self):
        """Returns data reader's feeding dict.

        :return: Data feeding dict.
        :rtype: dict
        """
        feeding_dict = {"audio_spectrogram": 0, "transcript_text": 1}
        return feeding_dict

    @property
    def vocab_size(self):
        """Return the vocabulary size.

        :return: Vocabulary size.
        :rtype: int
        """
        return self._speech_featurizer.vocab_size

    @property
    def vocab_list(self):
        """Return the vocabulary in list.

        :return: Vocabulary in list.
        :rtype: list
        """
        return self._speech_featurizer.vocab_list

    def _parse_tar(self, file):
        """Parse a tar file to get a tarfile object
        and a map containing tarinfoes
        """
        result = {}
        f = tarfile.open(file)
        for tarinfo in f.getmembers():
            result[tarinfo.name] = tarinfo
        return f, result

    def _subfile_from_tar(self, file):
        """Get subfile object from tar.

        It will return a subfile object from tar file
        and cached tar file info for next reading request.
        """
        tarpath, filename = file.split(':', 1)[1].split('#', 1)
        if 'tar2info' not in self._local_data.__dict__:
            self._local_data.tar2info = {}
        if 'tar2object' not in self._local_data.__dict__:
            self._local_data.tar2object = {}
        if tarpath not in self._local_data.tar2info:
            object, infoes = self._parse_tar(tarpath)
            self._local_data.tar2info[tarpath] = infoes
            self._local_data.tar2object[tarpath] = object
        return self._local_data.tar2object[tarpath].extractfile(
            self._local_data.tar2info[tarpath][filename])

    def _instance_reader_creator(self, manifest):
        """
        Instance reader creator. Create a callable function to produce
        instances of data.

        Instance: a tuple of ndarray of audio spectrogram and a list of
        token indices for transcript.
        """
        def reader():
            for instance in manifest:
                inst = self.process_utterance(instance["audio_filepath"],
                                              instance["text"]),
                yield inst[0]

        return reader

    def _padding_batch(self, batch, padding_to=-1, flatten=False):
        """
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one bach.

        If ``padding_to`` is -1, the maximun shape in the batch will be used
        as the target shape for padding. Otherwise, `padding_to` will be the
        target shape (only refers to the second axis).

        If `flatten` is True, features will be flatten to 1darray.
        """
        new_batch = []
        # get target shape
        max_length = max([audio.shape[1] for audio, text in batch])
        if padding_to != -1:
            if padding_to < max_length:
                raise ValueError(
                    "If padding_to is not -1, it should be larger "
                    "than any instance's shape in the batch")
            max_length = padding_to
        # padding
        padded_audios = []
        texts, text_lens = [], []
        audio_lens = []
        masks = []
        for audio, text in batch:
            padded_audio = np.zeros([audio.shape[0], max_length])
            padded_audio[:, :audio.shape[1]] = audio
            if flatten:
                padded_audio = padded_audio.flatten()
            padded_audios.append(padded_audio)
            if self._is_training:
                texts += text
            else:
                texts.append(text)
            text_lens.append(len(text))
            audio_lens.append(audio.shape[1])
            mask_shape0 = (audio.shape[0] - 1) // 2 + 1
            mask_shape1 = (audio.shape[1] - 1) // 3 + 1
            mask_max_len = (max_length - 1) // 3 + 1
            mask_ones = np.ones((mask_shape0, mask_shape1))
            mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1))
            mask = np.repeat(np.reshape(
                np.concatenate((mask_ones, mask_zeros), axis=1),
                (1, mask_shape0, mask_max_len)),
                             32,
                             axis=0)
            masks.append(mask)
        padded_audios = np.array(padded_audios).astype('float32')
        if self._is_training:
            texts = np.expand_dims(np.array(texts).astype('int32'), axis=-1)
            texts = fluid.create_lod_tensor(texts,
                                            recursive_seq_lens=[text_lens],
                                            place=self._place)
        audio_lens = np.array(audio_lens).astype('int64').reshape([-1, 1])
        masks = np.array(masks).astype('float32')
        return padded_audios, texts, audio_lens, masks

    def _batch_shuffle(self, manifest, batch_size, clipped=False):
        """Put similarly-sized instances into minibatches for better efficiency
        and make a batch-wise shuffle.

        1. Sort the audio clips by duration.
        2. Generate a random number `k`, k in [0, batch_size).
        3. Randomly shift `k` instances in order to create different batches
           for different epochs. Create minibatches.
        4. Shuffle the minibatches.

        :param manifest: Manifest contents. List of dict.
        :type manifest: list
        :param batch_size: Batch size. This size is also used for generate
                           a random number for batch shuffle.
        :type batch_size: int
        :param clipped: Whether to clip the heading (small shift) and trailing
                        (incomplete batch) instances.
        :type clipped: bool
        :return: Batch shuffled mainifest.
        :rtype: list
        """
        manifest.sort(key=lambda x: x["duration"])
        shift_len = self._rng.randint(0, batch_size - 1)
        batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
        self._rng.shuffle(batch_manifest)
        batch_manifest = [item for batch in batch_manifest for item in batch]
        if clipped:
            res_len = len(manifest) - shift_len - len(batch_manifest)
            batch_manifest.extend(manifest[-res_len:])
            batch_manifest.extend(manifest[0:shift_len])
        return batch_manifest
class DataGenerator(object):
    """
    DataGenerator provides basic audio data preprocessing pipeline, and offers
    data reader interfaces of PaddlePaddle requirements.

    :param vocab_filepath: Vocabulary filepath for indexing tokenized
                           transcripts.
    :type vocab_filepath: str
    :param mean_std_filepath: File containing the pre-computed mean and stddev.
    :type mean_std_filepath: None|str
    :param augmentation_config: Augmentation configuration in json string.
                                Details see AugmentationPipeline.__doc__.
    :type augmentation_config: str
    :param max_duration: Audio with duration (in seconds) greater than
                         this will be discarded.
    :type max_duration: float
    :param min_duration: Audio with duration (in seconds) smaller than
                         this will be discarded.
    :type min_duration: float
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
    :param max_freq: Used when specgram_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
                     returned.
    :types max_freq: None|float
    :param specgram_type: Specgram feature type. Options: 'linear'.
    :type specgram_type: str
    :param use_dB_normalization: Whether to normalize the audio to -20 dB
                                before extracting the features.
    :type use_dB_normalization: bool
    :param random_seed: Random seed.
    :type random_seed: int
    :param keep_transcription_text: If set to True, transcription text will
                                    be passed forward directly without
                                    converting to index sequence.
    :type keep_transcription_text: bool
    :param place: The place to run the program.
    :type place: CPUPlace or CUDAPlace
    :param is_training: If set to True, generate text data for training,
                        otherwise,  generate text data for infer.
    :type is_training: bool
    """
    def __init__(self,
                 vocab_filepath,
                 mean_std_filepath,
                 augmentation_config='{}',
                 max_duration=float('inf'),
                 min_duration=0.0,
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_freq=None,
                 specgram_type='linear',
                 use_dB_normalization=True,
                 random_seed=0,
                 keep_transcription_text=False,
                 place=fluid.CPUPlace(),
                 is_training=True):
        self._max_duration = max_duration
        self._min_duration = min_duration
        self._normalizer = FeatureNormalizer(mean_std_filepath)
        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=augmentation_config, random_seed=random_seed)
        self._speech_featurizer = SpeechFeaturizer(
            vocab_filepath=vocab_filepath,
            specgram_type=specgram_type,
            stride_ms=stride_ms,
            window_ms=window_ms,
            max_freq=max_freq,
            use_dB_normalization=use_dB_normalization)
        self._rng = random.Random(random_seed)
        self._keep_transcription_text = keep_transcription_text
        self._epoch = 0
        self._is_training = is_training
        # for caching tar files info
        self._local_data = local()
        self._local_data.tar2info = {}
        self._local_data.tar2object = {}
        self._place = place

    def process_utterance(self, audio_file, transcript):
        """对语音数据加载、扩充、特征化和归一化

        :param audio_file: 音频文件的文件路径或文件对象
        :type audio_file: str | file
        :param transcript: 音频对应的文本
        :type transcript: str
        :return: 经过归一化等预处理的音频数据,音频文件对应文本的ID
        :rtype: tuple of (2darray, list)
        """
        speech_segment = SpeechSegment.from_file(audio_file, transcript)
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        specgram = self._normalizer.apply(specgram)
        return specgram, transcript_part

    def batch_reader_creator(self,
                             manifest_path,
                             batch_size,
                             padding_to=-1,
                             flatten=False,
                             shuffle_method="batch_shuffle"):
        """
        Batch data reader creator for audio data. Return a callable generator
        function to produce batches of data.

        Audio features within one batch will be padded with zeros to have the
        same shape, or a user-defined shape.

        :param manifest_path: Filepath of manifest for audio files.
        :type manifest_path: str
        :param batch_size: Number of instances in a batch.
        :type batch_size: int
        :param padding_to:  If set -1, the maximun shape in the batch
                            will be used as the target shape for padding.
                            Otherwise, `padding_to` will be the target shape.
        :type padding_to: int
        :param flatten: If set True, audio features will be flatten to 1darray.
        :type flatten: bool
        :param shuffle_method: Shuffle method. Options:
                                '' or None: no shuffle.
                                'instance_shuffle': instance-wise shuffle.
                                'batch_shuffle': similarly-sized instances are
                                                 put into batches, and then
                                                 batch-wise shuffle the batches.
                                                 For more details, please see
                                                 ``_batch_shuffle.__doc__``.
                                'batch_shuffle_clipped': 'batch_shuffle' with
                                                         head shift and tail
                                                         clipping. For more
                                                         details, please see
                                                         ``_batch_shuffle``.
                              If sortagrad is True, shuffle is disabled
                              for the first epoch.
        :type shuffle_method: None|str
        :return: Batch reader function, producing batches of data when called.
        :rtype: callable
        """
        def batch_reader():
            # 读取数据列表
            manifest = read_manifest(manifest_path=manifest_path,
                                     max_duration=self._max_duration,
                                     min_duration=self._min_duration)
            # 将数据列表长到短排序
            if self._epoch == 0:
                manifest.sort(key=lambda x: x["duration"])
                manifest.reverse()
            else:
                if shuffle_method == "batch_shuffle":
                    manifest = self._batch_shuffle(manifest,
                                                   batch_size,
                                                   clipped=False)
                elif shuffle_method == "batch_shuffle_clipped":
                    manifest = self._batch_shuffle(manifest,
                                                   batch_size,
                                                   clipped=True)
                elif shuffle_method == "instance_shuffle":
                    self._rng.shuffle(manifest)
                elif shuffle_method is None:
                    pass
                else:
                    raise ValueError("Unknown shuffle method %s." %
                                     shuffle_method)
            # 准备批量数据
            batch = []
            instance_reader = self._instance_reader_creator(manifest)

            for instance in instance_reader():
                batch.append(instance)
                if len(batch) == batch_size:
                    yield self._padding_batch(batch, padding_to, flatten)
                    batch = []
            if len(batch) >= 1:
                yield self._padding_batch(batch, padding_to, flatten)
            self._epoch += 1

        return batch_reader

    @property
    def feeding(self):
        """返回数据读取器的exe读取字典

        :return: 数据读取字典
        :rtype: dict
        """
        feeding_dict = {"audio_spectrogram": 0, "transcript_text": 1}
        return feeding_dict

    @property
    def vocab_size(self):
        """返回词汇表大小

        :return: 词汇表大小
        :rtype: int
        """
        return self._speech_featurizer.vocab_size

    @property
    def vocab_list(self):
        """返回词汇表列表

        :return: 词汇表列表
        :rtype: list
        """
        return self._speech_featurizer.vocab_list

    def _instance_reader_creator(self, manifest):
        """
        创建一个数据生成器reader

        Instance: 生成器得到的数据是一个元组,包含了经过预处理音频数据和音频对应文本的ID
        """
        def reader():
            for instance in manifest:
                inst = self.process_utterance(instance["audio_filepath"],
                                              instance["text"])
                yield inst

        return reader

    def _padding_batch(self, batch, padding_to=-1, flatten=False):
        """
        用零填充音频功能,使它们在同一个batch具有相同的形状(或一个用户定义的形状)

        如果padding_to为-1,则批处理中的最大形状将被使用 作为填充的目标形状。
        否则,' padding_to '将是目标形状(仅指第二轴)。

        如果“flatten”为True,特征将被flatten为一维数据
        """
        # 获取目标形状
        max_length = max([audio.shape[1] for audio, text in batch])
        if padding_to != -1:
            if padding_to < max_length:
                raise ValueError("如果padding_to不是-1,它应该大于批处理中任何实例的形状")
            max_length = padding_to
        # 填充操作
        padded_audios = []
        texts, text_lens = [], []
        audio_lens = []
        masks = []
        for audio, text in batch:
            padded_audio = np.zeros([audio.shape[0], max_length])
            padded_audio[:, :audio.shape[1]] = audio
            if flatten:
                padded_audio = padded_audio.flatten()
            padded_audios.append(padded_audio)
            if self._is_training:
                texts += text
            else:
                texts.append(text)
            text_lens.append(len(text))
            audio_lens.append(audio.shape[1])
            mask_shape0 = (audio.shape[0] - 1) // 2 + 1
            mask_shape1 = (audio.shape[1] - 1) // 3 + 1
            mask_max_len = (max_length - 1) // 3 + 1
            mask_ones = np.ones((mask_shape0, mask_shape1))
            mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1))
            mask = np.repeat(np.reshape(
                np.concatenate((mask_ones, mask_zeros), axis=1),
                (1, mask_shape0, mask_max_len)),
                             32,
                             axis=0)
            masks.append(mask)
        padded_audios = np.array(padded_audios).astype('float32')
        if self._is_training:
            texts = np.expand_dims(np.array(texts).astype('int32'), axis=-1)
            texts = fluid.create_lod_tensor(texts,
                                            recursive_seq_lens=[text_lens],
                                            place=self._place)
        audio_lens = np.array(audio_lens).astype('int64').reshape([-1, 1])
        masks = np.array(masks).astype('float32')
        return padded_audios, texts, audio_lens, masks

    def _batch_shuffle(self, manifest, batch_size, clipped=False):
        """将大小相似的实例放入小批量中可以提高效率,并进行批量打乱

        1. 按持续时间对音频剪辑进行排序
        2. 生成一个随机数k, k的范围[0,batch_size)
        3. 随机移动k实例,为不同的epoch训练创建不同的批次
        4. 打乱minibatches.

        :param manifest: 数据列表
        :type manifest: list
        :param batch_size: 批量大小。这个大小还用于为批量洗牌生成一个随机数。
        :type batch_size: int
        :param clipped: 是否剪辑头部(小移位)和尾部(不完整批处理)实例。
        :type clipped: bool
        :return: Batch shuffled mainifest.
        :rtype: list
        """
        manifest.sort(key=lambda x: x["duration"])
        shift_len = self._rng.randint(0, batch_size - 1)
        batch_manifest = list(zip(*[iter(manifest[shift_len:])] * batch_size))
        self._rng.shuffle(batch_manifest)
        batch_manifest = [item for batch in batch_manifest for item in batch]
        if not clipped:
            res_len = len(manifest) - shift_len - len(batch_manifest)
            batch_manifest.extend(manifest[-res_len:])
            batch_manifest.extend(manifest[0:shift_len])
        return batch_manifest
class SpecgramGenerator(DynamicLengthGenerator):
    """audio specgram generator"""
    def __init__(self,
                 manifest,
                 vocab_filepath,
                 mean_std_filepath,
                 augmentation_config='{}',
                 max_duration=float('inf'),
                 min_duration=0.0,
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_freq=None,
                 specgram_type='linear',
                 use_dB_normalization=True,
                 random_seed=0,
                 keep_transcription_text=False,
                 segmented=False):

        self._max_duration = max_duration
        self._min_duration = min_duration
        self._segmented = segmented
        self._keep_transcription_text = keep_transcription_text

        if isinstance(manifest, str) and os.path.isfile(manifest):
            self.manifest = pd.read_csv(manifest)
        elif isinstance(manifest, pd.DataFrame):
            self.manifest = manifest
        else:
            raise BaseException(
                "{} is neither an valide path or a pandas DataFrame object".
                format(manifest))

        # duration filtering
        self.manifest = self.manifest[
            (self.manifest.duration >= self._min_duration)
            & (self.manifest.duration <= self._max_duration)]

        self.manifest = self.manifest.sort_values(by=["duration"],
                                                  ascending=True)

        self._normalizer = FeatureNormalizer(mean_std_filepath)

        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=augmentation_config, random_seed=random_seed)

        self._speech_featurizer = SpeechFeaturizer(
            vocab_filepath=vocab_filepath,
            specgram_type=specgram_type,
            stride_ms=stride_ms,
            window_ms=window_ms,
            max_freq=max_freq,
            use_dB_normalization=use_dB_normalization)

    def __len__(self):
        return len(self.manifest)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        instance = self.manifest.iloc[idx]
        if self._segmented is True:
            specgram, transcript = self.process_utterance(
                instance["audio_path"], instance["text"], segments_info=None)
        else:
            specgram, transcript = self.process_utterance(
                instance["audio_path"],
                instance["text"],
                segments_info={
                    "start": instance["st"],
                    "end": instance["et"]
                })

        uttid = instance["uttid"]
        sample = {
            "uttid": uttid,
            "specgrams": specgram,
            "text": transcript,
            "trans": instance["text"]
        }

        return sample

    def process_utterance(self,
                          audio_file,
                          transcript,
                          uttid=None,
                          segments_info=None):
        """Load, augment, featurize and normalize for speech data.

        :param audio_file: Filepath or file object of audio file.
        :type audio_file: basestring | file
        :param transcript: Transcription text.
        :type transcript: basestring
        :return: Tuple of audio feature tensor and data of transcription part,
                 where transcription part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """

        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(audio_file), transcript)
        elif segments_info is None:
            speech_segment = SpeechSegment.from_file(audio_file, transcript)
        else:
            speech_segment = SpeechSegment.slice_from_file(
                audio_file, transcript, **segments_info)

        # augment speech. i.e. add noise, speedup
        self._augmentation_pipeline.transform_audio(speech_segment)

        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)

        specgram = self._normalizer.apply(specgram)
        return specgram, transcript_part
Example #4
0
class DataGenerator(object):
    """
    DataGenerator provides basic audio data preprocessing pipeline, and offers
    data reader interfaces of PaddlePaddle requirements.

    :param vocab_filepath: Vocabulary filepath for indexing tokenized
                           transcripts.
    :type vocab_filepath: basestring
    :param mean_std_filepath: File containing the pre-computed mean and stddev.
    :type mean_std_filepath: None|basestring
    :param augmentation_config: Augmentation configuration in json string.
                                Details see AugmentationPipeline.__doc__.
    :type augmentation_config: str
    :param max_duration: Audio with duration (in seconds) greater than
                         this will be discarded.
    :type max_duration: float
    :param min_duration: Audio with duration (in seconds) smaller than
                         this will be discarded.
    :type min_duration: float
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
    :param max_freq: Used when specgram_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
                     returned.
    :types max_freq: None|float
    :param specgram_type: Specgram feature type. Options: 'linear'.
    :type specgram_type: str
    :param use_dB_normalization: Whether to normalize the audio to -20 dB
                                before extracting the features.
    :type use_dB_normalization: bool
    :param num_threads: Number of CPU threads for processing data.
    :type num_threads: int
    :param random_seed: Random seed.
    :type random_seed: int
    """
    def __init__(self,
                 vocab_filepath,
                 mean_std_filepath,
                 augmentation_config='{}',
                 max_duration=float('inf'),
                 min_duration=0.0,
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_freq=None,
                 specgram_type='linear',
                 use_dB_normalization=True,
                 num_threads=multiprocessing.cpu_count() // 2,
                 random_seed=0):
        self._max_duration = max_duration
        self._min_duration = min_duration
        self._normalizer = FeatureNormalizer(mean_std_filepath)
        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=augmentation_config, random_seed=random_seed)
        self._speech_featurizer = SpeechFeaturizer(
            vocab_filepath=vocab_filepath,
            specgram_type=specgram_type,
            stride_ms=stride_ms,
            window_ms=window_ms,
            max_freq=max_freq,
            use_dB_normalization=use_dB_normalization)
        self._num_threads = num_threads
        self._rng = random.Random(random_seed)
        self._epoch = 0
        # for caching tar files info
        self._local_data = local()
        self._local_data.tar2info = {}
        self._local_data.tar2object = {}

    def process_utterance(self, filename, transcript):
        """Load, augment, featurize and normalize for speech data.

        :param filename: Audio filepath
        :type filename: basestring | file
        :param transcript: Transcription text.
        :type transcript: basestring
        :return: Tuple of audio feature tensor and list of token ids for
                 transcription.
        :rtype: tuple of (2darray, list)
        """
        speech_segment = SpeechSegment.from_file(filename, transcript)
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
        specgram = self._normalizer.apply(specgram)
        return specgram, text_ids

    def batch_reader_creator(self,
                             manifest_path,
                             batch_size,
                             min_batch_size=1,
                             padding_to=-1,
                             flatten=False,
                             sortagrad=False,
                             shuffle_method="batch_shuffle"):
        """
        Batch data reader creator for audio data. Return a callable generator
        function to produce batches of data.

        Audio features within one batch will be padded with zeros to have the
        same shape, or a user-defined shape.

        :param manifest_path: Filepath of manifest for audio files.
        :type manifest_path: basestring
        :param batch_size: Number of instances in a batch.
        :type batch_size: int
        :param min_batch_size: Any batch with batch size smaller than this will
                               be discarded. (To be deprecated in the future.)
        :type min_batch_size: int
        :param padding_to:  If set -1, the maximun shape in the batch
                            will be used as the target shape for padding.
                            Otherwise, `padding_to` will be the target shape.
        :type padding_to: int
        :param flatten: If set True, audio features will be flatten to 1darray.
        :type flatten: bool
        :param sortagrad: If set True, sort the instances by audio duration
                          in the first epoch for speed up training.
        :type sortagrad: bool
        :param shuffle_method: Shuffle method. Options:
                                '' or None: no shuffle.
                                'instance_shuffle': instance-wise shuffle.
                                'batch_shuffle': similarly-sized instances are
                                                 put into batches, and then
                                                 batch-wise shuffle the batches.
                                                 For more details, please see
                                                 ``_batch_shuffle.__doc__``.
                                'batch_shuffle_clipped': 'batch_shuffle' with
                                                         head shift and tail
                                                         clipping. For more
                                                         details, please see
                                                         ``_batch_shuffle``.
                              If sortagrad is True, shuffle is disabled
                              for the first epoch.
        :type shuffle_method: None|str
        :return: Batch reader function, producing batches of data when called.
        :rtype: callable
        """
        def batch_reader():
            # read manifest
            manifest = read_manifest(manifest_path=manifest_path,
                                     max_duration=self._max_duration,
                                     min_duration=self._min_duration)
            # sort (by duration) or batch-wise shuffle the manifest
            if self._epoch == 0 and sortagrad:
                manifest.sort(key=lambda x: x["duration"])
            else:
                if shuffle_method == "batch_shuffle":
                    manifest = self._batch_shuffle(manifest,
                                                   batch_size,
                                                   clipped=False)
                elif shuffle_method == "batch_shuffle_clipped":
                    manifest = self._batch_shuffle(manifest,
                                                   batch_size,
                                                   clipped=True)
                elif shuffle_method == "instance_shuffle":
                    self._rng.shuffle(manifest)
                elif shuffle_method == None:
                    pass
                else:
                    raise ValueError("Unknown shuffle method %s." %
                                     shuffle_method)
            # prepare batches
            instance_reader = self._instance_reader_creator(manifest)
            batch = []
            for instance in instance_reader():
                batch.append(instance)
                if len(batch) == batch_size:
                    yield self._padding_batch(batch, padding_to, flatten)
                    batch = []
            if len(batch) >= min_batch_size:
                yield self._padding_batch(batch, padding_to, flatten)
            self._epoch += 1

        return batch_reader

    @property
    def feeding(self):
        """Returns data reader's feeding dict.

        :return: Data feeding dict.
        :rtype: dict
        """
        return {"audio_spectrogram": 0, "transcript_text": 1}

    @property
    def vocab_size(self):
        """Return the vocabulary size.

        :return: Vocabulary size.
        :rtype: int
        """
        return self._speech_featurizer.vocab_size

    @property
    def vocab_list(self):
        """Return the vocabulary in list.

        :return: Vocabulary in list.
        :rtype: list
        """
        return self._speech_featurizer.vocab_list

    def _parse_tar(self, file):
        """Parse a tar file to get a tarfile object
        and a map containing tarinfoes
        """
        result = {}
        f = tarfile.open(file)
        for tarinfo in f.getmembers():
            result[tarinfo.name] = tarinfo
        return f, result

    def _get_file_object(self, file):
        """Get file object by file path.

        If file startwith tar, it will return a tar file object
        and cached tar file info for next reading request.
        It will return file directly, if the type of file is not str.
        """
        if file.startswith('tar:'):
            tarpath, filename = file.split(':', 1)[1].split('#', 1)
            if 'tar2info' not in self._local_data.__dict__:
                self._local_data.tar2info = {}
            if 'tar2object' not in self._local_data.__dict__:
                self._local_data.tar2object = {}
            if tarpath not in self._local_data.tar2info:
                object, infoes = self._parse_tar(tarpath)
                self._local_data.tar2info[tarpath] = infoes
                self._local_data.tar2object[tarpath] = object
            return self._local_data.tar2object[tarpath].extractfile(
                self._local_data.tar2info[tarpath][filename])
        else:
            return open(file, 'r')

    def _instance_reader_creator(self, manifest):
        """
        Instance reader creator. Create a callable function to produce
        instances of data.

        Instance: a tuple of ndarray of audio spectrogram and a list of
        token indices for transcript.
        """
        def reader():
            for instance in manifest:
                yield instance

        def mapper(instance):
            return self.process_utterance(
                self._get_file_object(instance["audio_filepath"]),
                instance["text"])

        return paddle.reader.xmap_readers(mapper,
                                          reader,
                                          self._num_threads,
                                          1024,
                                          order=True)

    def _padding_batch(self, batch, padding_to=-1, flatten=False):
        """
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one bach.

        If ``padding_to`` is -1, the maximun shape in the batch will be used
        as the target shape for padding. Otherwise, `padding_to` will be the
        target shape (only refers to the second axis).

        If `flatten` is True, features will be flatten to 1darray.
        """
        new_batch = []
        # get target shape
        max_length = max([audio.shape[1] for audio, text in batch])
        if padding_to != -1:
            if padding_to < max_length:
                raise ValueError(
                    "If padding_to is not -1, it should be larger "
                    "than any instance's shape in the batch")
            max_length = padding_to
        # padding
        for audio, text in batch:
            padded_audio = np.zeros([audio.shape[0], max_length])
            padded_audio[:, :audio.shape[1]] = audio
            if flatten:
                padded_audio = padded_audio.flatten()
            new_batch.append((padded_audio, text))
        return new_batch

    def _batch_shuffle(self, manifest, batch_size, clipped=False):
        """Put similarly-sized instances into minibatches for better efficiency
        and make a batch-wise shuffle.

        1. Sort the audio clips by duration.
        2. Generate a random number `k`, k in [0, batch_size).
        3. Randomly shift `k` instances in order to create different batches
           for different epochs. Create minibatches.
        4. Shuffle the minibatches.

        :param manifest: Manifest contents. List of dict.
        :type manifest: list
        :param batch_size: Batch size. This size is also used for generate
                           a random number for batch shuffle.
        :type batch_size: int
        :param clipped: Whether to clip the heading (small shift) and trailing
                        (incomplete batch) instances.
        :type clipped: bool
        :return: Batch shuffled mainifest.
        :rtype: list
        """
        manifest.sort(key=lambda x: x["duration"])
        shift_len = self._rng.randint(0, batch_size - 1)
        batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
        self._rng.shuffle(batch_manifest)
        batch_manifest = list(sum(batch_manifest, ()))
        if not clipped:
            res_len = len(manifest) - shift_len - len(batch_manifest)
            batch_manifest.extend(manifest[-res_len:])
            batch_manifest.extend(manifest[0:shift_len])
        return batch_manifest