Example #1
0
    def load(
            self,
            start: Optional[Seconds] = None,
            duration: Optional[Seconds] = None,
    ) -> np.ndarray:
        # noinspection PyArgumentList
        storage = get_reader(self.storage_type)(self.storage_path)
        left_offset_frames, right_offset_frames = 0, None

        if start is None:
            start = self.start
        # In case the caller requested only a sub-span of the features, trim them.
        # Left trim
        if start < self.start - 1e-5:
            raise ValueError(f"Cannot load features for recording {self.recording_id} starting from {start}s. "
                             f"The available range is ({self.start}, {self.end}) seconds.")
        if not isclose(start, self.start):
            left_offset_frames = compute_num_frames(start - self.start, frame_shift=self.frame_shift,
                                                    sampling_rate=self.sampling_rate)
        # Right trim
        end = start + duration if duration is not None else None
        if duration is not None and not isclose(end, self.end):
            right_offset_frames = left_offset_frames + compute_num_frames(duration, frame_shift=self.frame_shift,
                                                                          sampling_rate=self.sampling_rate)

        # Load and return the features (subset) from the storage
        return storage.read(
            self.storage_key,
            left_offset_frames=left_offset_frames,
            right_offset_frames=right_offset_frames
        )
Example #2
0
    def __call__(self, cuts: CutSet) -> Tuple[torch.Tensor, torch.IntTensor]:
        """
        Reads the audio samples from recordings on disk/other storage
        and computes their features.
        The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``.

        :return: a tensor with collated features, and a tensor of ``num_frames`` of each cut before padding.
        """
        audio, _ = collate_audio(cuts)

        for tfnm in self.wave_transforms:
            audio = tfnm(audio)

        features_single = []
        for idx, cut in enumerate(cuts):
            samples = audio[idx].numpy()
            try:
                features = self.extractor.extract(samples,
                                                  cuts[idx].sampling_rate)
            except:
                logging.error(
                    f"Error while extracting the features for cut with ID {cut.id} -- details:\n{cut}"
                )
                raise
            features_single.append(torch.from_numpy(features))
        features_batch = torch.stack(features_single)

        feature_lens = torch.tensor([
            compute_num_frames(cut.duration, self.extractor.frame_shift,
                               cut.sampling_rate) for cut in cuts
        ],
                                    dtype=torch.int32)

        return features_batch, feature_lens
Example #3
0
def validate_features(f: Features,
                      read_data: bool = False,
                      feats_data: Optional[np.ndarray] = None) -> None:
    assert f.start >= 0, \
        f'Features: start has to be greater than 0 (is {f.start})'
    assert f.duration > 0, \
        f'Features: duration has to be greater than 0 (is {f.duration})'
    assert f.num_frames > 0, \
        f'Features: num_frames has to be greater than 0 (is {f.num_frames})'
    assert f.num_features > 0, \
        f'Features: num_features has to be greater than 0 (is {f.num_features})'
    assert f.sampling_rate > 0, \
        f'Features: sampling_rate has to be greater than 0 (is {f.sampling_rate})'
    assert f.frame_shift > 0, \
        f'Features: frame_shift has to be greater than 0 (is {f.frame_shift})'
    window_hop = round(f.frame_shift * f.sampling_rate, ndigits=12)
    assert float(int(window_hop)) == window_hop, \
        f'Features: frame_shift of {f.frame_shift} is incorrect because it is physically impossible; ' \
        f'multiplying it by a sampling rate of {f.sampling_rate} results in a fractional window hop ' \
        f'of {window_hop} samples.'
    expected_num_frames = compute_num_frames(duration=f.duration,
                                             frame_shift=f.frame_shift,
                                             sampling_rate=f.sampling_rate)
    assert expected_num_frames == f.num_frames, \
        f'Features: manifest is inconsistent: declared num_frames is {f.num_frames}, ' \
        f'but duration ({f.duration}s) / frame_shift ({f.frame_shift}s) results in {expected_num_frames} frames. ' \
        f'If you\'re using a custom feature extractor, you might need to ensure that it preserves ' \
        f'this relationship between duration, frame_shift and num_frames (use rounding up if needed - ' \
        f'see lhotse.utils.compute_num_frames).'
    if read_data or feats_data is not None:
        if read_data:
            feats_data = f.load()
        n_fr, n_ft = feats_data.shape
        assert f.num_frames == n_fr, f'Features: expected num_frames: {f.num_frames}, actual: {n_fr}'
        assert f.num_features == n_ft, f'Features: expected num_features: {f.num_features}, actual: {n_ft}'
def test_cut_with_temporal_array_move_to_memory_large_offset():
    path = "test/fixtures/libri/cuts.json"
    cut = CutSet.from_file(path)[0]
    cut.start = 10.0
    cut.duration = 1.5

    with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(f.name) as w:
        arr = np.array(
            np.arange(
                compute_num_frames(cut.duration,
                                   frame_shift=0.01,
                                   sampling_rate=16000)))
        cut.custom_array = w.store_array(
            key="dummy-key",
            value=arr,
            frame_shift=0.01,
            temporal_dim=0,
            start=cut.start,
        )

        cut_mem = cut.move_to_memory()
        arr_mem = cut_mem.load_custom_array()

        assert arr.dtype == arr_mem.dtype
        np.testing.assert_equal(arr, arr_mem)

        arr_trunc = cut.truncate(duration=0.5).load_custom_array()
        arr_mem_trunc = cut_mem.truncate(duration=0.5).load_custom_array()

        assert arr_trunc.dtype == arr_mem_trunc.dtype
        np.testing.assert_equal(arr_trunc, arr_mem_trunc)
Example #5
0
    def __call__(
        self, cuts: CutSet
    ) -> Union[
        Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, CutSet]
    ]:
        """
        Reads the audio samples from recordings on disk/other storage
        and computes their features.
        The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``.

        :return: a tensor with collated features, and a tensor of ``num_frames`` of each cut before padding.
        """
        audios, cuts = read_audio_from_cuts(
            cuts,
            executor=_get_executor(self.num_workers, executor_type=self._executor_type),
            suppress_errors=self.fault_tolerant,
        )

        for tfnm in self.wave_transforms:
            for idx in range(len(audios)):
                audios[idx] = tfnm(audios[idx])

        if self.use_batch_extract:
            # Batch extraction is possibly faster depending on the implementation
            # of the feature extractor.
            assert all(c.sampling_rate == cuts[0].sampling_rate for c in cuts)
            features_single = self.extractor.extract_batch(
                audios, sampling_rate=cuts[0].sampling_rate
            )
        else:
            # Sequential extraction allows the sampling rates to be different.
            features_single = []
            for idx, cut in enumerate(cuts):
                samples = audios[idx].numpy()
                try:
                    features = self.extractor.extract(samples, cuts[idx].sampling_rate)
                except:
                    logging.error(
                        f"Error while extracting the features for cut with ID {cut.id} -- details:\n{cut}"
                    )
                    raise
                features_single.append(torch.from_numpy(features))

        features_batch = collate_matrices(features_single, padding_value=LOG_EPSILON)

        feature_lens = torch.tensor(
            [
                compute_num_frames(
                    cut.duration, self.extractor.frame_shift, cut.sampling_rate
                )
                for cut in cuts
            ],
            dtype=torch.int32,
        )

        if self.fault_tolerant:
            return features_batch, feature_lens, cuts
        else:
            return features_batch, feature_lens
Example #6
0
 def asserted_num_frames(start: Seconds, duration: Seconds, frame_shift: Seconds) -> int:
     """
     This closure with compute the num_frames, correct off-by-one errors in edge cases,
     and assert that the supervision does not exceed the feature matrix temporal dimension.
     """
     offset = compute_num_frames(start, frame_shift=frame_shift)
     num_frames = compute_num_frames(duration, frame_shift=frame_shift)
     diff = features.shape[1] - (offset + num_frames)
     # Note: we tolerate off-by-ones because some mixed cuts could have one frame more
     # than their duration suggests (we will try to change this eventually).
     if diff == -1:
         num_frames -= 1
     assert offset + num_frames <= features.shape[1], \
         f"Unexpected num_frames ({offset + num_frames}) exceeding features time dimension for a supervision " \
         f"({features.shape[1]}) when constructing a batch; please report this in Lhotse's GitHub issues, " \
         "ideally providing the Cut data that triggered this."
     return num_frames
Example #7
0
    def __next__(self) -> Dict[str, Union[torch.Tensor, List[str]]]:
        """
        Return a new batch, with the batch size automatically determined using the contraints
        of max_frames and max_cuts.
        """
        from torch.utils.data._utils.collate import default_collate

        # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames.
        # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.)
        cuts: CutSet = self._collect_batch()

        # For now, we'll just pad it with low energy values to match the longest Cut's
        # duration in the batch. We might want to do something more interesting here
        # later on - padding/mixing with noises, etc.
        cuts = cuts.sort_by_duration().pad()

        # Get a tensor with batched feature matrices, shape (B, T, F)
        features = _collate_features(cuts)

        def asserted_num_frames(start: Seconds, duration: Seconds,
                                frame_shift: Seconds) -> int:
            """
            This closure with compute the num_frames, correct off-by-one errors in edge cases,
            and assert that the supervision does not exceed the feature matrix temporal dimension.
            """
            offset = compute_num_frames(start, frame_shift=frame_shift)
            num_frames = compute_num_frames(duration, frame_shift=frame_shift)
            diff = features.shape[1] - (offset + num_frames)
            # Note: we tolerate off-by-ones because some mixed cuts could have one frame more
            # than their duration suggests (we will try to change this eventually).
            if diff == -1:
                num_frames -= 1
            assert offset + num_frames <= features.shape[1], \
                f"Unexpected num_frames ({offset + num_frames}) exceeding features time dimension for a supervision " \
                f"({features.shape[1]}) when constructing a batch; please report this in Lhotse's GitHub issues, " \
                "ideally providing the Cut data that triggered this."
            return num_frames

        return {
            'features':
            features,
            'supervisions':
            default_collate([{
                'cut_id':
                cut.id,
                'sequence_idx':
                sequence_idx,
                'text':
                supervision.text,
                'start_frame':
                compute_num_frames(supervision.start, cut.frame_shift),
                'num_frames':
                asserted_num_frames(supervision.start, supervision.duration,
                                    cut.frame_shift),
            } for sequence_idx, cut in enumerate(cuts)
                             for supervision in cut.supervisions])
        }
Example #8
0
    def __next__(self) -> Dict[str, Union[torch.Tensor, List[str]]]:
        """
        Return a new batch, with the batch size automatically determined using the contraints
        of max_frames and max_cuts.
        """
        # Collect the cuts that will form a batch, satisfying the criteria of max_cuts and max_frames.
        # The returned object is a CutSet that we can keep on modifying (e.g. padding, mixing, etc.)
        cuts: CutSet = self._collect_batch()

        # For now, we'll just pad it with low energy values to match the longest Cut's
        # duration in the batch. We might want to do something more interesting here
        # later on - padding/mixing with noises, etc.
        cuts = cuts.sort_by_duration().pad()

        # Get a tensor with batched feature matrices, shape (B, T, F)
        features = collate_features(cuts)

        batch = {
            'features': features,
            'supervisions': default_collate([
                {
                    'sequence_idx': sequence_idx,
                    'text': supervision.text,
                    'start_frame': compute_num_frames(
                        supervision.start,
                        frame_shift=cut.frame_shift,
                        # Note: Rounding "floor" can sometimes result in one extra frame being included
                        # in the left context; but it guarantees that we will never go out-of-bounds when
                        # summing start_frame + num_frames.
                        rounding=ROUND_FLOOR
                    ),
                    'num_frames': compute_num_frames(
                        supervision.duration,
                        frame_shift=cut.frame_shift
                    )
                }
                for sequence_idx, cut in enumerate(cuts)
                for supervision in cut.supervisions
            ])
        }
        if self.return_cuts:
            batch['supervisions']['cut'] = [cut for cut in cuts for sup in cut.supervisions]

        return batch
Example #9
0
 def _pad_frames(self, samples: np.ndarray, feats: np.ndarray,
                 sampling_rate: int) -> np.ndarray:
     """Adds last diff frames to the end of feats matrix to fit lhotse.utils.compute_num_frames."""
     duration = np.shape(samples)[1] / sampling_rate
     diff = (compute_num_frames(duration, self.frame_shift, sampling_rate) -
             np.shape(feats)[0])
     if abs(diff) >= 6:
         warnings.warn(f"Unusual difference in number of frames: {diff}")
     if diff > 0:
         feats = np.append(feats, feats[-diff:, :], axis=0)
     elif diff < 0:
         feats = feats[:-diff, :]
     return feats
Example #10
0
def test_num_frames(
    feature_set,
    feature_level,
):
    sr = 8000
    duration = 12.059
    config = OpenSmileConfig(
        feature_set=feature_set,
        feature_level=feature_level,
        sampling_rate=sr,
        resample=True,
    )
    feature_extractor = OpenSmileExtractor(config=config)

    num_frames = compute_num_frames(duration, feature_extractor.frame_shift,
                                    sr)
    num_samples = compute_num_samples(duration, sr)

    signal = np.random.rand(1, num_samples)
    y = feature_extractor.extract(signal, sr)
    assert np.shape(y)[0] == num_frames
Example #11
0
    def __getitem__(self,
                    cuts: CutSet) -> Dict[str, Union[torch.Tensor, List[str]]]:
        """
        Return a new batch, with the batch size automatically determined using the constraints
        of max_frames and max_cuts.
        """
        validate_for_asr(cuts)

        self.hdf5_fix.update()

        # Sort the cuts by duration so that the first one determines the batch time dimensions.
        cuts = cuts.sort_by_duration(ascending=False)

        # Optional CutSet transforms - e.g. padding, or speed perturbation that adjusts
        # the supervision boundaries.
        for tnfm in self.cut_transforms:
            cuts = tnfm(cuts)

        # Get a tensor with batched feature matrices, shape (B, T, F)
        # Collation performs auto-padding, if necessary.
        input_tpl = self.input_strategy(cuts)
        if len(input_tpl) == 3:
            # An input strategy with fault tolerant audio reading mode.
            # "cuts" may be a subset of the original "cuts" variable,
            # that only has cuts for which we succesfully read the audio.
            inputs, _, cuts = input_tpl
        else:
            inputs, _ = input_tpl

        # Get a dict of tensors that encode the positional information about supervisions
        # in the batch of feature matrices. The tensors are named "sequence_idx",
        # "start_frame/sample" and "num_frames/samples".
        supervision_intervals = self.input_strategy.supervision_intervals(cuts)

        # Apply all available transforms on the inputs, i.e. either audio or features.
        # This could be feature extraction, global MVN, SpecAugment, etc.
        segments = torch.stack(list(supervision_intervals.values()), dim=1)
        for tnfm in self.input_transforms:
            inputs = tnfm(inputs, supervision_segments=segments)

        batch = {
            "inputs":
            inputs,
            "supervisions":
            default_collate([{
                "text": supervision.text,
            } for sequence_idx, cut in enumerate(cuts)
                             for supervision in cut.supervisions]),
        }
        # Update the 'supervisions' field with sequence_idx and start/num frames/samples
        batch["supervisions"].update(supervision_intervals)
        if self.return_cuts:
            batch["supervisions"]["cut"] = [
                cut for cut in cuts for sup in cut.supervisions
            ]

        has_word_alignments = all(
            s.alignment is not None and "word" in s.alignment for c in cuts
            for s in c.supervisions)
        if has_word_alignments:
            # TODO: might need to refactor BatchIO API to move the following conditional logic
            #       into these objects (e.g. use like: self.input_strategy.convert_timestamp(),
            #       that returns either num_frames or num_samples depending on the strategy).
            words, starts, ends = [], [], []
            frame_shift = cuts[0].frame_shift
            sampling_rate = cuts[0].sampling_rate
            if frame_shift is None:
                try:
                    frame_shift = self.input_strategy.extractor.frame_shift
                except AttributeError:
                    raise ValueError(
                        "Can't determine the frame_shift -- it is not present either in cuts or the input_strategy. "
                    )
            for c in cuts:
                for s in c.supervisions:
                    words.append(
                        [aliword.symbol for aliword in s.alignment["word"]])
                    starts.append([
                        compute_num_frames(
                            aliword.start,
                            frame_shift=frame_shift,
                            sampling_rate=sampling_rate,
                        ) for aliword in s.alignment["word"]
                    ])
                    ends.append([
                        compute_num_frames(
                            aliword.end,
                            frame_shift=frame_shift,
                            sampling_rate=sampling_rate,
                        ) for aliword in s.alignment["word"]
                    ])
            batch["supervisions"]["word"] = words
            batch["supervisions"]["word_start"] = starts
            batch["supervisions"]["word_end"] = ends

        return batch
Example #12
0
    def add_to_mix(
        self,
        feats: np.ndarray,
        sampling_rate: int,
        snr: Optional[Decibels] = None,
        offset: Seconds = 0.0,
    ):
        """
        Add feature matrix of a new track into the mix.
        :param feats: A 2D feature matrix to be mixed in.
        :param sampling_rate: The sampling rate of ``feats``
        :param snr: Signal-to-noise ratio, assuming ``feats`` represents noise (positive SNR - lower ``feats`` energy,
        negative SNR - higher ``feats`` energy)
        :param offset: How many seconds to shift ``feats`` in time. For mixing, the signal will be padded before
        the start with low energy values.
        """
        assert offset >= 0.0, "Negative offset in mixing is not supported."

        reference_feats = self.tracks[0]
        num_frames_offset = compute_num_frames(duration=offset,
                                               frame_shift=self.frame_shift,
                                               sampling_rate=sampling_rate)
        current_num_frames = reference_feats.shape[0]
        incoming_num_frames = feats.shape[0] + num_frames_offset
        mix_num_frames = max(current_num_frames, incoming_num_frames)

        feats_to_add = feats

        # When the existing frames are less than what we anticipate after the mix,
        # we need to pad after the end of the existing features mixed so far.
        if current_num_frames < mix_num_frames:
            for idx in range(len(self.tracks)):
                padded_track = np.vstack([
                    self.tracks[idx],
                    self.padding_value * np.ones(
                        (mix_num_frames - current_num_frames,
                         self.num_features),
                        dtype=self.dtype,
                    ),
                ])
                self.tracks[idx] = padded_track

        # When there is an offset, we need to pad before the start of the features we're adding.
        if offset > 0:
            feats_to_add = np.vstack([
                self.padding_value * np.ones(
                    (num_frames_offset, self.num_features), dtype=self.dtype),
                feats_to_add,
            ])

        # When the features we're mixing in are shorter that the anticipated mix length,
        # we need to pad after their end.
        # Note: we're doing that inefficiently, as we potentially re-allocate numpy arrays twice,
        # during this padding and the offset padding before. If that's a bottleneck, we'll optimize.
        if incoming_num_frames < mix_num_frames:
            feats_to_add = np.vstack([
                feats_to_add,
                self.padding_value * np.ones(
                    (mix_num_frames - incoming_num_frames, self.num_features),
                    dtype=self.dtype,
                ),
            ])

        # When SNR is requested, find what gain is needed to satisfy the SNR
        gain = 1.0
        if snr is not None:
            # Compute the added signal energy before it was padded
            added_feats_energy = self.feature_extractor.compute_energy(feats)
            if added_feats_energy <= 0.0:
                raise NonPositiveEnergyError(
                    f"To perform mix, energy must be non-zero and non-negative (got {added_feats_energy}). "
                )
            target_energy = self.reference_energy * (10.0**(-snr / 10))
            gain = target_energy / added_feats_energy

        self.tracks.append(feats_to_add)
        self.gains.append(gain)
Example #13
0
    def __call__(
        self, cuts: CutSet
    ) -> Union[
        Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, CutSet]
    ]:
        """
        Reads the audio samples from recordings on disk/other storage
        and computes their features.
        The returned shape is ``(B, T, F) => (batch_size, num_frames, num_features)``.

        :return: a tuple of objcets: ``(feats, feat_lens, [audios, audio_lens], [cuts])``.
            Tensors ``audios`` and ``audio_lens`` are returned when ``return_audio=True``.
            CutSet ``cuts`` is returned when ``fault_tolerant=True``.
        """
        audios, cuts = read_audio_from_cuts(
            cuts,
            executor=_get_executor(self.num_workers, executor_type=self._executor_type),
            suppress_errors=self.fault_tolerant,
        )

        for tfnm in self.wave_transforms:
            for idx in range(len(audios)):
                audios[idx] = tfnm(audios[idx])

        if self.use_batch_extract:
            # Batch extraction is possibly faster depending on the implementation
            # of the feature extractor.
            assert all(c.sampling_rate == cuts[0].sampling_rate for c in cuts)
            features_single = self.extractor.extract_batch(
                audios, sampling_rate=cuts[0].sampling_rate
            )
        else:
            # Sequential extraction allows the sampling rates to be different.
            features_single = []
            for idx, cut in enumerate(cuts):
                samples = audios[idx].numpy()
                try:
                    features = self.extractor.extract(samples, cuts[idx].sampling_rate)
                except:
                    logging.error(
                        f"Error while extracting the features for cut with ID {cut.id} -- details:\n{cut}"
                    )
                    raise
                features_single.append(torch.from_numpy(features))

        features_batch = collate_matrices(features_single, padding_value=LOG_EPSILON)

        feature_lens = torch.tensor(
            [
                compute_num_frames(
                    cut.duration, self.extractor.frame_shift, cut.sampling_rate
                )
                for cut in cuts
            ],
            dtype=torch.int64,
        )

        out = (features_batch, feature_lens)

        if self.return_audio:
            audios = [a.squeeze(0) for a in audios]  # (1, T) -> (T, )
            audio_lens = torch.tensor([a.size(0) for a in audios], dtype=torch.int64)
            audios = collate_vectors(audios, padding_value=0)

            out = out + (audios, audio_lens)

        if self.fault_tolerant:
            out = out + (cuts,)

        return out
Example #14
0
def logmelfilterbank(
    audio: np.ndarray,
    sampling_rate: int,
    fft_size: int = 1024,
    hop_size: int = 256,
    win_length: int = None,
    window: str = "hann",
    num_mel_bins: int = 80,
    fmin: int = 80,
    fmax: int = 7600,
    eps: float = EPSILON,
):
    """Compute log-Mel filterbank feature.

    Args:
        audio (ndarray): Audio signal (T,).
        sampling_rate (int): Sampling rate.
        fft_size (int): FFT size.
        hop_size (int): Hop size.
        win_length (int): Window length. If set to None, it will be the same as fft_size.
        window (str): Window function type.
        num_mel_bins (int): Number of mel basis.
        fmin (int): Minimum frequency in mel basis calculation.
        fmax (int): Maximum frequency in mel basis calculation.
        eps (float): Epsilon value to avoid inf in log calculation.
    Returns:
        ndarray: Log Mel filterbank feature (#source_feats, num_mel_bins).
    """
    if is_module_available("librosa"):
        import librosa
    else:
        raise ImportError(
            "Librosa is not installed. Please install librosa before using LibrosaFbank extractor."
        )

    if len(audio.shape) == 2:
        assert (
            audio.shape[0] == 1
        ), f"LibrosaFbank works only with single-channel recordings (shape: {audio.shape})"
        audio = audio[0]
    else:
        assert (
            len(audio.shape) == 1
        ), f"LibrosaFbank works only with single-channel recordings (shape: {audio.shape})"

    x_stft = librosa.stft(
        audio,
        n_fft=fft_size,
        hop_length=hop_size,
        win_length=win_length,
        window=window,
        pad_mode="reflect",
    )
    spc = np.abs(x_stft).T

    fmin = 0 if fmin is None else fmin
    fmax = sampling_rate / 2 if fmax is None else fmax
    mel_basis = librosa.filters.mel(sampling_rate, fft_size, num_mel_bins,
                                    fmin, fmax)

    feats = np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))

    expected_num_frames = compute_num_frames(
        duration=len(audio) / sampling_rate,
        frame_shift=hop_size / sampling_rate,
        sampling_rate=sampling_rate,
    )
    feats = pad_or_truncate_features(feats, expected_num_frames)
    return feats