Esempio n. 1
0
    def perturb_speed(self,
                      factor: float,
                      sampling_rate: int,
                      affix_id: bool = True) -> 'SupervisionSegment':
        """
        Return a ``SupervisionSegment`` that has time boundaries matching the
        recording/cut perturbed with the same factor.

        :param factor: The speed will be adjusted this many times (e.g. factor=1.1 means 1.1x faster).
        :param sampling_rate: The sampling rate is necessary to accurately perturb the start
            and duration (going through the sample counts).
        :param affix_id: When true, we will modify the ``id`` and ``recording_id`` fields
            by affixing it with "_sp{factor}".
        :return: a modified copy of the current ``Recording``.
        """
        start_sample = compute_num_samples(self.start, sampling_rate)
        num_samples = compute_num_samples(self.duration, sampling_rate)
        new_start = perturb_num_samples(start_sample, factor) / sampling_rate
        new_duration = perturb_num_samples(num_samples, factor) / sampling_rate
        return fastcopy(self,
                        id=f'{self.id}_sp{factor}' if affix_id else self.id,
                        recording_id=f'{self.recording_id}_sp{factor}'
                        if affix_id else self.id,
                        start=new_start,
                        duration=new_duration,
                        alignment={
                            type: [
                                item.perturb_speed(factor=factor,
                                                   sampling_rate=sampling_rate)
                                for item in ali
                            ]
                            for type, ali in self.alignment.items()
                        } if self.alignment else None)
Esempio n. 2
0
def read_audio(
    path_or_fd: Union[Pathlike, FileObject],
    offset: Seconds = 0.0,
    duration: Optional[Seconds] = None,
    force_audioread: Optional[bool] = False,
) -> Tuple[np.ndarray, int]:
    try:
        if force_audioread:
            raise Exception
        import soundfile as sf
        with sf.SoundFile(path_or_fd) as sf_desc:
            sampling_rate = sf_desc.samplerate
            if offset > 0:
                # Seek to the start of the target read
                sf_desc.seek(compute_num_samples(offset, sampling_rate))
            if duration is not None:
                frame_duration = compute_num_samples(duration, sampling_rate)
            else:
                frame_duration = -1
            # Load the target number of frames, and transpose to match librosa form
            return sf_desc.read(frames=frame_duration,
                                dtype=np.float32,
                                always_2d=False).T, sampling_rate
    except:
        return _audioread_load(path_or_fd, offset=offset, duration=duration)
Esempio n. 3
0
    def reverse_timestamps(
            self, offset: Seconds, duration: Optional[Seconds],
            sampling_rate: int) -> Tuple[Seconds, Optional[Seconds]]:
        """
        This method helps estimate the original offset and duration for a recording
        before resampling was applied.
        We need this estimate to know how much audio to actually load from disk during the
        call to ``load_audio()``.

        In case of resampling, the timestamps might change slightly when using non-trivial
        pairs of sampling rates, e.g. 16kHz -> 22.05kHz, because the number of samples in
        the resampled audio might actually correspond to incrementally larger/smaller duration.
        E.g. 16kHz, 235636 samples correspond to 14.72725s duration; after resampling to 22.05kHz,
        it is 324736 samples which correspond to 14.727256235827664s duration.
        """
        old_num_samples = compute_num_samples(offset,
                                              self.source_sampling_rate,
                                              rounding=ROUND_HALF_UP)
        old_offset = old_num_samples / self.source_sampling_rate
        if duration is not None:
            old_num_samples = compute_num_samples(duration,
                                                  self.source_sampling_rate,
                                                  rounding=ROUND_HALF_UP)
            old_duration = old_num_samples / self.source_sampling_rate
        else:
            old_duration = None
        return old_offset, old_duration
Esempio n. 4
0
 def reverse_timestamps(
     self,
     offset: Seconds,
     duration: Optional[Seconds],
     sampling_rate: int,
 ) -> Tuple[Seconds, Optional[Seconds]]:
     """
     This method helps estimate the original offset and duration for a recording
     before tempo perturbation was applied.
     We need this estimate to know how much audio to actually load from disk during the
     call to ``load_audio()``.
     """
     start_sample = compute_num_samples(offset, sampling_rate)
     num_samples = (
         compute_num_samples(duration, sampling_rate)
         if duration is not None
         else None
     )
     start_sample = perturb_num_samples(start_sample, 1 / self.factor)
     num_samples = (
         perturb_num_samples(num_samples, 1 / self.factor)
         if num_samples is not None
         else None
     )
     return (
         start_sample / sampling_rate,
         num_samples / sampling_rate if num_samples is not None else None,
     )
Esempio n. 5
0
 def perturb_speed(self, factor: float, sampling_rate: int) -> 'AlignmentItem':
     """
     Return an ``AlignmentItem`` that has time boundaries matching the
     recording/cut perturbed with the same factor. See :meth:`SupervisionSegment.perturb_speed` 
     for details.
     """
     start_sample = compute_num_samples(self.start, sampling_rate)
     num_samples = compute_num_samples(self.duration, sampling_rate)
     new_start = perturb_num_samples(start_sample, factor) / sampling_rate
     new_duration = perturb_num_samples(num_samples, factor) / sampling_rate
     return AlignmentItem(self.symbol, new_start, new_duration)
Esempio n. 6
0
def read_audio(path: Pathlike, offset: Seconds, duration: Seconds) -> Tuple[np.ndarray, int]:
    import soundfile as sf
    with sf.SoundFile(path) as sf_desc:
        sampling_rate = sf_desc.samplerate
        if offset:
            # Seek to the start of the target read
            sf_desc.seek(compute_num_samples(offset, sampling_rate))
        if duration is not None:
            frame_duration = compute_num_samples(duration, sampling_rate)
        else:
            frame_duration = -1
        # Load the target number of frames, and transpose to match librosa form
        return sf_desc.read(frames=frame_duration, dtype=np.float32, always_2d=False).T, sampling_rate
Esempio n. 7
0
def assert_and_maybe_fix_num_samples(audio: np.ndarray, offset: Seconds,
                                     duration: Optional[Seconds],
                                     recording: Recording) -> np.ndarray:
    # When resampling in high sampling rates (48k -> 44.1k)
    # it is difficult to estimate how sox will perform rounding;
    # we will just add/remove one sample to be consistent with
    # what we have estimated.
    # This effect is exacerbated by chaining multiple augmentations together.
    expected_num_samples = compute_num_samples(
        duration=duration if duration is not None else recording.duration -
        offset,
        sampling_rate=recording.sampling_rate)
    diff = expected_num_samples - audio.shape[1]
    if diff == 0:
        return audio  # this is normal condition
    allowed_diff = int(
        ceil(AUGMENTATION_DURATION_TOLERANCE * recording.sampling_rate))
    if 0 < diff <= allowed_diff:
        # note the extra colon in -1:, which preserves the shape
        audio = np.append(audio, audio[:, -diff:], axis=1)
        return audio
    elif -allowed_diff <= diff < 0:
        audio = audio[:, :diff]
        return audio
    else:
        raise ValueError(
            "The number of declared samples in the recording diverged from the one obtained "
            f"when loading audio (offset={offset}, duration={duration}). "
            f"This could be internal Lhotse's error or a faulty transform implementation. "
            "Please report this issue in Lhotse and show the "
            f"following: diff={diff}, audio.shape={audio.shape}, recording={recording}"
        )
Esempio n. 8
0
 def resample(self, sampling_rate: int) -> 'Recording':
     """
     Return a new ``Recording`` that will be lazily resampled while loading audio.
     :param sampling_rate: The new sampling rate.
     :return: A resampled ``Recording``.
     """
     resampling = [
         Resample(source_sampling_rate=self.sampling_rate, target_sampling_rate=sampling_rate).to_dict()
     ]
     new_num_samples = compute_num_samples(self.duration, sampling_rate, rounding=ROUND_HALF_UP)
     # Duration might need an adjustment when doing a non-trivial resampling
     # (e.g. 16000 -> 22050), where the resulting number of samples cannot
     # correspond to old duration exactly.
     new_duration = new_num_samples / sampling_rate
     return fastcopy(
         self,
         duration=new_duration,
         num_samples=new_num_samples,
         sampling_rate=sampling_rate,
         transforms=(self.transforms or []) + resampling
     )
Esempio n. 9
0
def test_num_frames(
    feature_set,
    feature_level,
):
    sr = 8000
    duration = 12.059
    config = OpenSmileConfig(
        feature_set=feature_set,
        feature_level=feature_level,
        sampling_rate=sr,
        resample=True,
    )
    feature_extractor = OpenSmileExtractor(config=config)

    num_frames = compute_num_frames(duration, feature_extractor.frame_shift,
                                    sr)
    num_samples = compute_num_samples(duration, sr)

    signal = np.random.rand(1, num_samples)
    y = feature_extractor.extract(signal, sr)
    assert np.shape(y)[0] == num_frames
Esempio n. 10
0
def memmap_raw_audio(wav_scp,
                     f_memmapped,
                     utt_list,
                     dtype=np.float32,
                     sampling_rate=16000,
                     do_normalize=True):
    '''
        Maps the wva.scp file from kaldi to a memory mapped numpy object.
        This allows for fast i/o when creating window minibathces from slices
        of training data.

        input args: wav_scp, f_memmapped
        output:
            utt_lens = {'utt_n': # utt_n frames, ...}
            offsets = {'utt_n': utt_n offset in memory mapped numpy file}
            data_shape = {#frames, feature_dimension}
    '''
    import os
    dataset = os.path.dirname(wav_scp)

    print(dataset)
    if not os.path.exists(os.path.join(dataset, 'reco2dur')):
        p = subprocess.Popen(['./utils/data/get_reco2dur.sh', dataset],
                             stdout=subprocess.PIPE)
        out = p.communicate()

    # Import lhotse and install if not available
    try:
        from lhotse import kaldi, CutSet
    except ImportError:
        from pip._internal import main as pip
        pip(['install', 'lhotse'])
        from lhotse import kaldi, CutSet
    from lhotse.utils import compute_num_samples

    data = kaldi.load_kaldi_data_dir(dataset, sampling_rate)
    cuts = CutSet.from_manifests(data[0], data[1])
    dim = 1

    utt_lens = {}
    for cut in cuts:
        sr = cut.recording.sampling_rate
        for sup in cut.supervisions:
            if sup.id not in utt_list:
                continue
            utt_lens[sup.id.encode()] = compute_num_samples(sup.duration, sr)
    data_shape = (sum(utt_lens.values()), dim)

    f = np.memmap(f_memmapped, mode='w+', dtype=dtype, shape=data_shape)
    offsets = {}
    offset = 0
    for cut in cuts:
        x_ = cut.recording.load_audio().T
        # Mean and variance normalize
        if do_normalize:
            x = (x_ - x_.mean()) / x_.std()
        else:
            x = x_
        sr = cut.recording.sampling_rate
        for i, supervision in enumerate(cut.supervisions):
            k = supervision.id
            print('Utterance ', i, ' : ', k, ' : ', sr)
            start, dur = supervision.start, supervision.duration
            if k not in utt_list:
                continue
            start_sample = compute_num_samples(start, sr)
            end_sample = start_sample + utt_lens[k.encode()]
            m = x[start_sample:end_sample]
            offsets[k.encode()] = offset
            utt_lens[k.encode()] = m.shape[0]
            new_offset = offset + utt_lens[k.encode()]
            f[offset:new_offset, :] = m
            offset = new_offset
        print()
    del f
    return utt_lens, offsets, data_shape
Esempio n. 11
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests.
    For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet.
    In particular, feats.scp files are ignored.
    """
    path = Path(path)
    assert path.is_dir()

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True)

    durations = defaultdict(float)
    for recording_id, path_or_cmd in recordings.items():
        duration = get_duration(path_or_cmd)
        durations[recording_id] = duration

    recording_set = RecordingSet.from_recordings(
        Recording(id=recording_id,
                  sources=[
                      AudioSource(type='command' if path_or_cmd.
                                  endswith('|') else 'file',
                                  channels=[0],
                                  source=path_or_cmd[:-1] if path_or_cmd.
                                  endswith('|') else path_or_cmd)
                  ],
                  sampling_rate=sampling_rate,
                  num_samples=compute_num_samples(durations[recording_id],
                                                  sampling_rate),
                  duration=durations[recording_id])
        for recording_id, path_or_cmd in recordings.items())

    supervision_set = None
    segments = path / 'segments'
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [l.strip().split() for l in f]

        texts = load_kaldi_text_mapping(path / 'text')
        speakers = load_kaldi_text_mapping(path / 'utt2spk')
        genders = load_kaldi_text_mapping(path / 'spk2gender')
        languages = load_kaldi_text_mapping(path / 'utt2lang')

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(id=segment_id,
                               recording_id=recording_id,
                               start=float(start),
                               duration=float(end) - float(start),
                               channel=0,
                               text=texts[segment_id],
                               language=languages[segment_id],
                               speaker=speakers[segment_id],
                               gender=genders[speakers[segment_id]])
            for segment_id, recording_id, start, end in supervision_segments)

    feature_set = None
    feats_scp = path / 'feats.scp'
    if feats_scp.exists() and is_module_available('kaldiio'):
        if frame_shift is not None:
            import kaldiio
            from lhotse.features.io import KaldiReader
            feature_set = FeatureSet.from_features(
                Features(type='kaldiio',
                         num_frames=mat.shape[0],
                         num_features=mat.shape[1],
                         frame_shift=frame_shift,
                         sampling_rate=sampling_rate,
                         start=0,
                         duration=mat.shape[0] * frame_shift,
                         storage_type=KaldiReader.name,
                         storage_path=str(feats_scp),
                         storage_key=utt_id,
                         recording_id=supervision_set[utt_id].recording_id
                         if supervision_set is not None else utt_id,
                         channels=0)
                for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp)))
        else:
            warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: "
                          f"frame_shift must be not None. "
                          f"Feature import omitted.")

    return recording_set, supervision_set, feature_set
Esempio n. 12
0
 def _expected_num_samples(self, offset: Seconds,
                           duration: Optional[Seconds]) -> int:
     if offset == 0 and duration is None:
         return self.num_samples
     duration = duration if duration is not None else self.duration - offset
     return compute_num_samples(duration, sampling_rate=self.sampling_rate)
Esempio n. 13
0
def prepare_peoples_speech(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare :class:`~lhotse.RecordingSet` and :class:`~lhotse.SupervisionSet` manifests
    for The People's Speech.

    The metadata is read lazily and written to manifests in a stream to minimize
    the CPU RAM usage. If you want to convert this data to a :class:`~lhotse.CutSet`
    without using excessive memory, we suggest to call it like::

        >>> peoples_speech = prepare_peoples_speech(corpus_dir=..., output_dir=...)
        >>> cuts = CutSet.from_manifests(
        ...     recordings=peoples_speech["recordings"],
        ...     supervisions=peoples_speech["supervisions"],
        ...     output_path=...,
        ...     lazy=True,
        ... )

    :param corpus_dir: Pathlike, the path of the main data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "recordings" and "supervisions" with lazily opened manifests.
    """
    corpus_dir = Path(corpus_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    recs_path = output_dir / "peoples-speech_recordings_all.jsonl.gz"
    sups_path = output_dir / "peoples-speech_supervisions_all.jsonl.gz"

    if recs_path.is_file() and sups_path.is_file():
        # Nothing to do: just open the manifests in lazy mode.
        return {
            "recordings": RecordingSet.from_jsonl_lazy(recs_path),
            "supervisions": SupervisionSet.from_jsonl_lazy(sups_path),
        }

    exist = 0
    tot = 0
    err = 0
    with RecordingSet.open_writer(
            recs_path, ) as rec_writer, SupervisionSet.open_writer(
                sups_path, ) as sup_writer:
        for item in tqdm(
                # Note: People's Speech manifest.json is really a JSONL.
                load_jsonl(corpus_dir / "manifest.json"),
                desc=
                "Converting People's Speech manifest.json to Lhotse manifests",
        ):
            for duration_ms, text, audio_path in zip(
                    *item["training_data"].values()):
                full_path = corpus_dir / audio_path

                tot += 1
                if not full_path.exists():
                    # If we can't find some data, we'll just continue and some items
                    # were missing later.
                    continue
                exist += 1

                try:
                    audio_info = info(full_path)
                    duration = duration_ms / 1000
                    r = Recording(
                        id=full_path.stem,
                        sampling_rate=audio_info.samplerate,
                        num_samples=compute_num_samples(
                            duration, audio_info.samplerate),
                        duration=duration,
                        sources=[
                            AudioSource(
                                type="file",
                                channels=[0],
                                source=str(full_path),
                            )
                        ],
                    )
                    s = SupervisionSegment(
                        id=r.id,
                        recording_id=r.id,
                        start=0,
                        duration=r.duration,
                        channel=0,
                        text=text,
                        language="English",
                        custom={"session_id": item["identifier"]},
                    )

                    validate_recordings_and_supervisions(recordings=r,
                                                         supervisions=s)

                    rec_writer.write(r)
                    sup_writer.write(s)

                except Exception as e:
                    # If some files are missing (e.g. somebody is working on a subset
                    # of 30.000 hours), we won't interrupt processing; we will only
                    # do so for violated assertions.
                    if isinstance(e, AssertionError):
                        raise
                    err += 1
                    continue

    if exist < tot or err > 0:
        warnings.warn(
            f"We finished preparing The People's Speech Lhotse manifests. "
            f"Out of {tot} entries in the original manifest, we found {exist} "
            f"audio files existed, out of which {err} had errors during processing."
        )

    return {
        "recordings": rec_writer.open_manifest(),
        "supervisions": sup_writer.open_manifest(),
    }
Esempio n. 14
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
    map_string_to_underscores: Optional[str] = None,
    num_jobs: int = 1,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and
    SupervisionSet manifests. For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might
    not be handled yet. In particular, feats.scp files are ignored.

    :param map_string_to_underscores: optional string, when specified, we will replace
        all instances of this string in SupervisonSegment IDs to underscores.
        This is to help with handling underscores in Kaldi
        (see :func:`.export_to_kaldi`). This is also done for speaker IDs.
    """
    path = Path(path)
    assert path.is_dir()

    def fix_id(t: str) -> str:
        if map_string_to_underscores is None:
            return t
        return t.replace(map_string_to_underscores, "_")

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True)

    with ProcessPoolExecutor(num_jobs) as ex:
        dur_vals = ex.map(get_duration, recordings.values())
    durations = dict(zip(recordings.keys(), dur_vals))

    recording_set = RecordingSet.from_recordings(
        Recording(
            id=recording_id,
            sources=[
                AudioSource(
                    type="command" if path_or_cmd.endswith("|") else "file",
                    channels=[0],
                    source=path_or_cmd[:-1]
                    if path_or_cmd.endswith("|")
                    else path_or_cmd,
                )
            ],
            sampling_rate=sampling_rate,
            num_samples=compute_num_samples(durations[recording_id], sampling_rate),
            duration=durations[recording_id],
        )
        for recording_id, path_or_cmd in recordings.items()
    )

    supervision_set = None
    segments = path / "segments"
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [sup_string.strip().split() for sup_string in f]

        texts = load_kaldi_text_mapping(path / "text")
        speakers = load_kaldi_text_mapping(path / "utt2spk")
        genders = load_kaldi_text_mapping(path / "spk2gender")
        languages = load_kaldi_text_mapping(path / "utt2lang")

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(
                id=fix_id(segment_id),
                recording_id=recording_id,
                start=float(start),
                duration=add_durations(
                    float(end), -float(start), sampling_rate=sampling_rate
                ),
                channel=0,
                text=texts[segment_id],
                language=languages[segment_id],
                speaker=fix_id(speakers[segment_id]),
                gender=genders[speakers[segment_id]],
            )
            for segment_id, recording_id, start, end in supervision_segments
        )

    feature_set = None
    feats_scp = path / "feats.scp"
    if feats_scp.exists() and is_module_available("kaldi_native_io"):
        if frame_shift is not None:
            import kaldi_native_io
            from lhotse.features.io import KaldiReader

            feature_set = FeatureSet.from_features(
                Features(
                    type="kaldi_native_io",
                    num_frames=mat_shape.num_rows,
                    num_features=mat_shape.num_cols,
                    frame_shift=frame_shift,
                    sampling_rate=sampling_rate,
                    start=0,
                    duration=mat_shape.num_rows * frame_shift,
                    storage_type=KaldiReader.name,
                    storage_path=str(feats_scp),
                    storage_key=utt_id,
                    recording_id=supervision_set[utt_id].recording_id
                    if supervision_set is not None
                    else utt_id,
                    channels=0,
                )
                for utt_id, mat_shape in kaldi_native_io.SequentialMatrixShapeReader(
                    f"scp:{feats_scp}"
                )
            )
        else:
            warnings.warn(
                "Failed to import Kaldi 'feats.scp' to Lhotse: "
                "frame_shift must be not None. "
                "Feature import omitted."
            )

    return recording_set, supervision_set, feature_set