Exemple #1
0
def prepare_single_partition(
    raw_manifest_path: Path,
    corpus_dir: Path,
    speaker_id: str,
    clean_or_other: str,
):
    recordings = []
    supervisions = []
    for meta in load_jsonl(raw_manifest_path):
        recording = Recording.from_file(corpus_dir / meta["audio_filepath"])
        recordings.append(recording)
        supervisions.append(
            SupervisionSegment(
                id=recording.id,
                recording_id=recording.id,
                start=0,
                duration=recording.duration,
                channel=0,
                text=meta["text"],
                speaker=ID2SPEAKER[speaker_id],
                gender=ID2GENDER[speaker_id],
                custom={
                    "text_punct": meta["text_normalized"],
                    "split": clean_or_other
                },
            ))
    recordings = RecordingSet.from_recordings(recordings)
    supervisions = SupervisionSet.from_segments(supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)
    return recordings, supervisions
Exemple #2
0
def test_jsonl_save_load_roundtrip(extension):
    data = [{"some": ["data"]}]
    with NamedTemporaryFile() as f:
        path = Path(f.name).with_suffix(extension)
        save_to_jsonl(data, path)
        f.flush()
        data_deserialized = list(load_jsonl(path))
    assert data == data_deserialized
Exemple #3
0
def test_cut_set_decompose_output_dir_doesnt_duplicate_recording():
    c = dummy_cut(0)
    c2 = dummy_cut(0)
    c2.id = "dummy-cut-0001"  # override cut ID, retain identical recording ID as `c`
    cuts = CutSet.from_cuts([c, c2])

    with TemporaryDirectory() as td:
        td = Path(td)
        cuts.decompose(output_dir=td)

        text = load_jsonl(td / "recordings.jsonl.gz")
        print(list(text))

        recs = load_manifest(td / "recordings.jsonl.gz")
        assert isinstance(recs, RecordingSet)
        # deduplicated recording
        assert len(recs) == 1
        assert recs[0].id == "dummy-recording-0000"
def prepare_peoples_speech(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare :class:`~lhotse.RecordingSet` and :class:`~lhotse.SupervisionSet` manifests
    for The People's Speech.

    The metadata is read lazily and written to manifests in a stream to minimize
    the CPU RAM usage. If you want to convert this data to a :class:`~lhotse.CutSet`
    without using excessive memory, we suggest to call it like::

        >>> peoples_speech = prepare_peoples_speech(corpus_dir=..., output_dir=...)
        >>> cuts = CutSet.from_manifests(
        ...     recordings=peoples_speech["recordings"],
        ...     supervisions=peoples_speech["supervisions"],
        ...     output_path=...,
        ...     lazy=True,
        ... )

    :param corpus_dir: Pathlike, the path of the main data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "recordings" and "supervisions" with lazily opened manifests.
    """
    corpus_dir = Path(corpus_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    recs_path = output_dir / "peoples-speech_recordings_all.jsonl.gz"
    sups_path = output_dir / "peoples-speech_supervisions_all.jsonl.gz"

    if recs_path.is_file() and sups_path.is_file():
        # Nothing to do: just open the manifests in lazy mode.
        return {
            "recordings": RecordingSet.from_jsonl_lazy(recs_path),
            "supervisions": SupervisionSet.from_jsonl_lazy(sups_path),
        }

    exist = 0
    tot = 0
    err = 0
    with RecordingSet.open_writer(
            recs_path, ) as rec_writer, SupervisionSet.open_writer(
                sups_path, ) as sup_writer:
        for item in tqdm(
                # Note: People's Speech manifest.json is really a JSONL.
                load_jsonl(corpus_dir / "manifest.json"),
                desc=
                "Converting People's Speech manifest.json to Lhotse manifests",
        ):
            for duration_ms, text, audio_path in zip(
                    *item["training_data"].values()):
                full_path = corpus_dir / audio_path

                tot += 1
                if not full_path.exists():
                    # If we can't find some data, we'll just continue and some items
                    # were missing later.
                    continue
                exist += 1

                try:
                    audio_info = info(full_path)
                    duration = duration_ms / 1000
                    r = Recording(
                        id=full_path.stem,
                        sampling_rate=audio_info.samplerate,
                        num_samples=compute_num_samples(
                            duration, audio_info.samplerate),
                        duration=duration,
                        sources=[
                            AudioSource(
                                type="file",
                                channels=[0],
                                source=str(full_path),
                            )
                        ],
                    )
                    s = SupervisionSegment(
                        id=r.id,
                        recording_id=r.id,
                        start=0,
                        duration=r.duration,
                        channel=0,
                        text=text,
                        language="English",
                        custom={"session_id": item["identifier"]},
                    )

                    validate_recordings_and_supervisions(recordings=r,
                                                         supervisions=s)

                    rec_writer.write(r)
                    sup_writer.write(s)

                except Exception as e:
                    # If some files are missing (e.g. somebody is working on a subset
                    # of 30.000 hours), we won't interrupt processing; we will only
                    # do so for violated assertions.
                    if isinstance(e, AssertionError):
                        raise
                    err += 1
                    continue

    if exist < tot or err > 0:
        warnings.warn(
            f"We finished preparing The People's Speech Lhotse manifests. "
            f"Out of {tot} entries in the original manifest, we found {exist} "
            f"audio files existed, out of which {err} had errors during processing."
        )

    return {
        "recordings": rec_writer.open_manifest(),
        "supervisions": sup_writer.open_manifest(),
    }