Example #1
0
def validate_recordings_and_supervisions(
    recordings: Union[RecordingSet, Recording],
    supervisions: Union[SupervisionSet, SupervisionSegment],
    read_data: bool = False,
) -> None:
    """
    Validate the recording and supervision manifests separately,
    and then check if they are consistent with each other.

    This method will emit warnings, instead of errors, when some recordings or supervisions
    are missing their counterparts.
    These items will be discarded by default when creating a CutSet.
    """
    if isinstance(recordings, Recording):
        recordings = RecordingSet.from_recordings([recordings])
    if isinstance(supervisions, SupervisionSegment):
        supervisions = SupervisionSet.from_segments([supervisions])

    if recordings.is_lazy:
        recordings = RecordingSet.from_recordings(iter(recordings))
    if supervisions.is_lazy:
        supervisions = SupervisionSet.from_segments(iter(supervisions))

    validate(recordings, read_data=read_data)
    validate(supervisions)
    # Errors
    for s in supervisions:
        r = recordings[s.recording_id]
        assert -1e-3 <= s.start <= s.end <= r.duration + 1e-3, (
            f"Supervision {s.id}: exceeded the bounds of its corresponding recording "
            f"(supervision spans [{s.start}, {s.end}]; recording spans [0, {r.duration}])"
        )
        assert s.channel in r.channel_ids, (
            f"Supervision {s.id}: channel {s.channel} does not exist in its corresponding Recording "
            f"(recording channels: {r.channel_ids})"
        )
    # Warnings
    recording_ids = frozenset(r.id for r in recordings)
    recording_ids_in_sups = frozenset(s.recording_id for s in supervisions)
    only_in_recordings = recording_ids - recording_ids_in_sups
    if only_in_recordings:
        logging.warning(
            f"There are {len(only_in_recordings)} recordings that "
            f"do not have any corresponding supervisions in the SupervisionSet."
        )
    only_in_supervisions = recording_ids_in_sups - recording_ids
    if only_in_supervisions:
        logging.warning(
            f"There are {len(only_in_supervisions)} supervisions that "
            f"are missing their corresponding recordings in the RecordingSet."
        )
Example #2
0
def test_serialization():
    audio_set = RecordingSet.from_recordings([
        Recording(
            id='x',
            sources=[
                AudioSource(
                    type='file',
                    channel_ids=[0],
                    source='text/fixtures/mono_c0.wav'
                ),
                AudioSource(
                    type='command',
                    channel_ids=[1],
                    source='cat text/fixtures/mono_c1.wav'
                )
            ],
            sampling_rate=8000,
            num_samples=4000,
            duration_seconds=0.5
        )
    ])
    with NamedTemporaryFile() as f:
        audio_set.to_yaml(f.name)
        deserialized = RecordingSet.from_yaml(f.name)
    assert deserialized == audio_set
Example #3
0
def prepare_switchboard(
        audio_dir: Pathlike,
        transcripts_dir: Optional[Pathlike] = None,
        sentiment_dir: Optional[Pathlike] = None,
        output_dir: Optional[Pathlike] = None,
        omit_silence: bool = True,
        absolute_paths: bool = False
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC97S62`` package.
    :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations
        for SWBD segments.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    if transcripts_dir is None:
        transcripts_dir = download_and_untar()
    audio_paths = check_and_rglob(audio_dir, '*.sph')
    text_paths = check_and_rglob(transcripts_dir, '*trans.text')

    groups = []
    name_to_text = {p.stem.split('-')[0]: p for p in text_paths}
    for ap in audio_paths:
        name = ap.stem.replace('sw0', 'sw')
        groups.append({'audio': ap, 'text-0': name_to_text[f'{name}A'], 'text-1': name_to_text[f'{name}B']})

    recordings = RecordingSet.from_recordings(
        Recording.from_sphere(group['audio'], relative_path_depth=None if absolute_paths else 3)
        for group in groups
    )
    supervisions = SupervisionSet.from_segments(chain.from_iterable(
        make_segments(
            transcript_path=group[f'text-{channel}'],
            recording=recording,
            channel=channel,
            omit_silence=omit_silence
        )
        for group, recording in zip(groups, recordings)
        for channel in [0, 1]
    ))

    if sentiment_dir is not None:
        parse_and_add_sentiment_labels(sentiment_dir, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / 'recordings.json')
        supervisions.to_json(output_dir / 'supervisions.json')
    return {
        'recordings': recordings,
        'supervisions': supervisions
    }
Example #4
0
def test_serialization(format, compressed):
    recording_set = RecordingSet.from_recordings([
        Recording(
            id='x',
            sources=[
                AudioSource(
                    type='file',
                    channels=[0],
                    source='text/fixtures/mono_c0.wav'
                ),
                AudioSource(
                    type='command',
                    channels=[1],
                    source='cat text/fixtures/mono_c1.wav'
                )
            ],
            sampling_rate=8000,
            num_samples=4000,
            duration=0.5
        )
    ])
    with NamedTemporaryFile(suffix='.gz' if compressed else '') as f:
        if format == 'yaml':
            recording_set.to_yaml(f.name)
            deserialized = RecordingSet.from_yaml(f.name)
        if format == 'json':
            recording_set.to_json(f.name)
            deserialized = RecordingSet.from_json(f.name)
    assert deserialized == recording_set
Example #5
0
def prepare_audio_single(audio_paths: List[Pathlike], ) -> RecordingSet:
    import soundfile as sf

    recordings = []
    for audio_path in tqdm(audio_paths, desc="Preparing audio"):
        session_name = audio_path.parts[-2]
        if audio_path.suffix == ".wav":
            audio_sf = sf.SoundFile(str(audio_path))
            num_frames = audio_sf.frames
            num_channels = audio_sf.channels
            samplerate = audio_sf.samplerate
        else:
            audio_sf, samplerate = read_sph(audio_path)
            num_channels, num_frames = audio_sf.shape
        recordings.append(
            Recording(
                id=session_name,
                sources=[
                    AudioSource(
                        type="file",
                        channels=list(range(num_channels)),
                        source=str(audio_path),
                    )
                ],
                sampling_rate=samplerate,
                num_samples=num_frames,
                duration=num_frames / samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Example #6
0
def to_manifest(items: Iterable[ManifestItem]) -> Optional[Manifest]:
    """
    Take an iterable of data types in Lhotse such as Recording, SupervisonSegment or Cut, and create the manifest of the
    corresponding type. When the iterable is empty, returns None.
    """
    items = iter(items)
    try:
        first_item = next(items)
    except StopIteration:
        return None
    items = chain([first_item], items)

    if isinstance(first_item, Recording):
        return RecordingSet.from_recordings(items)
    if isinstance(first_item, SupervisionSegment):
        return SupervisionSet.from_segments(items)
    if isinstance(first_item, (Cut, MixedCut)):
        return CutSet.from_cuts(items)
    if isinstance(first_item, Features):
        raise ValueError(
            "FeatureSet generic construction from iterable is not possible, as the config information "
            "would have been lost. Call FeatureSet.from_features() directly instead."
        )

    raise ValueError(f"Unknown type of manifest item: {first_item}")
Example #7
0
def prepare_audio_grouped(audio_paths: List[Pathlike], ) -> RecordingSet:
    import soundfile as sf

    # Group together multiple channels from the same session.
    # We will use that to create a Recording with multiple sources (channels).
    from cytoolz import groupby

    channel_wavs = groupby(lambda p: p.parts[-3], audio_paths)

    recordings = []
    for session_name, channel_paths in channel_wavs.items():
        audio_sf = sf.SoundFile(str(channel_paths[0]))

        recordings.append(
            Recording(
                id=session_name,
                sources=[
                    AudioSource(type="file",
                                channels=[idx],
                                source=str(audio_path))
                    for idx, audio_path in enumerate(sorted(channel_paths))
                ],
                sampling_rate=audio_sf.samplerate,
                num_samples=audio_sf.frames,
                duration=audio_sf.frames / audio_sf.samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Example #8
0
def trim_supervisions_to_recordings(
    recordings: RecordingSet, supervisions: SupervisionSet
) -> SupervisionSet:
    """
    Return a new :class:`~lhotse.supervision.SupervisionSet` with supervisions that are
    not exceeding the duration of their corresponding :class:`~lhotse.audio.Recording`.
    """
    if recordings.is_lazy:
        recordings = RecordingSet.from_recordings(iter(recordings))

    sups = []
    removed = 0
    trimmed = 0
    for s in supervisions:
        end = recordings[s.recording_id].duration
        if s.start > end:
            removed += 1
            continue
        if s.end > end:
            trimmed += 1
            s = s.trim(recordings[s.recording_id].duration)
        sups.append(s)
    if removed:
        logging.warning(
            f"Removed {removed} supervisions starting after the end of the recording."
        )
    if trimmed:
        logging.warning(
            f"Trimmed {trimmed} supervisions exceeding the end of the recording."
        )
    return SupervisionSet.from_segments(sups)
Example #9
0
def test_cut_set_reverb_rir_doesnt_duplicate_transforms(cut_with_supervision, rir):
    rirs = RecordingSet.from_recordings([rir])
    cuts = CutSet.from_cuts(
        [cut_with_supervision, cut_with_supervision.with_id("other-id")]
    )
    cuts_vp = cuts.reverb_rir(rir_recordings=rirs)
    for cut in cuts_vp:
        # This prevents a bug regression where multiple cuts referencing the same recording would
        # attach transforms to the same manifest
        assert len(cut.recording.transforms) == 1
Example #10
0
def prepare_broadcast_news(
    audio_dir: Pathlike,
    transcripts_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for 1997 English Broadcast News corpus.
    We create three manifests: one with recordings, one with segments supervisions,
    and one with section supervisions. The latter can be used e.g. for topic segmentation.

    :param audio_dir: Path to ``LDC98S71`` package.
    :param transcripts_dir: Path to ``LDC98T28`` package.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :return: A dict with manifests. The keys are: ``{'recordings', 'sections', 'segments'}``.
    """
    audio_paths = check_and_rglob(audio_dir, "*.sph")
    sgml_paths = check_and_rglob(transcripts_dir, "*.sgml")

    recordings = RecordingSet.from_recordings(
        Recording.from_file(p, relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths
    )

    # BeautifulSoup has quite inefficient tag-to-string rendering that uses a recursive implementation;
    # on some systems the recursion limit needs to be raised for this to work.
    with recursion_limit(5000):
        supervisions_list = [
            make_supervisions(p, r) for p, r in zip(sgml_paths, recordings)
        ]
    section_supervisions = SupervisionSet.from_segments(
        chain.from_iterable(sups["sections"] for sups in supervisions_list)
    )
    segment_supervisions = SupervisionSet.from_segments(
        chain.from_iterable(sups["segments"] for sups in supervisions_list)
    )

    validate_recordings_and_supervisions(recordings, segment_supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_file(output_dir / "broadcast-news_recordings_all.jsonl.gz")
        section_supervisions.to_file(
            output_dir / "broadcast-news_sections_all.jsonl.gz"
        )
        segment_supervisions.to_file(
            output_dir / "broadcast-news_segments_all.jsonl.gz"
        )

    return {
        "recordings": recordings,
        "sections": section_supervisions,
        "segments": segment_supervisions,
    }
Example #11
0
def DummyManifest(type_: Type, *, begin_id: int, end_id: int) -> Manifest:
    if type_ == RecordingSet:
        return RecordingSet.from_recordings(
            dummy_recording(idx) for idx in range(begin_id, end_id))
    if type_ == SupervisionSet:
        return SupervisionSet.from_segments(
            dummy_supervision(idx) for idx in range(begin_id, end_id))
    if type_ == FeatureSet:
        # noinspection PyTypeChecker
        return FeatureSet.from_features(
            dummy_features(idx) for idx in range(begin_id, end_id))
Example #12
0
def test_cut_set_reverb_rir(libri_cut_set, rir, affix_id):
    rirs = RecordingSet.from_recordings([rir])
    perturbed_rvb_cs = libri_cut_set.reverb_rir(rirs, affix_id=affix_id)
    for original, perturbed_rvb in zip(libri_cut_set, perturbed_rvb_cs):
        if affix_id:
            assert original.id != perturbed_rvb.id
            assert perturbed_rvb.id.endswith(f"_rvb")
        else:
            assert original.id == perturbed_rvb.id
        assert original.sampling_rate == perturbed_rvb.sampling_rate
        assert original.num_samples == perturbed_rvb.num_samples
        assert original.load_audio().shape == perturbed_rvb.load_audio().shape
Example #13
0
def dummy_recording_set():
    return RecordingSet.from_recordings([
        Recording(
            id="rec1",
            sampling_rate=16000,
            num_samples=160000,
            duration=10,
            sources=[
                AudioSource(type="file", channels=[0], source="dummy.wav")
            ],
        )
    ])
Example #14
0
def prepare_ljspeech(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)
    metadata_csv_path = corpus_dir / "metadata.csv"
    assert metadata_csv_path.is_file(), f"No such file: {metadata_csv_path}"
    recordings = []
    supervisions = []
    with open(metadata_csv_path) as f:
        for line in f:
            recording_id, text, _ = line.split("|")
            audio_path = corpus_dir / "wavs" / f"{recording_id}.wav"
            if not audio_path.is_file():
                logging.warning(f"No such file: {audio_path}")
                continue
            recording = Recording.from_file(audio_path)
            segment = SupervisionSegment(
                id=recording_id,
                recording_id=recording_id,
                start=0.0,
                duration=recording.duration,
                channel=0,
                language="English",
                gender="female",
                text=text,
            )
            recordings.append(recording)
            supervisions.append(segment)

    recording_set = RecordingSet.from_recordings(recordings)
    supervision_set = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recording_set, supervision_set)

    if output_dir is not None:
        supervision_set.to_json(output_dir / "supervisions.json")
        recording_set.to_json(output_dir / "recordings.json")

    return {"recordings": recording_set, "supervisions": supervision_set}
Example #15
0
def dummy_recording_set():
    return RecordingSet.from_recordings([
        Recording(id='rec1',
                  sampling_rate=16000,
                  num_samples=160000,
                  duration=10,
                  sources=[
                      AudioSource(type='file',
                                  channels=[0],
                                  source='dummy.wav')
                  ])
    ])
Example #16
0
def prepare_yesno(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply
    read and return them.

    :param corpus_dir: Pathlike, the path of the data dir. It's expected to
        contain wave files with the pattern x_x_x_x_x_x_x_x.wav, where there
        are 8 x's and each x is either 1 or 0.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is either "train" or "test", and the value is
        Dicts with the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    wave_files = list(corpus_dir.glob("*.wav"))
    assert len(wave_files) == 60

    wave_files.sort()
    train_set = wave_files[::2]
    test_set = wave_files[1::2]

    assert len(train_set) == 30
    assert len(test_set) == 30

    manifests = defaultdict(dict)
    for name, dataset in zip(["train", "test"], [train_set, test_set]):
        recordings, supervisions = _prepare_dataset(dataset)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)

        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f"supervisions_{name}.json")
            recording_set.to_json(output_dir / f"recordings_{name}.json")

        manifests[name] = {
            "recordings": recording_set,
            "supervisions": supervision_set,
        }

    return manifests
Example #17
0
def prepare_gigaspeech(
        gigaspeech: Any,
        dataset_parts: Union[str, Sequence[str]] = 'auto',
        output_dir: Optional[Pathlike] = None,
        num_jobs: int = 1
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    if is_module_available('speechcolab'):
        from speechcolab.datasets.gigaspeech import GigaSpeech
    else:
        raise ImportError(
            'To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab')

    subsets = ('{XL}', '{DEV}', '{TEST}') if dataset_parts == 'auto' else dataset_parts

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, suffix='jsonl')
        if maybe_manifests is not None:
            return maybe_manifests

    manifests = defaultdict(dict)
    with ThreadPoolExecutor(num_jobs) as ex:
        for part in subsets:
            futures = []
            for audio in tqdm(gigaspeech.audios(part), desc='Distributing tasks', leave=False):
                futures.append(ex.submit(parse_utterance, audio, gigaspeech.root_path))

            recordings = []
            supervisions = []
            for future in tqdm(futures, desc='Processing', leave=False):
                result = future.result()
                if result is None:
                    continue
                recording, segments = result
                recordings.append(recording)
                supervisions += segments

            manifests[part] = {
                'recordings': RecordingSet.from_recordings(recordings),
                'supervisions': SupervisionSet.from_segments(supervisions)
            }

            if output_dir is not None:
                manifests[part]['recordings'].to_file(output_dir / f'recordings_{part}.jsonl')
                manifests[part]['supervisions'].to_file(output_dir / f'supervisions_{part}.jsonl')

    return dict(manifests)
Example #18
0
def dummy_recording_set_lazy():
    with NamedTemporaryFile(suffix=".jsonl.gz") as f:
        recs = RecordingSet.from_recordings([
            Recording(
                id="rec1",
                sampling_rate=16000,
                num_samples=160000,
                duration=10,
                sources=[
                    AudioSource(type="file", channels=[0], source="dummy.wav")
                ],
            )
        ])
        recs.to_file(f.name)
        f.flush()
        yield RecordingSet.from_jsonl_lazy(f.name)
Example #19
0
def prepare_audio_grouped(
    audio_paths: List[Pathlike],
    channel_to_idx_map: Dict[str, Dict[str, int]] = None,
) -> RecordingSet:

    # Group together multiple channels from the same session.
    # We will use that to create a Recording with multiple sources (channels).
    from cytoolz import groupby

    channel_wavs = groupby(lambda p: p.parts[-2], audio_paths)

    if channel_to_idx_map is None:
        channel_to_idx_map = defaultdict(dict)
    recordings = []
    for session_name, channel_paths in tqdm(channel_wavs.items(),
                                            desc="Preparing audio"):
        if session_name not in channel_to_idx_map:
            channel_to_idx_map[session_name] = {
                c: idx
                for idx, c in enumerate(["chanE", "chanF", "chan6", "chan7"])
            }
        audio_sf, samplerate = read_sph(channel_paths[0])

        recordings.append(
            Recording(
                id=session_name,
                sources=[
                    AudioSource(
                        type="file",
                        channels=[
                            channel_to_idx_map[session_name][audio_path.stem]
                        ],
                        source=str(audio_path),
                    ) for audio_path in sorted(channel_paths)
                    if audio_path.stem in channel_to_idx_map[session_name]
                ],
                sampling_rate=samplerate,
                num_samples=audio_sf.shape[1],
                duration=audio_sf.shape[1] / samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Example #20
0
def prepare_audio_grouped(audio_paths: List[Pathlike], ) -> RecordingSet:
    import soundfile as sf

    # Group together multiple channels from the same session.
    # We will use that to create a Recording with multiple sources (channels).
    from cytoolz import groupby

    channel_wavs = groupby(lambda p: p.parts[-3], audio_paths)

    recordings = []
    for session_name, channel_paths in tqdm(channel_wavs.items(),
                                            desc="Processing audio files"):
        audio_sf = sf.SoundFile(str(channel_paths[0]))

        sources = []
        all_mono = True
        for idx, audio_path in enumerate(sorted(channel_paths)):
            audio = sf.SoundFile(str(audio_path))
            if audio.channels > 1:
                logging.warning(
                    f"Skipping recording {session_name} since it has a stereo channel"
                )
                all_mono = False
                break
            sources.append(
                AudioSource(type="file",
                            channels=[idx],
                            source=str(audio_path)))

        if not all_mono:
            continue

        recordings.append(
            Recording(
                id=session_name,
                sources=sources,
                sampling_rate=audio_sf.samplerate,
                num_samples=audio_sf.frames,
                duration=audio_sf.frames / audio_sf.samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Example #21
0
def prepare_audio_single(audio_paths: List[Pathlike], ) -> RecordingSet:
    import soundfile as sf

    recordings = []
    for audio_path in tqdm(audio_paths, desc="Processing audio files"):
        session_name = audio_path.parts[-3]
        audio_sf = sf.SoundFile(str(audio_path))
        recordings.append(
            Recording(
                id=session_name,
                sources=[
                    AudioSource(
                        type="file",
                        channels=list(range(audio_sf.channels)),
                        source=str(audio_path),
                    )
                ],
                sampling_rate=audio_sf.samplerate,
                num_samples=audio_sf.frames,
                duration=audio_sf.frames / audio_sf.samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Example #22
0
def prepare_audio_single(audio_paths: List[Pathlike], ) -> RecordingSet:
    import soundfile as sf

    recording_manifest = defaultdict(dict)

    recordings = []
    for audio_path in audio_paths:
        session_name = audio_path.parts[-3]
        audio_sf = sf.SoundFile(str(audio_path))
        recordings.append(
            Recording(
                id=session_name,
                sources=[
                    AudioSource(type='file',
                                channels=list(range(audio_sf.channels)),
                                source=str(audio_path))
                ],
                sampling_rate=audio_sf.samplerate,
                num_samples=audio_sf.frames,
                duration=audio_sf.frames / audio_sf.samplerate,
            ))
    return RecordingSet.from_recordings(recordings)
Example #23
0
def prepare_librispeech(
        corpus_dir: Pathlike,
        dataset_parts: Optional[Tuple[str]] = dataset_parts_mini,
        output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: dataset part name, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, text)
        metadata = {}
        part_path = corpus_dir / part
        for trans_path in part_path.rglob('*.txt'):
            with open(trans_path) as f:
                for line in f:
                    idx, text = line.split(maxsplit=1)
                    audio_path = part_path / Path(idx.replace('-', '/')).parent / f'{idx}.flac'
                    if audio_path.is_file():
                        # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
                        # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
                        info = torchaudio.info(str(audio_path))
                        metadata[idx] = LibriSpeechMetaData(audio_path=audio_path, audio_info=info[0], text=text)
                    else:
                        logging.warning(f'No such file: {audio_path}')

        # Audio
        audio = RecordingSet.from_recordings(
            Recording(
                id=idx,
                sources=[
                    AudioSource(
                        type='file',
                        channels=[0],
                        source=str(metadata[idx].audio_path)
                    )
                ],
                sampling_rate=int(metadata[idx].audio_info.rate),
                num_samples=metadata[idx].audio_info.length,
                duration=metadata[idx].audio_info.length / metadata[idx].audio_info.rate
            )
            for idx in metadata
        )

        # Supervision
        supervision = SupervisionSet.from_segments(
            SupervisionSegment(
                id=idx,
                recording_id=idx,
                start=0.0,
                duration=audio.recordings[idx].duration,
                channel=0,
                language='English',
                speaker=re.sub(r'-.*', r'', idx),
                text=metadata[idx].text.strip()
            )
            for idx in audio.recordings
        )

        if output_dir is not None:
            supervision.to_json(output_dir / f'supervisions_{part}.json')
            audio.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {
            'recordings': audio,
            'supervisions': supervision
        }

    return manifests
Example #24
0
def load_kaldi_data_dir(
    path: Pathlike,
    sampling_rate: int,
    frame_shift: Optional[Seconds] = None,
) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]:
    """
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests.
    For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists.
    All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet.
    In particular, feats.scp files are ignored.
    """
    path = Path(path)
    assert path.is_dir()

    # must exist for RecordingSet
    recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True)

    durations = defaultdict(float)
    reco2dur = path / 'reco2dur'
    if not reco2dur.is_file():
        raise ValueError(
            f"No such file: '{reco2dur}' -- fix it by running: utils/data/get_reco2dur.sh <data-dir>"
        )
    with reco2dur.open() as f:
        for line in f:
            recording_id, dur = line.strip().split()
            durations[recording_id] = float(dur)

    recording_set = RecordingSet.from_recordings(
        Recording(id=recording_id,
                  sources=[
                      AudioSource(type='command' if path_or_cmd.
                                  endswith('|') else 'file',
                                  channels=[0],
                                  source=path_or_cmd[:-1] if path_or_cmd.
                                  endswith('|') else path_or_cmd)
                  ],
                  sampling_rate=sampling_rate,
                  num_samples=int(durations[recording_id] * sampling_rate),
                  duration=durations[recording_id])
        for recording_id, path_or_cmd in recordings.items())

    supervision_set = None
    segments = path / 'segments'
    if segments.is_file():
        with segments.open() as f:
            supervision_segments = [l.strip().split() for l in f]

        texts = load_kaldi_text_mapping(path / 'text')
        speakers = load_kaldi_text_mapping(path / 'utt2spk')
        genders = load_kaldi_text_mapping(path / 'spk2gender')
        languages = load_kaldi_text_mapping(path / 'utt2lang')

        supervision_set = SupervisionSet.from_segments(
            SupervisionSegment(id=segment_id,
                               recording_id=recording_id,
                               start=float(start),
                               duration=float(end) - float(start),
                               channel=0,
                               text=texts[segment_id],
                               language=languages[segment_id],
                               speaker=speakers[segment_id],
                               gender=genders[speakers[segment_id]])
            for segment_id, recording_id, start, end in supervision_segments)

    feature_set = None
    feats_scp = path / 'feats.scp'
    if feats_scp.exists() and is_module_available('kaldiio'):
        if frame_shift is not None:
            import kaldiio
            from lhotse.features.io import KaldiReader
            feature_set = FeatureSet.from_features(
                Features(type='kaldiio',
                         num_frames=mat.shape[0],
                         num_features=mat.shape[1],
                         frame_shift=frame_shift,
                         sampling_rate=sampling_rate,
                         start=0,
                         duration=mat.shape[0] * frame_shift,
                         storage_type=KaldiReader.name,
                         storage_path=str(feats_scp),
                         storage_key=utt_id,
                         recording_id=supervision_set[utt_id].recording_id
                         if supervision_set is not None else utt_id,
                         channels=0)
                for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp)))
        else:
            warnings.warn(f"Failed to import Kaldi 'feats.scp' to Lhotse: "
                          f"frame_shift must be not None. "
                          f"Feature import omitted.")

    return recording_set, supervision_set, feature_set
Example #25
0
def prepare_wenet_speech(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = "all",
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: Which parts of dataset to prepare, all for all the
                          parts.
    :param output_dir: Pathlike, the path where to write the manifests.
    :num_jobs Number of workers to extract manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with
             the keys 'recordings' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    subsets = WETNET_SPEECH_PARTS if "all" in dataset_parts else dataset_parts

    manifests = defaultdict(dict)
    for sub in subsets:
        if sub not in WETNET_SPEECH_PARTS:
            raise ValueError(f"No such part of dataset in WenetSpeech : {sub}")
        manifests[sub] = {"recordings": [], "supervisions": []}

    raw_manifests_path = corpus_dir / "WenetSpeech.json"
    assert raw_manifests_path.is_file(), f"No such file : {raw_manifests_path}"
    logging.info(f"Loading raw manifests from : {raw_manifests_path}")
    raw_manifests = json.load(open(raw_manifests_path, "r", encoding="utf8"))

    with ProcessPoolExecutor(num_jobs) as ex:
        for recording, segments in tqdm(
                ex.map(
                    parse_utterance,
                    raw_manifests["audios"],
                    repeat(corpus_dir),
                    repeat(subsets),
                ),
                desc="Processing WenetSpeech JSON entries",
        ):
            for part in segments:
                manifests[part]["recordings"].append(recording)
                manifests[part]["supervisions"].extend(segments[part])

    for sub in subsets:
        recordings, supervisions = fix_manifests(
            recordings=RecordingSet.from_recordings(
                manifests[sub]["recordings"]),
            supervisions=SupervisionSet.from_segments(
                manifests[sub]["supervisions"]),
        )
        validate_recordings_and_supervisions(recordings=recordings,
                                             supervisions=supervisions)

        if output_dir is not None:
            supervisions.to_file(output_dir / f"supervisions_{sub}.jsonl.gz")
            recordings.to_file(output_dir / f"recordings_{sub}.jsonl.gz")

        manifests[sub] = {
            "recordings": recordings,
            "supervisions": supervisions,
        }

    return manifests
Example #26
0
def prepare_cmu_kids(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: Optional[bool] = True,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for CMU Kids corpus. The prepared supervisions contain the
    prompt text as the `text`. Additionally, in the `custom` tag, we provide the
    following data: speaker grade/age, population where the speaker came from
    (SIM95/FP), spoken transcript, and transcription bin (1/2).

    Here, bin `1` means utterances where the speaker followed the prompt and no
    noise/mispronunciation is present, and `2` refers to noisy utterances.

    The tag `spoken_transcript` is the transcription that was actually spoken. It
    contains noise tags and phone transcription in case the pronunciation differed
    from that in CMU Dict.

    :param corpus_dir: Path to downloaded LDC corpus.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    corpus_dir = Path(corpus_dir) if isinstance(corpus_dir,
                                                str) else corpus_dir
    corpus_dir = corpus_dir.parent if corpus_dir.stem == "cmu_kids" else corpus_dir

    recordings = []
    supervisions = []

    # Get transcripts for all utterances
    utterances = {}
    with open(corpus_dir / 'cmu_kids' / 'tables' / 'sentence.tbl', 'r') as f:
        for line in f:
            utt, count, text = line.strip().split('\t')
            utterances[utt] = text

    # Get speaker metadata
    speaker_info = {}
    with open(corpus_dir / 'cmu_kids' / 'tables' / 'speaker.tbl', 'r') as f:
        for _ in range(2):
            next(f)
        for line in f:
            # ID    LOC     GR/AGE  TOT     BIN2
            # fabm    SUM95   3/9     100     62
            # facs    SUM95   2/8     90      55
            spk, pop, gr_age, _, _ = line.strip().split('\t')
            grade, age = gr_age.split('/')
            speaker_info[spk] = (pop, grade, age)

    # Iterate through all transcriptions and add to supervisions
    with open(corpus_dir / 'cmu_kids' / 'tables' / 'transcrp.tbl', 'r') as f:
        for line in f:
            trn_id, transcript = line.strip().split(maxsplit=1)
            spk = trn_id[0:4]
            utt = trn_id[4:7]
            bin = int(trn_id[7])
            pop, grade, age = speaker_info[spk]

            audio_path = (corpus_dir / 'cmu_kids' / 'kids' / spk / 'signal' /
                          f'{trn_id}.sph')
            recording = Recording.from_file(
                audio_path, relative_path_depth=None if absolute_paths else 3)
            recordings.append(recording)

            supervisions.append(
                SupervisionSegment(
                    id=trn_id,
                    recording_id=trn_id,
                    start=0,
                    duration=recording.duration,
                    speaker=spk,
                    gender="Male" if spk[0] == 'm' else "Female",
                    language='English',
                    text=utterances[utt],
                    custom={
                        'speaker_grade': grade if grade != "NA" else None,
                        'speaker_age': int(age) if age != "NA" else None,
                        'speaker_population': pop,
                        'bin': bin,
                        'spoken_transcript': transcript,
                    },
                ))

    recordings = RecordingSet.from_recordings(recordings)
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    manifests = {
        'recordings': recordings,
        'supervisions': supervisions,
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSON files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        manifests["recordings"].to_json(output_dir / 'recordings.json')
        manifests["supervisions"].to_json(output_dir / 'supervisions.json')

    return manifests
Example #27
0
def prepare_aishell(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    transcript_path = corpus_dir / 'data_aishell/transcript/aishell_transcript_v0.8.txt'
    transcript_dict = {}
    with open(transcript_path, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            idx_transcript = line.split()
            transcript_dict[idx_transcript[0]] = ' '.join(idx_transcript[1:])
    manifests = defaultdict(dict)
    dataset_parts = ['train', 'dev', 'test']
    for part in dataset_parts:
        # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text)
        recordings = []
        supervisions = []
        wav_path = corpus_dir / 'data_aishell' / 'wav' / f'{part}'
        for audio_path in wav_path.rglob('**/*.wav'):
            idx = audio_path.stem
            speaker = audio_path.parts[-2]
            if idx not in transcript_dict:
                logging.warning(f'No transcript: {idx}')
                continue
            text = transcript_dict[idx]
            if not audio_path.is_file():
                logging.warning(f'No such file: {audio_path}')
                continue
            recording = Recording.from_file(audio_path)
            recordings.append(recording)
            segment = SupervisionSegment(id=idx,
                                         recording_id=idx,
                                         start=0.0,
                                         duration=recording.duration,
                                         channel=0,
                                         language='Chinese',
                                         speaker=speaker,
                                         text=text.strip())
            supervisions.append(segment)

        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recording_set, supervision_set)

        if output_dir is not None:
            supervision_set.to_json(output_dir / f'supervisions_{part}.json')
            recording_set.to_json(output_dir / f'recordings_{part}.json')

        manifests[part] = {
            'recordings': recording_set,
            'supervisions': supervision_set
        }

    return manifests
Example #28
0
def prepare_librispeech(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = "auto",
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if dataset_parts == "mini_librispeech":
        dataset_parts = set(MINI_LIBRISPEECH).intersection(
            path.name for path in corpus_dir.glob("*"))
    elif dataset_parts == "auto":
        dataset_parts = (set(LIBRISPEECH).union(MINI_LIBRISPEECH).intersection(
            path.name for path in corpus_dir.glob("*")))
        if not dataset_parts:
            raise ValueError(
                f"Could not find any of librispeech or mini_librispeech splits in: {corpus_dir}"
            )
    elif isinstance(dataset_parts, str):
        dataset_parts = [dataset_parts]

    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                             output_dir=output_dir)

    with ThreadPoolExecutor(num_jobs) as ex:
        for part in tqdm(dataset_parts, desc="Dataset parts"):
            logging.info(f"Processing LibriSpeech subset: {part}")
            if manifests_exist(part=part, output_dir=output_dir):
                logging.info(
                    f"LibriSpeech subset: {part} already prepared - skipping.")
                continue
            recordings = []
            supervisions = []
            part_path = corpus_dir / part
            futures = []
            for trans_path in tqdm(part_path.rglob("*.trans.txt"),
                                   desc="Distributing tasks",
                                   leave=False):
                alignments = {}
                ali_path = trans_path.parent / (trans_path.stem.split(".")[0] +
                                                ".alignment.txt")
                if ali_path.exists():
                    alignments = parse_alignments(ali_path)
                # "trans_path" file contains lines like:
                #
                #   121-121726-0000 ALSO A POPULAR CONTRIVANCE
                #   121-121726-0001 HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE
                #   121-121726-0002 ANGOR PAIN PAINFUL TO HEAR
                #
                # We will create a separate Recording and SupervisionSegment for those.
                with open(trans_path) as f:
                    for line in f:
                        futures.append(
                            ex.submit(parse_utterance, part_path, line,
                                      alignments))

            for future in tqdm(futures, desc="Processing", leave=False):
                result = future.result()
                if result is None:
                    continue
                recording, segment = result
                recordings.append(recording)
                supervisions.append(segment)

            recording_set = RecordingSet.from_recordings(recordings)
            supervision_set = SupervisionSet.from_segments(supervisions)

            validate_recordings_and_supervisions(recording_set,
                                                 supervision_set)

            if output_dir is not None:
                supervision_set.to_file(output_dir /
                                        f"supervisions_{part}.json")
                recording_set.to_file(output_dir / f"recordings_{part}.json")

            manifests[part] = {
                "recordings": recording_set,
                "supervisions": supervision_set,
            }

    return manifests
Example #29
0
def prepare_heroico(
    speech_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions

    :param speech_dir: Pathlike, the path of the speech data dir.
param transcripts_dir: Pathlike, the path of the transcript data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the fold, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    speech_dir = Path(speech_dir)
    transcript_dir = Path(transcript_dir)
    assert speech_dir.is_dir(), f'No such directory: {speech_dir}'
    assert transcript_dir.is_dir(), f'No such directory: {transcript_dir}'
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)

    # set some patterns to match fields in transcript files and filenames
    answers_line_pattern = re.compile("\d+/\d+\t.+")
    answers_path_pattern = re.compile('Answers_Spanish')
    heroico_recitations_line_pattern = re.compile("\d+\t.+")
    heroico_recitations_devtest_path_pattern = re.compile('Recordings_Spanish')
    heroico_recitations_train_path_pattern = re.compile('Recordings_Spanish')
    usma_line_pattern = re.compile("s\d+\t.+")
    usma_native_demo_pattern = re.compile(
        "usma/native\-[fm]\-\w+\-\S+\-\S+\-\S+\-\S+\-\w+\d+")
    usma_native_path_pattern = re.compile('usma/native')
    usma_native_prompt_id_pattern = re.compile('s\d+')
    usma_nonnative_demo_pattern = re.compile(
        "nonnative\-[fm]\-[a-zA-Z]+\d*\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\d+"
    )
    usma_nonnative_path_pattern = re.compile('nonnative.+\.wav')

    # Generate a mapping: utt_id -> (audio_path, audio_info, text)

    transcripts = defaultdict(dict)
    # store answers trnscripts
    answers_trans_path = Path(transcript_dir, heroico_dataset_answers)
    with open(answers_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            # some recordings do not have a transcript, skip them here
            if not answers_line_pattern.match(line):
                continue
            # IDs have the form speaker/prompt_id
            spk_utt, text = line.split(maxsplit=1)
            spk_id, prompt_id = spk_utt.split('/')
            utt_id = '-'.join(['answers', spk_id, prompt_id])
            transcripts[utt_id] = text

    # store heroico recitations transcripts
    heroico_recitations_trans_path = Path(transcript_dir,
                                          heroico_dataset_recordings)
    with open(heroico_recitations_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            if not heroico_recitations_line_pattern.match(line):
                continue
            idx, text = line.split(maxsplit=1)
            utt_id = '-'.join(['heroico-recitations', idx])
            transcripts[utt_id] = text

    # store usma transcripts
    usma_trans_path = Path(transcript_dir, usma_dataset)
    with open(usma_trans_path, encoding='iso-8859-1') as f:
        for line in f:
            line = line.rstrip()
            if not usma_line_pattern.match(line):
                continue
            idx, text = line.split(maxsplit=1)
            utt_id = '-'.join(['usma', idx])
            transcripts[utt_id] = text

    # store utterance info
    audio_paths = speech_dir.rglob('*.wav')
    uttdata = {}
    for wav_file in audio_paths:
        wav_path = Path(wav_file)
        path_components = wav_path.parts
        pid = wav_path.stem
        if re.findall(answers_path_pattern, str(wav_file)):
            # store utternce info for Heroico Answers
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['answers', spk, pid])
            if utt_id not in transcripts:
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='train',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='answers',
                                             utterance_id=utt_id,
                                             transcript=transcripts[utt_id])
        elif re.findall(usma_native_path_pattern, str(wav_file)):
            # store utterance info for usma native data
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['usma', spk, pid])
            trans_id = '-'.join(['usma', pid])
            if not usma_native_demo_pattern.match(spk):
                uttdata[str(wav_file)] = None
            if not usma_native_prompt_id_pattern.match(pid):
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='test',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='usma',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif re.findall(usma_nonnative_path_pattern, str(wav_file)):
            # store utterance data for usma nonnative data
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['usma', spk, pid])
            trans_id = '-'.join(['usma', pid])
            if not usma_nonnative_demo_pattern.match(spk):
                uttdata[str(wav_file)] = None
                continue
            uttdata[str(wav_file)] = UttInfo(fold='test',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='usma',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif int(pid) <= 354 or int(pid) >= 562:
            # store utterance info for heroico recitations for train dataset
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['heroico-recitations', spk, pid])
            trans_id = '-'.join(['heroico-recitations', pid])
            uttdata[str(wav_file)] = UttInfo(fold='train',
                                             speaker=spk,
                                             prompt_id=pid,
                                             subcorpus='heroico-recitations',
                                             utterance_id=utt_id,
                                             transcript=transcripts[trans_id])
        elif int(pid) > 354 and int(pid) < 562:
            spk = wav_path.parts[-2]
            utt_id = '-'.join(['heroico-recitations-repeats', spk, pid])
            trans_id = '-'.join(['heroico-recitations-repeats', pid])
            uttdata[str(wav_file)] = UttInfo(
                fold='devtest',
                speaker=spk,
                prompt_id=pid,
                subcorpus='heroico-recitations-repeats',
                utterance_id=utt_id,
                transcript=transcripts[trans_id])
        else:
            logging.warning(f'No such file: {wav_file}')

    audio_paths = speech_dir.rglob('*.wav')
    audio_files = [w for w in audio_paths]

    for fld in folds:
        metadata = {}
        for wav_file in audio_files:
            wav_path = Path(wav_file)
            # skip files with no record
            if not uttdata[str(wav_file)]:
                continue
            # only process the current fold
            if uttdata[str(wav_file)].fold != fld:
                continue
            path_components = wav_path.parts
            prompt_id = wav_path.stem
            # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... )
            # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...)
            info = soundfile.info(str(wav_file))
            spk = wav_path.parts[-2]
            utt_id = '-'.join(
                [uttdata[str(wav_file)].subcorpus, spk, prompt_id])
            metadata[utt_id] = HeroicoMetaData(
                audio_path=wav_file,
                audio_info=info,
                text=uttdata[str(wav_file)].transcript)

        # Audio
        audio = RecordingSet.from_recordings(
            Recording(id=idx,
                      sources=[
                          AudioSource(type='file',
                                      channels=[0],
                                      source=str(metadata[idx].audio_path))
                      ],
                      sampling_rate=int(metadata[idx].audio_info.samplerate),
                      num_samples=metadata[idx].audio_info.frames,
                      duration=metadata[idx].audio_info.duration)
            for idx in metadata)

        # Supervision
        supervision = SupervisionSet.from_segments(
            SupervisionSegment(id=idx,
                               recording_id=idx,
                               start=0.0,
                               duration=audio.recordings[idx].duration,
                               channel=0,
                               language='Spanish',
                               speaker=idx.split('-')[-2],
                               text=metadata[idx].text)
            for idx in audio.recordings)

        validate_recordings_and_supervisions(audio, supervision)

        if output_dir is not None:
            supervision.to_json(output_dir / f'supervisions_{fld}.json')
            audio.to_json(output_dir / f'recordings_{fld}.json')

        manifests[fld] = {'recordings': audio, 'supervisions': supervision}

    return manifests
Example #30
0
def prepare_gale_mandarin(
    audio_dirs: List[Pathlike],
    transcript_dirs: List[Pathlike],
    output_dir: Optional[Pathlike] = None,
    absolute_paths: Optional[bool] = True,
    segment_words: Optional[bool] = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for GALE Mandarin Broadcast speech corpus.

    :param audio_dirs: List of paths to audio corpora.
    :param transcripts_dirs: List of paths to transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :param segment_words: Use `jieba` package to perform word segmentation (default = False)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    assert len(audio_dirs) == len(
        transcript_dirs
    ), "Paths to the same speech and transcript corpora must be provided"

    logging.info("Reading audio and transcript paths from provided dirs")
    # Some of the audio is wav while others are flac. Also, some recordings
    # may be repeated across corpora so we make a dict to avoid adding them
    # twice.
    audio_paths = defaultdict(
        Path,
        {
            p.stem: p
            for p in chain.from_iterable(
                [
                    check_and_rglob(dir, ext, strict=False)
                    for dir in audio_dirs
                    for ext in ["*.wav", "*.flac"]
                ]
            )
        },
    )
    transcript_paths = chain.from_iterable(
        [check_and_rglob(dir, "*.tdf") for dir in transcript_dirs]
    )

    logging.info("Preparing recordings manifest")

    recordings = RecordingSet.from_recordings(
        Recording.from_file(p, relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths.values()
    )

    logging.info("Preparing supervisions manifest")
    supervisions = SupervisionSet.from_segments(
        parse_transcripts(transcript_paths, segment_words=segment_words)
    ).filter(lambda s: s.recording_id in audio_paths)

    # Some supervisions exceed recording boundaries, so here we trim them
    supervisions = trim_supervisions_to_recordings(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    TEST = [
        line.decode("utf-8").strip() for url in TEST_FILE_URLS for line in urlopen(url)
    ]

    manifests = defaultdict(dict)
    manifests["dev"] = {
        "recordings": recordings.filter(lambda r: r.id in TEST),
        "supervisions": supervisions.filter(lambda s: s.recording_id in TEST),
    }
    manifests["train"] = {
        "recordings": recordings.filter(lambda r: r.id not in TEST),
        "supervisions": supervisions.filter(lambda s: s.recording_id not in TEST),
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSONL files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part in ["train", "dev"]:
            manifests[part]["recordings"].to_file(
                output_dir / f"gale-mandarin_recordings_{part}.jsonl.gz"
            )
            manifests[part]["supervisions"].to_file(
                output_dir / f"gale-mandarin_supervisions_{part}.jsonl.gz"
            )

    return manifests