コード例 #1
0
def prepare_single_partition(
    raw_manifest_path: Path,
    corpus_dir: Path,
    speaker_id: str,
    clean_or_other: str,
):
    recordings = []
    supervisions = []
    for meta in load_jsonl(raw_manifest_path):
        recording = Recording.from_file(corpus_dir / meta["audio_filepath"])
        recordings.append(recording)
        supervisions.append(
            SupervisionSegment(
                id=recording.id,
                recording_id=recording.id,
                start=0,
                duration=recording.duration,
                channel=0,
                text=meta["text"],
                speaker=ID2SPEAKER[speaker_id],
                gender=ID2GENDER[speaker_id],
                custom={
                    "text_punct": meta["text_normalized"],
                    "split": clean_or_other
                },
            ))
    recordings = RecordingSet.from_recordings(recordings)
    supervisions = SupervisionSet.from_segments(supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)
    return recordings, supervisions
コード例 #2
0
def prepare_vctk(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    speaker_meta = _parse_speaker_description(corpus_dir)

    recordings = RecordingSet.from_recordings(
        Recording.from_file(wav)
        for wav in (corpus_dir / "wav48").rglob("*.wav"))
    supervisions = []
    for path in (corpus_dir / "txt").rglob("*.txt"):
        # One utterance (line) per file
        text = path.read_text().strip()
        speaker = path.name.split("_")[0]  # p226_001.txt -> p226
        seg_id = path.stem
        meta = speaker_meta.get(speaker, defaultdict(lambda: None))
        if meta is None:
            logging.warning(f"Cannot find metadata for speaker {speaker}.")
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language="English",
                speaker=speaker,
                gender=meta["gender"],
                custom={
                    "accent": meta["accent"],
                    "age": meta["age"],
                    "region": meta["region"],
                },
            ))
    supervisions = SupervisionSet.from_segments(supervisions)

    # note(pzelasko): There were 172 recordings without supervisions when I ran it.
    #                 I am just removing them.
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / "recordings.json")
        supervisions.to_json(output_dir / "supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
コード例 #3
0
ファイル: tedlium.py プロジェクト: AmirHussein96/lhotse
def prepare_tedlium(
    tedlium_root: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the TED-LIUM v3 corpus.

    The manifests are created in a dict with three splits: train, dev and test.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param tedlium_root: Path to the unpacked TED-LIUM data.
    :return: A dict with standard corpus splits containing the manifests.
    """
    tedlium_root = Path(tedlium_root)
    output_dir = Path(output_dir) if output_dir is not None else None
    corpus = {}
    for split in ("train", "dev", "test"):
        root = tedlium_root / "legacy" / split
        recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in (root / "sph").glob("*.sph"))
        stms = list((root / "stm").glob("*.stm"))
        assert len(stms) == len(recordings), (
            f"Mismatch: found {len(recordings)} "
            f"sphere files and {len(stms)} STM files. "
            f"You might be missing some parts of TEDLIUM...")
        segments = []
        for p in stms:
            with p.open() as f:
                for idx, l in enumerate(f):
                    rec_id, _, _, start, end, _, *words = l.split()
                    start, end = float(start), float(end)
                    text = " ".join(words).replace("{NOISE}", "[NOISE]")
                    if text == "ignore_time_segment_in_scoring":
                        continue
                    segments.append(
                        SupervisionSegment(
                            id=f"{rec_id}-{idx}",
                            recording_id=rec_id,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=text,
                            language="English",
                            speaker=rec_id,
                        ))
        supervisions = SupervisionSet.from_segments(segments)
        corpus[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

        validate_recordings_and_supervisions(**corpus[split])

        if output_dir is not None:
            recordings.to_file(output_dir /
                               f"tedlium_recordings_{split}.jsonl.gz")
            supervisions.to_file(output_dir /
                                 f"tedlium_supervisions_{split}.jsonl.gz")

    return corpus
コード例 #4
0
def test_resample_opus():
    # Test that reading OPUS files after resampling
    # does not raise an exception.
    r = Recording.from_file("test/fixtures/mono_c0.opus")
    r.load_audio()
    r1 = r.resample(24000)
    r1.load_audio()
コード例 #5
0
def test_audio_caching_disabled_works():
    lhotse.set_caching_enabled(False)  # Disable caching.

    np.random.seed(89)  # Reproducibility.

    # Prepare two different waveforms.
    noise1 = np.random.rand(1, 32000).astype(np.float32)
    noise2 = np.random.rand(1, 32000).astype(np.float32)
    # Sanity check -- the noises are different
    assert np.abs(noise1 - noise2).sum() != 0

    # Save the first waveform in a file.
    with NamedTemporaryFile(suffix=".wav") as f:
        torchaudio.save(f.name, torch.from_numpy(noise1), sample_rate=16000)
        recording = Recording.from_file(f.name)

        # Read the audio -- should be equal to noise1.
        audio = recording.load_audio()
        np.testing.assert_almost_equal(audio, noise1)

        # Save noise2 to the same location.
        torchaudio.save(f.name, torch.from_numpy(noise2), sample_rate=16000)

        # Read the audio -- should be equal to noise2,
        # and the caching is ignored (doesn't happen).
        audio = recording.load_audio()
        np.testing.assert_almost_equal(audio, noise2)
コード例 #6
0
ファイル: voxceleb.py プロジェクト: glynpu/lhotse
def _process_file(
    file_path: Pathlike,
    speaker_metadata: Dict[str, SpeakerMetadata],
) -> Tuple[Recording, SupervisionSegment]:
    """
    Process a single wav file and return a Recording and a SupervisionSegment.
    """
    speaker_id = file_path.parent.parent.stem
    session_id = file_path.parent.stem
    uttid = file_path.stem
    recording_id = f"{speaker_id}-{session_id}-{uttid}"
    recording = Recording.from_file(file_path, recording_id=recording_id)
    supervision = SupervisionSegment(
        id=recording_id,
        recording_id=recording_id,
        speaker=speaker_id,
        gender=speaker_metadata[speaker_id].gender,
        start=0.0,
        duration=recording.duration,
        custom={
            "speaker_name": speaker_metadata[speaker_id].name,
            "nationality": speaker_metadata[speaker_id].nationality,
            "split": speaker_metadata[speaker_id].split,
        },
    )
    return recording, supervision
コード例 #7
0
def test_cut_load_custom_recording_pad_left():
    sampling_rate = 16000
    duration = 52.4
    audio = np.random.randn(1, compute_num_samples(
        duration, sampling_rate)).astype(np.float32)
    audio /= np.abs(audio).max()  # normalize to [-1, 1]
    with NamedTemporaryFile(suffix=".wav") as f:
        torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate)
        f.flush()
        os.fsync(f)
        recording = Recording.from_file(f.name)

        # Note: MonoCut doesn't normally have an "alignment" attribute,
        #       and a "load_alignment()" method.
        #       We are dynamically extending it.
        cut = MonoCut(
            id="x",
            start=0,
            duration=duration,
            channel=0,
            recording=dummy_recording(0, duration=duration),
        )
        cut.my_favorite_song = recording

        cut_pad = cut.pad(duration=60.0, direction="left")

        restored_audio = cut_pad.load_my_favorite_song()
        assert restored_audio.shape == (1, 960000)  # 16000 * 60

        np.testing.assert_almost_equal(0, restored_audio[:, :-audio.shape[1]])
        np.testing.assert_almost_equal(audio, restored_audio[:,
                                                             -audio.shape[1]:])
コード例 #8
0
def process_file(
    file_path: Any,
    transcript_dict : Dict
) -> Tuple[Recording, SupervisionSegment]:
    """
    Process a single wav file and return a Recording and a SupervisionSegment.
    """
    try:
        speaker_id = Path(file_path).stem
        uttid = speaker_id
        recording_id = speaker_id
        recording = Recording.from_file(file_path)
        text = transcript_dict[uttid]
        supervision = SupervisionSegment(
            id=recording_id,
            recording_id=recording_id,
            speaker=speaker_id,
            start=0.0,
            duration=recording.duration,
            channel=0,
            language="Chinese",
            text=text.strip(),
        )
        return recording, supervision
    except Exception as e:
        logging.info(f"process_file err: {file_path}")
        return None, None
コード例 #9
0
def prepare_tedlium(
        tedlium_root: Pathlike,
        output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the TED-LIUM v3 corpus.

    The manifests are created in a dict with three splits: train, dev and test.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param tedlium_root: Path to the unpacked TED-LIUM data.
    :return: A dict with standard corpus splits containing the manifests.
    """
    tedlium_root = Path(tedlium_root)
    output_dir = Path(output_dir) if output_dir is not None else None
    corpus = {}
    for split in ('train', 'dev', 'test'):
        root = tedlium_root / 'legacy' / split
        recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in (root / 'sph').glob('*.sph')
        )
        stms = list((root / 'stm').glob('*.stm'))
        assert len(stms) == len(recordings), f'Mismatch: found {len(recordings)} ' \
                                             f'sphere files and {len(stms)} STM files. ' \
                                             f'You might be missing some parts of TEDLIUM...'
        segments = []
        for p in stms:
            with p.open() as f:
                for idx, l in enumerate(f):
                    rec_id, _, _, start, end, _, *words = l.split()
                    start, end = float(start), float(end)
                    text = ' '.join(words).replace('{NOISE}', '[NOISE]')
                    if text == 'ignore_time_segment_in_scoring':
                        continue
                    segments.append(
                        SupervisionSegment(
                            id=f'{rec_id}-{idx}',
                            recording_id=rec_id,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=text,
                            language='English',
                            speaker=rec_id,
                        )
                    )
        supervisions = SupervisionSet.from_segments(segments)
        corpus[split] = {
            'recordings': recordings,
            'supervisions': supervisions
        }

        validate_recordings_and_supervisions(**corpus[split])

        if output_dir is not None:
            recordings.to_json(output_dir / f'{split}_recordings.json')
            supervisions.to_json(output_dir / f'{split}_supervisions.json')

    return corpus
コード例 #10
0
def test_cut_with_audio_move_to_memory():
    path = "test/fixtures/mono_c0.wav"
    cut = dummy_cut(0, duration=0.5).drop_recording()
    cut.recording = Recording.from_file(path)

    memory_cut = cut.move_to_memory()

    np.testing.assert_equal(memory_cut.load_audio(), cut.load_audio())
コード例 #11
0
ファイル: cmu_arctic.py プロジェクト: underdogliu/lhotse
def prepare_cmu_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the CMU Arctic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        # Example ID: cmu_us_sup_arctic-arctic_a0001
        Recording.from_file(
            wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}"
        )
        for wav in corpus_dir.rglob("*.wav")
    )
    supervisions = []
    for path in corpus_dir.rglob("txt.done.data"):
        lines = path.read_text().splitlines()
        speaker = _get_speaker(path.parent.parent.name)
        for l in lines:
            l = l[2:-2]  # get rid of parentheses and whitespaces on the edges
            seg_id, text = l.split(maxsplit=1)
            seg_id = f"{speaker}-{seg_id}"
            supervisions.append(
                SupervisionSegment(
                    id=seg_id,
                    recording_id=seg_id,
                    start=0,
                    duration=recordings[seg_id].duration,
                    text=text.replace('"', ""),  # get rid of quotation marks,
                    language="English",
                    speaker=speaker,
                    gender=GENDER_MAP.get(speaker),
                    custom={"accent": ACCENT_MAP.get(speaker)},
                )
            )
    supervisions = SupervisionSet.from_segments(supervisions)

    # There seem to be 20 recordings missing; remove the before validation
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions
    )
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        recordings.to_json(output_dir / "cmu_arctic_recordings.json")
        supervisions.to_json(output_dir / "cmu_arctic_supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
コード例 #12
0
def test_cut_with_audio_move_to_memory_large_offset():
    path = "test/fixtures/mono_c0.wav"
    cut = dummy_cut(0, duration=0.1).drop_recording()
    cut.recording = Recording.from_file(path)
    cut.start = 0.4
    assert isclose(cut.end, 0.5)

    memory_cut = cut.move_to_memory()

    np.testing.assert_equal(memory_cut.load_audio(), cut.load_audio())
コード例 #13
0
def test_cut_move_to_memory_audio_serialization():
    path = "test/fixtures/mono_c0.wav"
    cut = dummy_cut(0, duration=0.5).drop_recording()
    cut.recording = Recording.from_file(path)

    cut_with_audio = cut.move_to_memory()

    assert cut.custom is None  # original cut is unmodified

    data = cut_with_audio.to_dict()
    cut_deserialized = MonoCut.from_dict(data)

    np.testing.assert_equal(cut_deserialized.load_audio(),
                            cut_with_audio.load_audio())
コード例 #14
0
def prepare_same_close_mic(part3_path):
    check_dependencies()
    from textgrids import TextGrid

    recordings = []
    supervisions = []
    for audio_path in tqdm(
        (part3_path / "AudioSameCloseMic").glob("*.wav"),
        desc="Creating manifests for SameCloseMic",
    ):
        try:
            recording_id = audio_path.stem
            recording = Recording.from_file(audio_path)

            tg = TextGrid(
                part3_path / f"ScriptsSame/{recording_id}.TextGrid", coding="utf-16"
            )
            segments = [
                s
                for s in (
                    SupervisionSegment(
                        id=f"{recording_id}-{idx}",
                        recording_id=recording_id,
                        start=segment.xmin,
                        # We're trimming the last segment's duration as it exceeds the actual duration of the recording.
                        # This is safe because if we end up with a zero/negative duration, the validation will catch it.
                        duration=min(
                            round(segment.xmax - segment.xmin, ndigits=8),
                            recording.duration - segment.xmin,
                        ),
                        text=segment.text,
                        language="Singaporean English",
                        speaker=recording_id,
                    )
                    for idx, segment in enumerate(tg[recording_id])
                    if segment.text not in ("<S>", "<Z>")  # skip silences
                )
                if s.duration > 0  # NSC has some bad segments
            ]

            recordings.append(recording)
            supervisions.extend(segments)
        except:
            print(f"Error when processing {audio_path} - skipping...")
    return {
        "recordings": RecordingSet.from_recordings(recordings),
        "supervisions": SupervisionSet.from_segments(supervisions),
    }
コード例 #15
0
def test_augmentation_chain_randomized(
        target_sampling_rate: int,
        sp_factor: float,
        resample_first: bool,
        cut_duration: Seconds
):
    recording = Recording.from_file('test/fixtures/libri/libri-1088-134315-0000.wav')

    if resample_first:
        recording_aug = recording.resample(target_sampling_rate).perturb_speed(sp_factor)
    else:
        recording_aug = recording.perturb_speed(sp_factor).resample(target_sampling_rate)

    audio_aug = recording_aug.load_audio()
    assert audio_aug.shape[1] == recording_aug.num_samples

    cut_aug = MonoCut(id='dummy', start=0.5125, duration=cut_duration, channel=0, recording=recording_aug)
    assert cut_aug.load_audio().shape[1] == cut_aug.num_samples
コード例 #16
0
def prepare_separate_phone_mic(part3_path):
    check_dependencies()
    from textgrids import TextGrid
    recordings = []
    supervisions = []
    for audio_path in tqdm(
            (part3_path / 'AudioSeparateIVR').rglob('**/*.wav'),
            desc='Creating manifests for SeparateIVR'
    ):
        try:
            recording_id = f'{audio_path.parent.name}_{audio_path.stem}'
            recording = Recording.from_file(audio_path)

            tg = TextGrid(part3_path / f'ScriptsSeparate/{recording_id}.TextGrid', coding='utf-16')
            segments = [
                s for s in (
                    SupervisionSegment(
                        id=f'{recording_id}-{idx}',
                        recording_id=recording_id,
                        start=segment.xmin,
                        # We're trimming the last segment's duration as it exceeds the actual duration of the recording.
                        # This is safe because if we end up with a zero/negative duration, the validation will catch it.
                        duration=min(round(segment.xmax - segment.xmin, ndigits=8), recording.duration - segment.xmin),
                        text=segment.text,
                        language='Singaporean English',
                        speaker=recording_id,
                    )
                    for idx, segment in enumerate(tg[recording_id])
                    if segment.text not in ('<S>', '<Z>')  # skip silences
                )
                if s.duration > 0  # NSC has some bad segments
            ]

            supervisions.extend(segments)
            recordings.append(recording)
        except:
            print(f'Error when processing {audio_path} - skipping...')
    return {
        'recordings': RecordingSet.from_recordings(recordings),
        'supervisions': SupervisionSet.from_segments(supervisions)
    }
コード例 #17
0
def prepare_callhome_english_sre(
    audio_dir: Pathlike,
    rttm_dir: Optional[Pathlike] = None,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Callhome American English portion prepartion.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory. If not provided,
        the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests.
        The keys are: ``{'recordings', 'supervisions'}``.
    """
    if rttm_dir is None:
        rttm_dir = download_callhome_metadata()
    rttm_path = rttm_dir / "fullref.rttm"
    supervisions = read_rttm(rttm_path)

    audio_paths = check_and_rglob(audio_dir, "*.sph")
    recordings = RecordingSet.from_recordings(
        Recording.from_file(p,
                            relative_path_depth=None if absolute_paths else 4)
        for p in tqdm(audio_paths))

    recordings, supervisions = fix_manifests(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / "recordings.json")
        supervisions.to_json(output_dir / "supervisions.json")
    return {"recordings": recordings, "supervisions": supervisions}
コード例 #18
0
def test_cut_load_custom_recording_truncate():
    sampling_rate = 16000
    duration = 52.4
    audio = np.random.randn(1, compute_num_samples(
        duration, sampling_rate)).astype(np.float32)
    audio /= np.abs(audio).max()  # normalize to [-1, 1]
    with NamedTemporaryFile(suffix=".wav") as f:
        torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate)
        f.flush()
        os.fsync(f)
        recording = Recording.from_file(f.name)

        # Note: MonoCut doesn't normally have an "alignment" attribute,
        #       and a "load_alignment()" method.
        #       We are dynamically extending it.
        cut = dummy_cut(0, duration=duration)
        cut.my_favorite_song = recording

        cut_trunc = cut.truncate(duration=5.0)

        restored_audio = cut_trunc.load_my_favorite_song()
        assert restored_audio.shape == (1, 80000)

        np.testing.assert_almost_equal(audio[:, :80000], restored_audio)
コード例 #19
0
def prepare_same_close_mic(part3_path):
    check_dependencies()
    from textgrids import TextGrid
    recordings = []
    supervisions = []
    for audio_path in tqdm((part3_path / 'AudioSameCloseMic').glob('*.wav'),
                           desc='Creating manifests for SameCloseMic'):
        try:
            recording_id = audio_path.stem
            recording = Recording.from_file(audio_path)

            tg = TextGrid(part3_path / f'ScriptsSame/{recording_id}.TextGrid',
                          coding='utf-16')
            segments = [
                s for s in (
                    SupervisionSegment(
                        id=f'{recording_id}-{idx}',
                        recording_id=recording_id,
                        start=segment.xmin,
                        duration=round(segment.xmax - segment.xmin, ndigits=8),
                        text=segment.text,
                        language='Singaporean English',
                        speaker=recording_id,
                    ) for idx, segment in enumerate(tg[recording_id])
                    if segment.text not in ('<S>', '<Z>')  # skip silences
                ) if s.duration > 0  # NSC has some bad segments
            ]

            recordings.append(recording)
            supervisions.extend(segments)
        except:
            print(f'Error when processing {audio_path} - skipping...')
    return {
        'recordings': RecordingSet.from_recordings(recordings),
        'supervisions': SupervisionSet.from_segments(supervisions)
    }
コード例 #20
0
def prepare_callhome_english_asr(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the CallHome American English corpus.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S42`` content
    :param transcript_dir: Path to the ``LDC97T14`` content
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are:
        ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["evaltest", "train", "devtest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "data" / split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir / "transcrpt" / split,
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            postprocessed_lines = list()
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                if line.startswith("#"):
                    continue
                try:
                    start, end, spk, text = line.split(maxsplit=3)
                    duration = float(Decimal(end) - Decimal(start))
                    if duration <= 0:
                        continue
                    postprocessed_lines.append(line)
                except InvalidOperation:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line
                except ValueError:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line

            for line in postprocessed_lines:
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        channel=ord(spk[0]) - ord("A"),
                        speaker=f"{recording_id}_{spk:0>2s}",
                        id=f"{recording_id}_{spk:0>2s}_{idx:0>5d}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_file(output_dir /
                               f"callhome-english_recordings_{split}.jsonl.gz")
            supervisions.to_file(
                output_dir / f"callhome-english_supervisions_{split}.jsonl.gz")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
コード例 #21
0
def prepare_l2_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'

    speaker_meta = _parse_speaker_description()

    recordings = RecordingSet.from_recordings(
        # Example ID: zhaa-arctic_b0126
        Recording.from_file(
            wav, recording_id=f'{wav.parent.parent.name.lower()}-{wav.stem}')
        for wav in corpus_dir.rglob('*.wav'))
    supervisions = []
    for path in corpus_dir.rglob('*.txt'):
        # One utterance (line) per file
        text = path.read_text().strip()

        is_suitcase_corpus = 'suitcase_corpus' in path.parts

        speaker = path.parent.parent.name.lower(
        )  # <root>/ABA/transcript/arctic_a0051.txt -> aba
        if is_suitcase_corpus:
            speaker = path.stem  # <root>/suitcase_corpus/transcript/aba.txt -> aba

        seg_id = f'suitcase_corpus-{speaker}' if is_suitcase_corpus else f'{speaker}-{path.stem}'
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language='English',
                speaker=speaker,
                gender=speaker_meta[speaker]['gender'],
                custom={'accent': speaker_meta[speaker]['native_lang']}))
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    splits = {
        'read': {
            'recordings':
            recordings.filter(lambda r: 'suitcase_corpus' not in r.id),
            'supervisions':
            supervisions.filter(
                lambda s: 'suitcase_corpus' not in s.recording_id)
        },
        'suitcase': {
            'recordings':
            recordings.filter(lambda r: 'suitcase_corpus' in r.id),
            'supervisions':
            supervisions.filter(lambda s: 'suitcase_corpus' in s.recording_id)
        }
    }

    if output_dir is not None:
        output_dir = Path(output_dir)
        makedirs(output_dir, exist_ok=True)
        for key, manifests in splits.items():
            manifests['recordings'].to_json(output_dir /
                                            f'recordings-{key}.json')
            manifests['supervisions'].to_json(output_dir /
                                              f'supervisions-{key}.json')

    return splits
コード例 #22
0
def scan_recordings(corpus_dir: Path) -> RecordingSet:
    return RecordingSet.from_recordings(
        Recording.from_file(file)
        for file in corpus_dir.rglob('*.wav')
    )
コード例 #23
0
def prepare_rir_noise(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    parts: Sequence[str] = ("point_noise", "iso_noise", "real_rir", "sim_rir"),
) -> Dict[str, Dict[str, Union[RecordingSet, CutSet]]]:
    """
    Prepare the RIR Noise corpus.

    :param corpus_dir: Pathlike, the path of the dir to store the dataset.
    :param output_dir: Pathlike, the path of the dir to write the manifests.
    :param parts: Sequence[str], the parts of the dataset to prepare.

    The corpus contains 4 things: point-source noises (point_noise), isotropic noises (iso_noise),
    real RIRs (real_rir), and simulated RIRs (sim_rir). We will prepare these parts
    in the corresponding dict keys.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if not parts:
        raise ValueError("No parts specified for manifest preparation.")
    if isinstance(parts, str):
        parts = [parts]

    manifests = defaultdict(dict)
    for part in parts:
        logging.info(f"Preparing {part}...")
        audio_dir = corpus_dir / PARTS[part]
        assert audio_dir.is_dir(), f"No such directory: {audio_dir}"
        if part == "sim_rir":
            # The "small", "medium", and "large" rooms have the same file names, so
            # we have to handle them separately to avoid duplicating manifests.
            recordings = []
            for room_type in ("small", "medium", "large"):
                room_dir = audio_dir / f"{room_type}room"
                recordings += [
                    Recording.from_file(
                        file, recording_id=f"{room_type}-{file.stem}")
                    for file in room_dir.rglob("*.wav")
                ]
            manifests[part]["recordings"] = RecordingSet.from_recordings(
                recordings)
        elif part == "point_noise":
            manifests[part]["recordings"] = RecordingSet.from_recordings(
                Recording.from_file(file) for file in audio_dir.rglob("*.wav"))
        elif part == "iso_noise":
            manifests[part]["recordings"] = RecordingSet.from_recordings(
                Recording.from_file(file) for file in audio_dir.rglob("*.wav")
                if "noise" in file.stem)
        elif part == "real_rir":
            manifests[part]["recordings"] = RecordingSet.from_recordings(
                Recording.from_file(file) for file in audio_dir.rglob("*.wav")
                if "rir" in file.stem)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part in manifests:
            for key, manifest in manifests[part].items():
                manifest.to_file(output_dir / f"{key}_{part}.json")

    return manifests
コード例 #24
0
def recording():
    return Recording.from_file("test/fixtures/libri/libri-1088-134315-0000.wav")
コード例 #25
0
def prepare_callhome_egyptian(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Callhome Egyptian Arabic Corpus
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S45`` package.
    :param transcript_dir: Path to the ``LDC97T19`` content
    :param output_dir: Directory where the manifests should be written. Can be omitted
        to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir)
        paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["train", "devtest", "evaltest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "callhome/arabic" /
            split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir /
            f"callhome_arabic_trans_970711/transcrp/{split}/roman",
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        id=f"{recording_id}_{idx}",
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        speaker=f"{recording_id}_{spk}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f"recordings_{split}.json")
            supervisions.to_json(output_dir / f"supervisions_{split}.json")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
コード例 #26
0
def rir():
    return Recording.from_file("test/fixtures/rir/sim_1ch.wav")
コード例 #27
0
ファイル: babel.py プロジェクト: glynpu/lhotse
def prepare_single_babel_language(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    no_eval_ok: bool = False,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares manifests using a single BABEL LDC package.

    This function works like the following:

        - first, it will scan `corpus_dir` for a directory named `conversational`;
            if there is more than once, it picks the first one (and emits a warning)
        - then, it will try to find `dev`, `eval`, and `training` splits inside
            (if any of them is not present, it will skip it with a warning)
        - finally, it scans the selected location for SPHERE audio files and transcripts.

    :param corpus_dir: Path to the root of the LDC package with a BABEL language.
    :param output_dir: Path where the manifests are stored.json
    :param no_eval_ok: When set to True, this function won't emit a warning
        that the eval set was not found.
    :return:
    """
    manifests = defaultdict(dict)

    # Auto-detect the location of the "conversational" directory
    orig_corpus_dir = corpus_dir
    corpus_dir = Path(corpus_dir)
    corpus_dir = [d for d in corpus_dir.rglob("conversational") if d.is_dir()]
    if not corpus_dir:
        raise ValueError(
            f"Could not find 'conversational' directory anywhere inside '{orig_corpus_dir}' "
            f"- please check your path.")
    if len(corpus_dir) > 1:
        # People have very messy data distributions, the best we can do is warn them.
        logging.warning(
            f"It seems there are multiple 'conversational' directories in '{orig_corpus_dir}' - "
            f"we are selecting the first one only ({corpus_dir[0]}). Please ensure that you provided "
            f"the path to a single language's dir, and the root dir for all BABEL languages."
        )
    corpus_dir = corpus_dir[0].parent

    for split in ("dev", "eval", "training"):
        audio_dir = corpus_dir / f"conversational/{split}/audio"
        sph_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.sph"))
        wav_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.wav"))
        recordings = combine(sph_recordings, wav_recordings)
        if len(recordings) == 0:
            if split == "eval" and no_eval_ok:
                continue
            logging.warning(f"No SPHERE or WAV files found in {audio_dir}")

        supervisions = []
        text_dir = corpus_dir / f"conversational/{split}/transcription"
        for p in tqdm.tqdm(text_dir.glob("*")):
            # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine
            # parts:
            #   0 -> BABEL
            #   1 -> BP
            #   2 -> <language-code> (101)
            #   3 -> <speaker-id> (10033)
            #   4 -> <date> (20111024)
            #   5 -> <hour> (205740)
            #   6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A
            p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split(
                "_")
            channel = {"inLine": "A", "outLine": "B"}.get(channel, "A")
            # Fix problematic segments that have two consecutive timestamp lines with no transcript in between
            lines = p.read_text().splitlines() + [""]
            lines = [
                prev_l for prev_l, l in sliding_window(2, lines)
                if not (prev_l.startswith("[") and l.startswith("["))
            ]
            # Add a None at the end so that the last timestamp is only used as "next_timestamp"
            # and ends the iretation (otherwise we'd lose the last segment).
            lines += [None]
            for (timestamp,
                 text), (next_timestamp,
                         _) in sliding_window(2, zip(lines[::2], lines[1::2])):
                try:
                    start = float(timestamp[1:-1])
                    end = float(next_timestamp[1:-1])
                    # Create supervision
                    supervisions.append(
                        SupervisionSegment(
                            id=
                            f"{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}",
                            recording_id=p.stem,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=normalize_text(text),
                            language=BABELCODE2LANG[lang_code],
                            speaker=f"{lang_code}_{speaker}_{channel}",
                        ))
                except Exception as e:
                    logging.warning(
                        f"Error while parsing segment. Message: {str(e)}")
                    raise ValueError(
                        f"Too many errors while parsing segments (file: '{p}'). "
                        f"Please check your data or increase the threshold.")
        supervisions = deduplicate_supervisions(supervisions)

        if len(supervisions) == 0:
            logging.warning(f"No supervisions found in {text_dir}")
        supervisions = SupervisionSet.from_segments(supervisions)

        # Fixing and validation of manifests
        if split == "eval" and len(supervisions) == 0:
            # We won't remove missing recordings for the "eval" split in cases where
            # the user does not have its corresponding transcripts (very likely).
            pass
        else:
            recordings, supervisions = remove_missing_recordings_and_supervisions(
                recordings, supervisions)
            supervisions = trim_supervisions_to_recordings(
                recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            language = BABELCODE2LANG[lang_code]
            save_split = "train" if split == "training" else split
            recordings.to_file(output_dir /
                               f"recordings_{language}_{save_split}.json")
            supervisions.to_file(output_dir /
                                 f"supervisions_{language}_{save_split}.json")

    return dict(manifests)
コード例 #28
0
def multi_channel_rir():
    return Recording.from_file("test/fixtures/rir/real_8ch.wav")
コード例 #29
0
ファイル: cmu_indic.py プロジェクト: AmirHussein96/lhotse
def prepare_cmu_indic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the CMU Indic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        # Example ID: cmu_indic_ben_rm_bn_00001
        Recording.from_file(
            wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}"
        )
        for wav in corpus_dir.rglob("*.wav")
    )
    supervisions = []
    for path in corpus_dir.rglob("txt.done.data"):
        lines = path.read_text().splitlines()
        speaker = _get_speaker(path.parent.parent.name)
        lang_code = speaker.split("_")[0]  # example: 'ben_rm' -> 'ben' (Bengali)
        try:
            # Example contents of voice.feats file:
            #   variant guj
            #   age 28
            #   gender male
            #   description Built with build_cg_rfs_voice, 3 rf and 3 dur
            #   gujarati_data h2r_prompts
            #   prompt_dur 59.27min
            age = int(
                (path.parent / "voice.feats")
                .read_text()
                .splitlines()[1]
                .replace("age ", "")
                .strip()
            )
        except:
            age = None
        for l in lines:
            l = l[2:-2]  # get rid of parentheses and whitespaces on the edges
            seg_id, text = l.split(maxsplit=1)
            seg_id = f"{speaker}-{seg_id}"
            language = LANGUAGE_MAP[lang_code]
            is_english = "arctic" in seg_id

            # Determine available custom meta-data to attach.
            custom = None
            if is_english or age is not None:
                custom = {}
                if is_english:
                    custom["accent"] = language
                if age is not None:
                    custom["age"] = age

            supervisions.append(
                SupervisionSegment(
                    id=seg_id,
                    recording_id=seg_id,
                    start=0,
                    duration=recordings[seg_id].duration,
                    text=text.replace('"', ""),  # get rid of quotation marks,
                    language="English" if is_english else language,
                    speaker=speaker,
                    gender=GENDER_MAP.get(speaker),
                    custom=custom,
                )
            )
    supervisions = SupervisionSet.from_segments(supervisions)

    # There seem to be 20 recordings missing; remove the before validation
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions
    )
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        recordings.to_file(output_dir / "cmu-indic_recordings_all.jsonl.gz")
        supervisions.to_file(output_dir / "cmu-indic_supervisions_all.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
コード例 #30
0
ファイル: bvcc.py プロジェクト: oplatek/lhotse
def prepare_bvcc(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    corpus_dir = Path(corpus_dir)

    phase1_main = (corpus_dir / "phase1-main").resolve()
    assert phase1_main.exists(), f"Main track dir is missing {phase1_main}"

    main1_sets = phase1_main / "DATA" / "sets"
    main1_wav = phase1_main / "DATA" / "wav"
    assert (main1_sets.exists() and main1_wav.exists()
            ), f"Have you run data preparation in {phase1_main}?"
    main1_devp = main1_sets / "DEVSET"
    assert main1_devp.exists(), main1_devp
    main1_trainp = main1_sets / "TRAINSET"
    assert main1_trainp.exists(), main1_trainp

    phase1_ood = (corpus_dir / "phase1-ood").resolve()
    assert phase1_ood.exists(
    ), f"Out of domain track dir is missing {phase1_ood}"
    ood1_sets = phase1_ood / "DATA" / "sets"
    ood1_wav = phase1_ood / "DATA" / "wav"
    assert (ood1_sets.exists() and ood1_wav.exists()
            ), f"Have you run data preparation in {phase1_ood}?"
    ood1_unlabeled = ood1_sets / "unlabeled_mos_list.txt"
    assert ood1_unlabeled.exists(), ood1_unlabeled
    ood1_devp = ood1_sets / "DEVSET"
    assert ood1_devp.exists(), ood1_devp
    ood1_trainp = ood1_sets / "TRAINSET"
    assert ood1_trainp.exists(), ood1_devp

    manifests = {}

    # ### Main track sets
    main1_recs = RecordingSet.from_dir(main1_wav,
                                       pattern="*.wav",
                                       num_jobs=num_jobs)

    logging.info("Preparing main1_dev")
    main1_dev_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(main1_devp).readlines()),
            main1_recs,
            parse_main_line,
        ))
    main1_dev_recs = main1_recs.filter(lambda rec: rec.id in main1_dev_sup)
    manifests["main1_dev"] = {
        "recordings": main1_dev_recs,
        "supervisions": main1_dev_sup,
    }

    logging.info("Preparing main1_train")
    main1_train_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(main1_trainp).readlines()),
            main1_recs,
            parse_main_line,
        ))
    main1_train_recs = main1_recs.filter(lambda rec: rec.id in main1_train_sup)
    manifests["main1_train"] = {
        "recordings": main1_train_recs,
        "supervisions": main1_train_sup,
    }

    # ### Out of Domain (OOD) track sets
    unlabeled_wavpaths = [
        ood1_wav / name.strip() for name in open(ood1_unlabeled).readlines()
    ]
    manifests["ood1_unlabeled"] = {
        "recordings":
        RecordingSet.from_recordings(
            Recording.from_file(p) for p in unlabeled_wavpaths)
    }

    ood1_recs = RecordingSet.from_dir(ood1_wav,
                                      pattern="*.wav",
                                      num_jobs=num_jobs)

    logging.info("Preparing ood1_dev")
    ood1_dev_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(ood1_devp).readlines()),
            ood1_recs,
            parse_ood_line,
        ))
    ood1_dev_recs = ood1_recs.filter(lambda rec: rec.id in ood1_dev_sup)
    manifests["ood1_dev"] = {
        "recordings": ood1_dev_recs,
        "supervisions": ood1_dev_sup,
    }

    logging.info("Preparing ood1_train")
    ood1_train_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(ood1_trainp).readlines()),
            ood1_recs,
            parse_ood_line,
        ))
    ood1_train_recs = ood1_recs.filter(lambda rec: rec.id in ood1_train_sup)
    manifests["ood1_train"] = {
        "recordings": ood1_train_recs,
        "supervisions": ood1_train_sup,
    }

    # Optionally serializing to disc
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part, d in manifests.items():
            d["recordings"].to_file(output_dir / f"recordings_{part}.jsonl.gz")
            if "supervisions" in d:
                d["supervisions"].to_file(output_dir /
                                          f"supervisions_{part}.jsonl.gz")

    return manifests