Example #1
0
def prepare_single_partition(
    raw_manifest_path: Path,
    corpus_dir: Path,
    speaker_id: str,
    clean_or_other: str,
):
    recordings = []
    supervisions = []
    for meta in load_jsonl(raw_manifest_path):
        recording = Recording.from_file(corpus_dir / meta["audio_filepath"])
        recordings.append(recording)
        supervisions.append(
            SupervisionSegment(
                id=recording.id,
                recording_id=recording.id,
                start=0,
                duration=recording.duration,
                channel=0,
                text=meta["text"],
                speaker=ID2SPEAKER[speaker_id],
                gender=ID2GENDER[speaker_id],
                custom={
                    "text_punct": meta["text_normalized"],
                    "split": clean_or_other
                },
            ))
    recordings = RecordingSet.from_recordings(recordings)
    supervisions = SupervisionSet.from_segments(supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)
    return recordings, supervisions
Example #2
0
def prepare_tedlium(
    tedlium_root: Pathlike,
    output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the TED-LIUM v3 corpus.

    The manifests are created in a dict with three splits: train, dev and test.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param tedlium_root: Path to the unpacked TED-LIUM data.
    :return: A dict with standard corpus splits containing the manifests.
    """
    tedlium_root = Path(tedlium_root)
    output_dir = Path(output_dir) if output_dir is not None else None
    corpus = {}
    for split in ("train", "dev", "test"):
        root = tedlium_root / "legacy" / split
        recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in (root / "sph").glob("*.sph"))
        stms = list((root / "stm").glob("*.stm"))
        assert len(stms) == len(recordings), (
            f"Mismatch: found {len(recordings)} "
            f"sphere files and {len(stms)} STM files. "
            f"You might be missing some parts of TEDLIUM...")
        segments = []
        for p in stms:
            with p.open() as f:
                for idx, l in enumerate(f):
                    rec_id, _, _, start, end, _, *words = l.split()
                    start, end = float(start), float(end)
                    text = " ".join(words).replace("{NOISE}", "[NOISE]")
                    if text == "ignore_time_segment_in_scoring":
                        continue
                    segments.append(
                        SupervisionSegment(
                            id=f"{rec_id}-{idx}",
                            recording_id=rec_id,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=text,
                            language="English",
                            speaker=rec_id,
                        ))
        supervisions = SupervisionSet.from_segments(segments)
        corpus[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

        validate_recordings_and_supervisions(**corpus[split])

        if output_dir is not None:
            recordings.to_file(output_dir /
                               f"tedlium_recordings_{split}.jsonl.gz")
            supervisions.to_file(output_dir /
                                 f"tedlium_supervisions_{split}.jsonl.gz")

    return corpus
Example #3
0
def prepare_vctk(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    speaker_meta = _parse_speaker_description(corpus_dir)

    recordings = RecordingSet.from_recordings(
        Recording.from_file(wav)
        for wav in (corpus_dir / "wav48").rglob("*.wav"))
    supervisions = []
    for path in (corpus_dir / "txt").rglob("*.txt"):
        # One utterance (line) per file
        text = path.read_text().strip()
        speaker = path.name.split("_")[0]  # p226_001.txt -> p226
        seg_id = path.stem
        meta = speaker_meta.get(speaker, defaultdict(lambda: None))
        if meta is None:
            logging.warning(f"Cannot find metadata for speaker {speaker}.")
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language="English",
                speaker=speaker,
                gender=meta["gender"],
                custom={
                    "accent": meta["accent"],
                    "age": meta["age"],
                    "region": meta["region"],
                },
            ))
    supervisions = SupervisionSet.from_segments(supervisions)

    # note(pzelasko): There were 172 recordings without supervisions when I ran it.
    #                 I am just removing them.
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / "recordings.json")
        supervisions.to_json(output_dir / "supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
Example #4
0
def prepare_tedlium(
        tedlium_root: Pathlike,
        output_dir: Optional[Pathlike] = None
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the TED-LIUM v3 corpus.

    The manifests are created in a dict with three splits: train, dev and test.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param tedlium_root: Path to the unpacked TED-LIUM data.
    :return: A dict with standard corpus splits containing the manifests.
    """
    tedlium_root = Path(tedlium_root)
    output_dir = Path(output_dir) if output_dir is not None else None
    corpus = {}
    for split in ('train', 'dev', 'test'):
        root = tedlium_root / 'legacy' / split
        recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in (root / 'sph').glob('*.sph')
        )
        stms = list((root / 'stm').glob('*.stm'))
        assert len(stms) == len(recordings), f'Mismatch: found {len(recordings)} ' \
                                             f'sphere files and {len(stms)} STM files. ' \
                                             f'You might be missing some parts of TEDLIUM...'
        segments = []
        for p in stms:
            with p.open() as f:
                for idx, l in enumerate(f):
                    rec_id, _, _, start, end, _, *words = l.split()
                    start, end = float(start), float(end)
                    text = ' '.join(words).replace('{NOISE}', '[NOISE]')
                    if text == 'ignore_time_segment_in_scoring':
                        continue
                    segments.append(
                        SupervisionSegment(
                            id=f'{rec_id}-{idx}',
                            recording_id=rec_id,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=text,
                            language='English',
                            speaker=rec_id,
                        )
                    )
        supervisions = SupervisionSet.from_segments(segments)
        corpus[split] = {
            'recordings': recordings,
            'supervisions': supervisions
        }

        validate_recordings_and_supervisions(**corpus[split])

        if output_dir is not None:
            recordings.to_json(output_dir / f'{split}_recordings.json')
            supervisions.to_json(output_dir / f'{split}_supervisions.json')

    return corpus
Example #5
0
def prepare_single_babel_language(corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None):
    manifests = defaultdict(dict)
    for split in ('dev', 'eval', 'training'):
        audio_dir = corpus_dir / f'conversational/{split}/audio'
        recordings = RecordingSet.from_recordings(Recording.from_sphere(p) for p in audio_dir.glob('*.sph'))
        if len(recordings) == 0:
            logging.warning(f"No SPHERE files found in {audio_dir}")
        manifests[split]['recordings'] = recordings

        supervisions = []
        text_dir = corpus_dir / f'conversational/{split}/transcription'
        for p in text_dir.glob('*'):
            # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine
            # parts:
            #   0 -> BABEL
            #   1 -> BP
            #   2 -> <language-code> (101)
            #   3 -> <speaker-id> (10033)
            #   4 -> <date> (20111024)
            #   5 -> <hour> (205740)
            #   6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A
            p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split('_')
            channel = {'inLine': 'A', 'outLine': 'B'}.get(channel, 'A')
            # Add a None at the end so that the last timestamp is only used as "next_timestamp"
            # and ends the iretation (otherwise we'd lose the last segment).
            lines = p.read_text().splitlines() + [None]
            for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])):
                start = float(timestamp[1:-1])
                end = float(next_timestamp[1:-1])
                supervisions.append(
                    SupervisionSegment(
                        id=f'{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}',
                        recording_id=p.stem,
                        start=start,
                        duration=round(end - start, ndigits=8),
                        channel=0,
                        text=normalize_text(text),
                        language=BABELCODE2LANG[lang_code],
                        speaker=speaker,
                    )
                )
        if len(supervisions) == 0:
            logging.warning(f"No supervisions found in {text_dir}")
        manifests[split]['supervisions'] = SupervisionSet.from_segments(supervisions)

        validate_recordings_and_supervisions(
            manifests[split]['recordings'],
            manifests[split]['superevisions']
        )

        if output_dir is not None:
            language = BABELCODE2LANG[lang_code]
            if split == 'training':
                split = 'train'
            manifests[split]['recordings'].to_json(f'recordings_{language}_{split}.json')
            manifests[split]['supervisions'].to_json(f'supervisions_{language}_{split}.json')

    return manifests
Example #6
0
def prepare_cmu_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the CMU Arctic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        # Example ID: cmu_us_sup_arctic-arctic_a0001
        Recording.from_file(
            wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}"
        )
        for wav in corpus_dir.rglob("*.wav")
    )
    supervisions = []
    for path in corpus_dir.rglob("txt.done.data"):
        lines = path.read_text().splitlines()
        speaker = _get_speaker(path.parent.parent.name)
        for l in lines:
            l = l[2:-2]  # get rid of parentheses and whitespaces on the edges
            seg_id, text = l.split(maxsplit=1)
            seg_id = f"{speaker}-{seg_id}"
            supervisions.append(
                SupervisionSegment(
                    id=seg_id,
                    recording_id=seg_id,
                    start=0,
                    duration=recordings[seg_id].duration,
                    text=text.replace('"', ""),  # get rid of quotation marks,
                    language="English",
                    speaker=speaker,
                    gender=GENDER_MAP.get(speaker),
                    custom={"accent": ACCENT_MAP.get(speaker)},
                )
            )
    supervisions = SupervisionSet.from_segments(supervisions)

    # There seem to be 20 recordings missing; remove the before validation
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions
    )
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        recordings.to_json(output_dir / "cmu_arctic_recordings.json")
        supervisions.to_json(output_dir / "cmu_arctic_supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
Example #7
0
def prepare_norm_cn(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    num_jobs: int = 15,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the VoxCeleb1 corpus. The manifests are created in a dict with
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    manifests = defaultdict(dict)
    dataset_parts = ["dev", "test", "train"]
    for part in dataset_parts:
        transcript_path = corpus_dir / f"{part}/text.txt"
        transcript_dict = {}
        with open(transcript_path, "r", encoding="utf-8") as f:
            for line in f.readlines():
                idx_transcript = line.split()
                if len(idx_transcript) < 2 :
                    logging.info(f"get transcript err: {line}")
                    continue
                transcript_dict[idx_transcript[0]] = " ".join(idx_transcript[1:])
        file_path = corpus_dir / f"{part}/wav.scp"
        file_paths = []
        with open(file_path, "r", encoding="utf-8") as f:
            file_paths = [line.strip() for line in f]

        recordings = []
        supervisions = []
        with ThreadPoolExecutor(num_jobs) as ex:
            for recording, supervision in tqdm(
                ex.map(
                    process_file,
                    file_paths,
                    repeat(transcript_dict),
                ),
                desc="Processing NormcnSpeech JSON entries",
                leave=False,
            ):
            #for p in file_paths:
            #    recording, supervision = process_file(p, transcript_dict)
                if recording is not None :
                    recordings.append(recording)
                    supervisions.append(supervision)

        supervision_set = SupervisionSet.from_segments(supervisions)
        recording_set = RecordingSet.from_recordings(recordings)
        if output_dir is not None:
            supervision_set.to_json(output_dir / f"supervisions_{part}.json")
            recording_set.to_json(output_dir / f"recordings_{part}.json")
        manifests[part] = {"recordings": recording_set, "supervisions": supervision_set}
    return manifests
Example #8
0
def test_known_issue_with_overlap():
    r = dummy_recording(0)
    rec = RecordingSet.from_recordings([r])

    # Make two segments. The first segment is 1s long. The segment segment
    # is 0.3 seconds long and lies entirely within the first. Both have the
    # same recording_id as the single entry in rec.
    sup = SupervisionSet.from_segments(
        [
            SupervisionSegment(
                id="utt1",
                recording_id=r.id,
                start=0.0,
                duration=1.0,
                channel=0,
                text="Hello",
            ),
            SupervisionSegment(
                id="utt2",
                recording_id=r.id,
                start=0.2,
                duration=0.5,
                channel=0,
                text="World",
            ),
        ]
    )

    cuts = CutSet.from_manifests(recordings=rec, supervisions=sup)
    assert len(cuts) == 1

    cuts_trim = cuts.trim_to_supervisions(keep_overlapping=False)
    assert len(cuts_trim) == 2

    cut = cuts_trim[0]
    assert cut.start == 0
    assert cut.duration == 1
    assert len(cut.supervisions) == 1
    sup = cut.supervisions[0]
    assert sup.start == 0
    assert sup.duration == 1
    assert sup.text == "Hello"

    cut = cuts_trim[1]
    assert cut.start == 0.2
    assert cut.duration == 0.5
    assert len(cut.supervisions) == 1
    sup = cut.supervisions[0]
    assert sup.start == 0
    assert sup.duration == 0.5
    assert sup.text == "World"
Example #9
0
def recording_set():
    return RecordingSet.from_recordings([
        Recording(id='x',
                  sources=[
                      AudioSource(type='file',
                                  channels=[0],
                                  source='text/fixtures/mono_c0.wav'),
                      AudioSource(type='command',
                                  channels=[1],
                                  source='cat text/fixtures/mono_c1.wav')
                  ],
                  sampling_rate=8000,
                  num_samples=4000,
                  duration=0.5)
    ])
Example #10
0
def prepare_same_close_mic(part3_path):
    check_dependencies()
    from textgrids import TextGrid

    recordings = []
    supervisions = []
    for audio_path in tqdm(
        (part3_path / "AudioSameCloseMic").glob("*.wav"),
        desc="Creating manifests for SameCloseMic",
    ):
        try:
            recording_id = audio_path.stem
            recording = Recording.from_file(audio_path)

            tg = TextGrid(
                part3_path / f"ScriptsSame/{recording_id}.TextGrid", coding="utf-16"
            )
            segments = [
                s
                for s in (
                    SupervisionSegment(
                        id=f"{recording_id}-{idx}",
                        recording_id=recording_id,
                        start=segment.xmin,
                        # We're trimming the last segment's duration as it exceeds the actual duration of the recording.
                        # This is safe because if we end up with a zero/negative duration, the validation will catch it.
                        duration=min(
                            round(segment.xmax - segment.xmin, ndigits=8),
                            recording.duration - segment.xmin,
                        ),
                        text=segment.text,
                        language="Singaporean English",
                        speaker=recording_id,
                    )
                    for idx, segment in enumerate(tg[recording_id])
                    if segment.text not in ("<S>", "<Z>")  # skip silences
                )
                if s.duration > 0  # NSC has some bad segments
            ]

            recordings.append(recording)
            supervisions.extend(segments)
        except:
            print(f"Error when processing {audio_path} - skipping...")
    return {
        "recordings": RecordingSet.from_recordings(recordings),
        "supervisions": SupervisionSet.from_segments(supervisions),
    }
Example #11
0
def _prepare_voxceleb_v1(
    corpus_path: Pathlike,
    num_jobs: int,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the VoxCeleb1 corpus. The manifests are created in a dict with
    2 splits: train ("dev") and test.
    """
    speaker_metadata = {}
    with open(corpus_path / "vox1_meta.csv", "r") as f:
        next(f)
        for line in f:
            spkid, name, gender, nationality, split = line.strip().split("\t")
            speaker_metadata[spkid] = SpeakerMetadata(
                id=spkid, name=name, gender=gender, nationality=nationality, split=split
            )
    with ProcessPoolExecutor(num_jobs) as ex:
        recordings = []
        supervisions = []
        futures = []
        for p in (corpus_path / "wav").rglob("*.wav"):
            futures.append(ex.submit(_process_file, p, speaker_metadata))
        for future in tqdm(
            as_completed(futures),
            total=len(futures),
            desc="Processing VoxCeleb1",
            leave=False,
        ):
            recording, supervision = future.result()
            recordings.append(recording)
            supervisions.append(supervision)
        recording_set = RecordingSet.from_recordings(recordings)
        supervision_set = SupervisionSet.from_segments(supervisions)
    manifests = defaultdict(dict)
    # Split into dev and test sets based on the split of the speakers.
    for split in ("dev", "test"):
        manifests[split]["supervisions"] = supervision_set.filter(
            lambda s: s.custom["split"] == split
        )
        split_ids = [s.recording_id for s in manifests[split]["supervisions"]]
        manifests[split]["recordings"] = recording_set.filter(
            lambda r: r.id in split_ids
        )
    manifests["train"] = manifests.pop("dev")
    return manifests
Example #12
0
def recording_set():
    return RecordingSet.from_recordings([
        Recording(
            id="x",
            sources=[
                AudioSource(type="file",
                            channels=[0],
                            source="text/fixtures/mono_c0.wav"),
                AudioSource(
                    type="command",
                    channels=[1],
                    source="cat text/fixtures/mono_c1.wav",
                ),
            ],
            sampling_rate=8000,
            num_samples=4000,
            duration=0.5,
        )
    ])
Example #13
0
def _prepare_voxceleb_v2(
    corpus_path: Pathlike,
    num_jobs: int,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the VoxCeleb2 corpus. The manifests are created the same dict
    without any splits since the whole data is used in the final "train" split.
    """
    # Read the speaker metadata.
    speaker_metadata = {}
    with open(corpus_path / "vox2_meta.csv", "r") as f:
        next(f)
        for line in f:
            spkid, _, gender, split = map(str.strip, line.split(","))
            speaker_metadata[spkid] = SpeakerMetadata(id=spkid,
                                                      name="",
                                                      gender=gender,
                                                      nationality="",
                                                      split=split)
    # Read the wav files and prepare manifests
    with ProcessPoolExecutor(num_jobs) as ex:
        recordings = []
        supervisions = []
        futures = []
        for p in (corpus_path / split).glob("*.wav"):
            futures.append(
                ex.submit(_process_file, p, speaker_metadata, type="command"))
        for future in tqdm(
                futures,
                total=len(futures),
                desc=f"Processing VoxCeleb2 {split} split...",
                leave=False,
        ):
            recording, supervision = future.result()
            recordings.append(recording)
            supervisions.append(supervision)
    recording_set = RecordingSet.from_recordings(recordings)
    supervision_set = SupervisionSet.from_segments(supervisions)
    manifests = {
        "recordings": recording_set,
        "supervisions": supervision_set,
    }
    return manifests
Example #14
0
def prepare_callhome_english(
        audio_dir: Pathlike,
        rttm_dir: Optional[Pathlike] = None,
        output_dir: Optional[Pathlike] = None,
        sph2pipe_path: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    if rttm_dir is None:
        rttm_dir = download_callhome_metadata()
    rttm_path = rttm_dir / 'fullref.rttm'
    supervisions = read_rttm(rttm_path)

    audio_paths = check_and_rglob(audio_dir, '*.sph')
    recordings = RecordingSet.from_recordings(
        make_recording_callhome(p, sph2pipe_path=sph2pipe_path) for p in tqdm(audio_paths)
    )

    recordings, supervisions = remove_missing_recordings_and_supervisions(recordings, supervisions)
    supervisions = trim_supervisions_to_recordings(recordings, supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / 'recordings.json')
        supervisions.to_json(output_dir / 'supervisions.json')
    return {
        'recordings': recordings,
        'supervisions': supervisions
    }
Example #15
0
def prepare_callhome_english_sre(
    audio_dir: Pathlike,
    rttm_dir: Optional[Pathlike] = None,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Callhome American English portion prepartion.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory. If not provided,
        the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests.
        The keys are: ``{'recordings', 'supervisions'}``.
    """
    if rttm_dir is None:
        rttm_dir = download_callhome_metadata()
    rttm_path = rttm_dir / "fullref.rttm"
    supervisions = read_rttm(rttm_path)

    audio_paths = check_and_rglob(audio_dir, "*.sph")
    recordings = RecordingSet.from_recordings(
        Recording.from_file(p,
                            relative_path_depth=None if absolute_paths else 4)
        for p in tqdm(audio_paths))

    recordings, supervisions = fix_manifests(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / "recordings.json")
        supervisions.to_json(output_dir / "supervisions.json")
    return {"recordings": recordings, "supervisions": supervisions}
Example #16
0
def prepare_separate_phone_mic(part3_path):
    check_dependencies()
    from textgrids import TextGrid
    recordings = []
    supervisions = []
    for audio_path in tqdm(
            (part3_path / 'AudioSeparateIVR').rglob('**/*.wav'),
            desc='Creating manifests for SeparateIVR'
    ):
        try:
            recording_id = f'{audio_path.parent.name}_{audio_path.stem}'
            recording = Recording.from_file(audio_path)

            tg = TextGrid(part3_path / f'ScriptsSeparate/{recording_id}.TextGrid', coding='utf-16')
            segments = [
                s for s in (
                    SupervisionSegment(
                        id=f'{recording_id}-{idx}',
                        recording_id=recording_id,
                        start=segment.xmin,
                        # We're trimming the last segment's duration as it exceeds the actual duration of the recording.
                        # This is safe because if we end up with a zero/negative duration, the validation will catch it.
                        duration=min(round(segment.xmax - segment.xmin, ndigits=8), recording.duration - segment.xmin),
                        text=segment.text,
                        language='Singaporean English',
                        speaker=recording_id,
                    )
                    for idx, segment in enumerate(tg[recording_id])
                    if segment.text not in ('<S>', '<Z>')  # skip silences
                )
                if s.duration > 0  # NSC has some bad segments
            ]

            supervisions.extend(segments)
            recordings.append(recording)
        except:
            print(f'Error when processing {audio_path} - skipping...')
    return {
        'recordings': RecordingSet.from_recordings(recordings),
        'supervisions': SupervisionSet.from_segments(supervisions)
    }
Example #17
0
def prepare_same_close_mic(part3_path):
    check_dependencies()
    from textgrids import TextGrid
    recordings = []
    supervisions = []
    for audio_path in tqdm(
            (part3_path / 'AudioSameCloseMic').glob('*.wav'),
            desc='Creating manifests for SameCloseMic'
    ):
        try:
            recording_id = audio_path.stem
            recording = Recording.from_wav(audio_path)

            tg = TextGrid(part3_path / f'ScriptsSame/{recording_id}.TextGrid', coding='utf-16')
            segments = [
                s for s in (
                    SupervisionSegment(
                        id=f'{recording_id}-{idx}',
                        recording_id=recording_id,
                        start=segment.xmin,
                        duration=round(segment.xmax - segment.xmin, ndigits=8),
                        text=segment.text,
                        language='Singaporean English',
                        speaker=recording_id,
                    )
                    for idx, segment in enumerate(tg[recording_id])
                    if segment.text not in ('<S>', '<Z>')  # skip silences
                )
                if s.duration > 0  # NSC has some bad segments
            ]

            recordings.append(recording)
            supervisions.extend(segments)
        except:
            print(f'Error when processing {audio_path} - skipping...')
    return {
        'recordings': RecordingSet.from_recordings(recordings),
        'supervisions': SupervisionSet.from_segments(supervisions)
    }
Example #18
0
def prepare_bvcc(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    corpus_dir = Path(corpus_dir)

    phase1_main = (corpus_dir / "phase1-main").resolve()
    assert phase1_main.exists(), f"Main track dir is missing {phase1_main}"

    main1_sets = phase1_main / "DATA" / "sets"
    main1_wav = phase1_main / "DATA" / "wav"
    assert (main1_sets.exists() and main1_wav.exists()
            ), f"Have you run data preparation in {phase1_main}?"
    main1_devp = main1_sets / "DEVSET"
    assert main1_devp.exists(), main1_devp
    main1_trainp = main1_sets / "TRAINSET"
    assert main1_trainp.exists(), main1_trainp

    phase1_ood = (corpus_dir / "phase1-ood").resolve()
    assert phase1_ood.exists(
    ), f"Out of domain track dir is missing {phase1_ood}"
    ood1_sets = phase1_ood / "DATA" / "sets"
    ood1_wav = phase1_ood / "DATA" / "wav"
    assert (ood1_sets.exists() and ood1_wav.exists()
            ), f"Have you run data preparation in {phase1_ood}?"
    ood1_unlabeled = ood1_sets / "unlabeled_mos_list.txt"
    assert ood1_unlabeled.exists(), ood1_unlabeled
    ood1_devp = ood1_sets / "DEVSET"
    assert ood1_devp.exists(), ood1_devp
    ood1_trainp = ood1_sets / "TRAINSET"
    assert ood1_trainp.exists(), ood1_devp

    manifests = {}

    # ### Main track sets
    main1_recs = RecordingSet.from_dir(main1_wav,
                                       pattern="*.wav",
                                       num_jobs=num_jobs)

    logging.info("Preparing main1_dev")
    main1_dev_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(main1_devp).readlines()),
            main1_recs,
            parse_main_line,
        ))
    main1_dev_recs = main1_recs.filter(lambda rec: rec.id in main1_dev_sup)
    manifests["main1_dev"] = {
        "recordings": main1_dev_recs,
        "supervisions": main1_dev_sup,
    }

    logging.info("Preparing main1_train")
    main1_train_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(main1_trainp).readlines()),
            main1_recs,
            parse_main_line,
        ))
    main1_train_recs = main1_recs.filter(lambda rec: rec.id in main1_train_sup)
    manifests["main1_train"] = {
        "recordings": main1_train_recs,
        "supervisions": main1_train_sup,
    }

    # ### Out of Domain (OOD) track sets
    unlabeled_wavpaths = [
        ood1_wav / name.strip() for name in open(ood1_unlabeled).readlines()
    ]
    manifests["ood1_unlabeled"] = {
        "recordings":
        RecordingSet.from_recordings(
            Recording.from_file(p) for p in unlabeled_wavpaths)
    }

    ood1_recs = RecordingSet.from_dir(ood1_wav,
                                      pattern="*.wav",
                                      num_jobs=num_jobs)

    logging.info("Preparing ood1_dev")
    ood1_dev_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(ood1_devp).readlines()),
            ood1_recs,
            parse_ood_line,
        ))
    ood1_dev_recs = ood1_recs.filter(lambda rec: rec.id in ood1_dev_sup)
    manifests["ood1_dev"] = {
        "recordings": ood1_dev_recs,
        "supervisions": ood1_dev_sup,
    }

    logging.info("Preparing ood1_train")
    ood1_train_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(ood1_trainp).readlines()),
            ood1_recs,
            parse_ood_line,
        ))
    ood1_train_recs = ood1_recs.filter(lambda rec: rec.id in ood1_train_sup)
    manifests["ood1_train"] = {
        "recordings": ood1_train_recs,
        "supervisions": ood1_train_sup,
    }

    # Optionally serializing to disc
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part, d in manifests.items():
            d["recordings"].to_file(output_dir / f"recordings_{part}.jsonl.gz")
            if "supervisions" in d:
                d["supervisions"].to_file(output_dir /
                                          f"supervisions_{part}.jsonl.gz")

    return manifests
Example #19
0
def prepare_l2_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'

    speaker_meta = _parse_speaker_description()

    recordings = RecordingSet.from_recordings(
        # Example ID: zhaa-arctic_b0126
        Recording.from_file(
            wav, recording_id=f'{wav.parent.parent.name.lower()}-{wav.stem}')
        for wav in corpus_dir.rglob('*.wav'))
    supervisions = []
    for path in corpus_dir.rglob('*.txt'):
        # One utterance (line) per file
        text = path.read_text().strip()

        is_suitcase_corpus = 'suitcase_corpus' in path.parts

        speaker = path.parent.parent.name.lower(
        )  # <root>/ABA/transcript/arctic_a0051.txt -> aba
        if is_suitcase_corpus:
            speaker = path.stem  # <root>/suitcase_corpus/transcript/aba.txt -> aba

        seg_id = f'suitcase_corpus-{speaker}' if is_suitcase_corpus else f'{speaker}-{path.stem}'
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language='English',
                speaker=speaker,
                gender=speaker_meta[speaker]['gender'],
                custom={'accent': speaker_meta[speaker]['native_lang']}))
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    splits = {
        'read': {
            'recordings':
            recordings.filter(lambda r: 'suitcase_corpus' not in r.id),
            'supervisions':
            supervisions.filter(
                lambda s: 'suitcase_corpus' not in s.recording_id)
        },
        'suitcase': {
            'recordings':
            recordings.filter(lambda r: 'suitcase_corpus' in r.id),
            'supervisions':
            supervisions.filter(lambda s: 'suitcase_corpus' in s.recording_id)
        }
    }

    if output_dir is not None:
        output_dir = Path(output_dir)
        makedirs(output_dir, exist_ok=True)
        for key, manifests in splits.items():
            manifests['recordings'].to_json(output_dir /
                                            f'recordings-{key}.json')
            manifests['supervisions'].to_json(output_dir /
                                              f'supervisions-{key}.json')

    return splits
Example #20
0
def scan_recordings(corpus_dir: Path) -> RecordingSet:
    return RecordingSet.from_recordings(
        Recording.from_file(file)
        for file in corpus_dir.rglob('*.wav')
    )
Example #21
0
def prepare_libricss(
    corpus_dir: Pathlike,
    output_dir: Pathlike = None,
    type: str = "mdm",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    NOTE: The recordings contain all 7 channels. If you want to use only one channel, you can
    use either ``recording.load_audio(channel=0)`` or ``MonoCut(id=...,recording=recording,channel=0)``
    while creating the CutSet.

    :param corpus_dir: Pathlike, the path to the extracted corpus.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param type: str, the type of data to prepare ('mdm', 'sdm', 'ihm-mix', or 'ihm'). These settings
        are similar to the ones in AMI and ICSI recipes.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.

    """
    assert type in ["mdm", "ihm-mix", "ihm"]

    manifests = {}

    corpus_dir = Path(corpus_dir)
    corpus_dir = (corpus_dir / "for_release"
                  if corpus_dir.stem != "for_release" else corpus_dir)

    recordings = []
    segments = []

    for ov in OVERLAP_RATIOS:
        for session in (corpus_dir / ov).iterdir():
            _, _, _, _, _, name, actual_ov = session.name.split("_")
            actual_ov = float(actual_ov.split("actual")[1])
            recording_id = f"{ov}_{name}"
            audio_path = (session / "clean" /
                          "mix.wav" if type == "ihm-mix" else session /
                          "clean" /
                          "each_spk.wav" if type == "ihm" else session /
                          "record" / "raw_recording.wav")
            recordings.append(
                Recording.from_file(audio_path, recording_id=recording_id))
            for idx, seg in enumerate(
                    parse_transcript(session / "transcription" /
                                     "meeting_info.txt")):
                segments.append(
                    SupervisionSegment(
                        id=f"{recording_id}-{idx}",
                        recording_id=recording_id,
                        start=seg[0],
                        duration=seg[1] - seg[0],
                        text=seg[4],
                        language="English",
                        speaker=seg[2],
                        channel=SPK_TO_CHANNEL_MAP[session.name][seg[2]]
                        if type == "ihm" else 0,
                    ))

    supervisions = SupervisionSet.from_segments(segments)
    recordings = RecordingSet.from_recordings(recordings)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True, parents=True)
        recordings.to_jsonl(output_dir / "recordings.jsonl")
        supervisions.to_jsonl(output_dir / "supervisions.jsonl")

    return {"recordings": recordings, "supervisions": supervisions}
Example #22
0
def prepare_rir_noise(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    parts: Sequence[str] = ("point_noise", "iso_noise", "real_rir", "sim_rir"),
) -> Dict[str, Dict[str, Union[RecordingSet, CutSet]]]:
    """
    Prepare the RIR Noise corpus.

    :param corpus_dir: Pathlike, the path of the dir to store the dataset.
    :param output_dir: Pathlike, the path of the dir to write the manifests.
    :param parts: Sequence[str], the parts of the dataset to prepare.

    The corpus contains 4 things: point-source noises (point_noise), isotropic noises (iso_noise),
    real RIRs (real_rir), and simulated RIRs (sim_rir). We will prepare these parts
    in the corresponding dict keys.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"
    if not parts:
        raise ValueError("No parts specified for manifest preparation.")
    if isinstance(parts, str):
        parts = [parts]

    manifests = defaultdict(dict)
    for part in parts:
        logging.info(f"Preparing {part}...")
        audio_dir = corpus_dir / PARTS[part]
        assert audio_dir.is_dir(), f"No such directory: {audio_dir}"
        if part == "sim_rir":
            # The "small", "medium", and "large" rooms have the same file names, so
            # we have to handle them separately to avoid duplicating manifests.
            recordings = []
            for room_type in ("small", "medium", "large"):
                room_dir = audio_dir / f"{room_type}room"
                recordings += [
                    Recording.from_file(
                        file, recording_id=f"{room_type}-{file.stem}")
                    for file in room_dir.rglob("*.wav")
                ]
            manifests[part]["recordings"] = RecordingSet.from_recordings(
                recordings)
        elif part == "point_noise":
            manifests[part]["recordings"] = RecordingSet.from_recordings(
                Recording.from_file(file) for file in audio_dir.rglob("*.wav"))
        elif part == "iso_noise":
            manifests[part]["recordings"] = RecordingSet.from_recordings(
                Recording.from_file(file) for file in audio_dir.rglob("*.wav")
                if "noise" in file.stem)
        elif part == "real_rir":
            manifests[part]["recordings"] = RecordingSet.from_recordings(
                Recording.from_file(file) for file in audio_dir.rglob("*.wav")
                if "rir" in file.stem)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part in manifests:
            for key, manifest in manifests[part].items():
                manifest.to_file(output_dir / f"{key}_{part}.json")

    return manifests
Example #23
0
def prepare_callhome_egyptian(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Callhome Egyptian Arabic Corpus
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S45`` package.
    :param transcript_dir: Path to the ``LDC97T19`` content
    :param output_dir: Directory where the manifests should be written. Can be omitted
        to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir)
        paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["train", "devtest", "evaltest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "callhome/arabic" /
            split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir /
            f"callhome_arabic_trans_970711/transcrp/{split}/roman",
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        id=f"{recording_id}_{idx}",
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        speaker=f"{recording_id}_{spk}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f"recordings_{split}.json")
            supervisions.to_json(output_dir / f"supervisions_{split}.json")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
Example #24
0
def prepare_single_mtedx_language(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    language: str = "language",
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares manifests using a single MTEDx language.

    This function works as follows:

        - First it looks for the audio directory in the data/wav where the .flac
            files are stored.
        - Then, it looks for the vtt directory in data/{train,dev,test}/vtt
            which contains the segmentation and transcripts for the audio.
        - The transcripts undergo some basic text normalization

    :param corpus_dir: Path to the root of the MTEDx download
    :param output_dir: Path where the manifests are stored as .json files
    :param language: The two-letter language code.
    :param num_jobs: Number of threads to use when preparing data.
    :return:
    """
    if isinstance(corpus_dir, str):
        corpus_dir = Path(corpus_dir)
    manifests = defaultdict(dict)

    with ThreadPoolExecutor(num_jobs) as ex:
        for split in ("train", "valid", "test"):
            audio_dir = corpus_dir / f"data/{split}/wav"
            recordings = RecordingSet.from_recordings(
                Recording.from_file(p) for p in audio_dir.glob("*.flac")
            )
            if len(recordings) == 0:
                logging.warning(f"No .flac files found in {audio_dir}")

            supervisions = []
            text_dir = corpus_dir / f"data/{split}/vtt"
            futures = []
            for p in text_dir.glob("*"):
                futures.append(ex.submit(_filename_to_supervisions, p, language))

            for future in tqdm(futures, desc="Processing", leave=False):
                result = future.result()
                if result is None:
                    continue
                for sup in result:
                    supervisions.append(sup)

            if len(supervisions) == 0:
                logging.warning(f"No supervisions found in {text_dir}")
            supervisions = SupervisionSet.from_segments(supervisions)

            recordings, supervisions = remove_missing_recordings_and_supervisions(
                recordings, supervisions
            )
            supervisions = trim_supervisions_to_recordings(recordings, supervisions)
            validate_recordings_and_supervisions(recordings, supervisions)

            manifests[split] = {
                "recordings": recordings,
                "supervisions": supervisions,
            }

            if output_dir is not None:
                if isinstance(output_dir, str):
                    output_dir = Path(output_dir)
                output_dir.mkdir(parents=True, exist_ok=True)
                save_split = "dev" if split == "valid" else split
                recordings.to_file(output_dir / f"recordings_{language}_{split}.json")
                supervisions.to_file(
                    output_dir / f"supervisions_{language}_{split}.json"
                )

    return dict(manifests)
Example #25
0
def prepare_single_babel_language(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    no_eval_ok: bool = False,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares manifests using a single BABEL LDC package.

    This function works like the following:

        - first, it will scan `corpus_dir` for a directory named `conversational`;
            if there is more than once, it picks the first one (and emits a warning)
        - then, it will try to find `dev`, `eval`, and `training` splits inside
            (if any of them is not present, it will skip it with a warning)
        - finally, it scans the selected location for SPHERE audio files and transcripts.

    :param corpus_dir: Path to the root of the LDC package with a BABEL language.
    :param output_dir: Path where the manifests are stored.json
    :param no_eval_ok: When set to True, this function won't emit a warning
        that the eval set was not found.
    :return:
    """
    manifests = defaultdict(dict)

    # Auto-detect the location of the "conversational" directory
    orig_corpus_dir = corpus_dir
    corpus_dir = Path(corpus_dir)
    corpus_dir = [d for d in corpus_dir.rglob("conversational") if d.is_dir()]
    if not corpus_dir:
        raise ValueError(
            f"Could not find 'conversational' directory anywhere inside '{orig_corpus_dir}' "
            f"- please check your path.")
    if len(corpus_dir) > 1:
        # People have very messy data distributions, the best we can do is warn them.
        logging.warning(
            f"It seems there are multiple 'conversational' directories in '{orig_corpus_dir}' - "
            f"we are selecting the first one only ({corpus_dir[0]}). Please ensure that you provided "
            f"the path to a single language's dir, and the root dir for all BABEL languages."
        )
    corpus_dir = corpus_dir[0].parent

    for split in ("dev", "eval", "training"):
        audio_dir = corpus_dir / f"conversational/{split}/audio"
        sph_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.sph"))
        wav_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.wav"))
        recordings = combine(sph_recordings, wav_recordings)
        if len(recordings) == 0:
            if split == "eval" and no_eval_ok:
                continue
            logging.warning(f"No SPHERE or WAV files found in {audio_dir}")

        supervisions = []
        text_dir = corpus_dir / f"conversational/{split}/transcription"
        for p in tqdm.tqdm(text_dir.glob("*")):
            # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine
            # parts:
            #   0 -> BABEL
            #   1 -> BP
            #   2 -> <language-code> (101)
            #   3 -> <speaker-id> (10033)
            #   4 -> <date> (20111024)
            #   5 -> <hour> (205740)
            #   6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A
            p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split(
                "_")
            channel = {"inLine": "A", "outLine": "B"}.get(channel, "A")
            # Fix problematic segments that have two consecutive timestamp lines with no transcript in between
            lines = p.read_text().splitlines() + [""]
            lines = [
                prev_l for prev_l, l in sliding_window(2, lines)
                if not (prev_l.startswith("[") and l.startswith("["))
            ]
            # Add a None at the end so that the last timestamp is only used as "next_timestamp"
            # and ends the iretation (otherwise we'd lose the last segment).
            lines += [None]
            for (timestamp,
                 text), (next_timestamp,
                         _) in sliding_window(2, zip(lines[::2], lines[1::2])):
                try:
                    start = float(timestamp[1:-1])
                    end = float(next_timestamp[1:-1])
                    # Create supervision
                    supervisions.append(
                        SupervisionSegment(
                            id=
                            f"{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}",
                            recording_id=p.stem,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=normalize_text(text),
                            language=BABELCODE2LANG[lang_code],
                            speaker=f"{lang_code}_{speaker}_{channel}",
                        ))
                except Exception as e:
                    logging.warning(
                        f"Error while parsing segment. Message: {str(e)}")
                    raise ValueError(
                        f"Too many errors while parsing segments (file: '{p}'). "
                        f"Please check your data or increase the threshold.")
        supervisions = deduplicate_supervisions(supervisions)

        if len(supervisions) == 0:
            logging.warning(f"No supervisions found in {text_dir}")
        supervisions = SupervisionSet.from_segments(supervisions)

        # Fixing and validation of manifests
        if split == "eval" and len(supervisions) == 0:
            # We won't remove missing recordings for the "eval" split in cases where
            # the user does not have its corresponding transcripts (very likely).
            pass
        else:
            recordings, supervisions = remove_missing_recordings_and_supervisions(
                recordings, supervisions)
            supervisions = trim_supervisions_to_recordings(
                recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            language = BABELCODE2LANG[lang_code]
            save_split = "train" if split == "training" else split
            recordings.to_file(output_dir /
                               f"recordings_{language}_{save_split}.json")
            supervisions.to_file(output_dir /
                                 f"supervisions_{language}_{save_split}.json")

    return dict(manifests)
Example #26
0
def prepare_callhome_english_asr(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the CallHome American English corpus.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S42`` content
    :param transcript_dir: Path to the ``LDC97T14`` content
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are:
        ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["evaltest", "train", "devtest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "data" / split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir / "transcrpt" / split,
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            postprocessed_lines = list()
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                if line.startswith("#"):
                    continue
                try:
                    start, end, spk, text = line.split(maxsplit=3)
                    duration = float(Decimal(end) - Decimal(start))
                    if duration <= 0:
                        continue
                    postprocessed_lines.append(line)
                except InvalidOperation:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line
                except ValueError:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line

            for line in postprocessed_lines:
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        channel=ord(spk[0]) - ord("A"),
                        speaker=f"{recording_id}_{spk:0>2s}",
                        id=f"{recording_id}_{spk:0>2s}_{idx:0>5d}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_file(output_dir /
                               f"callhome-english_recordings_{split}.jsonl.gz")
            supervisions.to_file(
                output_dir / f"callhome-english_supervisions_{split}.jsonl.gz")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
Example #27
0
def prepare_adept(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
):
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names,
        e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        Recording.from_file(
            path=path,
            # converts:
            #   path/to/ADEPT/wav_44khz/propositional_attitude/surprise/ad01_0204.wav
            # to:
            #   propositional_attitude_surprise_ad01_0204
            recording_id=str(path.relative_to(path.parent.parent.parent))
            [:-4].replace("/", "_"),
        ) for path in (corpus_dir / "wav_44khz").rglob("*.wav"))

    supervisions = []

    with open(corpus_dir / "adept_prompts.json") as f:
        interpretation_map = json.load(f)

    for path in (corpus_dir / "txt").rglob("*.txt"):
        annotation_type, label, prompt_id = str(
            path.relative_to(path.parent.parent.parent))[:-4].split("/")
        speaker_id = "ADEPT_" + prompt_id.split("_")[0]
        recording_id = "_".join((annotation_type, label, prompt_id))
        interpretation_group = interpretation_map.get(annotation_type)
        interpretation = (interpretation_group[prompt_id][label]
                          if interpretation_group else None)
        recording = recordings[recording_id]
        custom = {
            "type": annotation_type,
            "label": label,
            "prompt_id": prompt_id
        }
        if interpretation:
            # label is "interpretation_1", "interpretation_2", ..., "middle", "end", etc
            # Interpretations' labels meaning is defined by their textual realisation:
            #  {..., "middle": "Galleries are WHAT on Thursdays?", "end": "Galleries are free WHEN?"}
            custom["text"] = interpretation
        supervisions.append(
            SupervisionSegment(
                id=recording_id,
                recording_id=recording_id,
                start=0,
                duration=recording.duration,
                channel=0,
                text=path.read_text(),
                language="English",
                speaker=speaker_id,
                custom=custom,
            ))

    supervisions = SupervisionSet.from_segments(supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        supervisions.to_file(output_dir / "adept_supervisions.json")
        recordings.to_file(output_dir / "adept_recordings.json")

    return {"recordings": recordings, "supervisions": supervisions}
Example #28
0
def prepare_l2_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    speaker_meta = _parse_speaker_description()

    recordings = RecordingSet.from_recordings(
        # Example ID: zhaa-arctic_b0126
        Recording.from_file(
            wav, recording_id=f"{wav.parent.parent.name.lower()}-{wav.stem}")
        for wav in corpus_dir.rglob("*.wav"))
    supervisions = []
    for path in corpus_dir.rglob("*.txt"):
        # One utterance (line) per file
        text = path.read_text().strip()

        is_suitcase_corpus = "suitcase_corpus" in path.parts

        speaker = (path.parent.parent.name.lower()
                   )  # <root>/ABA/transcript/arctic_a0051.txt -> aba
        if is_suitcase_corpus:
            speaker = path.stem  # <root>/suitcase_corpus/transcript/aba.txt -> aba

        seg_id = (f"suitcase_corpus-{speaker}"
                  if is_suitcase_corpus else f"{speaker}-{path.stem}")
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language="English",
                speaker=speaker,
                gender=speaker_meta[speaker]["gender"],
                custom={"accent": speaker_meta[speaker]["native_lang"]},
            ))
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    splits = {
        "read": {
            "recordings":
            recordings.filter(lambda r: "suitcase_corpus" not in r.id),
            "supervisions":
            supervisions.filter(
                lambda s: "suitcase_corpus" not in s.recording_id),
        },
        "suitcase": {
            "recordings":
            recordings.filter(lambda r: "suitcase_corpus" in r.id),
            "supervisions":
            supervisions.filter(lambda s: "suitcase_corpus" in s.recording_id),
        },
    }

    if output_dir is not None:
        output_dir = Path(output_dir)
        makedirs(output_dir, exist_ok=True)
        for key, manifests in splits.items():
            manifests["recordings"].to_file(
                output_dir / f"l2-arctic_recordings_{key}.jsonl.gz")
            manifests["supervisions"].to_file(
                output_dir / f"l2-arctic_supervisions_{key}.jsonl.gz")

    return splits
Example #29
0
def prepare_cmu_indic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the CMU Indic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        # Example ID: cmu_indic_ben_rm_bn_00001
        Recording.from_file(
            wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}"
        )
        for wav in corpus_dir.rglob("*.wav")
    )
    supervisions = []
    for path in corpus_dir.rglob("txt.done.data"):
        lines = path.read_text().splitlines()
        speaker = _get_speaker(path.parent.parent.name)
        lang_code = speaker.split("_")[0]  # example: 'ben_rm' -> 'ben' (Bengali)
        try:
            # Example contents of voice.feats file:
            #   variant guj
            #   age 28
            #   gender male
            #   description Built with build_cg_rfs_voice, 3 rf and 3 dur
            #   gujarati_data h2r_prompts
            #   prompt_dur 59.27min
            age = int(
                (path.parent / "voice.feats")
                .read_text()
                .splitlines()[1]
                .replace("age ", "")
                .strip()
            )
        except:
            age = None
        for l in lines:
            l = l[2:-2]  # get rid of parentheses and whitespaces on the edges
            seg_id, text = l.split(maxsplit=1)
            seg_id = f"{speaker}-{seg_id}"
            language = LANGUAGE_MAP[lang_code]
            is_english = "arctic" in seg_id

            # Determine available custom meta-data to attach.
            custom = None
            if is_english or age is not None:
                custom = {}
                if is_english:
                    custom["accent"] = language
                if age is not None:
                    custom["age"] = age

            supervisions.append(
                SupervisionSegment(
                    id=seg_id,
                    recording_id=seg_id,
                    start=0,
                    duration=recordings[seg_id].duration,
                    text=text.replace('"', ""),  # get rid of quotation marks,
                    language="English" if is_english else language,
                    speaker=speaker,
                    gender=GENDER_MAP.get(speaker),
                    custom=custom,
                )
            )
    supervisions = SupervisionSet.from_segments(supervisions)

    # There seem to be 20 recordings missing; remove the before validation
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions
    )
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        recordings.to_file(output_dir / "cmu-indic_recordings_all.jsonl.gz")
        supervisions.to_file(output_dir / "cmu-indic_supervisions_all.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
Example #30
0
def prepare_callhome_egyptian(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    sph2pipe_path: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ['train', 'devtest', 'evaltest']:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / 'callhome/arabic' /
            split.replace('evaltest', 'evltest'),
            '*.sph')
        recordings = RecordingSet.from_recordings(
            make_recording_callhome(p, sph2pipe_path=sph2pipe_path)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir /
            f'callhome_arabic_trans_970711/transcrp/{split}/roman', '*.txt')

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(id=f'{recording_id}_{idx}',
                                       recording_id=recording_id,
                                       start=start,
                                       duration=duration,
                                       speaker=f'{recording_id}_{spk}',
                                       text=text))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = remove_missing_recordings_and_supervisions(
            recordings, supervisions)
        supervisions = trim_supervisions_to_recordings(recordings,
                                                       supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f'recordings_{split}.json')
            supervisions.to_json(output_dir / f'supervisions_{split}.json')

        manifests[split] = {
            'recordings': recordings,
            'supervisions': supervisions
        }

    return manifests