Beispiel #1
0
def prepare_cmu_indic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the CMU Indic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        # Example ID: cmu_indic_ben_rm_bn_00001
        Recording.from_file(
            wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}"
        )
        for wav in corpus_dir.rglob("*.wav")
    )
    supervisions = []
    for path in corpus_dir.rglob("txt.done.data"):
        lines = path.read_text().splitlines()
        speaker = _get_speaker(path.parent.parent.name)
        lang_code = speaker.split("_")[0]  # example: 'ben_rm' -> 'ben' (Bengali)
        try:
            # Example contents of voice.feats file:
            #   variant guj
            #   age 28
            #   gender male
            #   description Built with build_cg_rfs_voice, 3 rf and 3 dur
            #   gujarati_data h2r_prompts
            #   prompt_dur 59.27min
            age = int(
                (path.parent / "voice.feats")
                .read_text()
                .splitlines()[1]
                .replace("age ", "")
                .strip()
            )
        except:
            age = None
        for l in lines:
            l = l[2:-2]  # get rid of parentheses and whitespaces on the edges
            seg_id, text = l.split(maxsplit=1)
            seg_id = f"{speaker}-{seg_id}"
            language = LANGUAGE_MAP[lang_code]
            is_english = "arctic" in seg_id

            # Determine available custom meta-data to attach.
            custom = None
            if is_english or age is not None:
                custom = {}
                if is_english:
                    custom["accent"] = language
                if age is not None:
                    custom["age"] = age

            supervisions.append(
                SupervisionSegment(
                    id=seg_id,
                    recording_id=seg_id,
                    start=0,
                    duration=recordings[seg_id].duration,
                    text=text.replace('"', ""),  # get rid of quotation marks,
                    language="English" if is_english else language,
                    speaker=speaker,
                    gender=GENDER_MAP.get(speaker),
                    custom=custom,
                )
            )
    supervisions = SupervisionSet.from_segments(supervisions)

    # There seem to be 20 recordings missing; remove the before validation
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions
    )
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        recordings.to_file(output_dir / "cmu-indic_recordings_all.jsonl.gz")
        supervisions.to_file(output_dir / "cmu-indic_supervisions_all.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
Beispiel #2
0
def prepare_libricss(
    corpus_dir: Pathlike,
    output_dir: Pathlike = None,
    type: str = "mdm",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    NOTE: The recordings contain all 7 channels. If you want to use only one channel, you can
    use either ``recording.load_audio(channel=0)`` or ``MonoCut(id=...,recording=recording,channel=0)``
    while creating the CutSet.

    :param corpus_dir: Pathlike, the path to the extracted corpus.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param type: str, the type of data to prepare ('mdm', 'sdm', 'ihm-mix', or 'ihm'). These settings
        are similar to the ones in AMI and ICSI recipes.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.

    """
    assert type in ["mdm", "ihm-mix", "ihm"]

    manifests = {}

    corpus_dir = Path(corpus_dir)
    corpus_dir = (corpus_dir / "for_release"
                  if corpus_dir.stem != "for_release" else corpus_dir)

    recordings = []
    segments = []

    for ov in OVERLAP_RATIOS:
        for session in (corpus_dir / ov).iterdir():
            _, _, _, _, _, name, actual_ov = session.name.split("_")
            actual_ov = float(actual_ov.split("actual")[1])
            recording_id = f"{ov}_{name}"
            audio_path = (session / "clean" /
                          "mix.wav" if type == "ihm-mix" else session /
                          "clean" /
                          "each_spk.wav" if type == "ihm" else session /
                          "record" / "raw_recording.wav")
            recordings.append(
                Recording.from_file(audio_path, recording_id=recording_id))
            for idx, seg in enumerate(
                    parse_transcript(session / "transcription" /
                                     "meeting_info.txt")):
                segments.append(
                    SupervisionSegment(
                        id=f"{recording_id}-{idx}",
                        recording_id=recording_id,
                        start=seg[0],
                        duration=seg[1] - seg[0],
                        text=seg[4],
                        language="English",
                        speaker=seg[2],
                        channel=SPK_TO_CHANNEL_MAP[session.name][seg[2]]
                        if type == "ihm" else 0,
                    ))

    supervisions = SupervisionSet.from_segments(segments)
    recordings = RecordingSet.from_recordings(recordings)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True, parents=True)
        recordings.to_jsonl(output_dir / "recordings.jsonl")
        supervisions.to_jsonl(output_dir / "supervisions.jsonl")

    return {"recordings": recordings, "supervisions": supervisions}
Beispiel #3
0
def prepare_l2_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'

    speaker_meta = _parse_speaker_description()

    recordings = RecordingSet.from_recordings(
        # Example ID: zhaa-arctic_b0126
        Recording.from_file(
            wav, recording_id=f'{wav.parent.parent.name.lower()}-{wav.stem}')
        for wav in corpus_dir.rglob('*.wav'))
    supervisions = []
    for path in corpus_dir.rglob('*.txt'):
        # One utterance (line) per file
        text = path.read_text().strip()

        is_suitcase_corpus = 'suitcase_corpus' in path.parts

        speaker = path.parent.parent.name.lower(
        )  # <root>/ABA/transcript/arctic_a0051.txt -> aba
        if is_suitcase_corpus:
            speaker = path.stem  # <root>/suitcase_corpus/transcript/aba.txt -> aba

        seg_id = f'suitcase_corpus-{speaker}' if is_suitcase_corpus else f'{speaker}-{path.stem}'
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language='English',
                speaker=speaker,
                gender=speaker_meta[speaker]['gender'],
                custom={'accent': speaker_meta[speaker]['native_lang']}))
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    splits = {
        'read': {
            'recordings':
            recordings.filter(lambda r: 'suitcase_corpus' not in r.id),
            'supervisions':
            supervisions.filter(
                lambda s: 'suitcase_corpus' not in s.recording_id)
        },
        'suitcase': {
            'recordings':
            recordings.filter(lambda r: 'suitcase_corpus' in r.id),
            'supervisions':
            supervisions.filter(lambda s: 'suitcase_corpus' in s.recording_id)
        }
    }

    if output_dir is not None:
        output_dir = Path(output_dir)
        makedirs(output_dir, exist_ok=True)
        for key, manifests in splits.items():
            manifests['recordings'].to_json(output_dir /
                                            f'recordings-{key}.json')
            manifests['supervisions'].to_json(output_dir /
                                              f'supervisions-{key}.json')

    return splits
Beispiel #4
0
def prepare_libritts(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = 'auto',
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param num_jobs: the number of parallel workers parsing the data.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}'

    if dataset_parts == 'auto':
        dataset_parts = LIBRITTS
    elif isinstance(dataset_parts, str):
        assert dataset_parts in LIBRITTS
        dataset_parts = [dataset_parts]

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                                   output_dir=output_dir,
                                                   prefix='libritts')
        if maybe_manifests is not None:
            return maybe_manifests

    # Contents of the file
    #   ;ID  |SEX| SUBSET           |MINUTES| NAME
    #   14   | F | train-clean-360  | 25.03 | ...
    #   16   | F | train-clean-360  | 25.11 | ...
    #   17   | M | train-clean-360  | 25.04 | ...
    spk2gender = {
        spk_id.strip(): gender.strip()
        for spk_id, gender, *_ in (line.split('|') for line in (
            corpus_dir / 'SPEAKERS.txt').read_text().splitlines()
                                   if not line.startswith(';'))
    }

    manifests = defaultdict(dict)
    for part in tqdm(dataset_parts, desc='Preparing LibriTTS parts'):
        part_path = corpus_dir / part
        recordings = RecordingSet.from_dir(part_path,
                                           '*.wav',
                                           num_jobs=num_jobs)
        supervisions = []
        for trans_path in tqdm(
                part_path.rglob('*.trans.tsv'),
                desc='Scanning transcript files (progbar per speaker)',
                leave=False):
            # The trans.tsv files contain only the recordings that were kept for LibriTTS.
            # Example path to a file:
            #   /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv
            #
            # Example content:
            #   84_121123_000007_000001 Maximilian.     Maximilian.
            #   84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief.    Villefort rose, half ashamed of being surprised in such a paroxysm of grief.

            # book.tsv contains additional metadata
            utt2snr = {
                rec_id: float(snr)
                for rec_id, *_, snr in map(str.split, (
                    trans_path.parent /
                    trans_path.name.replace('.trans.tsv', '.book.tsv')
                ).read_text().splitlines())
            }
            for line in trans_path.read_text().splitlines():
                rec_id, orig_text, norm_text = line.split('\t')
                spk_id = rec_id.split('_')[0]
                supervisions.append(
                    SupervisionSegment(id=rec_id,
                                       recording_id=rec_id,
                                       start=0.0,
                                       duration=recordings[rec_id].duration,
                                       channel=0,
                                       text=norm_text,
                                       language='English',
                                       speaker=spk_id,
                                       gender=spk2gender[spk_id],
                                       custom={
                                           'orig_text': orig_text,
                                           'snr': utt2snr[rec_id]
                                       }))

        supervisions = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            supervisions.to_json(output_dir /
                                 f'libritts_supervisions_{part}.json')
            recordings.to_json(output_dir / f'libritts_recordings_{part}.json')

        manifests[part] = {
            'recordings': recordings,
            'supervisions': supervisions
        }

    return dict(manifests)  # Convert to normal dict
def prepare_callhome_egyptian(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Callhome Egyptian Arabic Corpus
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S45`` package.
    :param transcript_dir: Path to the ``LDC97T19`` content
    :param output_dir: Directory where the manifests should be written. Can be omitted
        to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir)
        paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["train", "devtest", "evaltest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "callhome/arabic" /
            split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir /
            f"callhome_arabic_trans_970711/transcrp/{split}/roman",
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        id=f"{recording_id}_{idx}",
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        speaker=f"{recording_id}_{spk}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f"recordings_{split}.json")
            supervisions.to_json(output_dir / f"supervisions_{split}.json")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
def prepare_callhome_english_asr(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the CallHome American English corpus.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S42`` content
    :param transcript_dir: Path to the ``LDC97T14`` content
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are:
        ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["evaltest", "train", "devtest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "data" / split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir / "transcrpt" / split,
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            postprocessed_lines = list()
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                if line.startswith("#"):
                    continue
                try:
                    start, end, spk, text = line.split(maxsplit=3)
                    duration = float(Decimal(end) - Decimal(start))
                    if duration <= 0:
                        continue
                    postprocessed_lines.append(line)
                except InvalidOperation:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line
                except ValueError:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line

            for line in postprocessed_lines:
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        channel=ord(spk[0]) - ord("A"),
                        speaker=f"{recording_id}_{spk:0>2s}",
                        id=f"{recording_id}_{spk:0>2s}_{idx:0>5d}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_file(output_dir /
                               f"callhome-english_recordings_{split}.jsonl.gz")
            supervisions.to_file(
                output_dir / f"callhome-english_supervisions_{split}.jsonl.gz")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
Beispiel #7
0
def prepare_single_babel_language(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    no_eval_ok: bool = False,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares manifests using a single BABEL LDC package.

    This function works like the following:

        - first, it will scan `corpus_dir` for a directory named `conversational`;
            if there is more than once, it picks the first one (and emits a warning)
        - then, it will try to find `dev`, `eval`, and `training` splits inside
            (if any of them is not present, it will skip it with a warning)
        - finally, it scans the selected location for SPHERE audio files and transcripts.

    :param corpus_dir: Path to the root of the LDC package with a BABEL language.
    :param output_dir: Path where the manifests are stored.json
    :param no_eval_ok: When set to True, this function won't emit a warning
        that the eval set was not found.
    :return:
    """
    manifests = defaultdict(dict)

    # Auto-detect the location of the "conversational" directory
    orig_corpus_dir = corpus_dir
    corpus_dir = Path(corpus_dir)
    corpus_dir = [d for d in corpus_dir.rglob("conversational") if d.is_dir()]
    if not corpus_dir:
        raise ValueError(
            f"Could not find 'conversational' directory anywhere inside '{orig_corpus_dir}' "
            f"- please check your path.")
    if len(corpus_dir) > 1:
        # People have very messy data distributions, the best we can do is warn them.
        logging.warning(
            f"It seems there are multiple 'conversational' directories in '{orig_corpus_dir}' - "
            f"we are selecting the first one only ({corpus_dir[0]}). Please ensure that you provided "
            f"the path to a single language's dir, and the root dir for all BABEL languages."
        )
    corpus_dir = corpus_dir[0].parent

    for split in ("dev", "eval", "training"):
        audio_dir = corpus_dir / f"conversational/{split}/audio"
        sph_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.sph"))
        wav_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.wav"))
        recordings = combine(sph_recordings, wav_recordings)
        if len(recordings) == 0:
            if split == "eval" and no_eval_ok:
                continue
            logging.warning(f"No SPHERE or WAV files found in {audio_dir}")

        supervisions = []
        text_dir = corpus_dir / f"conversational/{split}/transcription"
        for p in tqdm.tqdm(text_dir.glob("*")):
            # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine
            # parts:
            #   0 -> BABEL
            #   1 -> BP
            #   2 -> <language-code> (101)
            #   3 -> <speaker-id> (10033)
            #   4 -> <date> (20111024)
            #   5 -> <hour> (205740)
            #   6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A
            p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split(
                "_")
            channel = {"inLine": "A", "outLine": "B"}.get(channel, "A")
            # Fix problematic segments that have two consecutive timestamp lines with no transcript in between
            lines = p.read_text().splitlines() + [""]
            lines = [
                prev_l for prev_l, l in sliding_window(2, lines)
                if not (prev_l.startswith("[") and l.startswith("["))
            ]
            # Add a None at the end so that the last timestamp is only used as "next_timestamp"
            # and ends the iretation (otherwise we'd lose the last segment).
            lines += [None]
            for (timestamp,
                 text), (next_timestamp,
                         _) in sliding_window(2, zip(lines[::2], lines[1::2])):
                try:
                    start = float(timestamp[1:-1])
                    end = float(next_timestamp[1:-1])
                    # Create supervision
                    supervisions.append(
                        SupervisionSegment(
                            id=
                            f"{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}",
                            recording_id=p.stem,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=normalize_text(text),
                            language=BABELCODE2LANG[lang_code],
                            speaker=f"{lang_code}_{speaker}_{channel}",
                        ))
                except Exception as e:
                    logging.warning(
                        f"Error while parsing segment. Message: {str(e)}")
                    raise ValueError(
                        f"Too many errors while parsing segments (file: '{p}'). "
                        f"Please check your data or increase the threshold.")
        supervisions = deduplicate_supervisions(supervisions)

        if len(supervisions) == 0:
            logging.warning(f"No supervisions found in {text_dir}")
        supervisions = SupervisionSet.from_segments(supervisions)

        # Fixing and validation of manifests
        if split == "eval" and len(supervisions) == 0:
            # We won't remove missing recordings for the "eval" split in cases where
            # the user does not have its corresponding transcripts (very likely).
            pass
        else:
            recordings, supervisions = remove_missing_recordings_and_supervisions(
                recordings, supervisions)
            supervisions = trim_supervisions_to_recordings(
                recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            language = BABELCODE2LANG[lang_code]
            save_split = "train" if split == "training" else split
            recordings.to_file(output_dir /
                               f"recordings_{language}_{save_split}.json")
            supervisions.to_file(output_dir /
                                 f"supervisions_{language}_{save_split}.json")

    return dict(manifests)
Beispiel #8
0
def prepare_mgb2(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    text_cleaning: bool = True,
    buck_walter: bool = False,
    num_jobs: int = 1,
    mer_thresh: int = 80,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param text_cleaning: Bool, if True, basic text cleaning is performed (similar to ESPNet recipe).
    :param buck_walter: Bool, use BuckWalter transliteration
    :param num_jobs: int, the number of jobs to use for parallel processing.
    :param mer_thresh: int, filter out segments based on mer (Match Error Rate)
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.

    .. note::
        Unlike other recipes, output_dir is not Optional here because we write the manifests
        to the output directory while processing to avoid OOM issues, since it is a large dataset.

    .. caution::
        The `text_cleaning` option removes all punctuation and diacritics.
    """

    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    dataset_parts = ["dev", "train", "test"]
    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(
            dataset_parts=dataset_parts,
            output_dir=output_dir,
            prefix="mgb2",
            suffix="jsonl.gz",
            lazy=True,
        )

    for part in dataset_parts:
        info(f"Processing MGB2 subset: {part}")
        if manifests_exist(
            part=part, output_dir=output_dir, prefix="mgb2", suffix="jsonl.gz"
        ):
            info(f"MGB2 subset: {part} already prepared - skipping.")
            continue

        # Read the recordings and write them into manifest. We additionally store the
        # duration of the recordings in a dict which will be used later to create the
        # supervisions.

        output_dir = Path(output_dir)
        corpus_dir = Path(corpus_dir)
        if part == "test" or part == "dev":
            (output_dir / part).mkdir(parents=True, exist_ok=True)
            copy(
                corpus_dir / part / "text.non_overlap_speech",
                output_dir / part / "text",
            )
            copy(
                corpus_dir / part / "segments.non_overlap_speech",
                output_dir / part / "segments",
            )
            with open(corpus_dir / part / "wav.scp", "r") as f_in, open(
                output_dir / part / "wav.scp", "w"
            ) as f_out:
                for line in f_in:
                    f_out.write(line.replace("wav/", f"{corpus_dir}/{part}/wav/"))
                    f_out.write("\n")

            recordings, supervisions, _ = load_kaldi_data_dir(
                (output_dir / part), 16000
            )
            if buck_walter is False:
                supervisions = supervisions.transform_text(from_buck_walter)
            if part == "test":
                assert (
                    len(supervisions) == 5365
                ), f"Expected 5365 supervisions for test, found {len(supervisions)}"
            elif part == "dev":
                assert (
                    len(supervisions) == 5002
                ), f"Expected 5002 supervisions for dev, found {len(supervisions)}"
        elif part == "train":
            recordings = RecordingSet.from_dir(
                (corpus_dir / part / "wav"), pattern="*.wav", num_jobs=num_jobs
            )

            xml_paths = check_and_rglob(
                path.join(corpus_dir, part, "xml/utf8"), "*.xml"
            )
            # Read supervisions and write them to manifest
            with recursion_limit(5000):
                supervisions_list = list(
                    chain.from_iterable(
                        [make_supervisions(p, mer_thresh) for p in xml_paths]
                    )
                )

            supervisions = SupervisionSet.from_segments(supervisions_list)

            assert (
                len(supervisions) == 375103
            ), f"Expected 375103 supervisions for train, found {len(supervisions)}"

            if text_cleaning is True:
                supervisions = supervisions.transform_text(cleaning)
            recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        # saving recordings and supervisions
        recordings.to_file((output_dir / f"mgb2_recordings_{part}.jsonl.gz"))
        supervisions.to_file((output_dir / f"mgb2_supervisions_{part}.jsonl.gz"))

        manifests[part] = {
            "recordings": recordings,
            "supervisions": supervisions,
        }
    return manifests
Beispiel #9
0
def prepare_bvcc(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    corpus_dir = Path(corpus_dir)

    phase1_main = (corpus_dir / "phase1-main").resolve()
    assert phase1_main.exists(), f"Main track dir is missing {phase1_main}"

    main1_sets = phase1_main / "DATA" / "sets"
    main1_wav = phase1_main / "DATA" / "wav"
    assert (main1_sets.exists() and main1_wav.exists()
            ), f"Have you run data preparation in {phase1_main}?"
    main1_devp = main1_sets / "DEVSET"
    assert main1_devp.exists(), main1_devp
    main1_trainp = main1_sets / "TRAINSET"
    assert main1_trainp.exists(), main1_trainp

    phase1_ood = (corpus_dir / "phase1-ood").resolve()
    assert phase1_ood.exists(
    ), f"Out of domain track dir is missing {phase1_ood}"
    ood1_sets = phase1_ood / "DATA" / "sets"
    ood1_wav = phase1_ood / "DATA" / "wav"
    assert (ood1_sets.exists() and ood1_wav.exists()
            ), f"Have you run data preparation in {phase1_ood}?"
    ood1_unlabeled = ood1_sets / "unlabeled_mos_list.txt"
    assert ood1_unlabeled.exists(), ood1_unlabeled
    ood1_devp = ood1_sets / "DEVSET"
    assert ood1_devp.exists(), ood1_devp
    ood1_trainp = ood1_sets / "TRAINSET"
    assert ood1_trainp.exists(), ood1_devp

    manifests = {}

    # ### Main track sets
    main1_recs = RecordingSet.from_dir(main1_wav,
                                       pattern="*.wav",
                                       num_jobs=num_jobs)

    logging.info("Preparing main1_dev")
    main1_dev_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(main1_devp).readlines()),
            main1_recs,
            parse_main_line,
        ))
    main1_dev_recs = main1_recs.filter(lambda rec: rec.id in main1_dev_sup)
    manifests["main1_dev"] = {
        "recordings": main1_dev_recs,
        "supervisions": main1_dev_sup,
    }

    logging.info("Preparing main1_train")
    main1_train_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(main1_trainp).readlines()),
            main1_recs,
            parse_main_line,
        ))
    main1_train_recs = main1_recs.filter(lambda rec: rec.id in main1_train_sup)
    manifests["main1_train"] = {
        "recordings": main1_train_recs,
        "supervisions": main1_train_sup,
    }

    # ### Out of Domain (OOD) track sets
    unlabeled_wavpaths = [
        ood1_wav / name.strip() for name in open(ood1_unlabeled).readlines()
    ]
    manifests["ood1_unlabeled"] = {
        "recordings":
        RecordingSet.from_recordings(
            Recording.from_file(p) for p in unlabeled_wavpaths)
    }

    ood1_recs = RecordingSet.from_dir(ood1_wav,
                                      pattern="*.wav",
                                      num_jobs=num_jobs)

    logging.info("Preparing ood1_dev")
    ood1_dev_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(ood1_devp).readlines()),
            ood1_recs,
            parse_ood_line,
        ))
    ood1_dev_recs = ood1_recs.filter(lambda rec: rec.id in ood1_dev_sup)
    manifests["ood1_dev"] = {
        "recordings": ood1_dev_recs,
        "supervisions": ood1_dev_sup,
    }

    logging.info("Preparing ood1_train")
    ood1_train_sup = SupervisionSet.from_segments(
        gen_supervision_per_utt(
            sorted(open(ood1_trainp).readlines()),
            ood1_recs,
            parse_ood_line,
        ))
    ood1_train_recs = ood1_recs.filter(lambda rec: rec.id in ood1_train_sup)
    manifests["ood1_train"] = {
        "recordings": ood1_train_recs,
        "supervisions": ood1_train_sup,
    }

    # Optionally serializing to disc
    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part, d in manifests.items():
            d["recordings"].to_file(output_dir / f"recordings_{part}.jsonl.gz")
            if "supervisions" in d:
                d["supervisions"].to_file(output_dir /
                                          f"supervisions_{part}.jsonl.gz")

    return manifests
Beispiel #10
0
def prepare_libritts(
    corpus_dir: Pathlike,
    dataset_parts: Union[str, Sequence[str]] = "auto",
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
    link_previous_utt: bool = False,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param num_jobs: the number of parallel workers parsing the data.
    :param link_previous_utt: If true adds previous utterance id to supervisions.
        Useful for reconstructing chains of utterances as they were read.
        If previous utterance was skipped from LibriTTS datasets previous_utt label is None.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    if dataset_parts == "auto":
        dataset_parts = LIBRITTS
    elif isinstance(dataset_parts, str):
        assert dataset_parts in LIBRITTS
        dataset_parts = [dataset_parts]

    manifests = {}

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        # Maybe the manifests already exist: we can read them and save a bit of preparation time.
        manifests = read_manifests_if_cached(dataset_parts=dataset_parts,
                                             output_dir=output_dir,
                                             prefix="libritts")

    # Contents of the file
    #   ;ID  |SEX| SUBSET           |MINUTES| NAME
    #   14   | F | train-clean-360  | 25.03 | ...
    #   16   | F | train-clean-360  | 25.11 | ...
    #   17   | M | train-clean-360  | 25.04 | ...
    spk2gender = {
        spk_id.strip(): gender.strip()
        for spk_id, gender, *_ in (line.split("|") for line in (
            corpus_dir / "SPEAKERS.txt").read_text().splitlines()
                                   if not line.startswith(";"))
    }

    for part in tqdm(dataset_parts, desc="Preparing LibriTTS parts"):
        if manifests_exist(part=part, output_dir=output_dir,
                           prefix="libritts"):
            logging.info(
                f"LibriTTS subset: {part} already prepared - skipping.")
            continue
        part_path = corpus_dir / part
        recordings = RecordingSet.from_dir(part_path,
                                           "*.wav",
                                           num_jobs=num_jobs)
        supervisions = []
        for trans_path in tqdm(
                part_path.rglob("*.trans.tsv"),
                desc="Scanning transcript files (progbar per speaker)",
                leave=False,
        ):
            # The trans.tsv files contain only the recordings that were kept for LibriTTS.
            # Example path to a file:
            #   /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv
            #
            # Example content:
            #   84_121123_000007_000001 Maximilian.     Maximilian.
            #   84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief.    Villefort rose, half ashamed of being surprised in such a paroxysm of grief.

            # book.tsv contains additional metadata
            utt2snr = [(rec_id, float(snr)) for rec_id, *_, snr in map(
                str.split,
                (trans_path.parent /
                 trans_path.name.replace(".trans.tsv", ".book.tsv")
                 ).read_text().splitlines(),
            )]
            # keeps the order of uttids as they appear in book.tsv
            uttids = [r for r, _ in utt2snr]
            utt2snr = dict(utt2snr)

            if link_previous_utt:
                # Using the property of sorted keys to find previous utterance
                # The keys has structure speaker_book_x_y e.g. 1089_134691_000004_000001
                utt2prevutt = dict(zip(uttids + [None], [None] + uttids))

            prev_rec_id = None
            for line in trans_path.read_text().splitlines():
                rec_id, orig_text, norm_text = line.split("\t")
                spk_id = rec_id.split("_")[0]
                customd = {"orig_text": orig_text, "snr": utt2snr[rec_id]}
                if link_previous_utt:
                    # all recordings ids should be in the book.csv
                    # but they are some missing e.g. 446_123502_000030_000003
                    prev_utt = utt2prevutt.get(rec_id, None)
                    # previous utterance has to be present in trans.csv - otherwise it was skipped
                    prev_utt = prev_utt if prev_utt == prev_rec_id else None
                    customd["prev_utt"] = prev_utt
                    prev_rec_id = rec_id
                supervisions.append(
                    SupervisionSegment(
                        id=rec_id,
                        recording_id=rec_id,
                        start=0.0,
                        duration=recordings[rec_id].duration,
                        channel=0,
                        text=norm_text,
                        language="English",
                        speaker=spk_id,
                        gender=spk2gender[spk_id],
                        custom=customd,
                    ))

        supervisions = SupervisionSet.from_segments(supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            supervisions.to_file(output_dir /
                                 f"libritts_supervisions_{part}.jsonl.gz")
            recordings.to_file(output_dir /
                               f"libritts_recordings_{part}.jsonl.gz")

        manifests[part] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
Beispiel #11
0
def prepare_single_mtedx_language(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    language: str = "language",
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares manifests using a single MTEDx language.

    This function works as follows:

        - First it looks for the audio directory in the data/wav where the .flac
            files are stored.
        - Then, it looks for the vtt directory in data/{train,dev,test}/vtt
            which contains the segmentation and transcripts for the audio.
        - The transcripts undergo some basic text normalization

    :param corpus_dir: Path to the root of the MTEDx download
    :param output_dir: Path where the manifests are stored as .json files
    :param language: The two-letter language code.
    :param num_jobs: Number of threads to use when preparing data.
    :return:
    """
    if isinstance(corpus_dir, str):
        corpus_dir = Path(corpus_dir)
    manifests = defaultdict(dict)

    with ThreadPoolExecutor(num_jobs) as ex:
        for split in ("train", "valid", "test"):
            audio_dir = corpus_dir / f"data/{split}/wav"
            recordings = RecordingSet.from_recordings(
                Recording.from_file(p) for p in audio_dir.glob("*.flac")
            )
            if len(recordings) == 0:
                logging.warning(f"No .flac files found in {audio_dir}")

            supervisions = []
            text_dir = corpus_dir / f"data/{split}/vtt"
            futures = []
            for p in text_dir.glob("*"):
                futures.append(ex.submit(_filename_to_supervisions, p, language))

            for future in tqdm(futures, desc="Processing", leave=False):
                result = future.result()
                if result is None:
                    continue
                for sup in result:
                    supervisions.append(sup)

            if len(supervisions) == 0:
                logging.warning(f"No supervisions found in {text_dir}")
            supervisions = SupervisionSet.from_segments(supervisions)

            recordings, supervisions = remove_missing_recordings_and_supervisions(
                recordings, supervisions
            )
            supervisions = trim_supervisions_to_recordings(recordings, supervisions)
            validate_recordings_and_supervisions(recordings, supervisions)

            manifests[split] = {
                "recordings": recordings,
                "supervisions": supervisions,
            }

            if output_dir is not None:
                if isinstance(output_dir, str):
                    output_dir = Path(output_dir)
                output_dir.mkdir(parents=True, exist_ok=True)
                save_split = "dev" if split == "valid" else split
                recordings.to_file(output_dir / f"recordings_{language}_{split}.json")
                supervisions.to_file(
                    output_dir / f"supervisions_{language}_{split}.json"
                )

    return dict(manifests)
Beispiel #12
0
def prepare_vctk(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    speaker_meta = _parse_speaker_description(corpus_dir)

    recordings = RecordingSet.from_recordings(
        Recording.from_file(wav) for wav in (corpus_dir / "wav48").rglob("*.wav")
    )
    supervisions = []
    for path in (corpus_dir / "txt").rglob("*.txt"):
        # One utterance (line) per file
        text = path.read_text().strip()
        speaker = path.name.split("_")[0]  # p226_001.txt -> p226
        seg_id = path.stem
        meta = speaker_meta.get(speaker, defaultdict(lambda: None))
        if meta is None:
            logging.warning(f"Cannot find metadata for speaker {speaker}.")
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language="English",
                speaker=speaker,
                gender=meta["gender"],
                custom={
                    "accent": meta["accent"],
                    "age": meta["age"],
                    "region": meta["region"],
                },
            )
        )
    supervisions = SupervisionSet.from_segments(supervisions)

    # note(pzelasko): There were 172 recordings without supervisions when I ran it.
    #                 I am just removing them.
    recordings, supervisions = remove_missing_recordings_and_supervisions(
        recordings, supervisions
    )
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_file(output_dir / "vctk_recordings_all.jsonl.gz")
        supervisions.to_file(output_dir / "vctk_supervisions_all.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
Beispiel #13
0
def prepare_adept(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
):
    """
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param dataset_parts: string or sequence of strings representing dataset part names,
        e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
        By default we will infer which parts are available in ``corpus_dir``.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    recordings = RecordingSet.from_recordings(
        Recording.from_file(
            path=path,
            # converts:
            #   path/to/ADEPT/wav_44khz/propositional_attitude/surprise/ad01_0204.wav
            # to:
            #   propositional_attitude_surprise_ad01_0204
            recording_id=str(path.relative_to(path.parent.parent.parent))
            [:-4].replace("/", "_"),
        ) for path in (corpus_dir / "wav_44khz").rglob("*.wav"))

    supervisions = []

    with open(corpus_dir / "adept_prompts.json") as f:
        interpretation_map = json.load(f)

    for path in (corpus_dir / "txt").rglob("*.txt"):
        annotation_type, label, prompt_id = str(
            path.relative_to(path.parent.parent.parent))[:-4].split("/")
        speaker_id = "ADEPT_" + prompt_id.split("_")[0]
        recording_id = "_".join((annotation_type, label, prompt_id))
        interpretation_group = interpretation_map.get(annotation_type)
        interpretation = (interpretation_group[prompt_id][label]
                          if interpretation_group else None)
        recording = recordings[recording_id]
        custom = {
            "type": annotation_type,
            "label": label,
            "prompt_id": prompt_id
        }
        if interpretation:
            # label is "interpretation_1", "interpretation_2", ..., "middle", "end", etc
            # Interpretations' labels meaning is defined by their textual realisation:
            #  {..., "middle": "Galleries are WHAT on Thursdays?", "end": "Galleries are free WHEN?"}
            custom["text"] = interpretation
        supervisions.append(
            SupervisionSegment(
                id=recording_id,
                recording_id=recording_id,
                start=0,
                duration=recording.duration,
                channel=0,
                text=path.read_text(),
                language="English",
                speaker=speaker_id,
                custom=custom,
            ))

    supervisions = SupervisionSet.from_segments(supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        supervisions.to_file(output_dir / "adept_supervisions.json")
        recordings.to_file(output_dir / "adept_recordings.json")

    return {"recordings": recordings, "supervisions": supervisions}
Beispiel #14
0
def prepare_l2_arctic(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict with keys "read" and "spontaneous".
        Each hold another dict of {'recordings': ..., 'supervisions': ...}
    """
    corpus_dir = Path(corpus_dir)
    assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}"

    speaker_meta = _parse_speaker_description()

    recordings = RecordingSet.from_recordings(
        # Example ID: zhaa-arctic_b0126
        Recording.from_file(
            wav, recording_id=f"{wav.parent.parent.name.lower()}-{wav.stem}")
        for wav in corpus_dir.rglob("*.wav"))
    supervisions = []
    for path in corpus_dir.rglob("*.txt"):
        # One utterance (line) per file
        text = path.read_text().strip()

        is_suitcase_corpus = "suitcase_corpus" in path.parts

        speaker = (path.parent.parent.name.lower()
                   )  # <root>/ABA/transcript/arctic_a0051.txt -> aba
        if is_suitcase_corpus:
            speaker = path.stem  # <root>/suitcase_corpus/transcript/aba.txt -> aba

        seg_id = (f"suitcase_corpus-{speaker}"
                  if is_suitcase_corpus else f"{speaker}-{path.stem}")
        supervisions.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=seg_id,
                start=0,
                duration=recordings[seg_id].duration,
                text=text,
                language="English",
                speaker=speaker,
                gender=speaker_meta[speaker]["gender"],
                custom={"accent": speaker_meta[speaker]["native_lang"]},
            ))
    supervisions = SupervisionSet.from_segments(supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    splits = {
        "read": {
            "recordings":
            recordings.filter(lambda r: "suitcase_corpus" not in r.id),
            "supervisions":
            supervisions.filter(
                lambda s: "suitcase_corpus" not in s.recording_id),
        },
        "suitcase": {
            "recordings":
            recordings.filter(lambda r: "suitcase_corpus" in r.id),
            "supervisions":
            supervisions.filter(lambda s: "suitcase_corpus" in s.recording_id),
        },
    }

    if output_dir is not None:
        output_dir = Path(output_dir)
        makedirs(output_dir, exist_ok=True)
        for key, manifests in splits.items():
            manifests["recordings"].to_file(
                output_dir / f"l2-arctic_recordings_{key}.jsonl.gz")
            manifests["supervisions"].to_file(
                output_dir / f"l2-arctic_supervisions_{key}.jsonl.gz")

    return splits
Beispiel #15
0
def prepare_single_babel_language(corpus_dir: Pathlike,
                                  output_dir: Optional[Pathlike] = None):
    manifests = defaultdict(dict)
    for split in ('dev', 'eval', 'training'):
        audio_dir = corpus_dir / f'conversational/{split}/audio'
        recordings = RecordingSet.from_recordings(
            Recording.from_sphere(p) for p in audio_dir.glob('*.sph'))
        if len(recordings) == 0:
            logging.warning(f"No SPHERE files found in {audio_dir}")
        manifests[split]['recordings'] = recordings

        supervisions = []
        text_dir = corpus_dir / f'conversational/{split}/transcription'
        for p in text_dir.glob('*'):
            # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine
            # parts:
            #   0 -> BABEL
            #   1 -> BP
            #   2 -> <language-code> (101)
            #   3 -> <speaker-id> (10033)
            #   4 -> <date> (20111024)
            #   5 -> <hour> (205740)
            #   6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A
            p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split(
                '_')
            channel = {'inLine': 'A', 'outLine': 'B'}.get(channel, 'A')
            # Add a None at the end so that the last timestamp is only used as "next_timestamp"
            # and ends the iretation (otherwise we'd lose the last segment).
            lines = p.read_text().splitlines() + [None]
            for (timestamp,
                 text), (next_timestamp,
                         _) in sliding_window(2, zip(lines[::2], lines[1::2])):
                start = float(timestamp[1:-1])
                end = float(next_timestamp[1:-1])
                supervisions.append(
                    SupervisionSegment(
                        id=
                        f'{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}',
                        recording_id=p.stem,
                        start=start,
                        duration=round(end - start, ndigits=8),
                        channel=0,
                        text=normalize_text(text),
                        language=BABELCODE2LANG[lang_code],
                        speaker=speaker,
                    ))
        if len(supervisions) == 0:
            logging.warning(f"No supervisions found in {text_dir}")
        manifests[split]['supervisions'] = SupervisionSet.from_segments(
            supervisions)

        if output_dir is not None:
            language = BABELCODE2LANG[lang_code]
            if split == 'training':
                split = 'train'
            manifests[split]['recordings'].to_json(
                f'recordings_{language}_{split}.json')
            manifests[split]['supervisions'].to_json(
                f'supervisions_{language}_{split}.json')

    return manifests
Beispiel #16
0
def prepare_callhome_egyptian(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    sph2pipe_path: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ['train', 'devtest', 'evaltest']:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / 'callhome/arabic' /
            split.replace('evaltest', 'evltest'),
            '*.sph')
        recordings = RecordingSet.from_recordings(
            make_recording_callhome(p, sph2pipe_path=sph2pipe_path)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir /
            f'callhome_arabic_trans_970711/transcrp/{split}/roman', '*.txt')

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(id=f'{recording_id}_{idx}',
                                       recording_id=recording_id,
                                       start=start,
                                       duration=duration,
                                       speaker=f'{recording_id}_{spk}',
                                       text=text))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = remove_missing_recordings_and_supervisions(
            recordings, supervisions)
        supervisions = trim_supervisions_to_recordings(recordings,
                                                       supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f'recordings_{split}.json')
            supervisions.to_json(output_dir / f'supervisions_{split}.json')

        manifests[split] = {
            'recordings': recordings,
            'supervisions': supervisions
        }

    return manifests