Example #1
0
def prepare_callhome_english(
        audio_dir: Pathlike,
        rttm_dir: Optional[Pathlike] = None,
        output_dir: Optional[Pathlike] = None,
        sph2pipe_path: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    if rttm_dir is None:
        rttm_dir = download_callhome_metadata()
    rttm_path = rttm_dir / 'fullref.rttm'
    supervisions = read_rttm(rttm_path)

    audio_paths = check_and_rglob(audio_dir, '*.sph')
    recordings = RecordingSet.from_recordings(
        make_recording_callhome(p, sph2pipe_path=sph2pipe_path) for p in tqdm(audio_paths)
    )

    recordings, supervisions = remove_missing_recordings_and_supervisions(recordings, supervisions)
    supervisions = trim_supervisions_to_recordings(recordings, supervisions)

    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / 'recordings.json')
        supervisions.to_json(output_dir / 'supervisions.json')
    return {
        'recordings': recordings,
        'supervisions': supervisions
    }
Example #2
0
def prepare_gale_mandarin(
    audio_dirs: List[Pathlike],
    transcript_dirs: List[Pathlike],
    output_dir: Optional[Pathlike] = None,
    absolute_paths: Optional[bool] = True,
    segment_words: Optional[bool] = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for GALE Mandarin Broadcast speech corpus.

    :param audio_dirs: List of paths to audio corpora.
    :param transcripts_dirs: List of paths to transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :param segment_words: Use `jieba` package to perform word segmentation (default = False)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    assert len(audio_dirs) == len(
        transcript_dirs
    ), "Paths to the same speech and transcript corpora must be provided"

    logging.info("Reading audio and transcript paths from provided dirs")
    # Some of the audio is wav while others are flac. Also, some recordings
    # may be repeated across corpora so we make a dict to avoid adding them
    # twice.
    audio_paths = defaultdict(
        Path,
        {
            p.stem: p
            for p in chain.from_iterable(
                [
                    check_and_rglob(dir, ext, strict=False)
                    for dir in audio_dirs
                    for ext in ["*.wav", "*.flac"]
                ]
            )
        },
    )
    transcript_paths = chain.from_iterable(
        [check_and_rglob(dir, "*.tdf") for dir in transcript_dirs]
    )

    logging.info("Preparing recordings manifest")

    recordings = RecordingSet.from_recordings(
        Recording.from_file(p, relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths.values()
    )

    logging.info("Preparing supervisions manifest")
    supervisions = SupervisionSet.from_segments(
        parse_transcripts(transcript_paths, segment_words=segment_words)
    ).filter(lambda s: s.recording_id in audio_paths)

    # Some supervisions exceed recording boundaries, so here we trim them
    supervisions = trim_supervisions_to_recordings(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    TEST = [
        line.decode("utf-8").strip() for url in TEST_FILE_URLS for line in urlopen(url)
    ]

    manifests = defaultdict(dict)
    manifests["dev"] = {
        "recordings": recordings.filter(lambda r: r.id in TEST),
        "supervisions": supervisions.filter(lambda s: s.recording_id in TEST),
    }
    manifests["train"] = {
        "recordings": recordings.filter(lambda r: r.id not in TEST),
        "supervisions": supervisions.filter(lambda s: s.recording_id not in TEST),
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSONL files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part in ["train", "dev"]:
            manifests[part]["recordings"].to_file(
                output_dir / f"gale-mandarin_recordings_{part}.jsonl.gz"
            )
            manifests[part]["supervisions"].to_file(
                output_dir / f"gale-mandarin_supervisions_{part}.jsonl.gz"
            )

    return manifests
Example #3
0
def prepare_single_babel_language(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    no_eval_ok: bool = False,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares manifests using a single BABEL LDC package.

    This function works like the following:

        - first, it will scan `corpus_dir` for a directory named `conversational`;
            if there is more than once, it picks the first one (and emits a warning)
        - then, it will try to find `dev`, `eval`, and `training` splits inside
            (if any of them is not present, it will skip it with a warning)
        - finally, it scans the selected location for SPHERE audio files and transcripts.

    :param corpus_dir: Path to the root of the LDC package with a BABEL language.
    :param output_dir: Path where the manifests are stored.json
    :param no_eval_ok: When set to True, this function won't emit a warning
        that the eval set was not found.
    :return:
    """
    manifests = defaultdict(dict)

    # Auto-detect the location of the "conversational" directory
    orig_corpus_dir = corpus_dir
    corpus_dir = Path(corpus_dir)
    corpus_dir = [d for d in corpus_dir.rglob("conversational") if d.is_dir()]
    if not corpus_dir:
        raise ValueError(
            f"Could not find 'conversational' directory anywhere inside '{orig_corpus_dir}' "
            f"- please check your path.")
    if len(corpus_dir) > 1:
        # People have very messy data distributions, the best we can do is warn them.
        logging.warning(
            f"It seems there are multiple 'conversational' directories in '{orig_corpus_dir}' - "
            f"we are selecting the first one only ({corpus_dir[0]}). Please ensure that you provided "
            f"the path to a single language's dir, and the root dir for all BABEL languages."
        )
    corpus_dir = corpus_dir[0].parent

    for split in ("dev", "eval", "training"):
        audio_dir = corpus_dir / f"conversational/{split}/audio"
        sph_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.sph"))
        wav_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.wav"))
        recordings = combine(sph_recordings, wav_recordings)
        if len(recordings) == 0:
            if split == "eval" and no_eval_ok:
                continue
            logging.warning(f"No SPHERE or WAV files found in {audio_dir}")

        supervisions = []
        text_dir = corpus_dir / f"conversational/{split}/transcription"
        for p in tqdm.tqdm(text_dir.glob("*")):
            # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine
            # parts:
            #   0 -> BABEL
            #   1 -> BP
            #   2 -> <language-code> (101)
            #   3 -> <speaker-id> (10033)
            #   4 -> <date> (20111024)
            #   5 -> <hour> (205740)
            #   6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A
            p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split(
                "_")
            channel = {"inLine": "A", "outLine": "B"}.get(channel, "A")
            # Fix problematic segments that have two consecutive timestamp lines with no transcript in between
            lines = p.read_text().splitlines() + [""]
            lines = [
                prev_l for prev_l, l in sliding_window(2, lines)
                if not (prev_l.startswith("[") and l.startswith("["))
            ]
            # Add a None at the end so that the last timestamp is only used as "next_timestamp"
            # and ends the iretation (otherwise we'd lose the last segment).
            lines += [None]
            for (timestamp,
                 text), (next_timestamp,
                         _) in sliding_window(2, zip(lines[::2], lines[1::2])):
                try:
                    start = float(timestamp[1:-1])
                    end = float(next_timestamp[1:-1])
                    # Create supervision
                    supervisions.append(
                        SupervisionSegment(
                            id=
                            f"{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}",
                            recording_id=p.stem,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=normalize_text(text),
                            language=BABELCODE2LANG[lang_code],
                            speaker=f"{lang_code}_{speaker}_{channel}",
                        ))
                except Exception as e:
                    logging.warning(
                        f"Error while parsing segment. Message: {str(e)}")
                    raise ValueError(
                        f"Too many errors while parsing segments (file: '{p}'). "
                        f"Please check your data or increase the threshold.")
        supervisions = deduplicate_supervisions(supervisions)

        if len(supervisions) == 0:
            logging.warning(f"No supervisions found in {text_dir}")
        supervisions = SupervisionSet.from_segments(supervisions)

        # Fixing and validation of manifests
        if split == "eval" and len(supervisions) == 0:
            # We won't remove missing recordings for the "eval" split in cases where
            # the user does not have its corresponding transcripts (very likely).
            pass
        else:
            recordings, supervisions = remove_missing_recordings_and_supervisions(
                recordings, supervisions)
            supervisions = trim_supervisions_to_recordings(
                recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            language = BABELCODE2LANG[lang_code]
            save_split = "train" if split == "training" else split
            recordings.to_file(output_dir /
                               f"recordings_{language}_{save_split}.json")
            supervisions.to_file(output_dir /
                                 f"supervisions_{language}_{save_split}.json")

    return dict(manifests)
Example #4
0
def prepare_gale_arabic(
    audio_dirs: List[Pathlike],
    transcript_dirs: List[Pathlike],
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = True,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for GALE Arabic Broadcast speech corpus.

    :param audio_dirs: List of paths to audio corpora.
    :param transcripts_dirs: List of paths to transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    assert len(audio_dirs) == len(
        transcript_dirs
    ), "Paths to the same speech and transcript corpora must be provided"

    logging.info("Reading audio and transcript paths from provided dirs")
    # Some of the audio is wav while others are flac. Also, some recordings
    # may be repeated across corpora so we make a dict to avoid adding them
    # twice.
    audio_paths = defaultdict(
        Path,
        {
            p.stem: p
            for p in chain.from_iterable([
                check_and_rglob(dir, ext, strict=False) for dir in audio_dirs
                for ext in ['*.wav', '*.flac']
            ])
        },
    )
    transcript_paths = chain.from_iterable(
        [check_and_rglob(dir, '*.tdf') for dir in transcript_dirs])
    transcript_paths = [p for p in transcript_paths]

    logging.info("Preparing recordings manifest")

    recordings = RecordingSet.from_recordings(
        Recording.from_file(p,
                            relative_path_depth=None if absolute_paths else 3)
        for p in audio_paths.values())

    logging.info("Preparing supervisions manifest")
    supervisions = SupervisionSet.from_segments(
        parse_transcripts(transcript_paths))

    # Some supervisions exceed recording boundaries, so here we trim them
    supervisions = trim_supervisions_to_recordings(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    manifests = defaultdict(dict)
    manifests['test'] = {
        'recordings': recordings.filter(lambda r: r.id in TEST),
        'supervisions': supervisions.filter(lambda s: s.recording_id in TEST),
    }
    manifests['train'] = {
        'recordings': recordings.filter(lambda r: r.id not in TEST),
        'supervisions':
        supervisions.filter(lambda s: s.recording_id not in TEST),
    }

    if output_dir is not None:
        logging.info("Writing manifests to JSON files")
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        for part in ["train", "test"]:
            manifests[part]["recordings"].to_json(output_dir /
                                                  f'recordings_{part}.json')
            manifests[part]["supervisions"].to_json(
                output_dir / f'supervisions_{part}.json')

    return manifests
Example #5
0
def prepare_single_mtedx_language(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    language: str = "language",
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares manifests using a single MTEDx language.

    This function works as follows:

        - First it looks for the audio directory in the data/wav where the .flac
            files are stored.
        - Then, it looks for the vtt directory in data/{train,dev,test}/vtt
            which contains the segmentation and transcripts for the audio.
        - The transcripts undergo some basic text normalization

    :param corpus_dir: Path to the root of the MTEDx download
    :param output_dir: Path where the manifests are stored as .json files
    :param language: The two-letter language code.
    :param num_jobs: Number of threads to use when preparing data.
    :return:
    """
    if isinstance(corpus_dir, str):
        corpus_dir = Path(corpus_dir)
    manifests = defaultdict(dict)

    with ThreadPoolExecutor(num_jobs) as ex:
        for split in ("train", "valid", "test"):
            audio_dir = corpus_dir / f"data/{split}/wav"
            recordings = RecordingSet.from_recordings(
                Recording.from_file(p) for p in audio_dir.glob("*.flac")
            )
            if len(recordings) == 0:
                logging.warning(f"No .flac files found in {audio_dir}")

            supervisions = []
            text_dir = corpus_dir / f"data/{split}/vtt"
            futures = []
            for p in text_dir.glob("*"):
                futures.append(ex.submit(_filename_to_supervisions, p, language))

            for future in tqdm(futures, desc="Processing", leave=False):
                result = future.result()
                if result is None:
                    continue
                for sup in result:
                    supervisions.append(sup)

            if len(supervisions) == 0:
                logging.warning(f"No supervisions found in {text_dir}")
            supervisions = SupervisionSet.from_segments(supervisions)

            recordings, supervisions = remove_missing_recordings_and_supervisions(
                recordings, supervisions
            )
            supervisions = trim_supervisions_to_recordings(recordings, supervisions)
            validate_recordings_and_supervisions(recordings, supervisions)

            manifests[split] = {
                "recordings": recordings,
                "supervisions": supervisions,
            }

            if output_dir is not None:
                if isinstance(output_dir, str):
                    output_dir = Path(output_dir)
                output_dir.mkdir(parents=True, exist_ok=True)
                save_split = "dev" if split == "valid" else split
                recordings.to_file(output_dir / f"recordings_{language}_{split}.json")
                supervisions.to_file(
                    output_dir / f"supervisions_{language}_{split}.json"
                )

    return dict(manifests)
Example #6
0
def prepare_fisher_english(
    corpus_path: Pathlike,
    audio_dirs: List[str] = FISHER_AUDIO_DIRS,
    transcript_dirs: List[str] = FISHER_TRANSCRIPT_DIRS,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares manifests for Fisher English Part 1, 2.
    Script assumes that audio_dirs and transcript_dirs are in the corpus_path.
    We create two manifests: one with recordings, and the other one with text supervisions.

    :param corpus_path: Path to Fisher corpus
    :param audio_dirs: List of dirs of audio corpora.
    :param transcripts_dirs: List of dirs of transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """

    corpus_path = Path(corpus_path)

    for workdir in audio_dirs + transcript_dirs:
        workdir_path = corpus_path / workdir
        if not workdir_path.is_dir():
            raise ValueError(
                f"Could not find '{workdir}' directory inside '{corpus_path}'."
            )

    audio_subdir_paths = []
    for audio_dir in audio_dirs:
        audio_dir_path = corpus_path / audio_dir
        for audio_partition_dir in audio_dir_path.iterdir():
            audio_partition_dir_path = audio_dir_path / audio_partition_dir / "audio"
            audio_subdir_paths += [
                audio_partition_dir_path / audio_subdir
                for audio_subdir in audio_partition_dir_path.iterdir()
            ]

    transcript_subdir_paths = []
    for transcript_dir in transcript_dirs:
        transcript_dir_path = corpus_path / transcript_dir / "data" / "trans"
        transcript_subdir_paths += [
            transcript_dir_path / transcript_subdir
            for transcript_subdir in transcript_dir_path.iterdir()
        ]

    audio_paths = walk_dirs_parallel(audio_subdir_paths, "*.sph",
                                     "Parsing audio sub-dirs")
    transcript_paths = walk_dirs_parallel(transcript_subdir_paths, "*.txt",
                                          "Parsing transcript sub-dirs")

    sessions = {}
    for transcript_dir in transcript_dirs:
        sessions_data_path = check_and_rglob(
            corpus_path / transcript_dir / "doc", "*_calldata.tbl")[0]
        with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f:
            tmp_sessions = [
                l.rstrip("\n").split(",") for l in sessions_data_f.readlines()
            ][1:]
            sessions.update(
                {l[0]: {
                    "A": l[5],
                    "B": l[10]
                }
                 for l in tmp_sessions})

    assert len(transcript_paths) == len(sessions) == len(audio_paths)

    create_recordings_input = [(p, None if absolute_paths else 5)
                               for p in audio_paths]
    recordings = [None] * len(audio_paths)
    with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
        with tqdm(total=len(create_recordings_input),
                  desc="Collect recordings") as pbar:
            for i, reco in enumerate(
                    executor.map(create_recording, create_recordings_input)):
                recordings[i] = reco
                pbar.update()

    recordings = RecordingSet.from_recordings(recordings)

    create_supervisions_input = [(sessions, p) for p in transcript_paths]
    supervisions = [None] * len(create_supervisions_input)
    with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
        with tqdm(total=len(create_supervisions_input),
                  desc="Create supervisions") as pbar:
            for i, tmp_supervisions in enumerate(
                    executor.map(create_supervision,
                                 create_supervisions_input)):
                supervisions[i] = tmp_supervisions
                pbar.update()
    supervisions = list(it.chain.from_iterable(supervisions))
    supervisions = SupervisionSet.from_segments(supervisions)

    supervisions = trim_supervisions_to_recordings(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_file(output_dir / "recordings.jsonl.gz")
        supervisions.to_file(output_dir / "supervisions.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
Example #7
0
def prepare_callhome_egyptian(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    sph2pipe_path: Optional[Pathlike] = None,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ['train', 'devtest', 'evaltest']:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / 'callhome/arabic' /
            split.replace('evaltest', 'evltest'),
            '*.sph')
        recordings = RecordingSet.from_recordings(
            make_recording_callhome(p, sph2pipe_path=sph2pipe_path)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir /
            f'callhome_arabic_trans_970711/transcrp/{split}/roman', '*.txt')

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(id=f'{recording_id}_{idx}',
                                       recording_id=recording_id,
                                       start=start,
                                       duration=duration,
                                       speaker=f'{recording_id}_{spk}',
                                       text=text))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = remove_missing_recordings_and_supervisions(
            recordings, supervisions)
        supervisions = trim_supervisions_to_recordings(recordings,
                                                       supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f'recordings_{split}.json')
            supervisions.to_json(output_dir / f'supervisions_{split}.json')

        manifests[split] = {
            'recordings': recordings,
            'supervisions': supervisions
        }

    return manifests