Example #1
0
def prepare_callhome_english_sre(
    audio_dir: Pathlike,
    rttm_dir: Optional[Pathlike] = None,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Callhome American English portion prepartion.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory. If not provided,
        the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests.
        The keys are: ``{'recordings', 'supervisions'}``.
    """
    if rttm_dir is None:
        rttm_dir = download_callhome_metadata()
    rttm_path = rttm_dir / "fullref.rttm"
    supervisions = read_rttm(rttm_path)

    audio_paths = check_and_rglob(audio_dir, "*.sph")
    recordings = RecordingSet.from_recordings(
        Recording.from_file(p,
                            relative_path_depth=None if absolute_paths else 4)
        for p in tqdm(audio_paths))

    recordings, supervisions = fix_manifests(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / "recordings.json")
        supervisions.to_json(output_dir / "supervisions.json")
    return {"recordings": recordings, "supervisions": supervisions}
Example #2
0
def prepare_callhome_egyptian(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Callhome Egyptian Arabic Corpus
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S45`` package.
    :param transcript_dir: Path to the ``LDC97T19`` content
    :param output_dir: Directory where the manifests should be written. Can be omitted
        to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir)
        paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["train", "devtest", "evaltest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "callhome/arabic" /
            split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir /
            f"callhome_arabic_trans_970711/transcrp/{split}/roman",
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        id=f"{recording_id}_{idx}",
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        speaker=f"{recording_id}_{spk}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_json(output_dir / f"recordings_{split}.json")
            supervisions.to_json(output_dir / f"supervisions_{split}.json")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
Example #3
0
def prepare_fisher_english(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
    audio_dirs: List[str] = FISHER_AUDIO_DIRS,
    transcript_dirs: List[str] = FISHER_TRANSCRIPT_DIRS,
    absolute_paths: bool = False,
    num_jobs: int = 1,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepares manifests for Fisher English Part 1, 2.
    Script assumes that audio_dirs and transcript_dirs are in the corpus_path.
    We create two manifests: one with recordings, and the other one with text supervisions.

    :param corpus_path: Path to Fisher corpus
    :param audio_dirs: List of dirs of audio corpora.
    :param transcripts_dirs: List of dirs of transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """

    corpus_dir = Path(corpus_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    for workdir in audio_dirs + transcript_dirs:
        workdir_path = corpus_dir / workdir
        if not workdir_path.is_dir():
            raise ValueError(
                f"Could not find '{workdir}' directory inside '{corpus_dir}'.")

    audio_subdir_paths = []
    for audio_dir in audio_dirs:
        audio_dir_path = corpus_dir / audio_dir
        for audio_partition_dir in audio_dir_path.iterdir():
            audio_partition_dir_path = audio_dir_path / audio_partition_dir / "audio"
            audio_subdir_paths += [
                audio_partition_dir_path / audio_subdir
                for audio_subdir in audio_partition_dir_path.iterdir()
            ]

    transcript_subdir_paths = []
    for transcript_dir in transcript_dirs:
        transcript_dir_path = corpus_dir / transcript_dir / "data" / "trans"
        transcript_subdir_paths += [
            transcript_dir_path / transcript_subdir
            for transcript_subdir in transcript_dir_path.iterdir()
        ]

    audio_paths = walk_dirs_parallel(audio_subdir_paths, "*.sph",
                                     "Parsing audio sub-dirs")
    transcript_paths = walk_dirs_parallel(transcript_subdir_paths, "*.txt",
                                          "Parsing transcript sub-dirs")

    sessions = {}
    for transcript_dir in transcript_dirs:
        sessions_data_path = check_and_rglob(
            corpus_dir / transcript_dir / "doc", "*_calldata.tbl")[0]
        with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f:
            tmp_sessions = [
                l.rstrip("\n").split(",") for l in sessions_data_f.readlines()
            ][1:]
            sessions.update(
                {l[0]: {
                    "A": l[5],
                    "B": l[10]
                }
                 for l in tmp_sessions})

    assert len(transcript_paths) == len(
        audio_paths), f"{len(transcript_paths)} == {len(audio_paths)}"
    if len(transcript_paths) != len(sessions):
        warnings.warn(
            f"Fisher's *_calldata.tbl files indicate there should be {len(sessions)} sessions, "
            f"but our scanning of audio and transcript files indicates there are only {len(transcript_paths)}."
        )

    recs_path = output_dir / "recordings_notfixed.jsonl.gz"
    if recs_path.is_file():
        logging.info(f"Using existing recording manifest at {recs_path}")
        recordings = RecordingSet.from_jsonl_lazy(recs_path)
    else:
        logging.info(f"Building fresh recording manifest")
        create_recordings_input = [(p, None if absolute_paths else 5)
                                   for p in audio_paths]
        err_recos = 0
        with ProcessPoolExecutor(
                num_jobs) as executor, RecordingSet.open_writer(
                    recs_path) as writer:
            with tqdm(total=len(create_recordings_input),
                      desc="Collect recordings") as pbar:
                for reco in executor.map(create_recording,
                                         create_recordings_input):
                    if reco is not None:
                        writer.write(reco, flush=True)
                    else:
                        err_recos += 1
                    pbar.update()
        if err_recos:
            warnings.warn(f"Out of {len(create_recordings_input)} recordings, "
                          f"{err_recos} had errors and were omitted.")
        recordings = writer.open_manifest()

    sups_path = output_dir / "supervisions_notfixed.jsonl.gz"
    if sups_path.is_file():
        logging.info(f"Using existing supervision manifest at {recs_path}")
        supervisions = SupervisionSet.from_jsonl_lazy(sups_path)
    else:
        logging.info(f"Building fresh supervision manifest")
        create_supervisions_input = [(sessions, p) for p in transcript_paths]
        err_sups = 0
        with ThreadPoolExecutor(os.cpu_count() *
                                4) as executor, SupervisionSet.open_writer(
                                    sups_path) as writer:
            with tqdm(total=len(create_supervisions_input),
                      desc="Create supervisions") as pbar:
                for tmp_supervisions in executor.map(
                        create_supervision, create_supervisions_input):
                    if not tmp_supervisions:
                        err_sups += 1
                    for s in tmp_supervisions:
                        writer.write(s)
                    pbar.update()
        supervisions = writer.open_manifest()
        if err_recos:
            warnings.warn(
                f"Out of {len(create_supervisions_input)} transcript files, "
                f"{err_sups} had errors and were omitted.")

    recordings, supervisions = fix_manifests(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    # Write the fixed and validated version to files with standard names.
    recordings.to_file(recs_path.parent / "recordings.jsonl.gz")
    supervisions.to_file(sups_path.parent / "supervisions.jsonl.gz")

    return {"recordings": recordings, "supervisions": supervisions}
Example #4
0
def prepare_callhome_english_asr(
    audio_dir: Pathlike,
    transcript_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the CallHome American English corpus.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S42`` content
    :param transcript_dir: Path to the ``LDC97T14`` content
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are:
        ``{'recordings', 'supervisions'}``.
    """
    audio_dir = Path(audio_dir)
    transcript_dir = Path(transcript_dir)

    manifests = {}

    for split in ["evaltest", "train", "devtest"]:
        audio_paths = check_and_rglob(
            # The LDC distribution has a typo.
            audio_dir / "data" / split.replace("evaltest", "evltest"),
            "*.sph",
        )
        recordings = RecordingSet.from_recordings(
            Recording.from_file(
                p, relative_path_depth=None if absolute_paths else 4)
            for p in tqdm(audio_paths))

        transcript_paths = check_and_rglob(
            transcript_dir / "transcrpt" / split,
            "*.txt",
        )

        # TODO: Add text normalization like in Kaldi recipe.
        #       Not doing this right now as it's not needed for VAD/diarization...
        supervisions = []
        for p in transcript_paths:
            idx = 0
            postprocessed_lines = list()
            for line in p.read_text().splitlines():
                line = line.strip()
                if not line:
                    continue
                if line.startswith("#"):
                    continue
                try:
                    start, end, spk, text = line.split(maxsplit=3)
                    duration = float(Decimal(end) - Decimal(start))
                    if duration <= 0:
                        continue
                    postprocessed_lines.append(line)
                except InvalidOperation:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line
                except ValueError:
                    postprocessed_lines[
                        -1] = postprocessed_lines[-1] + " " + line

            for line in postprocessed_lines:
                recording_id = p.stem
                # example line:
                # 19.33 21.18 B: %ah Tayyib
                start, end, spk, text = line.split(maxsplit=3)
                spk = spk.replace(":", "")
                duration = float(Decimal(end) - Decimal(start))
                if duration <= 0:
                    continue
                start = float(start)
                supervisions.append(
                    SupervisionSegment(
                        recording_id=recording_id,
                        start=start,
                        duration=duration,
                        channel=ord(spk[0]) - ord("A"),
                        speaker=f"{recording_id}_{spk:0>2s}",
                        id=f"{recording_id}_{spk:0>2s}_{idx:0>5d}",
                        text=text,
                    ))
                idx += 1
        supervisions = SupervisionSet.from_segments(supervisions)

        recordings, supervisions = fix_manifests(recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            recordings.to_file(output_dir /
                               f"callhome-english_recordings_{split}.jsonl.gz")
            supervisions.to_file(
                output_dir / f"callhome-english_supervisions_{split}.jsonl.gz")

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

    return manifests
Example #5
0
def prepare_ami(
    data_dir: Pathlike,
    annotations_dir: Optional[Pathlike] = None,
    output_dir: Optional[Pathlike] = None,
    mic: Optional[str] = "ihm",
    partition: Optional[str] = "full-corpus",
    normalize_text: str = "kaldi",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param data_dir: Pathlike, the path of the data dir.
    :param annotations: Pathlike, the path of the annotations dir or zip file.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use.
    :param partition: str {'full-corpus','full-corpus-asr','scenario-only'}, AMI official data split
    :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text
    :return: a Dict whose key is ('train', 'dev', 'eval'), and the values are dicts of manifests under keys
        'recordings' and 'supervisions'.

    Example usage:
    1. Prepare IHM-Mix data for ASR:
    >>> manifests = prepare_ami('/path/to/ami-corpus', mic='ihm-mix', partition='full-corpus-asr')
    2. Prepare SDM data:
    >>> manifests = prepare_ami('/path/to/ami-corpus', mic='sdm', partition='full-corpus')
    """
    data_dir = Path(data_dir)
    assert data_dir.is_dir(), f"No such directory: {data_dir}"
    assert mic in MICS, f"Mic {mic} not supported"
    assert partition in PARTITIONS, f"Partition {partition} not supported"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    logging.info("Parsing AMI annotations")
    if not annotations_dir:
        if (data_dir / "ami_public_manual_1.6.2").is_dir():
            annotations_dir = data_dir / "ami_public_manual_1.6.2"
        elif (data_dir / "ami_public_manual_1.6.2.zip").is_file():
            annotations_dir = data_dir / "ami_public_manual_1.6.2.zip"
        else:
            raise ValueError(
                f"No annotations directory specified and no zip file found in {data_dir}"
            )
    # Prepare annotations which is a list of segment-level transcriptions
    annotations = parse_ami_annotations(annotations_dir,
                                        normalize=normalize_text)

    # Audio
    logging.info("Preparing recording manifests")
    wav_dir = data_dir

    if mic in ["ihm", "mdm"]:
        audio_paths = (wav_dir.rglob("*Headset-?.wav")
                       if mic == "ihm" else wav_dir.rglob("*Array?-0?.wav"))
        audio = prepare_audio_grouped(list(audio_paths))
    elif mic in ["ihm-mix", "sdm"]:
        audio_paths = (wav_dir.rglob("*Mix-Headset.wav") if mic == "ihm-mix"
                       else wav_dir.rglob("*Array1-01.wav"))
        audio = prepare_audio_single(list(audio_paths))

    # Supervisions
    logging.info("Preparing supervision manifests")
    supervision = (prepare_supervision_ihm(audio, annotations) if mic == "ihm"
                   else prepare_supervision_other(audio, annotations))

    manifests = defaultdict(dict)

    dataset_parts = PARTITIONS[partition]
    for part in ["train", "dev", "test"]:
        # Get recordings for current data split
        audio_part = audio.filter(lambda x: x.id in dataset_parts[part])
        supervision_part = supervision.filter(
            lambda x: x.recording_id in dataset_parts[part])

        # Write to output directory if a path is provided
        if output_dir is not None:
            audio_part.to_file(output_dir / f"recordings_{part}.jsonl")
            supervision_part.to_file(output_dir / f"supervisions_{part}.jsonl")

        audio_part, supervision_part = fix_manifests(audio_part,
                                                     supervision_part)
        validate_recordings_and_supervisions(audio_part, supervision_part)

        # Combine all manifests into one dictionary
        manifests[part] = {
            "recordings": audio_part,
            "supervisions": supervision_part
        }

    return dict(manifests)
Example #6
0
def prepare_fisher_spanish(
    audio_dir_path: Pathlike,
    transcript_dir_path: Pathlike,
    output_dir: Optional[Pathlike] = None,
    absolute_paths: bool = False,
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:

    """
    Prepares manifests for Fisher Spanish.
    We create two manifests: one with recordings, and the other one with text supervisions.

    :param audio_dir_path: Path to audio directory (usually LDC2010S01).
    :param transcript_dir_path: Path to transcript directory (usually LDC2010T04).
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """

    audio_dir_path, transcript_dir_path = Path(audio_dir_path), Path(
        transcript_dir_path
    )

    audio_paths = check_and_rglob(audio_dir_path, "*.sph")
    transcript_paths = check_and_rglob(transcript_dir_path, "*.tdf")

    sessions_data_path = check_and_rglob(transcript_dir_path, "*_call.tbl")[0]
    with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f:
        session_lines = [
            l.rstrip("\n").split(",") for l in sessions_data_f.readlines()
        ][1:]
        sessions = {l[0]: {0: l[2], 1: l[8]} for l in session_lines}

    assert len(transcript_paths) == len(sessions) == len(audio_paths)

    create_recordings_input = [(p, None if absolute_paths else 4) for p in audio_paths]
    recordings = [None] * len(audio_paths)
    with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
        with tqdm(total=len(audio_paths), desc="Collect recordings") as pbar:
            for i, reco in enumerate(
                executor.map(create_recording, create_recordings_input)
            ):
                recordings[i] = reco
                pbar.update()
    recordings = RecordingSet.from_recordings(recordings)

    create_supervisions_input = [(sessions, p) for p in transcript_paths]
    supervisions = [None] * len(create_supervisions_input)
    with ThreadPoolExecutor(os.cpu_count() * 4) as executor:
        with tqdm(
            total=len(create_supervisions_input), desc="Create supervisions"
        ) as pbar:
            for i, tmp_supervisions in enumerate(
                executor.map(create_supervision, create_supervisions_input)
            ):
                supervisions[i] = tmp_supervisions
                pbar.update()
    supervisions = list(it.chain.from_iterable(supervisions))
    supervisions = SupervisionSet.from_segments(supervisions).filter(
        lambda s: s.duration > 0.0
    )

    recordings, supervisions = fix_manifests(recordings, supervisions)
    validate_recordings_and_supervisions(recordings, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / "recordings.json")
        supervisions.to_json(output_dir / "supervisions.json")

    return {"recordings": recordings, "supervisions": supervisions}
Example #7
0
def prepare_icsi(
    audio_dir: Pathlike,
    transcripts_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    mic: Optional[str] = "ihm",
    normalize_text: str = "kaldi",
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Returns the manifests which consist of the Recordings and Supervisions
    :param audio_dir: Pathlike, the path which holds the audio data
    :param transcripts_dir: Pathlike, the path which holds the transcripts data
    :param output_dir: Pathlike, the path where to write the manifests - `None` means manifests aren't stored on disk.
    :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use.
    :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text
    :return: a Dict whose key is ('train', 'dev', 'test'), and the values are dicts of manifests under keys
        'recordings' and 'supervisions'.
    """
    audio_dir = Path(audio_dir)
    transcripts_dir = Path(transcripts_dir)

    assert audio_dir.is_dir(), f"No such directory: {audio_dir}"
    assert transcripts_dir.is_dir(), f"No such directory: {transcripts_dir}"
    assert mic in MIC_TO_CHANNELS.keys(), f"Mic {mic} not supported"

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

    logging.info("Parsing ICSI transcripts")
    annotations, channel_to_idx_map = parse_icsi_annotations(
        transcripts_dir, normalize=normalize_text)

    # Audio
    logging.info("Preparing recording manifests")

    channels = "".join(MIC_TO_CHANNELS[mic])
    if mic == "ihm" or mic == "mdm":
        audio_paths = audio_dir.rglob(f"chan[{channels}].sph")
        audio = prepare_audio_grouped(
            list(audio_paths), channel_to_idx_map if mic == "ihm" else None)
    elif mic == "sdm" or mic == "ihm-mix":
        audio_paths = (audio_dir.rglob(f"chan[{channels}].sph")
                       if len(channels) else audio_dir.rglob("*.wav"))
        audio = prepare_audio_single(list(audio_paths))

    # Supervisions
    logging.info("Preparing supervision manifests")
    supervision = (prepare_supervision_ihm(
        audio, annotations, channel_to_idx_map) if mic == "ihm" else
                   prepare_supervision_other(audio, annotations))

    manifests = defaultdict(dict)

    for part in ["train", "dev", "test"]:
        # Get recordings for current data split
        audio_part = audio.filter(lambda x: x.id in PARTITIONS[part])
        supervision_part = supervision.filter(
            lambda x: x.recording_id in PARTITIONS[part])

        # Write to output directory if a path is provided
        if output_dir is not None:
            audio_part.to_file(output_dir / f"recordings_{part}.jsonl")
            supervision_part.to_file(output_dir / f"supervisions_{part}.jsonl")

        audio_part, supervision_part = fix_manifests(audio_part,
                                                     supervision_part)
        validate_recordings_and_supervisions(audio_part, supervision_part)

        # Combine all manifests into one dictionary
        manifests[part] = {
            "recordings": audio_part,
            "supervisions": supervision_part
        }

    return dict(manifests)