Esempio n. 1
0
def test_combine(manifest_type):
    expected = DummyManifest(manifest_type, begin_id=0, end_id=200)
    combined = combine(
        DummyManifest(manifest_type, begin_id=0, end_id=68),
        DummyManifest(manifest_type, begin_id=68, end_id=136),
        DummyManifest(manifest_type, begin_id=136, end_id=200),
    )
    assert combined == expected
    combined_iterable = combine([
        DummyManifest(manifest_type, begin_id=0, end_id=68),
        DummyManifest(manifest_type, begin_id=68, end_id=136),
        DummyManifest(manifest_type, begin_id=136, end_id=200),
    ])
    assert combined_iterable == expected
Esempio n. 2
0
def test_combine_lazy(manifest_type):
    expected = DummyManifest(manifest_type, begin_id=0, end_id=200)
    with as_lazy(DummyManifest(manifest_type, begin_id=0, end_id=68)) as part1, as_lazy(
        DummyManifest(manifest_type, begin_id=68, end_id=136)
    ) as part2, as_lazy(
        DummyManifest(manifest_type, begin_id=136, end_id=200)
    ) as part3:
        combined = combine(part1, part2, part3)
        # Equivalent under iteration
        assert list(combined) == list(expected)
Esempio n. 3
0
def mix_by_recording_id(cut_manifests: List[Pathlike],
                        output_cut_manifest: Pathlike):
    """
    Create a CutSet stored in OUTPUT_CUT_MANIFEST by matching the Cuts from CUT_MANIFESTS by their recording IDs
    and mixing them together.
    """
    all_cuts = combine(*[CutSet.from_json(path) for path in cut_manifests])
    recording_id_to_cuts = groupby(lambda cut: cut.recording_id, all_cuts)
    mixed_cut_set = CutSet.from_cuts(
        mix_cuts(cuts) for recording_id, cuts in recording_id_to_cuts.items())
    mixed_cut_set.to_json(output_cut_manifest)
Esempio n. 4
0
def prepare_voxceleb(
    voxceleb1_root: Optional[Pathlike] = None,
    voxceleb2_root: Optional[Pathlike] = None,
    output_dir: Optional[Pathlike] = None,
    num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepare manifests for the VoxCeleb v1 and v2 corpora.

    The manifests are created in a dict with three splits: train, dev and test, for each
    of the two versions.
    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.

    :param voxceleb1_root: Path to the VoxCeleb v1 dataset.
    :param voxceleb2_root: Path to the VoxCeleb v2 dataset.
    :param output_dir: Path to the output directory.
    :param num_jobs: Number of parallel jobs to run.
    :return: A dict with standard corpus splits ("train" and "test") containing the manifests.

    NOTE: We prepare the data using the Kaldi style split, i.e., the whole VoxCeleb2
    ("dev" and "test") and the training portion ("dev") of VoxCeleb1 are put into the
    "train" split. The "test" split contains the "test" portion of VoxCeleb1. So if
    VoxCeleb1 is not provided, no "test" split is created in the output manifests.

    Example usage:

    .. code-block:: python

        >>> from lhotse.recipes.voxceleb import prepare_voxceleb
        >>> manifests = prepare_voxceleb(voxceleb_v1_root='/path/to/voxceleb1',
        ...                               voxceleb_v2_root='/path/to/voxceleb2',
        ...                               output_dir='/path/to/output',
        ...                               num_jobs=4)

    NOTE: If VoxCeleb1 is provided, we also prepare the trials file using the list provided
    in http://www.openslr.org/resources/49/voxceleb1_test_v2.txt. This file is used in the
    Kaldi recipes for VoxCeleb speaker verification. This is prepared as 2 tuples of the form
    (CutSet, CutSet) with identical id's, one for each of positive pairs and negative pairs.
    These are stored in the dict under keys 'pos_trials' and 'neg_trials', respectively.
    For evaluation purpose, the :class:`lhotse.dataset.sampling.CutPairsSampler`
    can be used to sample from this tuple.
    """
    voxceleb1_root = Path(voxceleb1_root) if voxceleb1_root else None
    voxceleb2_root = Path(voxceleb2_root) if voxceleb2_root else None
    if not (voxceleb1_root or voxceleb2_root):
        raise ValueError("Either VoxCeleb1 or VoxCeleb2 path must be provided.")

    output_dir = Path(output_dir) if output_dir is not None else None
    manifests = defaultdict(dict)
    if voxceleb1_root:
        logging.info("Preparing VoxCeleb1...")
        manifests.update(_prepare_voxceleb_v1(voxceleb1_root, num_jobs))
        manifests.update(_prepare_voxceleb_trials(manifests["test"]))
    else:
        logging.info(
            "VoxCeleb1 not provided, no test split or trials file will be created..."
        )
    if voxceleb2_root:
        logging.info("Preparing VoxCeleb2...")
        v2_manifests = _prepare_voxceleb_v2(voxceleb2_root, num_jobs)
        if "train" in manifests:
            manifests["train"]["recordings"] = combine(
                manifests["train"]["recordings"], v2_manifests["recordings"]
            )
            manifests["train"]["supervisions"] = combine(
                manifests["train"]["supervisions"], v2_manifests["supervisions"]
            )
        else:
            manifests["train"] = v2_manifests

    for split in ("train", "test"):
        recordings = manifests[split]["recordings"]
        supervisions = manifests[split]["supervisions"]
        validate_recordings_and_supervisions(recordings, supervisions)
        if output_dir is not None:
            recordings.to_file(output_dir / f"recordings_voxceleb_{split}.jsonl.gz")
            supervisions.to_file(output_dir / f"supervisions_voxceleb_{split}.jsonl.gz")

    # Write the trials cut sets to the output directory
    if output_dir is not None:
        if "pos_trials" in manifests:
            for i, cuts in enumerate(manifests["pos_trials"]):
                cuts.to_file(output_dir / f"pos_trials_voxceleb_utt{i+1}.jsonl.gz")
        if "neg_trials" in manifests:
            for i, cuts in enumerate(manifests["neg_trials"]):
                cuts.to_file(output_dir / f"neg_trials_voxceleb_utt{i+1}.jsonl.gz")

    return manifests
Esempio n. 5
0
def prepare_single_babel_language(
    corpus_dir: Pathlike,
    output_dir: Optional[Pathlike] = None,
    no_eval_ok: bool = False,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
    """
    Prepares manifests using a single BABEL LDC package.

    This function works like the following:

        - first, it will scan `corpus_dir` for a directory named `conversational`;
            if there is more than once, it picks the first one (and emits a warning)
        - then, it will try to find `dev`, `eval`, and `training` splits inside
            (if any of them is not present, it will skip it with a warning)
        - finally, it scans the selected location for SPHERE audio files and transcripts.

    :param corpus_dir: Path to the root of the LDC package with a BABEL language.
    :param output_dir: Path where the manifests are stored.json
    :param no_eval_ok: When set to True, this function won't emit a warning
        that the eval set was not found.
    :return:
    """
    manifests = defaultdict(dict)

    # Auto-detect the location of the "conversational" directory
    orig_corpus_dir = corpus_dir
    corpus_dir = Path(corpus_dir)
    corpus_dir = [d for d in corpus_dir.rglob("conversational") if d.is_dir()]
    if not corpus_dir:
        raise ValueError(
            f"Could not find 'conversational' directory anywhere inside '{orig_corpus_dir}' "
            f"- please check your path.")
    if len(corpus_dir) > 1:
        # People have very messy data distributions, the best we can do is warn them.
        logging.warning(
            f"It seems there are multiple 'conversational' directories in '{orig_corpus_dir}' - "
            f"we are selecting the first one only ({corpus_dir[0]}). Please ensure that you provided "
            f"the path to a single language's dir, and the root dir for all BABEL languages."
        )
    corpus_dir = corpus_dir[0].parent

    for split in ("dev", "eval", "training"):
        audio_dir = corpus_dir / f"conversational/{split}/audio"
        sph_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.sph"))
        wav_recordings = RecordingSet.from_recordings(
            Recording.from_file(p) for p in audio_dir.glob("*.wav"))
        recordings = combine(sph_recordings, wav_recordings)
        if len(recordings) == 0:
            if split == "eval" and no_eval_ok:
                continue
            logging.warning(f"No SPHERE or WAV files found in {audio_dir}")

        supervisions = []
        text_dir = corpus_dir / f"conversational/{split}/transcription"
        for p in tqdm.tqdm(text_dir.glob("*")):
            # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine
            # parts:
            #   0 -> BABEL
            #   1 -> BP
            #   2 -> <language-code> (101)
            #   3 -> <speaker-id> (10033)
            #   4 -> <date> (20111024)
            #   5 -> <hour> (205740)
            #   6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A
            p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split(
                "_")
            channel = {"inLine": "A", "outLine": "B"}.get(channel, "A")
            # Fix problematic segments that have two consecutive timestamp lines with no transcript in between
            lines = p.read_text().splitlines() + [""]
            lines = [
                prev_l for prev_l, l in sliding_window(2, lines)
                if not (prev_l.startswith("[") and l.startswith("["))
            ]
            # Add a None at the end so that the last timestamp is only used as "next_timestamp"
            # and ends the iretation (otherwise we'd lose the last segment).
            lines += [None]
            for (timestamp,
                 text), (next_timestamp,
                         _) in sliding_window(2, zip(lines[::2], lines[1::2])):
                try:
                    start = float(timestamp[1:-1])
                    end = float(next_timestamp[1:-1])
                    # Create supervision
                    supervisions.append(
                        SupervisionSegment(
                            id=
                            f"{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}",
                            recording_id=p.stem,
                            start=start,
                            duration=round(end - start, ndigits=8),
                            channel=0,
                            text=normalize_text(text),
                            language=BABELCODE2LANG[lang_code],
                            speaker=f"{lang_code}_{speaker}_{channel}",
                        ))
                except Exception as e:
                    logging.warning(
                        f"Error while parsing segment. Message: {str(e)}")
                    raise ValueError(
                        f"Too many errors while parsing segments (file: '{p}'). "
                        f"Please check your data or increase the threshold.")
        supervisions = deduplicate_supervisions(supervisions)

        if len(supervisions) == 0:
            logging.warning(f"No supervisions found in {text_dir}")
        supervisions = SupervisionSet.from_segments(supervisions)

        # Fixing and validation of manifests
        if split == "eval" and len(supervisions) == 0:
            # We won't remove missing recordings for the "eval" split in cases where
            # the user does not have its corresponding transcripts (very likely).
            pass
        else:
            recordings, supervisions = remove_missing_recordings_and_supervisions(
                recordings, supervisions)
            supervisions = trim_supervisions_to_recordings(
                recordings, supervisions)
        validate_recordings_and_supervisions(recordings, supervisions)

        manifests[split] = {
            "recordings": recordings,
            "supervisions": supervisions
        }

        if output_dir is not None:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
            language = BABELCODE2LANG[lang_code]
            save_split = "train" if split == "training" else split
            recordings.to_file(output_dir /
                               f"recordings_{language}_{save_split}.json")
            supervisions.to_file(output_dir /
                                 f"supervisions_{language}_{save_split}.json")

    return dict(manifests)