Beispiel #1
0
def combine(manifests: Pathlike, output_manifest: Pathlike):
    """Load MANIFESTS, combine them into a single one, and write it to OUTPUT_MANIFEST."""
    from lhotse.serialization import load_manifest_lazy_or_eager
    from lhotse.manipulation import combine as combine_manifests

    data_set = combine_manifests(
        *[load_manifest_lazy_or_eager(m) for m in manifests])
    data_set.to_file(output_manifest)
Beispiel #2
0
def describe(cutset: Pathlike):
    """
    Describe some statistics of CUTSET, such as the total speech and audio duration.
    """
    cuts = load_manifest_lazy_or_eager(cutset)
    assert isinstance(
        cuts, CutSet
    ), f"Only CutSet can be described (got: {type(cuts)} from '{cutset}')"
    cuts.describe()
Beispiel #3
0
def split_lazy(manifest: Pathlike, output_dir: Pathlike, chunk_size: int):
    """
    Load MANIFEST (lazily if in JSONL format) and split it into parts,
    each with CHUNK_SIZE items.
    The parts are saved to separate files with pattern "{output_dir}/{chunk_idx}.jsonl.gz".

    Prefer this to "lhotse split" when your manifests are very large.
    """
    from lhotse.serialization import load_manifest_lazy_or_eager

    output_dir = Path(output_dir)
    manifest = Path(manifest)
    any_set = load_manifest_lazy_or_eager(manifest)
    any_set.split_lazy(output_dir=output_dir, chunk_size=chunk_size)
Beispiel #4
0
def trim_to_supervisions(
    cuts: Pathlike,
    output_cuts: Pathlike,
    keep_overlapping: bool,
    min_duration: Optional[float],
    context_direction: str,
):
    """
    Splits each input cut into as many cuts as there are supervisions.
    These cuts have identical start times and durations as the supervisions.
    When there are overlapping supervisions, they can be kept or discarded with options.

    \b
    For example, the following cut:
                Cut
        |-----------------|
         Sup1
        |----|  Sup2
           |-----------|

    \b
    is transformed into two cuts:
         Cut1
        |----|
         Sup1
        |----|
           Sup2
           |-|
                Cut2
           |-----------|
           Sup1
           |-|
                Sup2
           |-----------|
    """
    from lhotse.serialization import load_manifest_lazy_or_eager

    cuts = load_manifest_lazy_or_eager(cuts)

    with CutSet.open_writer(output_cuts) as writer:
        for c in cuts:
            subcuts = c.trim_to_supervisions(
                keep_overlapping=keep_overlapping,
                min_duration=min_duration,
                context_direction=context_direction,
            )
            for sc in subcuts:
                writer.write(sc)
Beispiel #5
0
def decompose(cutset: Pathlike, output: Pathlike):
    """
    \b
    Decompose CUTSET into:
        * recording set (recordings.jsonl.gz)
        * feature set (features.jsonl.gz)
        * supervision set (supervisions.jsonl.gz)

    If any of these are not preset in any of the cuts,
    the corresponding file for them will be empty.
    """
    cuts = load_manifest_lazy_or_eager(cutset)
    assert isinstance(
        cuts, CutSet
    ), f"Only CutSet can be decomposed (got: {type(cuts)} from '{cutset}')"
    output = Path(output)
    cuts.decompose(output_dir=output, verbose=True)
Beispiel #6
0
def export_to_webdataset(
    cutset: Pathlike,
    wspecifier: str,
    shard_size: Optional[int],
    audio_format: str,
    audio: bool,
    features: bool,
    custom: bool,
    fault_tolerant: bool,
):
    """
    Export CUTS into a WebDataset tarfile, or a collection of tarfile shards, as specified by
    WSPECIFIER.

    \b
    WSPECIFIER can be:
    - a regular path (e.g., "data/cuts.tar"),
    - a path template for sharding (e.g., "data/shard-06%d.tar"), or
    - a "pipe:" expression (e.g., "pipe:gzip -c > data/shard-06%d.tar.gz").

    The resulting CutSet contains audio/feature data in addition to metadata, and can be read in
    Python using 'CutSet.from_webdataset' API.

    This function is useful for I/O intensive applications where random reads are too slow, and
    a one-time lengthy export step that enables fast sequential reading is preferable.

    See the WebDataset project for more information: https://github.com/webdataset/webdataset
    """
    from lhotse.dataset.webdataset import export_to_webdataset

    cuts = load_manifest_lazy_or_eager(cutset)
    assert isinstance(
        cuts, CutSet
    ), f"Only CutSet can be exported to WebDataset format (got: {type(cuts)} from '{cutset}')"

    export_to_webdataset(
        cuts=cuts,
        output_path=wspecifier,
        shard_size=shard_size,
        audio_format=audio_format,
        load_audio=audio,
        load_features=features,
        load_custom=custom,
        fault_tolerant=fault_tolerant,
    )
Beispiel #7
0
def split(num_splits: int, manifest: Pathlike, output_dir: Pathlike,
          shuffle: bool):
    """
    Load MANIFEST, split it into NUM_SPLITS equal parts and save as separate manifests in OUTPUT_DIR.

    When your manifests are very large, prefer to use "lhotse split-lazy" instead.
    """
    from lhotse.serialization import load_manifest_lazy_or_eager

    output_dir = Path(output_dir)
    manifest = Path(manifest)
    suffix = "".join(manifest.suffixes)
    any_set = load_manifest_lazy_or_eager(manifest)
    parts = any_set.split(num_splits=num_splits, shuffle=shuffle)
    output_dir.mkdir(parents=True, exist_ok=True)
    num_digits = len(str(num_splits))
    for idx, part in enumerate(parts):
        idx = f"{idx + 1}".zfill(num_digits)
        part.to_file(
            (output_dir / manifest.stem).with_suffix(f".{idx}{suffix}"))
Beispiel #8
0
def copy_feats(
    input_manifest: Pathlike,
    output_manifest: Pathlike,
    storage_path: str,
    storage_type: str,
    max_jobs: int,
) -> None:
    """
    Load INPUT_MANIFEST of type :class:`lhotse.FeatureSet` or `lhotse.CutSet`,
    read every feature matrix using ``features.load()`` or ``cut.load_features()``,
    save them in STORAGE_PATH and save the updated manifest to OUTPUT_MANIFEST.
    """
    from lhotse.serialization import load_manifest_lazy_or_eager
    from lhotse.manipulation import combine as combine_manifests

    manifests = load_manifest_lazy_or_eager(input_manifest)

    if isinstance(manifests, FeatureSet):
        with get_writer(storage_type)(storage_path) as w:
            # FeatureSet is copied in-memory and written (TODO: make it incremental if needed)
            manifests = manifests.copy_feats(writer=w)
            manifests.to_file(output_manifest)

    elif isinstance(manifests, CutSet):
        # Group cuts by their underlying feature files.
        manifests = sorted(manifests,
                           key=lambda cut: cut.features.storage_path)
        subsets = groupby(manifests, lambda cut: cut.features.storage_path)
        unique_storage_paths, subsets = zip(*[(k, CutSet.from_cuts(grp))
                                              for k, grp in subsets])

        # Create paths for new feature files and subset cutsets.
        tot_items = len(unique_storage_paths)
        new_storage_paths = [
            f"{storage_path}/feats-{i}" for i in range(tot_items)
        ]
        partial_manifest_paths = [
            f"{storage_path}/cuts-{i}.jsonl.gz" for i in range(tot_items)
        ]

        num_jobs = len(unique_storage_paths)
        if max_jobs > 0:
            num_jobs = min(num_jobs, max_jobs)

        # Create directory if needed (storage_path might be an URL)
        if Path(storage_path).parent.is_dir():
            Path(storage_path).mkdir(exist_ok=True)

        # Copy each partition in parallel and combine lazily opened manifests.
        with ProcessPoolExecutor(num_jobs) as ex:
            futures = []
            for cs, nsp, pmp in zip(subsets, new_storage_paths,
                                    partial_manifest_paths):
                futures.append(
                    ex.submit(copy_feats_worker, cs, nsp, storage_type, pmp))

            all_cuts = combine_manifests(
                (f.result() for f in as_completed(futures)))

        # Combine and save subset cutsets into the final file.
        with CutSet.open_writer(output_manifest) as w:
            for c in all_cuts:
                w.write(c)
    else:
        raise ValueError(
            f"Unsupported manifest type ({type(manifests)}) at: {input_manifest}"
        )