Example #1
0
def subset(manifest: Pathlike, output_manifest: Pathlike, first: Optional[int],
           last: Optional[int]):
    """Load MANIFEST, select the FIRST or LAST number of items and store it in OUTPUT_MANIFEST."""
    output_manifest = Path(output_manifest)
    manifest = Path(manifest)
    any_set = load_manifest(manifest)
    a_subset = any_set.subset(first=first, last=last)
    a_subset.to_json(output_manifest)
Example #2
0
def split(num_splits: int, manifest: Pathlike, output_dir: Pathlike):
    """Load MANIFEST, split it into NUM_SPLITS equal parts and save as separate manifests in OUTPUT_DIR. """
    output_dir = Path(output_dir)
    manifest = Path(manifest)
    data_set = load_manifest(manifest)
    parts = split_manifest(manifest=data_set, num_splits=num_splits)
    output_dir.mkdir(parents=True, exist_ok=True)
    for idx, part in enumerate(parts):
        part.to_json(output_dir / f'{manifest.stem}.{idx + 1}.json')
Example #3
0
def filter(predicate: str, manifest: Pathlike, output_manifest: Pathlike):
    """
    Filter a MANIFEST according to the rule specified in PREDICATE, and save the result to OUTPUT_MANIFEST.
    It is intended to work generically with most manifest types - it supports RecordingSet, SupervisionSet and CutSet.

    \b
    The PREDICATE specifies which attribute is used for item selection. Some examples:
    lhotse filter 'duration>4.5' supervision.json output.json
    lhotse filter 'num_frames<600' cuts.json output.json
    lhotse filter 'start=0' cuts.json output.json
    lhotse filter 'channel!=0' audio.json output.json

    It currently only supports comparison of numerical manifest item attributes, such as:
    start, duration, end, channel, num_frames, num_features, etc.
    """
    data_set = load_manifest(manifest)

    predicate_pattern = re.compile(
        r'(?P<key>\w+)(?P<op>=|==|!=|>|<|>=|<=)(?P<value>[0-9.]+)')
    match = predicate_pattern.match(predicate)
    if match is None:
        raise ValueError(
            "Invalid predicate! Run with --help option to learn what predicates are allowed."
        )

    compare = {
        '<': operator.lt,
        '>': operator.gt,
        '>=': operator.ge,
        '<=': operator.le,
        '=': isclose,
        '==': isclose,
        '!=': complement(isclose)
    }[match.group('op')]
    try:
        value = int(match.group('value'))
    except ValueError:
        value = float(match.group('value'))

    retained_items = []
    try:
        for item in data_set:
            attr = getattr(item, match.group('key'))
            if compare(attr, value):
                retained_items.append(item)
    except AttributeError:
        click.echo(
            f'Invalid predicate! Items in "{manifest}" do not have the attribute "{match.group("key")}"',
            err=True)
        exit(1)

    filtered_data_set = to_manifest(retained_items)
    if filtered_data_set is None:
        click.echo('No items satisfying the predicate.', err=True)
        exit(0)
    filtered_data_set.to_json(output_manifest)
Example #4
0
def split(num_splits: int, manifest: Pathlike, output_dir: Pathlike,
          shuffle: bool):
    """
    Load MANIFEST, split it into NUM_SPLITS equal parts and save as separate manifests in OUTPUT_DIR.
    """
    output_dir = Path(output_dir)
    manifest = Path(manifest)
    suffix = ''.join(manifest.suffixes)
    any_set = load_manifest(manifest)
    parts = any_set.split(num_splits=num_splits, shuffle=shuffle)
    output_dir.mkdir(parents=True, exist_ok=True)
    for idx, part in enumerate(parts):
        part.to_json(
            (output_dir / manifest).with_suffix(f'.{idx + 1}.{suffix}'))
Example #5
0
def simple(
        output_cut_manifest: Pathlike,
        recording_manifest: Optional[Pathlike],
        feature_manifest: Optional[Pathlike],
        supervision_manifest: Optional[Pathlike],
):
    """
    Create a CutSet stored in OUTPUT_CUT_MANIFEST. Depending on the provided options, it may contain any combination
    of recording, feature and supervision manifests.
    Either RECORDING_MANIFEST or FEATURE_MANIFEST has to be provided.
    When SUPERVISION_MANIFEST is provided, the cuts time span will correspond to that of the supervision segments.
    Otherwise, that time span corresponds to the one found in features, if available, otherwise recordings.
    """
    supervision_set, feature_set, recording_set = [
        load_manifest(p) if p is not None else None
        for p in (supervision_manifest, feature_manifest, recording_manifest)
    ]
    cut_set = CutSet.from_manifests(recordings=recording_set, supervisions=supervision_set, features=feature_set)
    cut_set.to_json(output_cut_manifest)
Example #6
0
def read_manifests_if_cached(
        dataset_parts: Optional[Sequence[str]],
        output_dir: Optional[Pathlike]
) -> Optional[Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]]:
    """Loads manifests from the disk if all of them exist in the specified paths.
    the manifests are searched for using the pattern `output_dir / f'{manifest}_{part}.json'`,
    where `manifest` is one of `["recordings", "supervisions"]` and `part` is specified in `dataset_parts`.
    This function is intended to speedup data preparation if it has already been done before.
    """
    if output_dir is None:
        return None
    manifests = defaultdict(dict)
    for part in dataset_parts:
        for manifest in ('recordings', 'supervisions'):
            path = output_dir / f'{manifest}_{part}.json'
            if not path.is_file():
                # If one of the manifests is not available, assume we need to read and prepare everything
                # to simplify the rest of the code.
                return None
            manifests[part][manifest] = load_manifest(path)
    return dict(manifests)
Example #7
0
def test_load_any_lhotse_manifest(path, exception_expectation):
    with exception_expectation:
        load_manifest(path)
Example #8
0
def combine(manifests: Pathlike, output_manifest: Pathlike):
    """Load MANIFESTS, combine them into a single one, and write it to OUTPUT_MANIFEST."""
    data_set = combine_manifests(*[load_manifest(m) for m in manifests])
    data_set.to_json(output_manifest)