def trim_to_supervisions( cuts: Pathlike, output_cuts: Pathlike, keep_overlapping: bool, min_duration: Optional[float], context_direction: str, ): """ Splits each input cut into as many cuts as there are supervisions. These cuts have identical start times and durations as the supervisions. When there are overlapping supervisions, they can be kept or discarded with options. \b For example, the following cut: Cut |-----------------| Sup1 |----| Sup2 |-----------| \b is transformed into two cuts: Cut1 |----| Sup1 |----| Sup2 |-| Cut2 |-----------| Sup1 |-| Sup2 |-----------| """ from lhotse.serialization import load_manifest_lazy_or_eager cuts = load_manifest_lazy_or_eager(cuts) with CutSet.open_writer(output_cuts) as writer: for c in cuts: subcuts = c.trim_to_supervisions( keep_overlapping=keep_overlapping, min_duration=min_duration, context_direction=context_direction, ) for sc in subcuts: writer.write(sc)
def copy_feats( input_manifest: Pathlike, output_manifest: Pathlike, storage_path: str, storage_type: str, max_jobs: int, ) -> None: """ Load INPUT_MANIFEST of type :class:`lhotse.FeatureSet` or `lhotse.CutSet`, read every feature matrix using ``features.load()`` or ``cut.load_features()``, save them in STORAGE_PATH and save the updated manifest to OUTPUT_MANIFEST. """ from lhotse.serialization import load_manifest_lazy_or_eager from lhotse.manipulation import combine as combine_manifests manifests = load_manifest_lazy_or_eager(input_manifest) if isinstance(manifests, FeatureSet): with get_writer(storage_type)(storage_path) as w: # FeatureSet is copied in-memory and written (TODO: make it incremental if needed) manifests = manifests.copy_feats(writer=w) manifests.to_file(output_manifest) elif isinstance(manifests, CutSet): # Group cuts by their underlying feature files. manifests = sorted(manifests, key=lambda cut: cut.features.storage_path) subsets = groupby(manifests, lambda cut: cut.features.storage_path) unique_storage_paths, subsets = zip(*[(k, CutSet.from_cuts(grp)) for k, grp in subsets]) # Create paths for new feature files and subset cutsets. tot_items = len(unique_storage_paths) new_storage_paths = [ f"{storage_path}/feats-{i}" for i in range(tot_items) ] partial_manifest_paths = [ f"{storage_path}/cuts-{i}.jsonl.gz" for i in range(tot_items) ] num_jobs = len(unique_storage_paths) if max_jobs > 0: num_jobs = min(num_jobs, max_jobs) # Create directory if needed (storage_path might be an URL) if Path(storage_path).parent.is_dir(): Path(storage_path).mkdir(exist_ok=True) # Copy each partition in parallel and combine lazily opened manifests. with ProcessPoolExecutor(num_jobs) as ex: futures = [] for cs, nsp, pmp in zip(subsets, new_storage_paths, partial_manifest_paths): futures.append( ex.submit(copy_feats_worker, cs, nsp, storage_type, pmp)) all_cuts = combine_manifests( (f.result() for f in as_completed(futures))) # Combine and save subset cutsets into the final file. with CutSet.open_writer(output_manifest) as w: for c in all_cuts: w.write(c) else: raise ValueError( f"Unsupported manifest type ({type(manifests)}) at: {input_manifest}" )