Esempio n. 1
0
 def _collect_batch(self) -> CutSet:
     """
     Return a sub-CutSet that represents a full batch.
     This is quick, as it does not perform any I/O in the process.
     """
     # Keep iterating the underlying CutSet as long as we hit or exceed the constraints
     # provided by user (the max number of frames or max number of cuts).
     # Note: no actual data is loaded into memory yet because the manifests contain all the metadata
     # required to do this operation.
     num_frames = 0
     cuts = []
     while True:
         # Check that we have not reached the end of the dataset.
         if self.current_idx < self.partition_end:
             # We didn't - grab the next cut
             next_cut_id = self.cut_ids[self.current_idx]
         else:
             if cuts:
                 # We did and we have a partial batch - return it.
                 return CutSet.from_cuts(cuts)
             else:
                 # We did and there is nothing more to return - signal the iteration code to stop.
                 raise StopIteration()
         next_cut = self.cuts[next_cut_id]
         next_num_frames = num_frames + next_cut.num_frames
         next_num_cuts = len(cuts) + 1
         # Did we exceed the max_frames and max_cuts constraints?
         if next_num_frames <= self.max_frames and (
                 self.max_cuts is None or next_num_cuts <= self.max_cuts):
             # No - add the next cut to the batch, and keep trying.
             num_frames = next_num_frames
             cuts.append(next_cut)
             self.current_idx += 1
         else:
             # Yes. Do we have at least one cut in the batch?
             if cuts:
                 # Yes. Return it.
                 break
             else:
                 # No. We'll warn the user that the constrains might be too tight,
                 # and return the cut anyway.
                 warnings.warn(
                     "The first cut drawn in batch collection violates the max_frames or max_cuts "
                     "constraints - we'll return it anyway. Consider increasing max_frames/max_cuts."
                 )
                 cuts.append(next_cut)
                 self.current_idx += 1
     if self.concat_cuts:
         cuts = concat_cuts(cuts,
                            gap=self.concat_cuts_gap,
                            max_duration=self.concat_cuts_duration_factor *
                            cuts[0].duration)
     return CutSet.from_cuts(cuts)
Esempio n. 2
0
def test_cut_set_decompose():
    c = dummy_cut(
        0,
        start=5.0,
        duration=10.0,
        supervisions=[
            dummy_supervision(0, start=0.0),
            dummy_supervision(1, start=6.5)
        ],
    )
    assert c.start == 5.0
    assert c.end == 15.0
    cuts = CutSet.from_cuts([c])

    recs, sups, feats = cuts.decompose()

    assert isinstance(recs, RecordingSet)
    assert len(recs) == 1
    assert recs[0].id == "dummy-recording-0000"

    assert isinstance(sups, SupervisionSet)
    assert len(sups) == 2
    assert sups[0].id == "dummy-segment-0000"
    assert sups[0].start == 5.0
    assert sups[0].end == 6.0
    assert sups[1].id == "dummy-segment-0001"
    assert sups[1].start == 11.5
    assert sups[1].end == 12.5

    assert isinstance(feats, FeatureSet)
    assert len(feats) == 1
Esempio n. 3
0
def test_mix_same_recording_channels():
    recording = Recording('rec',
                          sampling_rate=8000,
                          num_samples=30 * 8000,
                          duration=30,
                          sources=[
                              AudioSource('file',
                                          channels=[0],
                                          source='irrelevant1.wav'),
                              AudioSource('file',
                                          channels=[1],
                                          source='irrelevant2.wav')
                          ])
    cut_set = CutSet.from_cuts([
        Cut('cut1', start=0, duration=30, channel=0, recording=recording),
        Cut('cut2', start=0, duration=30, channel=1, recording=recording)
    ])

    mixed = cut_set.mix_same_recording_channels()
    assert len(mixed) == 1

    cut = mixed[0]
    assert isinstance(cut, MixedCut)
    assert len(cut.tracks) == 2
    assert cut.tracks[0].cut == cut_set[0]
    assert cut.tracks[1].cut == cut_set[1]
Esempio n. 4
0
def random_mixed(supervision_manifest: Pathlike, feature_manifest: Pathlike,
                 output_cut_manifest: Pathlike, snr_range: Tuple[float, float],
                 offset_range: Tuple[float, float]):
    """
    Create a CutSet stored in OUTPUT_CUT_MANIFEST that contains supervision regions from SUPERVISION_MANIFEST
    and features supplied by FEATURE_MANIFEST. It first creates a trivial CutSet, splits it into two equal, randomized
    parts and mixes their features.
    The parameters of the mix are controlled via SNR_RANGE and OFFSET_RANGE.
    """
    supervision_set = SupervisionSet.from_json(supervision_manifest)
    feature_set = FeatureSet.from_json(feature_manifest)

    source_cut_set = CutSet.from_manifests(supervisions=supervision_set,
                                           features=feature_set)
    left_cuts, right_cuts = source_cut_set.split(num_splits=2, shuffle=True)

    snrs = np.random.uniform(*snr_range, size=len(left_cuts)).tolist()
    relative_offsets = np.random.uniform(*offset_range,
                                         size=len(left_cuts)).tolist()

    mixed_cut_set = CutSet.from_cuts(
        left_cut.mix(right_cut,
                     offset_other_by=left_cut.duration * relative_offset,
                     snr=snr)
        for left_cut, right_cut, snr, relative_offset in zip(
            left_cuts, right_cuts, snrs, relative_offsets))
    mixed_cut_set.to_json(output_cut_manifest)
def k2_cut_set(libri_cut_set):
    # Create a cut set with 4 cuts, one of them having two supervisions
    return CutSet.from_cuts([
        libri_cut_set[0], libri_cut_set[0].with_id('copy-1'),
        libri_cut_set[0].with_id('copy-2'),
        libri_cut_set[0].append(libri_cut_set[0])
    ])
Esempio n. 6
0
def to_manifest(items: Iterable[ManifestItem]) -> Optional[Manifest]:
    """
    Take an iterable of data types in Lhotse such as Recording, SupervisonSegment or Cut, and create the manifest of the
    corresponding type. When the iterable is empty, returns None.
    """
    items = iter(items)
    try:
        first_item = next(items)
    except StopIteration:
        return None
    items = chain([first_item], items)

    if isinstance(first_item, Recording):
        return RecordingSet.from_recordings(items)
    if isinstance(first_item, SupervisionSegment):
        return SupervisionSet.from_segments(items)
    if isinstance(first_item, (Cut, MixedCut)):
        return CutSet.from_cuts(items)
    if isinstance(first_item, Features):
        raise ValueError(
            "FeatureSet generic construction from iterable is not possible, as the config information "
            "would have been lost. Call FeatureSet.from_features() directly instead."
        )

    raise ValueError(f"Unknown type of manifest item: {first_item}")
Esempio n. 7
0
def mixed_overlapping_cut_set():
    """
    Input mixed cut::
        |---------------mixedcut--------------------|
        |--------rec1 0-30s--------|
                     |-------rec2 15-45s--------|
         |---sup1--|         |-----sup3-----|
                 |sup2|
    """
    cut_set = CutSet.from_cuts([
        MonoCut(
            'cut1', start=0, duration=30, channel=0,
            recording=Recording(
                id='rec1', sources=[], sampling_rate=16000, num_samples=160000, duration=60.0
            ),
            supervisions=[
                SupervisionSegment('sup1', 'rec1', start=1.5, duration=10.5),
                SupervisionSegment('sup2', 'rec1', start=10, duration=6),
            ]
        ).mix(
            MonoCut(
                'cut2', start=15, duration=30, channel=0,
                recording=Recording(
                    id='rec2', sources=[], sampling_rate=16000, num_samples=160000, duration=60.0
                ),
                supervisions=[
                    SupervisionSegment('sup3', 'rec2', start=8, duration=18),
                ]
            ),
            offset_other_by=15.0
        )
    ])
    assert isinstance(cut_set[0], MixedCut)
    return cut_set
Esempio n. 8
0
def test_trim_to_unsupervised_segments():
    cut_set = CutSet.from_cuts([
        # Yields 3 unsupervised cuts - before first supervision,
        # between sup2 and sup3, and after sup3.
        Cut('cut1', start=0, duration=30, channel=0, supervisions=[
            SupervisionSegment('sup1', 'rec1', start=1.5, duration=8.5),
            SupervisionSegment('sup2', 'rec1', start=10, duration=5),
            SupervisionSegment('sup3', 'rec1', start=20, duration=8),
        ]),
        # Does not yield any "unsupervised" cut.
        Cut('cut2', start=0, duration=30, channel=0, supervisions=[
            SupervisionSegment('sup4', 'rec1', start=0, duration=30),
        ]),
    ])
    unsupervised_cuts = cut_set.trim_to_unsupervised_segments()

    assert len(unsupervised_cuts) == 3

    assert unsupervised_cuts[0].start == 0
    assert unsupervised_cuts[0].duration == 1.5
    assert unsupervised_cuts[0].supervisions == []

    assert unsupervised_cuts[1].start == 15
    assert unsupervised_cuts[1].duration == 5
    assert unsupervised_cuts[1].supervisions == []

    assert unsupervised_cuts[2].start == 28
    assert unsupervised_cuts[2].duration == 2
    assert unsupervised_cuts[2].supervisions == []
Esempio n. 9
0
def test_mixed_cut_set_prefix(cut_with_relative_paths):
    cut_set = CutSet.from_cuts([cut_with_relative_paths.mix(cut_with_relative_paths)])
    for c in cut_set.with_recording_path_prefix('/data'):
        for t in c.tracks:
            assert t.cut.recording.sources[0].source == '/data/audio.wav'
    for c in cut_set.with_features_path_prefix('/data'):
        for t in c.tracks:
            assert t.cut.features.storage_path == '/data/storage_dir'
Esempio n. 10
0
def test_trim_to_supervisions_mixed_cuts():
    cut_set = CutSet.from_cuts([
        Cut('cut1',
            start=0,
            duration=30,
            channel=0,
            recording=Recording(id='rec1',
                                sources=[],
                                sampling_rate=16000,
                                num_samples=160000,
                                duration=10.0),
            supervisions=[
                SupervisionSegment('sup1', 'rec1', start=1.5, duration=8.5),
                SupervisionSegment('sup2', 'rec1', start=10, duration=5),
                SupervisionSegment('sup3', 'rec1', start=20, duration=8),
            ]).append(
                Cut('cut2',
                    start=0,
                    duration=30,
                    channel=0,
                    recording=Recording(id='rec1',
                                        sources=[],
                                        sampling_rate=16000,
                                        num_samples=160000,
                                        duration=10.0),
                    supervisions=[
                        SupervisionSegment('sup4',
                                           'rec1',
                                           start=0,
                                           duration=30),
                    ]))
    ])
    assert isinstance(cut_set[0], MixedCut)
    cuts = cut_set.trim_to_supervisions()
    assert len(cuts) == 4
    # After "trimming", the MixedCut "decayed" into simple, unmixed cuts, as they did not overlap
    assert all(isinstance(cut, Cut) for cut in cuts)
    assert all(len(cut.supervisions) == 1 for cut in cuts)
    assert all(cut.supervisions[0].start == 0 for cut in cuts)
    cut = cuts[0]
    # Check that the cuts preserved their start/duration/supervisions after trimming
    assert cut.start == 1.5
    assert cut.duration == 8.5
    assert cut.supervisions[0].id == 'sup1'
    cut = cuts[1]
    assert cut.start == 10
    assert cut.duration == 5
    assert cut.supervisions[0].id == 'sup2'
    cut = cuts[2]
    assert cut.start == 20
    assert cut.duration == 8
    assert cut.supervisions[0].id == 'sup3'
    cut = cuts[3]
    assert cut.start == 0
    assert cut.duration == 30
    assert cut.supervisions[0].id == 'sup4'
Esempio n. 11
0
def mix_sequential(cut_manifests: List[Pathlike], output_cut_manifest: Pathlike):
    """
    Create a CutSet stored in OUTPUT_CUT_MANIFEST by iterating jointly over CUT_MANIFESTS and mixing the Cuts
    on the same positions. E.g. the first output cut is created from the first cuts in each input manifest.
    The mix is performed by summing the features from all Cuts.
    If the CUT_MANIFESTS have different number of Cuts, the mixing ends when the shorter manifest is depleted.
    """
    cut_manifests = [CutSet.from_json(path) for path in cut_manifests]
    mixed_cut_set = CutSet.from_cuts(mix_cuts(cuts) for cuts in zip(*cut_manifests))
    mixed_cut_set.to_file(output_cut_manifest)
Esempio n. 12
0
def mix_by_recording_id(cut_manifests: List[Pathlike],
                        output_cut_manifest: Pathlike):
    """
    Create a CutSet stored in OUTPUT_CUT_MANIFEST by matching the Cuts from CUT_MANIFESTS by their recording IDs
    and mixing them together.
    """
    all_cuts = combine(*[CutSet.from_json(path) for path in cut_manifests])
    recording_id_to_cuts = groupby(lambda cut: cut.recording_id, all_cuts)
    mixed_cut_set = CutSet.from_cuts(
        mix_cuts(cuts) for recording_id, cuts in recording_id_to_cuts.items())
    mixed_cut_set.to_json(output_cut_manifest)
Esempio n. 13
0
def append(
    cut_manifests: List[Pathlike],
    output_cut_manifest: Pathlike,
):
    """
    Create a new CutSet by appending the cuts in CUT_MANIFESTS. CUT_MANIFESTS are iterated position-wise (the
    cuts on i'th position in each manfiest are appended to each other).
    The cuts are appended in the order in which they appear in the
    input argument list.
    If CUT_MANIFESTS have different lengths, the script stops once the shortest CutSet is depleted.
    """
    cut_sets = [CutSet.from_file(path) for path in cut_manifests]
    appended_cut_set = CutSet.from_cuts(append_cuts(cuts) for cuts in zip(*cut_sets))
    appended_cut_set.to_file(output_cut_manifest)
Esempio n. 14
0
def DummyManifest(type_: Type, *, begin_id: int, end_id: int) -> Manifest:
    if type_ == RecordingSet:
        return RecordingSet.from_recordings(
            dummy_recording(idx) for idx in range(begin_id, end_id))
    if type_ == SupervisionSet:
        return SupervisionSet.from_segments(
            dummy_supervision(idx) for idx in range(begin_id, end_id))
    if type_ == FeatureSet:
        # noinspection PyTypeChecker
        return FeatureSet.from_features(
            dummy_features(idx) for idx in range(begin_id, end_id))
    if type_ == CutSet:
        # noinspection PyTypeChecker
        return CutSet.from_cuts(
            dummy_cut(idx) for idx in range(begin_id, end_id))
Esempio n. 15
0
def test_cut_set_decompose_doesnt_duplicate_recording():
    c = dummy_cut(0)
    c2 = dummy_cut(0)
    c2.id = "dummy-cut-0001"  # override cut ID, retain identical recording ID as `c`
    cuts = CutSet.from_cuts([c, c2])

    recs, sups, feats = cuts.decompose()

    assert isinstance(recs, RecordingSet)
    # deduplicated recording
    assert len(recs) == 1
    assert recs[0].id == "dummy-recording-0000"

    assert sups is None

    assert isinstance(feats, FeatureSet)
    # not deduplicated features
    assert len(feats) == 2
Esempio n. 16
0
def test_cut_set_decompose_output_dir_doesnt_duplicate_recording():
    c = dummy_cut(0)
    c2 = dummy_cut(0)
    c2.id = "dummy-cut-0001"  # override cut ID, retain identical recording ID as `c`
    cuts = CutSet.from_cuts([c, c2])

    with TemporaryDirectory() as td:
        td = Path(td)
        cuts.decompose(output_dir=td)

        text = load_jsonl(td / "recordings.jsonl.gz")
        print(list(text))

        recs = load_manifest(td / "recordings.jsonl.gz")
        assert isinstance(recs, RecordingSet)
        # deduplicated recording
        assert len(recs) == 1
        assert recs[0].id == "dummy-recording-0000"
Esempio n. 17
0
def test_trim_to_unsupervised_segments():
    cut_set = CutSet.from_cuts([
        # Yields 3 unsupervised cuts - before first supervision,
        # between sup2 and sup3, and after sup3.
        MonoCut(
            "cut1",
            start=0,
            duration=30,
            channel=0,
            supervisions=[
                SupervisionSegment("sup1", "rec1", start=1.5, duration=8.5),
                SupervisionSegment("sup2", "rec1", start=10, duration=5),
                SupervisionSegment("sup3", "rec1", start=20, duration=8),
            ],
            recording=dummy_recording(1, duration=30),
        ),
        # Does not yield any "unsupervised" cut.
        MonoCut(
            "cut2",
            start=0,
            duration=30,
            channel=0,
            supervisions=[
                SupervisionSegment("sup4", "rec1", start=0, duration=30),
            ],
            recording=dummy_recording(2, duration=30),
        ),
    ])
    unsupervised_cuts = cut_set.trim_to_unsupervised_segments()

    assert len(unsupervised_cuts) == 3

    assert unsupervised_cuts[0].start == 0
    assert unsupervised_cuts[0].duration == 1.5
    assert unsupervised_cuts[0].supervisions == []

    assert unsupervised_cuts[1].start == 15
    assert unsupervised_cuts[1].duration == 5
    assert unsupervised_cuts[1].supervisions == []

    assert unsupervised_cuts[2].start == 28
    assert unsupervised_cuts[2].duration == 2
    assert unsupervised_cuts[2].supervisions == []
Esempio n. 18
0
 def __getitem__(self, cuts: CutSet) -> Dict[str, Any]:
     if self.collate:
         audio, audio_lens = collate_audio(cuts)
         return {
             "cuts": cuts,
             "audio": audio,
             "audio_lens": audio_lens,
         }
     else:
         remain_cuts = []
         remain_audios = []
         for c in cuts:
             with suppress_audio_loading_errors():
                 remain_audios.append(c.load_audio())
                 remain_cuts.append(c)
         return {
             "cuts": CutSet.from_cuts(remain_cuts),
             "audio": remain_audios
         }
Esempio n. 19
0
def test_cut_set_decompose_output_dir():
    c = dummy_cut(
        0,
        start=5.0,
        duration=10.0,
        supervisions=[
            dummy_supervision(0, start=0.0),
            dummy_supervision(1, start=6.5)
        ],
    )
    assert c.start == 5.0
    assert c.end == 15.0
    cuts = CutSet.from_cuts([c])

    with TemporaryDirectory() as td:
        td = Path(td)
        recs, sups, feats = cuts.decompose(output_dir=td)
        assert list(recs) == list(load_manifest(td / "recordings.jsonl.gz"))
        assert list(sups) == list(load_manifest(td / "supervisions.jsonl.gz"))
        assert list(feats) == list(load_manifest(td / "features.jsonl.gz"))
Esempio n. 20
0
 def __getitem__(self, cuts: CutSet) -> Dict[str, Any]:
     if self.collate:
         audio, audio_lens = collate_audio(cuts)
         return {
             "cuts": cuts,
             "audio": audio,
             "audio_lens": audio_lens,
         }
     else:
         remain_cuts = []
         remain_audios = []
         for c in cuts:
             with suppress_and_warn(AudioLoadingError,
                                    DurationMismatchError,
                                    NonPositiveEnergyError):
                 remain_audios.append(c.load_audio())
                 remain_cuts.append(c)
         return {
             "cuts": CutSet.from_cuts(remain_cuts),
             "audio": remain_audios
         }
Esempio n. 21
0
def test_trim_to_supervisions_mixed_cuts():
    cut_set = CutSet.from_cuts([
        Cut('cut1',
            start=0,
            duration=30,
            channel=0,
            supervisions=[
                SupervisionSegment('sup1', 'rec1', start=1.5, duration=8.5),
                SupervisionSegment('sup2', 'rec1', start=10, duration=5),
                SupervisionSegment('sup3', 'rec1', start=20, duration=8),
            ]).append(
                Cut('cut2',
                    start=0,
                    duration=30,
                    channel=0,
                    supervisions=[
                        SupervisionSegment('sup4',
                                           'rec1',
                                           start=0,
                                           duration=30),
                    ]))
    ])
    cuts = cut_set.trim_to_supervisions()
    assert len(cuts) == 4
    assert all(isinstance(cut, MixedCut) for cut in cuts)
    assert all(cut.start == 0 for cut in cuts)
    assert all(len(cut.supervisions) == 1 for cut in cuts)
    assert all(cut.supervisions[0].start == 0 for cut in cuts)
    cut = cuts[0]
    assert cut.duration == 8.5
    assert cut.supervisions[0].id == 'sup1'
    cut = cuts[1]
    assert cut.duration == 5
    assert cut.supervisions[0].id == 'sup2'
    cut = cuts[2]
    assert cut.duration == 8
    assert cut.supervisions[0].id == 'sup3'
    cut = cuts[3]
    assert cut.duration == 30
    assert cut.supervisions[0].id == 'sup4'
Esempio n. 22
0
def test_mix_same_recording_channels():
    recording = Recording(
        "rec",
        sampling_rate=8000,
        num_samples=30 * 8000,
        duration=30,
        sources=[
            AudioSource("file", channels=[0], source="irrelevant1.wav"),
            AudioSource("file", channels=[1], source="irrelevant2.wav"),
        ],
    )
    cut_set = CutSet.from_cuts([
        MonoCut("cut1", start=0, duration=30, channel=0, recording=recording),
        MonoCut("cut2", start=0, duration=30, channel=1, recording=recording),
    ])

    mixed = cut_set.mix_same_recording_channels()
    assert len(mixed) == 1

    cut = mixed[0]
    assert isinstance(cut, MixedCut)
    assert len(cut.tracks) == 2
    assert cut.tracks[0].cut == cut_set[0]
    assert cut.tracks[1].cut == cut_set[1]
Esempio n. 23
0
def test_trim_to_supervisions_simple_cuts(keep_overlapping, num_jobs):
    cut_set = CutSet.from_cuts([
        MonoCut(
            "cut1",
            start=0,
            duration=30,
            channel=0,
            supervisions=[
                SupervisionSegment("sup1", "rec1", start=1.5, duration=10.5),
                SupervisionSegment("sup2", "rec1", start=10, duration=5),
                SupervisionSegment("sup3", "rec1", start=20, duration=8),
            ],
            recording=dummy_recording(1, duration=30),
        ),
        MonoCut(
            "cut2",
            start=0,
            duration=30,
            channel=0,
            supervisions=[
                SupervisionSegment("sup4", "rec1", start=0, duration=30),
            ],
            recording=dummy_recording(2, duration=30),
        ),
    ])
    cuts = cut_set.trim_to_supervisions(keep_overlapping=keep_overlapping,
                                        num_jobs=num_jobs)
    assert len(cuts) == 4

    # Note: expected results diverge here depending on the value of keep_overlapping flag
    cut = cuts[0]
    assert cut.start == 1.5
    assert cut.duration == 10.5
    if keep_overlapping:
        assert len(cut.supervisions) == 2
        sup = cut.supervisions[0]
        assert sup.id == "sup1"
        assert sup.start == 0
        assert sup.duration == 10.5
        sup = cut.supervisions[1]
        assert sup.id == "sup2"
        assert sup.start == 8.5
        assert sup.duration == 5
    else:
        assert len(cut.supervisions) == 1
        sup = cut.supervisions[0]
        assert sup.id == "sup1"
        assert sup.start == 0
        assert sup.duration == 10.5

    # Note: expected results diverge here depending on the value of keep_overlapping flag
    cut = cuts[1]
    assert cut.start == 10
    assert cut.duration == 5
    if keep_overlapping:
        assert len(cut.supervisions) == 2
        sup = cut.supervisions[0]
        assert sup.id == "sup1"
        assert sup.start == -8.5
        assert sup.duration == 10.5
        sup = cut.supervisions[1]
        assert sup.id == "sup2"
        assert sup.start == 0
        assert sup.duration == 5
    else:
        assert len(cut.supervisions) == 1
        sup = cut.supervisions[0]
        assert sup.id == "sup2"
        assert sup.start == 0
        assert sup.duration == 5

    # Note: both test cases have same results
    cut = cuts[2]
    assert len(cut.supervisions) == 1
    assert cut.start == 20
    assert cut.duration == 8
    assert cut.supervisions[0].id == "sup3"

    # Note: both test cases have same results
    cut = cuts[3]
    assert len(cut.supervisions) == 1
    assert cut.start == 0
    assert cut.duration == 30
    assert cut.supervisions[0].id == "sup4"
def k2_noise_cut_set(libri_cut_set):
    # Create a cut set with 4 cuts, one of them having two supervisions
    return CutSet.from_cuts([
        libri_cut_set[0].with_id("noise-1").truncate(duration=3.5),
        libri_cut_set[0].with_id("noise-2").truncate(duration=7.3),
    ])
Esempio n. 25
0
def copy_feats(
    input_manifest: Pathlike,
    output_manifest: Pathlike,
    storage_path: str,
    storage_type: str,
    max_jobs: int,
) -> None:
    """
    Load INPUT_MANIFEST of type :class:`lhotse.FeatureSet` or `lhotse.CutSet`,
    read every feature matrix using ``features.load()`` or ``cut.load_features()``,
    save them in STORAGE_PATH and save the updated manifest to OUTPUT_MANIFEST.
    """
    from lhotse.serialization import load_manifest_lazy_or_eager
    from lhotse.manipulation import combine as combine_manifests

    manifests = load_manifest_lazy_or_eager(input_manifest)

    if isinstance(manifests, FeatureSet):
        with get_writer(storage_type)(storage_path) as w:
            # FeatureSet is copied in-memory and written (TODO: make it incremental if needed)
            manifests = manifests.copy_feats(writer=w)
            manifests.to_file(output_manifest)

    elif isinstance(manifests, CutSet):
        # Group cuts by their underlying feature files.
        manifests = sorted(manifests,
                           key=lambda cut: cut.features.storage_path)
        subsets = groupby(manifests, lambda cut: cut.features.storage_path)
        unique_storage_paths, subsets = zip(*[(k, CutSet.from_cuts(grp))
                                              for k, grp in subsets])

        # Create paths for new feature files and subset cutsets.
        tot_items = len(unique_storage_paths)
        new_storage_paths = [
            f"{storage_path}/feats-{i}" for i in range(tot_items)
        ]
        partial_manifest_paths = [
            f"{storage_path}/cuts-{i}.jsonl.gz" for i in range(tot_items)
        ]

        num_jobs = len(unique_storage_paths)
        if max_jobs > 0:
            num_jobs = min(num_jobs, max_jobs)

        # Create directory if needed (storage_path might be an URL)
        if Path(storage_path).parent.is_dir():
            Path(storage_path).mkdir(exist_ok=True)

        # Copy each partition in parallel and combine lazily opened manifests.
        with ProcessPoolExecutor(num_jobs) as ex:
            futures = []
            for cs, nsp, pmp in zip(subsets, new_storage_paths,
                                    partial_manifest_paths):
                futures.append(
                    ex.submit(copy_feats_worker, cs, nsp, storage_type, pmp))

            all_cuts = combine_manifests(
                (f.result() for f in as_completed(futures)))

        # Combine and save subset cutsets into the final file.
        with CutSet.open_writer(output_manifest) as w:
            for c in all_cuts:
                w.write(c)
    else:
        raise ValueError(
            f"Unsupported manifest type ({type(manifests)}) at: {input_manifest}"
        )
Esempio n. 26
0
def test_cut_set_prefix(cut_with_relative_paths):
    cut_set = CutSet.from_cuts([cut_with_relative_paths])
    for c in cut_set.with_recording_path_prefix('/data'):
        assert c.recording.sources[0].source == '/data/audio.wav'
    for c in cut_set.with_features_path_prefix('/data'):
        assert c.features.storage_path == '/data/storage_dir'
Esempio n. 27
0
def cut_set(cut1, cut2):
    return CutSet.from_cuts([cut1, cut2])
Esempio n. 28
0
def cut_set_with_mixed_cut(cut1, cut2, mixed_cut):
    return CutSet.from_cuts([cut1, cut2, mixed_cut])