def test_mixed_cut_fill_supervision_raises_on_two_supervisions(): cut = dummy_cut(0, supervisions=[dummy_supervision(0), dummy_supervision(1)]) cut = cut.pad(duration=10) with pytest.raises(AssertionError): fcut = cut.fill_supervision()
def test_cut_set_decompose(): c = dummy_cut( 0, start=5.0, duration=10.0, supervisions=[ dummy_supervision(0, start=0.0), dummy_supervision(1, start=6.5) ], ) assert c.start == 5.0 assert c.end == 15.0 cuts = CutSet.from_cuts([c]) recs, sups, feats = cuts.decompose() assert isinstance(recs, RecordingSet) assert len(recs) == 1 assert recs[0].id == "dummy-recording-0000" assert isinstance(sups, SupervisionSet) assert len(sups) == 2 assert sups[0].id == "dummy-segment-0000" assert sups[0].start == 5.0 assert sups[0].end == 6.0 assert sups[1].id == "dummy-segment-0001" assert sups[1].start == 11.5 assert sups[1].end == 12.5 assert isinstance(feats, FeatureSet) assert len(feats) == 1
def cutset(): return CutSet.from_cuts([ # MonoCut dummy_cut(0, supervisions=[dummy_supervision(0)]), # PaddingCut PaddingCut('pad', duration=1.0, sampling_rate=16000, feat_value=-100, num_frames=100, frame_shift=0.01, num_features=80, num_samples=16000), # MixedCut dummy_cut(0, supervisions=[dummy_supervision(0)]).mix( dummy_cut(1, supervisions=[dummy_supervision(1)]), offset_other_by=0.5, snr=10 ) ])
def test_token_collater(add_bos, add_eos): test_sentences = [ "Testing the first sentence.", "Let's add some more punctuation, shall we?", "How about number 42!", ] cuts = CutSet.from_cuts( dummy_cut(idx, idx, supervisions=[dummy_supervision(idx, idx, text=sentence)]) for idx, sentence in enumerate(test_sentences)) token_collater = TokenCollater(cuts, add_bos=add_bos, add_eos=add_eos) tokens_batch, tokens_lens = token_collater(cuts) assert isinstance(tokens_batch, torch.LongTensor) assert isinstance(tokens_lens, torch.IntTensor) extend = int(add_bos) + int(add_eos) expected_len = len(max(test_sentences, key=len)) + extend assert tokens_batch.shape == (len(test_sentences), expected_len) assert torch.all(tokens_lens == torch.IntTensor( [len(sentence) + extend for sentence in test_sentences])) reconstructed = token_collater.inverse(tokens_batch, tokens_lens) assert reconstructed == test_sentences
def dummy_cut_with_supervisions(): return dummy_cut( unique_id=0, supervisions=[ dummy_supervision(unique_id=i, duration=i) for i in range(1, 7) ], )
def test_mixed_cut_fill_supervision_expand(): cut = dummy_cut(0, supervisions=[dummy_supervision(0)]) cut = cut.pad(duration=7.51) fcut = cut.fill_supervision() # Original is not modified assert cut.supervisions[0].start == 0 assert cut.supervisions[0].duration == 1 # Result is modified assert fcut.supervisions[0].start == 0 assert fcut.supervisions[0].duration == 7.51
def test_mono_cut_fill_supervision_shrink(): cut = dummy_cut(0, supervisions=[dummy_supervision(0)]) cut.duration = 0.5 fcut = cut.fill_supervision(shrink_ok=True) # Original is not modified assert cut.supervisions[0].start == 0 assert cut.supervisions[0].duration == 1 # Result is modified assert fcut.supervisions[0].start == 0 assert fcut.supervisions[0].duration == 0.5
def test_cut_set_decompose_output_dir(): c = dummy_cut( 0, start=5.0, duration=10.0, supervisions=[ dummy_supervision(0, start=0.0), dummy_supervision(1, start=6.5) ], ) assert c.start == 5.0 assert c.end == 15.0 cuts = CutSet.from_cuts([c]) with TemporaryDirectory() as td: td = Path(td) recs, sups, feats = cuts.decompose(output_dir=td) assert list(recs) == list(load_manifest(td / "recordings.jsonl.gz")) assert list(sups) == list(load_manifest(td / "supervisions.jsonl.gz")) assert list(feats) == list(load_manifest(td / "features.jsonl.gz"))
def test_mixed_cut_fill_supervision_shrink(): cut = dummy_cut(0, supervisions=[dummy_supervision(0)]) cut = cut.mix(dummy_cut(1)).truncate( duration=0.5) # cuts are 100% overlapping fcut = cut.fill_supervision(shrink_ok=True) # Original is not modified assert cut.supervisions[0].start == 0 assert cut.supervisions[0].duration == 1 # Result is modified assert fcut.supervisions[0].start == 0 assert fcut.supervisions[0].duration == 0.5
import pytest from lhotse import CutSet from lhotse.cut import PaddingCut from lhotse.testing.dummies import dummy_cut, dummy_supervision parametrize_on_cut_types = pytest.mark.parametrize( 'cut', [ # MonoCut dummy_cut(0, supervisions=[dummy_supervision(0)]), # PaddingCut PaddingCut('pad', duration=1.0, sampling_rate=16000, feat_value=-100, num_frames=100, frame_shift=0.01, num_features=80, num_samples=16000), # MixedCut dummy_cut(0, supervisions=[dummy_supervision(0)]).mix( dummy_cut(1, supervisions=[dummy_supervision(1)]), offset_other_by=0.5, snr=10 ) ] ) @parametrize_on_cut_types def test_drop_features(cut): assert cut.has_features cut_drop = cut.drop_features() assert cut.has_features assert not cut_drop.has_features
def test_mono_cut_fill_supervision_identity(): cut = dummy_cut(0, supervisions=[dummy_supervision(0)]) fcut = cut.fill_supervision() assert cut == fcut
def test_mixed_cut_fill_supervision_identity(): cut = dummy_cut(0, supervisions=[dummy_supervision(0)]) cut = cut.mix(dummy_cut(1)) # cuts are 100% overlapping fcut = cut.fill_supervision() assert cut == fcut
def test_mono_cut_fill_supervision_raises_on_two_supervisions(): cut = dummy_cut(0, supervisions=[dummy_supervision(0), dummy_supervision(1)]) with pytest.raises(AssertionError): fcut = cut.fill_supervision()
def test_mono_cut_fill_supervision_shrink_raises_default(): cut = dummy_cut(0, supervisions=[dummy_supervision(0)]) cut.duration = 0.5 with pytest.raises(ValueError): fcut = cut.fill_supervision()
def test_mixed_cut_fill_supervision_shrink_raises_default(): cut = dummy_cut(0, supervisions=[dummy_supervision(0)]) cut = cut.mix(dummy_cut(1)).truncate( duration=0.5) # cuts are 100% overlapping with pytest.raises(ValueError): fcut = cut.fill_supervision()