def test_cut_set_subset_cut_ids_preserves_order(): cuts = DummyManifest(CutSet, begin_id=0, end_id=1000) cut_ids = ["dummy-cut-0010", "dummy-cut-0171", "dummy-cut-0009"] subcuts = cuts.subset(cut_ids=cut_ids) cut1, cut2, cut3 = subcuts assert cut1.id == "dummy-cut-0010" assert cut2.id == "dummy-cut-0171" assert cut3.id == "dummy-cut-0009"
def test_cut_set_subset_cut_ids_preserves_order_with_lazy_manifest(): cuts = DummyManifest(CutSet, begin_id=0, end_id=1000) cut_ids = ["dummy-cut-0010", "dummy-cut-0171", "dummy-cut-0009"] with NamedTemporaryFile(suffix=".jsonl.gz") as f: cuts.to_file(f.name) cuts = cuts.from_jsonl_lazy(f.name) subcuts = cuts.subset(cut_ids=cut_ids) cut1, cut2, cut3 = subcuts assert cut1.id == "dummy-cut-0010" assert cut2.id == "dummy-cut-0171" assert cut3.id == "dummy-cut-0009"
def test_subset_raises(manifest_type, first, last): any_set = DummyManifest(manifest_type, begin_id=0, end_id=200) with pytest.raises(AssertionError): subset = any_set.subset(first=first, last=last)
def test_subset_last(manifest_type): any_set = DummyManifest(manifest_type, begin_id=0, end_id=200) expected = DummyManifest(manifest_type, begin_id=190, end_id=200) subset = any_set.subset(last=10) assert subset == expected
BucketingSampler(CUTS, CUTS, max_source_duration=10.0, shuffle=True, drop_last=True, num_buckets=2, sampler_type=CutPairsSampler), BucketingSampler(CUTS, CUTS, num_buckets=2, sampler_type=CutPairsSampler), ), lambda: ( DynamicBucketingSampler(CUTS, max_duration=10.0, shuffle=True, drop_last=True, num_buckets=2), DynamicBucketingSampler(CUTS, max_duration=10.0, num_buckets=2), ), lambda: ( DynamicCutSampler(CUTS, max_duration=10.0, shuffle=True, drop_last=True), DynamicCutSampler(CUTS, max_duration=10.0), ), # Differently initialized RoundRobinSampler with the same CUTS lambda: ( RoundRobinSampler( SingleCutSampler(CUTS.subset(first=50), max_duration=10.0, shuffle=True, drop_last=True), SingleCutSampler(CUTS_MOD.subset(first=50), max_duration=10.0, shuffle=True, drop_last=True), ), RoundRobinSampler( SingleCutSampler(CUTS.subset(first=50)), SingleCutSampler(CUTS_MOD.subset(first=50)), ), ), ] # fmt: on @pytest.mark.parametrize("create_samplers", SAMPLERS_TO_TEST) def test_restore_sampler_state(create_samplers): sampler, restored_sampler = create_samplers() # Iterate a full epoch through the sampler first to accumulate some sampling diagnostics.
DynamicBucketingSampler(CUTS, max_duration=10.0, shuffle=True, drop_last=True, num_buckets=2), DynamicBucketingSampler(CUTS, max_duration=10.0, num_buckets=2), ), lambda: ( DynamicCutSampler( CUTS, max_duration=10.0, shuffle=True, drop_last=True), DynamicCutSampler(CUTS, max_duration=10.0), ), # Differently initialized RoundRobinSampler with the same CUTS lambda: ( RoundRobinSampler( SingleCutSampler(CUTS.subset(first=50), max_duration=10.0, shuffle=True, drop_last=True), SingleCutSampler(CUTS_MOD.subset(first=50), max_duration=10.0, shuffle=True, drop_last=True), ), RoundRobinSampler( SingleCutSampler(CUTS.subset(first=50)), SingleCutSampler(CUTS_MOD.subset(first=50)), ), ), ] # fmt: on