Beispiel #1
0
def test_cut_pairs_sampler_lazy_shuffle(sampler_cls):
    # The dummy cuts have a duration of 1 second each
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)
    with NamedTemporaryFile(suffix=".jsonl") as f:
        cut_set.to_jsonl(f.name)
        lazy_cuts = CutSet.from_jsonl_lazy(f.name)

        sampler = sampler_cls(
            lazy_cuts,
            lazy_cuts,
            shuffle=True,
            # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames
            # This way we're testing that it works okay when returning multiple batches in
            # a full epoch.
            max_source_frames=1000,
        )
        sampled_src_cuts = []
        sampled_tgt_cuts = []
        for src_batch, tgt_batch in sampler:
            # Invariant 0: The order of source and target cut IDs is preserved within each batch.
            assert list(src_batch.ids) == list(tgt_batch.ids)
            sampled_src_cuts.extend(src_batch)
            sampled_tgt_cuts.extend(tgt_batch)

        # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet
        assert len(sampled_src_cuts) == len(cut_set)
        assert len(sampled_tgt_cuts) == len(cut_set)
        # Invariant 2: the items are not duplicated
        assert len(set(c.id for c in sampled_src_cuts)) == len(sampled_src_cuts)
        # Invariant 3: the items are shuffled
        assert [c.id for c in sampled_src_cuts] != [c.id for c in lazy_cuts]
Beispiel #2
0
def test_single_cut_sampler_lazy_shuffle(sampler_cls):
    # The dummy cuts have a duration of 1 second each
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)
    with NamedTemporaryFile(suffix=".jsonl") as f:
        cut_set.to_jsonl(f.name)
        lazy_cuts = CutSet.from_jsonl_lazy(f.name)

        sampler = sampler_cls(
            lazy_cuts,
            shuffle=True,
            # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames
            # This way we're testing that it works okay when returning multiple batches in
            # a full epoch.
            max_duration=10.0,
        )
        sampled_cuts = []
        for batch in sampler:
            sampled_cuts.extend(batch)

        # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet
        assert len(sampled_cuts) == len(cut_set)
        # Invariant 2: the items are not duplicated
        assert len(set(c.id for c in sampled_cuts)) == len(sampled_cuts)
        # Invariant 3: the items are shuffled
        assert [c.id for c in sampled_cuts] != [c.id for c in lazy_cuts]
Beispiel #3
0
def test_bucketing_sampler_raises_value_error_on_lazy_cuts_input():
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=2)
    with NamedTemporaryFile(suffix=".jsonl") as f:
        cut_set.to_jsonl(f.name)
        lazy_cuts = CutSet.from_jsonl_lazy(f.name)
        with pytest.raises(ValueError):
            sampler = BucketingSampler(
                lazy_cuts,
                max_duration=10.0,
            )