def test_k2_speech_recognition_iterable_dataset_shuffling():
    # The dummy cuts have a duration of 1 second each
    cut_set = DummyManifest(CutSet, begin_id=0, end_id=100)

    dataset = K2SpeechRecognitionDataset(
        return_cuts=True,
        cut_transforms=[
            CutConcatenate(),
        ],
    )
    sampler = SingleCutSampler(
        cut_set,
        shuffle=True,
        # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames
        # This way we're testing that it works okay when returning multiple batches in
        # a full epoch.
        max_frames=1000,
    )
    dloader = DataLoader(dataset,
                         batch_size=None,
                         sampler=sampler,
                         num_workers=2)
    dloader_cut_ids = []
    batches = []
    for batch in dloader:
        batches.append(batch)
        dloader_cut_ids.extend(c.id for c in batch["supervisions"]["cut"])

    # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet
    assert len(dloader_cut_ids) == len(cut_set)
    # Invariant 2: the items are not duplicated
    assert len(set(dloader_cut_ids)) == len(dloader_cut_ids)
    # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet
    assert dloader_cut_ids != [c.id for c in cut_set]
def test_k2_speech_recognition_iterable_dataset_multiple_workers(
        k2_cut_set, num_workers):
    k2_cut_set = k2_cut_set.pad()
    dataset = K2SpeechRecognitionDataset(cut_transforms=[CutConcatenate()])
    sampler = SingleCutSampler(k2_cut_set, shuffle=False)
    dloader = DataLoader(dataset,
                         batch_size=None,
                         sampler=sampler,
                         num_workers=num_workers)

    # We expect a variable number of batches for each parametrized num_workers value,
    # because the dataset is small with 4 cuts that are partitioned across the workers.
    batches = [item for item in dloader]

    features = torch.cat([b["inputs"] for b in batches])
    assert features.shape == (4, 2000, 40)
    text = [t for b in batches for t in b["supervisions"]["text"]]
    assert text == ["EXAMPLE OF TEXT"] * 5  # a list, not tensor
    start_frame = torch.cat(
        [b["supervisions"]["start_frame"] for b in batches]).tolist()
    # The multi-worker dataloader might not preserve order, because the workers
    # might finish processing in different order. To compare ground truth
    # start times with actual start times, we need to sort.
    start_frame = sorted(start_frame)
    assert start_frame == [0] * 4 + [1000]
    num_frames = torch.cat([b["supervisions"]["num_frames"]
                            for b in batches]).tolist()
    assert num_frames == [1000] * 5
def test_k2_speech_recognition_augmentation(k2_cut_set, k2_noise_cut_set):
    dataset = K2SpeechRecognitionDataset(
        cut_transforms=[CutConcatenate(),
                        CutMix(k2_noise_cut_set)])
    sampler = SingleCutSampler(k2_cut_set, shuffle=False)
    dloader = DataLoader(dataset, sampler=sampler, batch_size=None)
    # Check that it does not crash by just running all dataloader iterations
    batches = [item for item in dloader]
    assert len(batches) > 0
Beispiel #4
0
 def test_no_off_by_one_errors_in_dataset_batch_collation(
         self, sampling_rate: int, data):
     ### Test data preparation ###
     # Generate 10 - 20 cut durations in numbers of samples
     nums_samples = data.draw(
         st.lists(
             st.integers(round(sampling_rate * 0.1),
                         round(sampling_rate * 5.0)),
             min_size=10,
             max_size=20,
         ),
         label="Cuts numbers of samples",
     )
     # Generate random cuts
     cuts = [
         self.with_cut(sampling_rate=sampling_rate,
                       num_samples=num_samples,
                       supervision=True) for num_samples in nums_samples
     ]
     # Mix them with random offsets
     mixed_cuts = CutSet.from_cuts(
         lhs.mix(
             rhs,
             # Sample the offset in terms of number of samples, and then divide by the sampling rate
             # to obtain "realistic" offsets
             offset_other_by=data.draw(
                 st.integers(
                     min_value=int(0.1 * sampling_rate),
                     max_value=int(lhs.duration * sampling_rate),
                 ),
                 label=f"Offset for pair {idx + 1}",
             ) / sampling_rate,
         ) for idx, (lhs, rhs) in enumerate(zip(cuts, cuts[1:])))
     # Create an ASR dataset
     dataset = K2SpeechRecognitionDataset(
         return_cuts=True,
         cut_transforms=[CutConcatenate(duration_factor=3.0)],
     )
     sampler = SimpleCutSampler(
         mixed_cuts,
         shuffle=False,
     )
     dloader = DataLoader(dataset, batch_size=None, sampler=sampler)
     ### End of test data preparation ###
     # Test the invariants
     for batch in dloader:
         sups = batch["supervisions"]
         cuts = sups["cut"]
         for idx, cut in enumerate(cuts):
             assert (sups["start_frame"][idx] + sups["num_frames"][idx] <=
                     cut.num_frames), f"Error at index {idx}"
             # assert sups['start_sample'][idx] + sups['num_samples'][
             #     idx] <= cut.num_samples, f"Error at index {idx}"
     # Need to call cleanup manually to free the file handles, otherwise the test may crash
     self.cleanup()
def test_k2_speech_recognition_iterable_dataset(k2_cut_set, num_workers):
    dataset = K2SpeechRecognitionDataset(
        k2_cut_set,
        cut_transforms=[CutConcatenate()]
    )
    sampler = SingleCutSampler(k2_cut_set, shuffle=False)
    # Note: "batch_size=None" disables the automatic batching mechanism,
    #       which is required when Dataset takes care of the collation itself.
    dloader = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=num_workers)
    batch = next(iter(dloader))
    assert batch['inputs'].shape == (4, 2000, 40)
    # Each list has 5 items, to account for:
    # one cut with two supervisions + 3 three cuts with one supervision
    assert (batch['supervisions']['sequence_idx'] == tensor([0, 0, 1, 2, 3])).all()
    assert batch['supervisions']['text'] == ['EXAMPLE OF TEXT'] * 5  # a list, not tensor
    assert (batch['supervisions']['start_frame'] == tensor([0, 1000, 0, 0, 0])).all()
    assert (batch['supervisions']['num_frames'] == tensor([1000] * 5)).all()