def test_k2_speech_recognition_iterable_dataset_shuffling(): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) dataset = K2SpeechRecognitionDataset( return_cuts=True, cut_transforms=[ CutConcatenate(), ], ) sampler = SingleCutSampler( cut_set, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_frames=1000, ) dloader = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=2) dloader_cut_ids = [] batches = [] for batch in dloader: batches.append(batch) dloader_cut_ids.extend(c.id for c in batch["supervisions"]["cut"]) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(dloader_cut_ids) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(dloader_cut_ids)) == len(dloader_cut_ids) # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet assert dloader_cut_ids != [c.id for c in cut_set]
def test_k2_speech_recognition_iterable_dataset_multiple_workers( k2_cut_set, num_workers): k2_cut_set = k2_cut_set.pad() dataset = K2SpeechRecognitionDataset(cut_transforms=[CutConcatenate()]) sampler = SingleCutSampler(k2_cut_set, shuffle=False) dloader = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=num_workers) # We expect a variable number of batches for each parametrized num_workers value, # because the dataset is small with 4 cuts that are partitioned across the workers. batches = [item for item in dloader] features = torch.cat([b["inputs"] for b in batches]) assert features.shape == (4, 2000, 40) text = [t for b in batches for t in b["supervisions"]["text"]] assert text == ["EXAMPLE OF TEXT"] * 5 # a list, not tensor start_frame = torch.cat( [b["supervisions"]["start_frame"] for b in batches]).tolist() # The multi-worker dataloader might not preserve order, because the workers # might finish processing in different order. To compare ground truth # start times with actual start times, we need to sort. start_frame = sorted(start_frame) assert start_frame == [0] * 4 + [1000] num_frames = torch.cat([b["supervisions"]["num_frames"] for b in batches]).tolist() assert num_frames == [1000] * 5
def test_k2_speech_recognition_augmentation(k2_cut_set, k2_noise_cut_set): dataset = K2SpeechRecognitionDataset( cut_transforms=[CutConcatenate(), CutMix(k2_noise_cut_set)]) sampler = SingleCutSampler(k2_cut_set, shuffle=False) dloader = DataLoader(dataset, sampler=sampler, batch_size=None) # Check that it does not crash by just running all dataloader iterations batches = [item for item in dloader] assert len(batches) > 0
def test_no_off_by_one_errors_in_dataset_batch_collation( self, sampling_rate: int, data): ### Test data preparation ### # Generate 10 - 20 cut durations in numbers of samples nums_samples = data.draw( st.lists( st.integers(round(sampling_rate * 0.1), round(sampling_rate * 5.0)), min_size=10, max_size=20, ), label="Cuts numbers of samples", ) # Generate random cuts cuts = [ self.with_cut(sampling_rate=sampling_rate, num_samples=num_samples, supervision=True) for num_samples in nums_samples ] # Mix them with random offsets mixed_cuts = CutSet.from_cuts( lhs.mix( rhs, # Sample the offset in terms of number of samples, and then divide by the sampling rate # to obtain "realistic" offsets offset_other_by=data.draw( st.integers( min_value=int(0.1 * sampling_rate), max_value=int(lhs.duration * sampling_rate), ), label=f"Offset for pair {idx + 1}", ) / sampling_rate, ) for idx, (lhs, rhs) in enumerate(zip(cuts, cuts[1:]))) # Create an ASR dataset dataset = K2SpeechRecognitionDataset( return_cuts=True, cut_transforms=[CutConcatenate(duration_factor=3.0)], ) sampler = SimpleCutSampler( mixed_cuts, shuffle=False, ) dloader = DataLoader(dataset, batch_size=None, sampler=sampler) ### End of test data preparation ### # Test the invariants for batch in dloader: sups = batch["supervisions"] cuts = sups["cut"] for idx, cut in enumerate(cuts): assert (sups["start_frame"][idx] + sups["num_frames"][idx] <= cut.num_frames), f"Error at index {idx}" # assert sups['start_sample'][idx] + sups['num_samples'][ # idx] <= cut.num_samples, f"Error at index {idx}" # Need to call cleanup manually to free the file handles, otherwise the test may crash self.cleanup()
def test_k2_speech_recognition_iterable_dataset(k2_cut_set, num_workers): dataset = K2SpeechRecognitionDataset( k2_cut_set, cut_transforms=[CutConcatenate()] ) sampler = SingleCutSampler(k2_cut_set, shuffle=False) # Note: "batch_size=None" disables the automatic batching mechanism, # which is required when Dataset takes care of the collation itself. dloader = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=num_workers) batch = next(iter(dloader)) assert batch['inputs'].shape == (4, 2000, 40) # Each list has 5 items, to account for: # one cut with two supervisions + 3 three cuts with one supervision assert (batch['supervisions']['sequence_idx'] == tensor([0, 0, 1, 2, 3])).all() assert batch['supervisions']['text'] == ['EXAMPLE OF TEXT'] * 5 # a list, not tensor assert (batch['supervisions']['start_frame'] == tensor([0, 1000, 0, 0, 0])).all() assert (batch['supervisions']['num_frames'] == tensor([1000] * 5)).all()