def test_single_cut_sampler_order_is_deterministic_given_epoch(): cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) sampler = SingleCutSampler( cut_set, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_frames=1000) sampler.set_epoch(42) # calling the sampler twice without epoch update gives identical ordering assert [item for item in sampler] == [item for item in sampler]
def test_single_cut_sampler_len(): # total duration is 55 seconds # each second has 100 frames cuts = CutSet.from_cuts( dummy_cut(idx, duration=float(idx)) for idx in range(1, 11)) sampler = SingleCutSampler(cuts, shuffle=True, max_frames=10 * 100, max_cuts=6) for epoch in range(5): assert len(sampler) == len([batch for batch in sampler]) sampler.set_epoch(epoch)
def test_single_cut_sampler_time_constraints(max_duration, max_frames, max_samples, exception_expectation): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) if max_frames is None: cut_set = cut_set.drop_features() with exception_expectation: sampler = SingleCutSampler( cut_set, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_frames=max_frames, max_samples=max_samples, max_duration=max_duration, ) sampler_cut_ids = [] for batch in sampler: sampler_cut_ids.extend(batch) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(sampler_cut_ids) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(c.id for c in sampler_cut_ids)) == len(sampler_cut_ids) # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet assert [c.id for c in sampler_cut_ids] != [c.id for c in cut_set]
def test_k2_speech_recognition_iterable_dataset_shuffling(): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) dataset = K2SpeechRecognitionDataset( return_cuts=True, cut_transforms=[ CutConcatenate(), ], ) sampler = SingleCutSampler( cut_set, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_frames=1000, ) dloader = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=2) dloader_cut_ids = [] batches = [] for batch in dloader: batches.append(batch) dloader_cut_ids.extend(c.id for c in batch["supervisions"]["cut"]) # Invariant 1: we receive the same amount of items in a dataloader epoch as there we in the CutSet assert len(dloader_cut_ids) == len(cut_set) # Invariant 2: the items are not duplicated assert len(set(dloader_cut_ids)) == len(dloader_cut_ids) # Invariant 3: the items are shuffled, i.e. the order is different than that in the CutSet assert dloader_cut_ids != [c.id for c in cut_set]
def test_k2_speech_recognition_audio_inputs_with_workers_in_input_strategy( k2_cut_set): on_the_fly_dataset = K2SpeechRecognitionDataset( input_strategy=AudioSamples(num_workers=2), ) # all cuts in one batch sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_duration=100000.0) dloader = DataLoader( on_the_fly_dataset, batch_size=None, sampler=sampler, num_workers= 0, # has to be 0 because DataLoader workers can't spawn subprocesses ) batch = next(iter(dloader)) assert batch["inputs"].shape == (4, 320000) # Each list has 5 items, to account for: # one cut with two supervisions + 3 three cuts with one supervision assert (batch["supervisions"]["sequence_idx"] == tensor([0, 0, 1, 2, 3])).all() assert (batch["supervisions"]["text"] == ["EXAMPLE OF TEXT"] * 5 ) # a list, not tensor assert (batch["supervisions"]["start_sample"] == tensor( [0, 160000, 0, 0, 0])).all() assert (batch["supervisions"]["num_samples"] == tensor([160000] * 5)).all()
def test_k2_speech_recognition_iterable_dataset_multiple_workers( k2_cut_set, num_workers): k2_cut_set = k2_cut_set.pad() dataset = K2SpeechRecognitionDataset(cut_transforms=[CutConcatenate()]) sampler = SingleCutSampler(k2_cut_set, shuffle=False) dloader = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=num_workers) # We expect a variable number of batches for each parametrized num_workers value, # because the dataset is small with 4 cuts that are partitioned across the workers. batches = [item for item in dloader] features = torch.cat([b["inputs"] for b in batches]) assert features.shape == (4, 2000, 40) text = [t for b in batches for t in b["supervisions"]["text"]] assert text == ["EXAMPLE OF TEXT"] * 5 # a list, not tensor start_frame = torch.cat( [b["supervisions"]["start_frame"] for b in batches]).tolist() # The multi-worker dataloader might not preserve order, because the workers # might finish processing in different order. To compare ground truth # start times with actual start times, we need to sort. start_frame = sorted(start_frame) assert start_frame == [0] * 4 + [1000] num_frames = torch.cat([b["supervisions"]["num_frames"] for b in batches]).tolist() assert num_frames == [1000] * 5
def test_k2_speech_recognition_on_the_fly_feature_extraction( k2_cut_set, use_batch_extract, fault_tolerant): precomputed_dataset = K2SpeechRecognitionDataset() on_the_fly_dataset = K2SpeechRecognitionDataset( input_strategy=OnTheFlyFeatures( Fbank(FbankConfig(num_mel_bins=40)), use_batch_extract=use_batch_extract, fault_tolerant=fault_tolerant, )) sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1) for cut_ids in sampler: batch_pc = precomputed_dataset[cut_ids] batch_otf = on_the_fly_dataset[cut_ids] # Check that the features do not differ too much. norm_pc = torch.linalg.norm(batch_pc["inputs"]) norm_diff = torch.linalg.norm(batch_pc["inputs"] - batch_otf["inputs"]) # The precomputed and on-the-fly features are different due to mixing in time/fbank domains # and lilcom compression. assert norm_diff < 0.01 * norm_pc # Check that the supervision boundaries are the same. assert (batch_pc["supervisions"]["start_frame"] == batch_otf["supervisions"]["start_frame"]).all() assert (batch_pc["supervisions"]["num_frames"] == batch_otf["supervisions"]["num_frames"]).all()
def test_single_cut_sampler_order_differs_between_epochs(): cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) sampler = SingleCutSampler( cut_set, shuffle=True, # Set an effective batch size of 10 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_frames=1000) last_order = [item for item in sampler] for epoch in range(1, 6): sampler.set_epoch(epoch) new_order = [item for item in sampler] assert new_order != last_order last_order = new_order
def test_k2_speech_recognition_iterable_dataset_low_max_frames(k2_cut_set): dataset = K2SpeechRecognitionDataset() sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_frames=2) dloader = DataLoader(dataset, sampler=sampler, batch_size=None) # Check that it does not crash for batch in dloader: # There will be only a single item in each batch as we're exceeding the limit each time. assert batch["inputs"].shape[0] == 1
def test_zip_sampler_merge_batches_true(): cuts1 = DummyManifest(CutSet, begin_id=0, end_id=100) cuts2 = DummyManifest(CutSet, begin_id=1000, end_id=1100) sampler = ZipSampler( # Note: each cut is 1s duration in this test. SingleCutSampler(cuts1, max_duration=10), SingleCutSampler(cuts2, max_duration=2), ) batches = [b for b in sampler] assert len(batches) == 10 for idx, batch in enumerate(batches): assert len(batch) == 12 # twelve 1s items assert (len([c for c in batch if 0 <= int(c.id.split("-")[-1]) <= 100 ]) == 10) # ten come from cuts1 assert (len([ c for c in batch if 1000 <= int(c.id.split("-")[-1]) <= 1100 ]) == 2) # two come from cuts2
def test_k2_speech_recognition_augmentation(k2_cut_set, k2_noise_cut_set): dataset = K2SpeechRecognitionDataset( cut_transforms=[CutConcatenate(), CutMix(k2_noise_cut_set)]) sampler = SingleCutSampler(k2_cut_set, shuffle=False) dloader = DataLoader(dataset, sampler=sampler, batch_size=None) # Check that it does not crash by just running all dataloader iterations batches = [item for item in dloader] assert len(batches) > 0
def test_no_off_by_one_errors_in_dataset_batch_collation( self, sampling_rate: int, data): ### Test data preparation ### # Generate 10 - 20 cut durations in numbers of samples nums_samples = data.draw( st.lists( st.integers(round(sampling_rate * 0.1), round(sampling_rate * 5.0)), min_size=10, max_size=20, ), label="Cuts numbers of samples", ) # Generate random cuts cuts = [ self.with_cut(sampling_rate=sampling_rate, num_samples=num_samples, supervision=True) for num_samples in nums_samples ] # Mix them with random offsets mixed_cuts = CutSet.from_cuts( lhs.mix( rhs, # Sample the offset in terms of number of samples, and then divide by the sampling rate # to obtain "realistic" offsets offset_other_by=data.draw( st.integers( min_value=int(0.1 * sampling_rate), max_value=int(lhs.duration * sampling_rate), ), label=f"Offset for pair {idx + 1}", ) / sampling_rate, ) for idx, (lhs, rhs) in enumerate(zip(cuts, cuts[1:]))) # Create an ASR dataset dataset = K2SpeechRecognitionDataset( return_cuts=True, cut_transforms=[CutConcatenate(duration_factor=3.0)], ) sampler = SingleCutSampler( mixed_cuts, shuffle=False, ) dloader = DataLoader(dataset, batch_size=None, sampler=sampler) ### End of test data preparation ### # Test the invariants for batch in dloader: sups = batch["supervisions"] cuts = sups["cut"] for idx, cut in enumerate(cuts): assert (sups["start_frame"][idx] + sups["num_frames"][idx] <= cut.num_frames), f"Error at index {idx}" # assert sups['start_sample'][idx] + sups['num_samples'][ # idx] <= cut.num_samples, f"Error at index {idx}" # Need to call cleanup manually to free the file handles, otherwise the test may crash self.cleanup()
def test_k2_speech_recognition_on_the_fly_feature_extraction_with_randomized_smoothing( k2_cut_set, ): dataset = K2SpeechRecognitionDataset( input_strategy=OnTheFlyFeatures(extractor=Fbank(), )) rs_dataset = K2SpeechRecognitionDataset(input_strategy=OnTheFlyFeatures( extractor=Fbank(), # Use p=1.0 to ensure that smoothing is applied in this test. wave_transforms=[RandomizedSmoothing(sigma=0.5, p=1.0)], )) sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1) for cut_ids in sampler: batch = dataset[cut_ids] rs_batch = rs_dataset[cut_ids] # Additive noise should cause the energies to go up assert (rs_batch["inputs"] - batch["inputs"]).sum() > 0
def test_k2_speech_recognition_audio_inputs(k2_cut_set): on_the_fly_dataset = K2SpeechRecognitionDataset( k2_cut_set, input_strategy=AudioSamples(), ) # all cuts in one batch sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_frames=10000000) cut_ids = next(iter(sampler)) batch = on_the_fly_dataset[cut_ids] assert batch['inputs'].shape == (4, 320000) # Each list has 5 items, to account for: # one cut with two supervisions + 3 three cuts with one supervision assert (batch['supervisions']['sequence_idx'] == tensor([0, 0, 1, 2, 3])).all() assert batch['supervisions']['text'] == ['EXAMPLE OF TEXT'] * 5 # a list, not tensor assert (batch['supervisions']['start_sample'] == tensor([0, 160000, 0, 0, 0])).all() assert (batch['supervisions']['num_samples'] == tensor([160000] * 5)).all()
def test_k2_speech_recognition_iterable_dataset(k2_cut_set, num_workers): dataset = K2SpeechRecognitionDataset( k2_cut_set, cut_transforms=[CutConcatenate()] ) sampler = SingleCutSampler(k2_cut_set, shuffle=False) # Note: "batch_size=None" disables the automatic batching mechanism, # which is required when Dataset takes care of the collation itself. dloader = DataLoader(dataset, batch_size=None, sampler=sampler, num_workers=num_workers) batch = next(iter(dloader)) assert batch['inputs'].shape == (4, 2000, 40) # Each list has 5 items, to account for: # one cut with two supervisions + 3 three cuts with one supervision assert (batch['supervisions']['sequence_idx'] == tensor([0, 0, 1, 2, 3])).all() assert batch['supervisions']['text'] == ['EXAMPLE OF TEXT'] * 5 # a list, not tensor assert (batch['supervisions']['start_frame'] == tensor([0, 1000, 0, 0, 0])).all() assert (batch['supervisions']['num_frames'] == tensor([1000] * 5)).all()
def test_single_cut_sampler_drop_last(): # The dummy cuts have a duration of 1 second each cut_set = DummyManifest(CutSet, begin_id=0, end_id=100) sampler = SingleCutSampler( cut_set, # Set an effective batch size of 15 cuts, as all have 1s duration == 100 frames # This way we're testing that it works okay when returning multiple batches in # a full epoch. max_frames=1500, drop_last=True, ) batches = [] for batch in sampler: assert len(batch) == 15 batches.append(batch) assert len(batches) == 6
def test_k2_speech_recognition_on_the_fly_feature_extraction(k2_cut_set): precomputed_dataset = K2SpeechRecognitionDataset(k2_cut_set) on_the_fly_dataset = K2SpeechRecognitionDataset( k2_cut_set.drop_features(), input_strategy=OnTheFlyFeatures(Fbank()) ) sampler = SingleCutSampler(k2_cut_set, shuffle=False, max_cuts=1) for cut_ids in sampler: batch_pc = precomputed_dataset[cut_ids] batch_otf = on_the_fly_dataset[cut_ids] # Check that the features do not differ too much. norm_pc = torch.linalg.norm(batch_pc['inputs']) norm_diff = torch.linalg.norm(batch_pc['inputs'] - batch_otf['inputs']) # The precomputed and on-the-fly features are different due to mixing in time/fbank domains # and lilcom compression. assert norm_diff < 0.01 * norm_pc # Check that the supervision boundaries are the same. assert (batch_pc['supervisions']['start_frame'] == batch_otf['supervisions']['start_frame']).all() assert (batch_pc['supervisions']['num_frames'] == batch_otf['supervisions']['num_frames']).all()
def test_report_padding_ratio_estimate(): s = SingleCutSampler(DummyManifest(CutSet, begin_id=0, end_id=1000)) report_padding_ratio_estimate(s) # just test that it runs
def test_single_cut_sampler_low_max_frames(libri_cut_set): sampler = SingleCutSampler(libri_cut_set, shuffle=False, max_frames=2) # Check that it does not crash for batch in sampler: # There will be only a single item in each batch as we're exceeding the limit each time. assert len(batch) == 1
# There will be one more batch with a single 3s cut. expected_num_batches = 17 expected_num_cuts = 50 expected_discarded_cuts = 0 num_sampled_cuts = sum(len(b) for b in batches) num_discarded_cuts = len(cut_set) - num_sampled_cuts assert len(batches) == expected_num_batches assert num_sampled_cuts == expected_num_cuts assert num_discarded_cuts == expected_discarded_cuts @pytest.mark.parametrize( "sampler", [ SingleCutSampler(DummyManifest(CutSet, begin_id=0, end_id=10)), CutPairsSampler( DummyManifest(CutSet, begin_id=0, end_id=10), DummyManifest(CutSet, begin_id=0, end_id=10), ), BucketingSampler(DummyManifest(CutSet, begin_id=0, end_id=10)), ZipSampler( SingleCutSampler(DummyManifest(CutSet, begin_id=0, end_id=10)), SingleCutSampler(DummyManifest(CutSet, begin_id=10, end_id=20)), ), ], ) def test_sampler_get_report(sampler): _ = [b for b in sampler] print(sampler.get_report()) # It runs - voila!