def test_augmentation_chain_randomized( recording: Recording, rir: Recording, target_sampling_rate: int, sp_factor: float, vp_factor: float, reverb: bool, resample_first: bool, cut_duration: Seconds, ): if resample_first: recording_aug = (recording.resample(target_sampling_rate). perturb_speed(sp_factor).perturb_volume(vp_factor)) else: recording_aug = (recording.perturb_speed(sp_factor).resample( target_sampling_rate).perturb_volume(vp_factor)) if reverb: recording_aug = recording_aug.reverb_rir(rir) audio_aug = recording_aug.load_audio() assert audio_aug.shape[1] == recording_aug.num_samples cut_aug = MonoCut( id="dummy", start=0.5125, duration=cut_duration, channel=0, recording=recording_aug, ) assert cut_aug.load_audio().shape[1] == cut_aug.num_samples
def test_cut_load_custom_recording_pad_left(): sampling_rate = 16000 duration = 52.4 audio = np.random.randn(1, compute_num_samples( duration, sampling_rate)).astype(np.float32) audio /= np.abs(audio).max() # normalize to [-1, 1] with NamedTemporaryFile(suffix=".wav") as f: torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate) f.flush() os.fsync(f) recording = Recording.from_file(f.name) # Note: MonoCut doesn't normally have an "alignment" attribute, # and a "load_alignment()" method. # We are dynamically extending it. cut = MonoCut( id="x", start=0, duration=duration, channel=0, recording=dummy_recording(0, duration=duration), ) cut.my_favorite_song = recording cut_pad = cut.pad(duration=60.0, direction="left") restored_audio = cut_pad.load_my_favorite_song() assert restored_audio.shape == (1, 960000) # 16000 * 60 np.testing.assert_almost_equal(0, restored_audio[:, :-audio.shape[1]]) np.testing.assert_almost_equal(audio, restored_audio[:, -audio.shape[1]:])
def test_cut_speakers_audio_mask(self, supervisions, alignment): cut = MonoCut( "cut", start=0, duration=2, channel=0, recording=Mock(sampling_rate=16000), supervisions=supervisions, ) mask = cut.speakers_audio_mask(use_alignment_if_exists=alignment) if alignment == "word": ones = [ np.index_exp[list(chain(range(0, 1600), range(3200, 6400)))], np.index_exp[list(chain(range(9600, 12800)))], ] zeros = [ np.index_exp[list(chain(range(1600, 3200), range(6400, 32000)))], np.index_exp[list(chain(range(0, 9600), range(12800, 32000)))], ] else: ones = [np.index_exp[range(0, 8000)], np.index_exp[range(9600, 12800)]] zeros = [ np.index_exp[list(chain(range(8000, 32000)))], np.index_exp[list(chain(range(0, 9600), range(12800, 32000)))], ] assert (mask[0, ones[0]] == 1).all() assert (mask[1, ones[1]] == 1).all() assert (mask[0, zeros[0]] == 0).all() assert (mask[1, zeros[1]] == 0).all()
def test_mixed_cut_audio_mask(self): cut = MonoCut( "cut", start=0, duration=2, channel=0, recording=Mock(sampling_rate=16000) ) mixed_cut = cut.append(cut) mask = mixed_cut.supervisions_audio_mask() assert mask.sum() == 0
def with_cut(self, sampling_rate: int, num_samples: int, features: bool = True, supervision: bool = False, alignment: bool = False, frame_shift: Seconds = 0.01) -> MonoCut: duration = num_samples / sampling_rate cut = MonoCut( id=str(uuid4()), start=0, duration=duration, channel=0, recording=self.with_recording(sampling_rate=sampling_rate, num_samples=num_samples)) if features: cut = self._with_features(cut, frame_shift=frame_shift) if supervision: cut.supervisions.append( SupervisionSegment( id=f'sup-{cut.id}', recording_id=cut.recording_id, start=0, duration=cut.duration, text='irrelevant', alignment=self._with_alignment(cut, 'irrelevant') if alignment else None)) return cut
def test_cut_speakers_features_mask(self, supervisions, alignment): cut = MonoCut( "cut", start=0, duration=2, channel=0, features=Mock(sampling_rate=16000, frame_shift=0.01, num_frames=2000), supervisions=supervisions, ) mask = cut.speakers_feature_mask(use_alignment_if_exists=alignment) if alignment == "word": ones = [ np.index_exp[list(chain(range(0, 10), range(20, 40)))], np.index_exp[list(chain(range(60, 80)))], ] zeros = [ np.index_exp[list(chain(range(10, 20), range(40, 200)))], np.index_exp[list(chain(range(0, 60), range(80, 200)))], ] else: ones = [ np.index_exp[list(chain(range(0, 50)))], np.index_exp[list(chain(range(60, 80)))], ] zeros = [ np.index_exp[list(chain(range(50, 200)))], np.index_exp[list(chain(range(0, 60), range(80, 200)))], ] assert (mask[0, ones[0]] == 1).all() assert (mask[1, ones[1]] == 1).all() assert (mask[0, zeros[0]] == 0).all() assert (mask[1, zeros[1]] == 0).all()
def with_cut( self, sampling_rate: int, num_samples: int, features: bool = True, supervision: bool = False, alignment: bool = False, custom_field: bool = False, frame_shift: Seconds = 0.01, ) -> MonoCut: duration = num_samples / sampling_rate cut = MonoCut( id=str(uuid4()), start=0, duration=duration, channel=0, recording=self.with_recording(sampling_rate=sampling_rate, num_samples=num_samples), ) if features: cut = self._with_features(cut, frame_shift=frame_shift) if supervision: cut.supervisions.append( SupervisionSegment( id=f"sup-{cut.id}", recording_id=cut.recording_id, start=0, duration=cut.duration, text="irrelevant", alignment=self._with_alignment(cut, "irrelevant") if alignment else None, )) if custom_field: self._with_custom_temporal_array(cut=cut, frame_shift=frame_shift) return cut
def test_mixed_cut_audio_mask(self, supervisions): cut = MonoCut( "cut", start=0, duration=2, channel=0, recording=Mock(sampling_rate=16000), supervisions=supervisions, ) mixed_cut = cut.append(cut) mask = mixed_cut.supervisions_audio_mask() ones = np.index_exp[ list( chain( range(0, 8000), range(9600, 12800), range(32000, 40000), range(41600, 44800), ) ) ] zeros = np.index_exp[ list( chain( range(8000, 9600), range(12800, 32000), range(40000, 41600), range(44800, 64000), ) ) ] assert (mask[ones] == 1).all() assert (mask[zeros] == 0).all()
def test_cut_audio_mask(self): cut = MonoCut('cut', start=0, duration=2, channel=0, recording=Mock(sampling_rate=16000)) mask = cut.supervisions_audio_mask() assert mask.sum() == 0
def test_mixed_cut_features_mask(self): cut = MonoCut('cut', start=0, duration=2, channel=0, features=Mock(sampling_rate=16000, frame_shift=0.01)) mixed_cut = cut.append(cut) mask = mixed_cut.supervisions_feature_mask() assert mask.sum() == 0
def test_cut_custom_nonarray_attr_serialization(): """Check that arbitrary custom fields work with Cuts upon (de)serialization.""" cut = MonoCut(id="x", start=10, duration=8, channel=0, custom={"SNR": 7.3}) data = cut.to_dict() restored_cut = deserialize_item(data) assert cut == restored_cut # Note: we extended cuts attributes by setting the "custom" field. assert restored_cut.SNR == 7.3
def test_cut_features_mask(self): cut = MonoCut( "cut", start=0, duration=2, channel=0, features=Mock(sampling_rate=16000, frame_shift=0.01, num_frames=2000), ) mask = cut.supervisions_feature_mask() assert mask.sum() == 0
def cut_with_supervision(recording): return MonoCut( id="cut", start=0.0, duration=0.5, channel=0, supervisions=[ SupervisionSegment(id="sup", recording_id="rec", start=0.0, duration=0.5) ], recording=recording, )
def cut_with_supervision_start01(recording): return MonoCut( id="cut_start01", start=0.1, duration=0.4, channel=0, supervisions=[ SupervisionSegment(id="sup", recording_id="rec", start=0.1, duration=0.3) ], recording=recording, )
def random_cut_set(n_cuts=100) -> CutSet: return CutSet.from_cuts( MonoCut(id=uuid4(), start=round(random.uniform(0, 5), ndigits=8), duration=round(random.uniform(3, 10), ndigits=8), channel=0, recording=Recording(id=uuid4(), sources=[], sampling_rate=16000, num_samples=1600000, duration=100.0)) for _ in range(n_cuts))
def test_cut_load_array_truncate(): """Check that loading a custom Array works after truncation.""" ivector = np.arange(20).astype(np.float32) with NamedTemporaryFile(suffix=".h5") as f, LilcomHdf5Writer(f.name) as writer: cut = MonoCut(id="x", start=0, duration=5, channel=0) cut.ivector = writer.store_array(key="utt1", value=ivector) cut = cut.truncate(duration=3) restored_ivector = cut.load_ivector() np.testing.assert_equal(ivector, restored_ivector)
def test_cut_trim_to_supervisions_extend_handles_end_of_recording(mono_cut): """ Scenario:: |----------Recording---------| |---Sup1----| |--Sup2--| |------------Cut-------------| Into:: |----------Recording---------| |---Cut1----| |---Cut2---| |---Sup1----| |--Sup2--| """ cut = MonoCut( id="X", start=0.0, duration=10.0, channel=0, supervisions=[ SupervisionSegment(id="X", recording_id="X", start=0.0, duration=4.0), SupervisionSegment(id="X", recording_id="X", start=7.0, duration=3.0), ], recording=Recording(id="X", sources=[], sampling_rate=8000, num_samples=80000, duration=10.0), ) cuts = cut.trim_to_supervisions(min_duration=4.0) assert len(cuts) == 2 c1, c2 = cuts assert c1.start == 0 assert c1.duration == 4.0 assert len(c1.supervisions) == 1 (c1_s1, ) = c1.supervisions assert c1_s1.start == 0.0 assert c1_s1.duration == 4.0 assert c2.start == 6.5 assert c2.duration == 3.5 assert len(c2.supervisions) == 1 (c2_s1, ) = c2.supervisions assert c2_s1.start == 0.5 assert c2_s1.duration == 3.0
def cut_with_supervision(recording): return MonoCut(id='cut', start=0.0, duration=0.5, channel=0, supervisions=[ SupervisionSegment(id='sup', recording_id='rec', start=0.0, duration=0.5) ], recording=recording)
def test_cut_load_array(): """Check that a custom Array attribute is successfully recognized.""" ivector = np.arange(20).astype(np.float32) with NamedTemporaryFile(suffix=".h5") as f, LilcomHdf5Writer(f.name) as writer: manifest = writer.store_array(key="utt1", value=ivector) cut = MonoCut(id="x", start=0, duration=5, channel=0) # Note: MonoCut doesn't normally have an "ivector" attribute, # and a "load_ivector()" method. # We are dynamically extending it. cut.ivector = manifest restored_ivector = cut.load_ivector() np.testing.assert_equal(ivector, restored_ivector)
def cut_with_supervision_start01(recording): return MonoCut(id='cut_start01', start=0.1, duration=0.4, channel=0, supervisions=[ SupervisionSegment(id='sup', recording_id='rec', start=0.1, duration=0.3) ], recording=recording)
def test_cut_custom_attr_serialization(): """Check that a custom Array attribute is successfully serialized + deserialized.""" ivector = np.arange(20).astype(np.float32) with NamedTemporaryFile(suffix=".h5") as f, LilcomHdf5Writer(f.name) as writer: cut = MonoCut(id="x", start=0, duration=5, channel=0) cut.ivector = writer.store_array(key="utt1", value=ivector) data = cut.to_dict() restored_cut = deserialize_item(data) assert cut == restored_cut restored_ivector = restored_cut.load_ivector() np.testing.assert_equal(ivector, restored_ivector)
def test_padding_issue_478(): """ https://github.com/lhotse-speech/lhotse/issues/478 """ with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer( f.name) as writer: # Prepare data for cut 1. cut1 = MonoCut("c1", start=0, duration=4.9, channel=0, recording=dummy_recording(1)) ali1 = np.random.randint(500, size=(121, )) cut1.label_alignment = writer.store_array("c1", ali1, frame_shift=0.04, temporal_dim=0) # Prepare data for cut 2. cut2 = MonoCut("c2", start=0, duration=4.895, channel=0, recording=dummy_recording(2)) ali2 = np.random.randint(500, size=(121, )) cut2.label_alignment = writer.store_array("c2", ali2, frame_shift=0.04, temporal_dim=0) # Test collation behavior on this cutset. cuts = CutSet.from_cuts([cut1, cut2]) label_alignments, label_alignment_lens = collate_custom_field( cuts, "label_alignment") np.testing.assert_equal(label_alignments[0].numpy(), ali1) np.testing.assert_equal(label_alignments[1].numpy(), ali2)
def cut(recording): return MonoCut( id="cut", start=0, duration=1.0, channel=0, recording=recording, supervisions=[ SupervisionSegment(id="sup", recording_id=recording.id, start=0, duration=0.5) ], )
def test_cut_load_temporal_array_truncate(): """Check the array loaded via TemporalArray is truncated along with the cut.""" with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(f.name) as writer: expected_duration = 52.4 # 131 frames x 0.4s frame shift == 52.4s cut = MonoCut(id="x", start=0, duration=expected_duration, channel=0) alignment = np.random.randint(500, size=131) cut.alignment = writer.store_array( key="utt1", value=alignment, frame_shift=0.4, temporal_dim=0 ) cut_trunc = cut.truncate(duration=5.0) alignment_piece = cut_trunc.load_alignment() assert alignment_piece.shape == (13,) # 5.0 / 0.4 == 12.5 ~= 13 np.testing.assert_equal(alignment[:13], alignment_piece)
def test_cut_load_temporal_array(): """Check that we can read a TemporalArray from a cut when their durations match.""" alignment = np.random.randint(500, size=131) with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer(f.name) as writer: manifest = writer.store_array( key="utt1", value=alignment, frame_shift=0.4, temporal_dim=0 ) expected_duration = 52.4 # 131 frames x 0.4s frame shift == 52.4s cut = MonoCut(id="x", start=0, duration=expected_duration, channel=0) # Note: MonoCut doesn't normally have an "alignment" attribute, # and a "load_alignment()" method. # We are dynamically extending it. cut.alignment = manifest restored_alignment = cut.load_alignment() np.testing.assert_equal(alignment, restored_alignment)
def cut_set(): cut = MonoCut( id="cut-1", start=0.0, duration=10.0, channel=0, features=Features( type="fbank", num_frames=100, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=10.0, storage_type="lilcom", storage_path="irrelevant", storage_key="irrelevant", ), recording=Recording( id="rec-1", sampling_rate=16000, num_samples=160000, duration=10.0, sources=[ AudioSource(type="file", channels=[0], source="irrelevant") ], ), supervisions=[ SupervisionSegment(id="sup-1", recording_id="irrelevant", start=0.5, duration=6.0), SupervisionSegment(id="sup-2", recording_id="irrelevant", start=7.0, duration=2.0), ], ) return CutSet.from_cuts([ cut, fastcopy(cut, id="cut-nosup", supervisions=[]), fastcopy(cut, id="cut-norec", recording=None), fastcopy(cut, id="cut-nofeat", features=None), cut.pad(duration=30.0, direction="left"), cut.pad(duration=30.0, direction="right"), cut.pad(duration=30.0, direction="both"), cut.mix(cut, offset_other_by=5.0, snr=8), ])
def libri_cut_with_supervision(libri_recording_orig): return MonoCut( id="libri_cut_1", start=0, duration=libri_recording_orig.duration, channel=0, supervisions=[ SupervisionSegment( id="sup", recording_id="rec", start=0, duration=libri_recording_orig.duration, ) ], recording=libri_recording_orig, )
def test_mixed_cut_features_mask(self, supervisions): cut = MonoCut('cut', start=0, duration=2, channel=0, features=Mock(sampling_rate=16000, frame_shift=0.01), supervisions=supervisions) mixed_cut = cut.append(cut) mask = mixed_cut.supervisions_feature_mask() ones = np.index_exp[list( chain(range(0, 50), range(60, 80), range(200, 250), range(260, 280)))] zeros = np.index_exp[list( chain(range(50, 60), range(80, 200), range(250, 260), range(280, 400)))] assert (mask[ones] == 1).all() assert (mask[zeros] == 0).all()
def cut_set(): cut = MonoCut(id='cut-1', start=0.0, duration=10.0, channel=0, features=Features( type='fbank', num_frames=100, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=10.0, storage_type='lilcom', storage_path='irrelevant', storage_key='irrelevant', ), recording=Recording(id='rec-1', sampling_rate=16000, num_samples=160000, duration=10.0, sources=[ AudioSource(type='file', channels=[0], source='irrelevant') ]), supervisions=[ SupervisionSegment(id='sup-1', recording_id='irrelevant', start=0.5, duration=6.0), SupervisionSegment(id='sup-2', recording_id='irrelevant', start=7.0, duration=2.0) ]) return CutSet.from_cuts([ cut, fastcopy(cut, id='cut-nosup', supervisions=[]), fastcopy(cut, id='cut-norec', recording=None), fastcopy(cut, id='cut-nofeat', features=None), cut.pad(duration=30.0, direction='left'), cut.pad(duration=30.0, direction='right'), cut.pad(duration=30.0, direction='both'), cut.mix(cut, offset_other_by=5.0, snr=8) ])
def test_augmentation_chain_randomized( target_sampling_rate: int, sp_factor: float, resample_first: bool, cut_duration: Seconds ): recording = Recording.from_file('test/fixtures/libri/libri-1088-134315-0000.wav') if resample_first: recording_aug = recording.resample(target_sampling_rate).perturb_speed(sp_factor) else: recording_aug = recording.perturb_speed(sp_factor).resample(target_sampling_rate) audio_aug = recording_aug.load_audio() assert audio_aug.shape[1] == recording_aug.num_samples cut_aug = MonoCut(id='dummy', start=0.5125, duration=cut_duration, channel=0, recording=recording_aug) assert cut_aug.load_audio().shape[1] == cut_aug.num_samples