def mixed_overlapping_cut_set(): """ Input mixed cut:: |---------------mixedcut--------------------| |--------rec1 0-30s--------| |-------rec2 15-45s--------| |---sup1--| |-----sup3-----| |sup2| """ cut_set = CutSet.from_cuts([ MonoCut( 'cut1', start=0, duration=30, channel=0, recording=Recording( id='rec1', sources=[], sampling_rate=16000, num_samples=160000, duration=60.0 ), supervisions=[ SupervisionSegment('sup1', 'rec1', start=1.5, duration=10.5), SupervisionSegment('sup2', 'rec1', start=10, duration=6), ] ).mix( MonoCut( 'cut2', start=15, duration=30, channel=0, recording=Recording( id='rec2', sources=[], sampling_rate=16000, num_samples=160000, duration=60.0 ), supervisions=[ SupervisionSegment('sup3', 'rec2', start=8, duration=18), ] ), offset_other_by=15.0 ) ]) assert isinstance(cut_set[0], MixedCut) return cut_set
def test_trim_to_supervisions_mixed_cuts(): cut_set = CutSet.from_cuts([ Cut('cut1', start=0, duration=30, channel=0, recording=Recording(id='rec1', sources=[], sampling_rate=16000, num_samples=160000, duration=10.0), supervisions=[ SupervisionSegment('sup1', 'rec1', start=1.5, duration=8.5), SupervisionSegment('sup2', 'rec1', start=10, duration=5), SupervisionSegment('sup3', 'rec1', start=20, duration=8), ]).append( Cut('cut2', start=0, duration=30, channel=0, recording=Recording(id='rec1', sources=[], sampling_rate=16000, num_samples=160000, duration=10.0), supervisions=[ SupervisionSegment('sup4', 'rec1', start=0, duration=30), ])) ]) assert isinstance(cut_set[0], MixedCut) cuts = cut_set.trim_to_supervisions() assert len(cuts) == 4 # After "trimming", the MixedCut "decayed" into simple, unmixed cuts, as they did not overlap assert all(isinstance(cut, Cut) for cut in cuts) assert all(len(cut.supervisions) == 1 for cut in cuts) assert all(cut.supervisions[0].start == 0 for cut in cuts) cut = cuts[0] # Check that the cuts preserved their start/duration/supervisions after trimming assert cut.start == 1.5 assert cut.duration == 8.5 assert cut.supervisions[0].id == 'sup1' cut = cuts[1] assert cut.start == 10 assert cut.duration == 5 assert cut.supervisions[0].id == 'sup2' cut = cuts[2] assert cut.start == 20 assert cut.duration == 8 assert cut.supervisions[0].id == 'sup3' cut = cuts[3] assert cut.start == 0 assert cut.duration == 30 assert cut.supervisions[0].id == 'sup4'
def test_mix_same_recording_channels(): recording = Recording('rec', sampling_rate=8000, num_samples=30 * 8000, duration=30, sources=[ AudioSource('file', channels=[0], source='irrelevant1.wav'), AudioSource('file', channels=[1], source='irrelevant2.wav') ]) cut_set = CutSet.from_cuts([ Cut('cut1', start=0, duration=30, channel=0, recording=recording), Cut('cut2', start=0, duration=30, channel=1, recording=recording) ]) mixed = cut_set.mix_same_recording_channels() assert len(mixed) == 1 cut = mixed[0] assert isinstance(cut, MixedCut) assert len(cut.tracks) == 2 assert cut.tracks[0].cut == cut_set[0] assert cut.tracks[1].cut == cut_set[1]
def recording(): return Recording( id='rec', sources=[AudioSource(type='file', channels=[0, 1], source='test/fixtures/stereo.wav')], sampling_rate=8000, num_samples=8000, duration=1.0 )
def recording(file_source): return Recording( id="rec", sources=[file_source], sampling_rate=8000, num_samples=4000, duration=0.5, )
def cut_with_relative_paths(): return Cut('cut', 0, 10, 0, features=Features(type='fbank', num_frames=1000, num_features=40, sampling_rate=8000, storage_type='lilcom_files', storage_path='storage_dir', storage_key='feats.llc', start=0, duration=10), recording=Recording('rec', [AudioSource('file', [0], 'audio.wav')], 8000, 80000, 10.0) )
def random_cut_set(n_cuts=100) -> CutSet: return CutSet.from_cuts( MonoCut(id=uuid4(), start=round(random.uniform(0, 5), ndigits=8), duration=round(random.uniform(3, 10), ndigits=8), channel=0, recording=Recording(id=uuid4(), sources=[], sampling_rate=16000, num_samples=1600000, duration=100.0)) for _ in range(n_cuts))
def test_cut_trim_to_supervisions_extend_handles_end_of_recording(mono_cut): """ Scenario:: |----------Recording---------| |---Sup1----| |--Sup2--| |------------Cut-------------| Into:: |----------Recording---------| |---Cut1----| |---Cut2---| |---Sup1----| |--Sup2--| """ cut = MonoCut( id="X", start=0.0, duration=10.0, channel=0, supervisions=[ SupervisionSegment(id="X", recording_id="X", start=0.0, duration=4.0), SupervisionSegment(id="X", recording_id="X", start=7.0, duration=3.0), ], recording=Recording(id="X", sources=[], sampling_rate=8000, num_samples=80000, duration=10.0), ) cuts = cut.trim_to_supervisions(min_duration=4.0) assert len(cuts) == 2 c1, c2 = cuts assert c1.start == 0 assert c1.duration == 4.0 assert len(c1.supervisions) == 1 (c1_s1, ) = c1.supervisions assert c1_s1.start == 0.0 assert c1_s1.duration == 4.0 assert c2.start == 6.5 assert c2.duration == 3.5 assert len(c2.supervisions) == 1 (c2_s1, ) = c2.supervisions assert c2_s1.start == 0.5 assert c2_s1.duration == 3.0
def recording(): return Recording( id="rec", sources=[ AudioSource(type="file", channels=[0, 1], source="test/fixtures/stereo.wav") ], sampling_rate=8000, num_samples=8000, duration=1.0, )
def with_recording(self, sampling_rate: int, num_samples: int) -> Recording: f = NamedTemporaryFile('wb', suffix='.wav') self.files.append(f) duration = num_samples / sampling_rate samples = np.random.rand(num_samples) soundfile.write(f.name, samples, samplerate=sampling_rate) return Recording( id=str(uuid4()), sources=[AudioSource(type='file', channels=[0], source=f.name)], sampling_rate=sampling_rate, num_samples=num_samples, duration=duration)
def make_recording(sampling_rate: int, num_samples: int) -> Recording: # The idea is that we're going to write to a temporary file with a sine wave recording # of specified duration and sampling rate, and clean up only after the test is executed. with NamedTemporaryFile('wb', suffix='.wav') as f: duration = num_samples / sampling_rate samples: np.ndarray = np.sin(2 * np.pi * np.arange(0, num_samples) / sampling_rate) soundfile.write(f, samples, samplerate=sampling_rate) yield Recording( id=f'recording-{sampling_rate}-{duration}', sources=[AudioSource(type='file', channels=[0], source=f.name)], sampling_rate=sampling_rate, num_samples=num_samples, duration=duration)
def recording_set(): return RecordingSet.from_recordings([ Recording(id='x', sources=[ AudioSource(type='file', channels=[0], source='text/fixtures/mono_c0.wav'), AudioSource(type='command', channels=[1], source='cat text/fixtures/mono_c1.wav') ], sampling_rate=8000, num_samples=4000, duration=0.5) ])
def cut_set(): cut = MonoCut( id="cut-1", start=0.0, duration=10.0, channel=0, features=Features( type="fbank", num_frames=100, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=10.0, storage_type="lilcom", storage_path="irrelevant", storage_key="irrelevant", ), recording=Recording( id="rec-1", sampling_rate=16000, num_samples=160000, duration=10.0, sources=[ AudioSource(type="file", channels=[0], source="irrelevant") ], ), supervisions=[ SupervisionSegment(id="sup-1", recording_id="irrelevant", start=0.5, duration=6.0), SupervisionSegment(id="sup-2", recording_id="irrelevant", start=7.0, duration=2.0), ], ) return CutSet.from_cuts([ cut, fastcopy(cut, id="cut-nosup", supervisions=[]), fastcopy(cut, id="cut-norec", recording=None), fastcopy(cut, id="cut-nofeat", features=None), cut.pad(duration=30.0, direction="left"), cut.pad(duration=30.0, direction="right"), cut.pad(duration=30.0, direction="both"), cut.mix(cut, offset_other_by=5.0, snr=8), ])
def cut_set(): cut = Cut(id='cut-1', start=0.0, duration=10.0, channel=0, features=Features( type='fbank', num_frames=100, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=10.0, storage_type='lilcom', storage_path='irrelevant', storage_key='irrelevant', ), recording=Recording(id='rec-1', sampling_rate=16000, num_samples=160000, duration=10.0, sources=[ AudioSource(type='file', channels=[0], source='irrelevant') ]), supervisions=[ SupervisionSegment(id='sup-1', recording_id='irrelevant', start=0.5, duration=6.0), SupervisionSegment(id='sup-2', recording_id='irrelevant', start=7.0, duration=2.0) ]) return CutSet.from_cuts([ cut, fastcopy(cut, id='cut-nosup', supervisions=[]), fastcopy(cut, id='cut-norec', recording=None), fastcopy(cut, id='cut-nofeat', features=None), cut.pad(duration=30.0, direction='left'), cut.pad(duration=30.0, direction='right'), cut.pad(duration=30.0, direction='both'), cut.mix(cut, offset_other_by=5.0, snr=8) ])
def random_cut_set(n_cuts=100) -> CutSet: sr = 16000 return CutSet.from_cuts( MonoCut( id=uuid4(), start=random.randint(0, 5 * sr) / sr, duration=random.randint(3 * sr, 10 * sr) / sr, channel=0, recording=Recording( id=uuid4(), sources=[], sampling_rate=16000, num_samples=1600000, duration=100.0, ), ) for _ in range(n_cuts) )
def recording_set(): return RecordingSet.from_recordings([ Recording( id="x", sources=[ AudioSource(type="file", channels=[0], source="text/fixtures/mono_c0.wav"), AudioSource( type="command", channels=[1], source="cat text/fixtures/mono_c1.wav", ), ], sampling_rate=8000, num_samples=4000, duration=0.5, ) ])
def cut_with_relative_paths(): return MonoCut( "cut", 0, 10, 0, features=Features( type="fbank", num_frames=1000, num_features=40, sampling_rate=8000, storage_type="lilcom_files", storage_path="storage_dir", storage_key="feats.llc", start=0, duration=10, frame_shift=0.01, ), recording=Recording("rec", [AudioSource("file", [0], "audio.wav")], 8000, 80000, 10.0), )
def mono_cut(): """ Scenario:: |-----------------Recording-----------------| "Hey, Matt!" "Yes?" |--------------| |-----| "Oh, nothing" |------------------| |-------------------Cut1--------------------| """ rec = Recording(id="rec1", duration=10.0, sampling_rate=8000, num_samples=80000, sources=[...]) sups = [ SupervisionSegment(id="sup1", recording_id="rec1", start=0.0, duration=3.37, text="Hey, Matt!"), SupervisionSegment(id="sup2", recording_id="rec1", start=4.5, duration=0.9, text="Yes?"), SupervisionSegment(id="sup3", recording_id="rec1", start=4.9, duration=4.3, text="Oh, nothing"), ] return MonoCut( id="rec1-cut1", start=0.0, duration=10.0, channel=0, recording=rec, supervisions=sups, )
def with_recording( self, sampling_rate: int, num_samples: int, use_zeros: bool = False ) -> Recording: import torchaudio # torchaudio does not have issues on M1 macs unlike soundfile f = NamedTemporaryFile("wb", suffix=".wav") self.files.append(f) duration = num_samples / sampling_rate if use_zeros: samples = torch.zeros((1, num_samples)) else: samples = torch.rand((1, num_samples)) torchaudio.save(f.name, samples, sample_rate=sampling_rate) f.flush() os.fsync(f) return Recording( id=str(uuid4()), sources=[AudioSource(type="file", channels=[0], source=f.name)], sampling_rate=sampling_rate, num_samples=num_samples, duration=duration, )
def make_recording_callhome( sph_path: Pathlike, recording_id: Optional[str] = None, relative_path_depth: Optional[int] = None, sph2pipe_path: Optional[Pathlike] = None ) -> Recording: """ This function creates manifests for CallHome recordings that are compressed with shorten, a rare and mostly unsupported codec. You will need to install sph2pipe (e.g. using Kaldi) in order to read these files. """ try: from sphfile import SPHFile except ImportError: raise ImportError("Please install sphfile (pip install sphfile) instead and " "try preparing CallHome English again.") if sph2pipe_path is None: sph2pipe_path = 'sph2pipe' else: sph2pipe_path = str(sph2pipe_path).strip() sph_path = Path(sph_path) sphf = SPHFile(sph_path) return Recording( id=recording_id if recording_id is not None else sph_path.stem, sampling_rate=sphf.format['sample_rate'], num_samples=sphf.format['sample_count'], duration=sphf.format['sample_count'] / sphf.format['sample_rate'], sources=[ AudioSource( type='command', channels=list(range(sphf.format['channel_count'])), source=f'{sph2pipe_path} -f wav -p ' + ( '/'.join(sph_path.parts[-relative_path_depth:]) if relative_path_depth is not None and relative_path_depth > 0 else str(sph_path) ) ) ] )
def test_mix_same_recording_channels(): recording = Recording( "rec", sampling_rate=8000, num_samples=30 * 8000, duration=30, sources=[ AudioSource("file", channels=[0], source="irrelevant1.wav"), AudioSource("file", channels=[1], source="irrelevant2.wav"), ], ) cut_set = CutSet.from_cuts([ MonoCut("cut1", start=0, duration=30, channel=0, recording=recording), MonoCut("cut2", start=0, duration=30, channel=1, recording=recording), ]) mixed = cut_set.mix_same_recording_channels() assert len(mixed) == 1 cut = mixed[0] assert isinstance(cut, MixedCut) assert len(cut.tracks) == 2 assert cut.tracks[0].cut == cut_set[0] assert cut.tracks[1].cut == cut_set[1]