def parse_utterance( audio: Any, root_path: Path, subsets: Sequence ) -> Tuple[Recording, Dict[str, List[SupervisionSegment]]]: sampling_rate = 16000 recording = Recording( id=audio["aid"], sources=[ AudioSource( type="file", channels=[0], source=str(root_path / audio["path"]), ) ], num_samples=compute_num_samples(duration=audio["duration"], sampling_rate=sampling_rate), sampling_rate=sampling_rate, duration=audio["duration"], ) segments = defaultdict(dict) for sub in subsets: segments[sub] = [] for seg in audio["segments"]: segment = SupervisionSegment( id=seg["sid"], recording_id=audio["aid"], start=seg["begin_time"], duration=add_durations(seg["end_time"], -seg["begin_time"], sampling_rate), language="Chinese", text=seg["text"].strip(), ) for sub in seg["subsets"]: if sub in subsets: segments[sub].append(segment) return recording, segments
def test_cut_load_custom_recording_pad_left(): sampling_rate = 16000 duration = 52.4 audio = np.random.randn(1, compute_num_samples( duration, sampling_rate)).astype(np.float32) audio /= np.abs(audio).max() # normalize to [-1, 1] with NamedTemporaryFile(suffix=".wav") as f: torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate) f.flush() os.fsync(f) recording = Recording.from_file(f.name) # Note: MonoCut doesn't normally have an "alignment" attribute, # and a "load_alignment()" method. # We are dynamically extending it. cut = MonoCut( id="x", start=0, duration=duration, channel=0, recording=dummy_recording(0, duration=duration), ) cut.my_favorite_song = recording cut_pad = cut.pad(duration=60.0, direction="left") restored_audio = cut_pad.load_my_favorite_song() assert restored_audio.shape == (1, 960000) # 16000 * 60 np.testing.assert_almost_equal(0, restored_audio[:, :-audio.shape[1]]) np.testing.assert_almost_equal(audio, restored_audio[:, -audio.shape[1]:])
def parse_utterance( audio: Any, root_path: Path ) -> Optional[Tuple[Recording, List[SupervisionSegment]]]: sampling_rate = int(audio["sample_rate"]) recording = Recording( id=audio["aid"], sources=[ AudioSource( type="file", channels=list(range(int(audio["channels"]))), source=str(root_path / audio["path"]), ) ], num_samples=compute_num_samples( duration=Seconds(audio["duration"]), sampling_rate=sampling_rate ), sampling_rate=sampling_rate, duration=Seconds(audio["duration"]), ) segments = [] for seg in audio["segments"]: segments.append( SupervisionSegment( id=seg["sid"], recording_id=audio["aid"], start=Seconds(seg["begin_time"]), duration=round(Seconds(seg["end_time"] - seg["begin_time"]), ndigits=8), channel=0, language="English", speaker=seg["speaker"], text=seg["text_tn"], ) ) return recording, segments
def test_cut_load_custom_recording_truncate(): sampling_rate = 16000 duration = 52.4 audio = np.random.randn(1, compute_num_samples( duration, sampling_rate)).astype(np.float32) audio /= np.abs(audio).max() # normalize to [-1, 1] with NamedTemporaryFile(suffix=".wav") as f: torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate) f.flush() os.fsync(f) recording = Recording.from_file(f.name) # Note: MonoCut doesn't normally have an "alignment" attribute, # and a "load_alignment()" method. # We are dynamically extending it. cut = dummy_cut(0, duration=duration) cut.my_favorite_song = recording cut_trunc = cut.truncate(duration=5.0) restored_audio = cut_trunc.load_my_favorite_song() assert restored_audio.shape == (1, 80000) np.testing.assert_almost_equal(audio[:, :80000], restored_audio)