def cut(recording): return MonoCut( id="cut", start=0, duration=1.0, channel=0, recording=recording, supervisions=[ SupervisionSegment(id="sup", recording_id=recording.id, start=0, duration=0.5) ], )
def test_cut_move_to_memory_audio_serialization(): path = "test/fixtures/mono_c0.wav" cut = dummy_cut(0, duration=0.5).drop_recording() cut.recording = Recording.from_file(path) cut_with_audio = cut.move_to_memory() assert cut.custom is None # original cut is unmodified data = cut_with_audio.to_dict() cut_deserialized = MonoCut.from_dict(data) np.testing.assert_equal(cut_deserialized.load_audio(), cut_with_audio.load_audio())
def test_cut_load_custom_recording_pad_both(): sampling_rate = 16000 duration = 52.4 audio = np.random.randn(1, compute_num_samples( duration, sampling_rate)).astype(np.float32) audio /= np.abs(audio).max() # normalize to [-1, 1] with NamedTemporaryFile(suffix=".wav") as f: torchaudio.save(f.name, torch.from_numpy(audio), sampling_rate) f.flush() os.fsync(f) recording = Recording.from_file(f.name) # Note: MonoCut doesn't normally have an "alignment" attribute, # and a "load_alignment()" method. # We are dynamically extending it. cut = MonoCut( id="x", start=0, duration=duration, channel=0, recording=dummy_recording(0, duration=duration), ) cut.my_favorite_song = recording cut_pad = cut.pad(duration=duration + 1, direction="left").pad(duration=60.0, direction="right") restored_audio = cut_pad.load_my_favorite_song() assert restored_audio.shape == (1, 960000) # 16000 * 60 np.testing.assert_almost_equal(0, restored_audio[:, :sampling_rate]) np.testing.assert_almost_equal( audio, restored_audio[:, sampling_rate:audio.shape[1] + sampling_rate]) np.testing.assert_almost_equal( 0, restored_audio[:, sampling_rate + audio.shape[1]:])
def test_cut_load_temporal_array_pad(pad_value): """Check the array loaded via TemporalArray is padded along with the cut.""" with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer( f.name) as writer: cut = MonoCut( id="x", start=0, duration=52.4, # 131 frames x 0.4s frame shift == 52.4s channel=0, recording=dummy_recording(1), ) alignment = np.random.randint(500, size=131) cut.alignment = writer.store_array(key="utt1", value=alignment, frame_shift=0.4, temporal_dim=0) cut_pad = cut.pad(duration=60.0, pad_value_dict={"alignment": pad_value}) alignment_pad = cut_pad.load_alignment() assert alignment_pad.shape == (150, ) # 60.0 / 0.4 == 150 np.testing.assert_equal(alignment_pad[:131], alignment) np.testing.assert_equal(alignment_pad[131:], pad_value)
def random_cut_set(n_cuts=100) -> CutSet: return CutSet.from_cuts( MonoCut( id=uuid4(), start=round(random.uniform(0, 5), ndigits=8), duration=round(random.uniform(3, 10), ndigits=8), channel=0, recording=Recording( id=uuid4(), sources=[], sampling_rate=16000, num_samples=1600000, duration=100.0, ), ) for _ in range(n_cuts))
def test_augmentation_chain_randomized( recording: Recording, rir: Recording, target_sampling_rate: int, sp_factor: float, vp_factor: float, reverb: bool, resample_first: bool, cut_duration: Seconds, ): if resample_first: recording_aug = ( recording.resample(target_sampling_rate) .perturb_speed(sp_factor) .perturb_volume(vp_factor) ) else: recording_aug = ( recording.perturb_speed(sp_factor) .resample(target_sampling_rate) .perturb_volume(vp_factor) ) if reverb: recording_aug = recording_aug.reverb_rir(rir) audio_aug = recording_aug.load_audio() assert audio_aug.shape[1] == recording_aug.num_samples cut_aug = MonoCut( id="dummy", start=0.5125, duration=cut_duration, channel=0, recording=recording_aug, ) assert cut_aug.load_audio().shape[1] == cut_aug.num_samples
def libri_cut_with_supervision(libri_recording_orig): return MonoCut( id="libri_cut_1", start=0, duration=libri_recording_orig.duration, channel=0, supervisions=[ SupervisionSegment( id="sup", recording_id="rec", start=0, duration=libri_recording_orig.duration, ) ], recording=libri_recording_orig, )
def random_cut_set(n_cuts=100) -> CutSet: sr = 16000 return CutSet.from_cuts( MonoCut( id=uuid4(), start=random.randint(0, 5 * sr) / sr, duration=random.randint(3 * sr, 10 * sr) / sr, channel=0, recording=Recording( id=uuid4(), sources=[], sampling_rate=16000, num_samples=1600000, duration=100.0, ), ) for _ in range(n_cuts) )
def with_cut( self, sampling_rate: int, num_samples: int, features: bool = True, supervision: bool = False, alignment: bool = False, custom_field: bool = False, frame_shift: Seconds = 0.01, use_zeroes: bool = False, ) -> MonoCut: duration = num_samples / sampling_rate cut = MonoCut( id=str(uuid4()), start=0, duration=duration, channel=0, recording=self.with_recording( sampling_rate=sampling_rate, num_samples=num_samples, use_zeros=use_zeroes, ), ) if features: cut = self._with_features( cut, frame_shift=frame_shift, sampling_rate=sampling_rate ) if supervision: cut.supervisions.append( SupervisionSegment( id=f"sup-{cut.id}", recording_id=cut.recording_id, start=0, duration=cut.duration, text="irrelevant", alignment=self._with_alignment(cut, "irrelevant") if alignment else None, ) ) if custom_field: self._with_custom_temporal_array(cut=cut, frame_shift=frame_shift) return cut
def write(self, manifest: MonoCut) -> bool: """ Converts a Cut to a dict, pickles it, and then stores into a tarfile. :param manifest: the manifest to be written. :return: bool indicating whether the writing was successful. """ with suppress_and_warn(Exception, enabled=self.fault_tolerant): cut = manifest.move_to_memory( audio_format=self.audio_format, load_audio=self.load_audio, load_features=self.load_features, load_custom=self.load_custom, ) data = pickle.dumps(cut.to_dict()) self.writer.write({"__key__": cut.id, "data": data}) return True # Will get here if an exception happened. return False
def cut_set(): cut = MonoCut( id="cut-1", start=0.0, duration=10.0, channel=0, features=Features( type="fbank", num_frames=100, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=10.0, storage_type="lilcom", storage_path="irrelevant", storage_key="irrelevant", ), recording=Recording( id="rec-1", sampling_rate=16000, num_samples=160000, duration=10.0, sources=[ AudioSource(type="file", channels=[0], source="irrelevant") ], ), supervisions=[ SupervisionSegment(id="sup-1", recording_id="irrelevant", start=0.5, duration=6.0), SupervisionSegment(id="sup-2", recording_id="irrelevant", start=7.0, duration=2.0), ], ) return CutSet.from_cuts([ cut, fastcopy(cut, id="cut-nosup", supervisions=[]), fastcopy(cut, id="cut-norec", recording=None), fastcopy(cut, id="cut-nofeat", features=None), cut.pad(duration=30.0, direction="left"), cut.pad(duration=30.0, direction="right"), cut.pad(duration=30.0, direction="both"), cut.mix(cut, offset_other_by=5.0, snr=8), ])
def mono_cut(): """ Scenario:: |-----------------Recording-----------------| "Hey, Matt!" "Yes?" |--------------| |-----| "Oh, nothing" |------------------| |-------------------Cut1--------------------| """ rec = Recording(id="rec1", duration=10.0, sampling_rate=8000, num_samples=80000, sources=[...]) sups = [ SupervisionSegment(id="sup1", recording_id="rec1", start=0.0, duration=3.37, text="Hey, Matt!"), SupervisionSegment(id="sup2", recording_id="rec1", start=4.5, duration=0.9, text="Yes?"), SupervisionSegment(id="sup3", recording_id="rec1", start=4.9, duration=4.3, text="Oh, nothing"), ] return MonoCut( id="rec1-cut1", start=0.0, duration=10.0, channel=0, recording=rec, supervisions=sups, )
def cut_set(): cut = MonoCut(id='cut-1', start=0.0, duration=10.0, channel=0, features=Features( type='fbank', num_frames=100, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=10.0, storage_type='lilcom', storage_path='irrelevant', storage_key='irrelevant', ), recording=Recording(id='rec-1', sampling_rate=16000, num_samples=160000, duration=10.0, sources=[ AudioSource(type='file', channels=[0], source='irrelevant') ]), supervisions=[ SupervisionSegment(id='sup-1', recording_id='irrelevant', start=0.5, duration=6.0), SupervisionSegment(id='sup-2', recording_id='irrelevant', start=7.0, duration=2.0) ]) return CutSet.from_cuts([ cut, fastcopy(cut, id='cut-nosup', supervisions=[]), fastcopy(cut, id='cut-norec', recording=None), fastcopy(cut, id='cut-nofeat', features=None), cut.pad(duration=30.0, direction='left'), cut.pad(duration=30.0, direction='right'), cut.pad(duration=30.0, direction='both'), cut.mix(cut, offset_other_by=5.0, snr=8) ])
def test_padding_issue_478(): """ https://github.com/lhotse-speech/lhotse/issues/478 """ with NamedTemporaryFile(suffix=".h5") as f, NumpyHdf5Writer( f.name) as writer: # Prepare data for cut 1. cut1 = MonoCut("c1", start=0, duration=4.9, channel=0, recording=dummy_recording(1)) ali1 = np.random.randint(500, size=(121, )) cut1.label_alignment = writer.store_array("c1", ali1, frame_shift=0.04, temporal_dim=0) # Prepare data for cut 2. cut2 = MonoCut("c2", start=0, duration=4.895, channel=0, recording=dummy_recording(2)) ali2 = np.random.randint(500, size=(121, )) cut2.label_alignment = writer.store_array("c2", ali2, frame_shift=0.04, temporal_dim=0) # Test collation behavior on this cutset. cuts = CutSet.from_cuts([cut1, cut2]) label_alignments, label_alignment_lens = collate_custom_field( cuts, "label_alignment") np.testing.assert_equal(label_alignments[0].numpy(), ali1) np.testing.assert_equal(label_alignments[1].numpy(), ali2)
def _with_features(self, cut: MonoCut, frame_shift: Seconds) -> MonoCut: d = TemporaryDirectory() self.dirs.append(d) extractor = Fbank(config=FbankConfig(frame_shift=frame_shift)) with LilcomHdf5Writer(d.name) as storage: return cut.compute_and_store_features(extractor, storage=storage)
def cut(recording): return MonoCut(id="cut", start=0, duration=1.0, channel=0, recording=recording)
def _prepare_voxceleb_trials( manifests: Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]] ) -> Dict[str, Tuple[CutSet, CutSet]]: """ Prepare the trials file for the VoxCeleb1 corpus. """ recordings = manifests["recordings"] supervisions = manifests["supervisions"] cuts_utt1_pos, cuts_utt2_pos, cuts_utt1_neg, cuts_utt2_neg = [], [], [], [] urlretrieve_progress(VOXCELEB1_TRIALS_URL, filename="voxceleb_trials.txt") with open("voxceleb_trials.txt", "r") as f: for idx, line in enumerate(f): target, utt1, utt2 = line.strip().split(" ") # id10270/x6uYqmx31kE/00001.wav -> id10270-x6uYqmx31kE-00001 utt1 = "-".join(utt1.split(".")[0].split("/")) utt2 = "-".join(utt2.split(".")[0].split("/")) if utt1 not in recordings or utt2 not in recordings: logging.warning( f"Trial {idx} contains unknown recording: {utt1} or {utt2}" ) continue if target == "1": cuts_utt1_pos.append( MonoCut( id=f"trial-{idx}", recording=recordings[utt1], start=0, duration=recordings[utt1].duration, supervisions=supervisions[utt1], channel=0, ) ) cuts_utt2_pos.append( MonoCut( id=f"trial-{idx}", recording=recordings[utt2], start=0, duration=recordings[utt2].duration, supervisions=supervisions[utt2], channel=0, ) ) else: cuts_utt1_neg.append( MonoCut( id=f"trial-{idx}", recording=recordings[utt1], start=0, duration=recordings[utt1].duration, supervisions=supervisions[utt1], channel=0, ) ) cuts_utt2_neg.append( MonoCut( id=f"trial-{idx}", recording=recordings[utt2], start=0, duration=recordings[utt2].duration, supervisions=supervisions[utt2], channel=0, ) ) return { "pos_trials": ( CutSet.from_cuts(cuts_utt1_pos), CutSet.from_cuts(cuts_utt2_pos), ), "neg_trials": ( CutSet.from_cuts(cuts_utt1_neg), CutSet.from_cuts(cuts_utt2_neg), ), }
def test_cut_audio_mask(self): cut = MonoCut( "cut", start=0, duration=2, channel=0, recording=Mock(sampling_rate=16000) ) mask = cut.supervisions_audio_mask() assert mask.sum() == 0