def prepare_single_partition( raw_manifest_path: Path, corpus_dir: Path, speaker_id: str, clean_or_other: str, ): recordings = [] supervisions = [] for meta in load_jsonl(raw_manifest_path): recording = Recording.from_file(corpus_dir / meta["audio_filepath"]) recordings.append(recording) supervisions.append( SupervisionSegment( id=recording.id, recording_id=recording.id, start=0, duration=recording.duration, channel=0, text=meta["text"], speaker=ID2SPEAKER[speaker_id], gender=ID2GENDER[speaker_id], custom={ "text_punct": meta["text_normalized"], "split": clean_or_other }, )) recordings = RecordingSet.from_recordings(recordings) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) return recordings, supervisions
def with_cut( self, sampling_rate: int, num_samples: int, features: bool = True, supervision: bool = False ) -> Cut: duration = num_samples / sampling_rate cut = Cut( id=str(uuid4()), start=0, duration=duration, channel=0, recording=self.with_recording(sampling_rate=sampling_rate, num_samples=num_samples) ) if features: cut = self._with_features(cut) if supervision: cut.supervisions.append(SupervisionSegment( id=f'sup-{cut.id}', recording_id=cut.recording_id, start=0, duration=cut.duration, text='irrelevant' )) return cut
def test_extend_by_cut_with_supervision( cut_start, cut_duration, extend_duration, extend_direction, supervision_start, supervision_duration, expected_start, expected_end, ): recording = dummy_recording(int(uuid4()), duration=1.0) supervisions = SupervisionSet.from_segments([ SupervisionSegment( id=int(uuid4()), recording_id=recording.id, start=supervision_start, duration=supervision_duration, ) ]) cut = dummy_cut(int(uuid4()), start=cut_start, duration=cut_duration, supervisions=supervisions) extended_cut = cut.extend_by(duration=extend_duration, direction=extend_direction) assert isclose(extended_cut.supervisions[0].start, expected_start) assert isclose(extended_cut.supervisions[0].end, expected_end)
def _filename_to_supervisions(filename: Path, language: str): lines = filename.read_text() recoid = filename.stem.split(".")[0] supervisions = [] filterfun = partial(_filter_word) for start, end, line in _parse_vtt(lines, "<noise>"): line_list = [] for w in line.split(): w_ = w.strip() if re.match(r"^(\([^)]*\) *)+$", w_): line_list.append(w_) elif filterfun(w): line_list.append(w_) else: line_list.append("<unk>") line_ = " ".join(line_list) if re.match(r"^\w+ *(<[^>]*> *)+$", line_, re.UNICODE): line_new = line_.strip() elif "<" in line_ or ">" in line_: continue else: line_new = line_.strip() supervisions.append( SupervisionSegment( id=_format_uttid(recoid, start), recording_id=recoid, start=start, duration=round(end - start, ndigits=8), channel=0, text=line_new, language=language, speaker=recoid, )) return supervisions
def deserialize_item(data: dict) -> Any: # Figures out what type of manifest is being decoded with some heuristics # and returns a Lhotse manifest object rather than a raw dict. from lhotse import MonoCut, Features, Recording, SupervisionSegment from lhotse.array import deserialize_array from lhotse.cut import MixedCut if "shape" in data or "array" in data: return deserialize_array(data) if "sources" in data: return Recording.from_dict(data) if "num_features" in data: return Features.from_dict(data) if "type" not in data: return SupervisionSegment.from_dict(data) cut_type = data.pop("type") if cut_type == "MonoCut": return MonoCut.from_dict(data) if cut_type == "Cut": warnings.warn( "Your manifest was created with Lhotse version earlier than v0.8, when MonoCut was called Cut. " "Please re-generate it with Lhotse v0.8 as it might stop working in a future version " "(using manifest.from_file() and then manifest.to_file() should be sufficient)." ) return MonoCut.from_dict(data) if cut_type == "MixedCut": return MixedCut.from_dict(data) raise ValueError( f"Unexpected cut type during deserialization: '{cut_type}'")
def _process_file( file_path: Pathlike, speaker_metadata: Dict[str, SpeakerMetadata], ) -> Tuple[Recording, SupervisionSegment]: """ Process a single wav file and return a Recording and a SupervisionSegment. """ speaker_id = file_path.parent.parent.stem session_id = file_path.parent.stem uttid = file_path.stem recording_id = f"{speaker_id}-{session_id}-{uttid}" recording = Recording.from_file(file_path, recording_id=recording_id) supervision = SupervisionSegment( id=recording_id, recording_id=recording_id, speaker=speaker_id, gender=speaker_metadata[speaker_id].gender, start=0.0, duration=recording.duration, custom={ "speaker_name": speaker_metadata[speaker_id].name, "nationality": speaker_metadata[speaker_id].nationality, "split": speaker_metadata[speaker_id].split, }, ) return recording, supervision
def supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment( id="segment-1", recording_id="recording-1", channel=0, start=0.1, duration=0.3, text="transcript of the first segment", language="english", speaker="Norman Dyhrentfurth", gender="male", alignment={ "word": [ AlignmentItem(symbol="transcript", start=0.1, duration=0.08), AlignmentItem(symbol="of", start=0.18, duration=0.02), AlignmentItem(symbol="the", start=0.2, duration=0.03), AlignmentItem(symbol="first", start=0.23, duration=0.07), AlignmentItem(symbol="segment", start=0.3, duration=0.1), ] }, ) ])
def prepare_vctk( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" speaker_meta = _parse_speaker_description(corpus_dir) recordings = RecordingSet.from_recordings( Recording.from_file(wav) for wav in (corpus_dir / "wav48").rglob("*.wav")) supervisions = [] for path in (corpus_dir / "txt").rglob("*.txt"): # One utterance (line) per file text = path.read_text().strip() speaker = path.name.split("_")[0] # p226_001.txt -> p226 seg_id = path.stem meta = speaker_meta.get(speaker, defaultdict(lambda: None)) if meta is None: logging.warning(f"Cannot find metadata for speaker {speaker}.") supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language="English", speaker=speaker, gender=meta["gender"], custom={ "accent": meta["accent"], "age": meta["age"], "region": meta["region"], }, )) supervisions = SupervisionSet.from_segments(supervisions) # note(pzelasko): There were 172 recordings without supervisions when I ran it. # I am just removing them. recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / "recordings.json") supervisions.to_json(output_dir / "supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_tedlium( tedlium_root: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the TED-LIUM v3 corpus. The manifests are created in a dict with three splits: train, dev and test. Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'. :param tedlium_root: Path to the unpacked TED-LIUM data. :return: A dict with standard corpus splits containing the manifests. """ tedlium_root = Path(tedlium_root) output_dir = Path(output_dir) if output_dir is not None else None corpus = {} for split in ("train", "dev", "test"): root = tedlium_root / "legacy" / split recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in (root / "sph").glob("*.sph")) stms = list((root / "stm").glob("*.stm")) assert len(stms) == len(recordings), ( f"Mismatch: found {len(recordings)} " f"sphere files and {len(stms)} STM files. " f"You might be missing some parts of TEDLIUM...") segments = [] for p in stms: with p.open() as f: for idx, l in enumerate(f): rec_id, _, _, start, end, _, *words = l.split() start, end = float(start), float(end) text = " ".join(words).replace("{NOISE}", "[NOISE]") if text == "ignore_time_segment_in_scoring": continue segments.append( SupervisionSegment( id=f"{rec_id}-{idx}", recording_id=rec_id, start=start, duration=round(end - start, ndigits=8), channel=0, text=text, language="English", speaker=rec_id, )) supervisions = SupervisionSet.from_segments(segments) corpus[split] = { "recordings": recordings, "supervisions": supervisions } validate_recordings_and_supervisions(**corpus[split]) if output_dir is not None: recordings.to_file(output_dir / f"tedlium_recordings_{split}.jsonl.gz") supervisions.to_file(output_dir / f"tedlium_supervisions_{split}.jsonl.gz") return corpus
def cut_set(): # The contents of 'test/fixtures/libri/cuts.json' cs = CutSet.from_dicts([{ "channel": 0, "duration": 10.0, "features": { "channels": 0, "duration": 16.04, "num_features": 23, "num_frames": 1604, "recording_id": "recording-1", "sampling_rate": 16000, "start": 0.0, "storage_path": "test/fixtures/libri/storage", "storage_key": "30c2440c-93cb-4e83-b382-f2a59b3859b4.llc", "storage_type": "lilcom_files", "type": "fbank" }, "id": "e3e70682-c209-4cac-629f-6fbed82c07cd", "recording": { "duration": 16.04, "id": "recording-1", "num_samples": 256640, "sampling_rate": 16000, "sources": [{ "channels": [0], "source": "test/fixtures/libri/libri-1088-134315-0000.wav", "type": "file" }] }, "start": 0.0, "supervisions": [], "type": "Cut" }]) # These supervisions are artificially overwritten in a 10 seconds long LibriSpeech cut # to test the speaker activity matrix in the DiarizationDataset. cs[0].supervisions = [ SupervisionSegment('s1', 'recording-1', 0, 3, speaker='spk1'), SupervisionSegment('s2', 'recording-1', 2, 4, speaker='spk2'), SupervisionSegment('s3', 'recording-1', 5, 2, speaker='spk3'), SupervisionSegment('s4', 'recording-1', 7.5, 2.5, speaker='spk4'), ] return cs
def prepare_tedlium( tedlium_root: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the TED-LIUM v3 corpus. The manifests are created in a dict with three splits: train, dev and test. Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'. :param tedlium_root: Path to the unpacked TED-LIUM data. :return: A dict with standard corpus splits containing the manifests. """ tedlium_root = Path(tedlium_root) output_dir = Path(output_dir) if output_dir is not None else None corpus = {} for split in ('train', 'dev', 'test'): root = tedlium_root / 'legacy' / split recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in (root / 'sph').glob('*.sph') ) stms = list((root / 'stm').glob('*.stm')) assert len(stms) == len(recordings), f'Mismatch: found {len(recordings)} ' \ f'sphere files and {len(stms)} STM files. ' \ f'You might be missing some parts of TEDLIUM...' segments = [] for p in stms: with p.open() as f: for idx, l in enumerate(f): rec_id, _, _, start, end, _, *words = l.split() start, end = float(start), float(end) text = ' '.join(words).replace('{NOISE}', '[NOISE]') if text == 'ignore_time_segment_in_scoring': continue segments.append( SupervisionSegment( id=f'{rec_id}-{idx}', recording_id=rec_id, start=start, duration=round(end - start, ndigits=8), channel=0, text=text, language='English', speaker=rec_id, ) ) supervisions = SupervisionSet.from_segments(segments) corpus[split] = { 'recordings': recordings, 'supervisions': supervisions } validate_recordings_and_supervisions(**corpus[split]) if output_dir is not None: recordings.to_json(output_dir / f'{split}_recordings.json') supervisions.to_json(output_dir / f'{split}_supervisions.json') return corpus
def cut_set(): cut = Cut(id='cut-1', start=0.0, duration=10.0, channel=0, features=Features( type='fbank', num_frames=100, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=10.0, storage_type='lilcom', storage_path='irrelevant', storage_key='irrelevant', ), recording=Recording(id='rec-1', sampling_rate=16000, num_samples=160000, duration=10.0, sources=[ AudioSource(type='file', channels=[0], source='irrelevant') ]), supervisions=[ SupervisionSegment(id='sup-1', recording_id='irrelevant', start=0.5, duration=6.0), SupervisionSegment(id='sup-2', recording_id='irrelevant', start=7.0, duration=2.0) ]) return CutSet.from_cuts([ cut, fastcopy(cut, id='cut-nosup', supervisions=[]), fastcopy(cut, id='cut-norec', recording=None), fastcopy(cut, id='cut-nofeat', features=None), cut.pad(duration=30.0, direction='left'), cut.pad(duration=30.0, direction='right'), cut.pad(duration=30.0, direction='both'), cut.mix(cut, offset_other_by=5.0, snr=8) ])
def test_trim_to_supervisions_mixed_cuts(): cut_set = CutSet.from_cuts([ Cut('cut1', start=0, duration=30, channel=0, supervisions=[ SupervisionSegment('sup1', 'rec1', start=1.5, duration=8.5), SupervisionSegment('sup2', 'rec1', start=10, duration=5), SupervisionSegment('sup3', 'rec1', start=20, duration=8), ]).append( Cut('cut2', start=0, duration=30, channel=0, supervisions=[ SupervisionSegment('sup4', 'rec1', start=0, duration=30), ])) ]) assert isinstance(cut_set[0], MixedCut) cuts = cut_set.trim_to_supervisions() assert len(cuts) == 4 # After "trimming", the MixedCut "decayed" into simple, unmixed cuts, as they did not overlap assert all(isinstance(cut, Cut) for cut in cuts) assert all(len(cut.supervisions) == 1 for cut in cuts) assert all(cut.supervisions[0].start == 0 for cut in cuts) cut = cuts[0] # Check that the cuts preserved their start/duration/supervisions after trimming assert cut.start == 1.5 assert cut.duration == 8.5 assert cut.supervisions[0].id == 'sup1' cut = cuts[1] assert cut.start == 10 assert cut.duration == 5 assert cut.supervisions[0].id == 'sup2' cut = cuts[2] assert cut.start == 20 assert cut.duration == 8 assert cut.supervisions[0].id == 'sup3' cut = cuts[3] assert cut.start == 0 assert cut.duration == 30 assert cut.supervisions[0].id == 'sup4'
def cut_set(): cut = MonoCut( id="cut-1", start=0.0, duration=10.0, channel=0, features=Features( type="fbank", num_frames=100, num_features=40, frame_shift=0.01, sampling_rate=16000, start=0.0, duration=10.0, storage_type="lilcom", storage_path="irrelevant", storage_key="irrelevant", ), recording=Recording( id="rec-1", sampling_rate=16000, num_samples=160000, duration=10.0, sources=[AudioSource(type="file", channels=[0], source="irrelevant")], ), supervisions=[ SupervisionSegment( id="sup-1", recording_id="irrelevant", start=0.5, duration=6.0 ), SupervisionSegment( id="sup-2", recording_id="irrelevant", start=7.0, duration=2.0 ), ], ) return CutSet.from_cuts( [ cut, fastcopy(cut, id="cut-nosup", supervisions=[]), fastcopy(cut, id="cut-norec", recording=None), fastcopy(cut, id="cut-nofeat", features=None), cut.pad(duration=30.0, direction="left"), cut.pad(duration=30.0, direction="right"), cut.pad(duration=30.0, direction="both"), cut.mix(cut, offset_other_by=5.0, snr=8), ] )
def prepare_single_babel_language(corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None): manifests = defaultdict(dict) for split in ('dev', 'eval', 'training'): audio_dir = corpus_dir / f'conversational/{split}/audio' recordings = RecordingSet.from_recordings(Recording.from_sphere(p) for p in audio_dir.glob('*.sph')) if len(recordings) == 0: logging.warning(f"No SPHERE files found in {audio_dir}") manifests[split]['recordings'] = recordings supervisions = [] text_dir = corpus_dir / f'conversational/{split}/transcription' for p in text_dir.glob('*'): # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine # parts: # 0 -> BABEL # 1 -> BP # 2 -> <language-code> (101) # 3 -> <speaker-id> (10033) # 4 -> <date> (20111024) # 5 -> <hour> (205740) # 6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split('_') channel = {'inLine': 'A', 'outLine': 'B'}.get(channel, 'A') # Add a None at the end so that the last timestamp is only used as "next_timestamp" # and ends the iretation (otherwise we'd lose the last segment). lines = p.read_text().splitlines() + [None] for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])): start = float(timestamp[1:-1]) end = float(next_timestamp[1:-1]) supervisions.append( SupervisionSegment( id=f'{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}', recording_id=p.stem, start=start, duration=round(end - start, ndigits=8), channel=0, text=normalize_text(text), language=BABELCODE2LANG[lang_code], speaker=speaker, ) ) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") manifests[split]['supervisions'] = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions( manifests[split]['recordings'], manifests[split]['superevisions'] ) if output_dir is not None: language = BABELCODE2LANG[lang_code] if split == 'training': split = 'train' manifests[split]['recordings'].to_json(f'recordings_{language}_{split}.json') manifests[split]['supervisions'].to_json(f'supervisions_{language}_{split}.json') return manifests
def prepare_cmu_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the CMU Arctic manifests, which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( # Example ID: cmu_us_sup_arctic-arctic_a0001 Recording.from_file( wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}" ) for wav in corpus_dir.rglob("*.wav") ) supervisions = [] for path in corpus_dir.rglob("txt.done.data"): lines = path.read_text().splitlines() speaker = _get_speaker(path.parent.parent.name) for l in lines: l = l[2:-2] # get rid of parentheses and whitespaces on the edges seg_id, text = l.split(maxsplit=1) seg_id = f"{speaker}-{seg_id}" supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text.replace('"', ""), # get rid of quotation marks, language="English", speaker=speaker, gender=GENDER_MAP.get(speaker), custom={"accent": ACCENT_MAP.get(speaker)}, ) ) supervisions = SupervisionSet.from_segments(supervisions) # There seem to be 20 recordings missing; remove the before validation recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) recordings.to_json(output_dir / "cmu_arctic_recordings.json") supervisions.to_json(output_dir / "cmu_arctic_supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def test_trim_to_unsupervised_segments(): cut_set = CutSet.from_cuts([ # Yields 3 unsupervised cuts - before first supervision, # between sup2 and sup3, and after sup3. MonoCut( "cut1", start=0, duration=30, channel=0, supervisions=[ SupervisionSegment("sup1", "rec1", start=1.5, duration=8.5), SupervisionSegment("sup2", "rec1", start=10, duration=5), SupervisionSegment("sup3", "rec1", start=20, duration=8), ], recording=dummy_recording(1, duration=30), ), # Does not yield any "unsupervised" cut. MonoCut( "cut2", start=0, duration=30, channel=0, supervisions=[ SupervisionSegment("sup4", "rec1", start=0, duration=30), ], recording=dummy_recording(2, duration=30), ), ]) unsupervised_cuts = cut_set.trim_to_unsupervised_segments() assert len(unsupervised_cuts) == 3 assert unsupervised_cuts[0].start == 0 assert unsupervised_cuts[0].duration == 1.5 assert unsupervised_cuts[0].supervisions == [] assert unsupervised_cuts[1].start == 15 assert unsupervised_cuts[1].duration == 5 assert unsupervised_cuts[1].supervisions == [] assert unsupervised_cuts[2].start == 28 assert unsupervised_cuts[2].duration == 2 assert unsupervised_cuts[2].supervisions == []
def cut_with_supervision_start01(recording): return MonoCut( id="cut_start01", start=0.1, duration=0.4, channel=0, supervisions=[ SupervisionSegment(id="sup", recording_id="rec", start=0.1, duration=0.3) ], recording=recording, )
def cut_with_supervision(recording): return MonoCut( id="cut", start=0.0, duration=0.5, channel=0, supervisions=[ SupervisionSegment(id="sup", recording_id="rec", start=0.0, duration=0.5) ], recording=recording, )
def cut_with_supervision(recording): return MonoCut(id='cut', start=0.0, duration=0.5, channel=0, supervisions=[ SupervisionSegment(id='sup', recording_id='rec', start=0.0, duration=0.5) ], recording=recording)
def supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment(id='segment-1', recording_id='recording-1', channel=0, start=0.1, duration=0.3, text='transcript of the first segment', language='english', speaker='Norman Dyhrentfurth', gender='male') ])
def cut_with_supervision_start01(recording): return MonoCut(id='cut_start01', start=0.1, duration=0.4, channel=0, supervisions=[ SupervisionSegment(id='sup', recording_id='rec', start=0.1, duration=0.3) ], recording=recording)
def mono_cut(): """ Scenario:: |-----------------Recording-----------------| "Hey, Matt!" "Yes?" |--------------| |-----| "Oh, nothing" |------------------| |-------------------Cut1--------------------| """ rec = Recording(id="rec1", duration=10.0, sampling_rate=8000, num_samples=80000, sources=[...]) sups = [ SupervisionSegment(id="sup1", recording_id="rec1", start=0.0, duration=3.37, text="Hey, Matt!"), SupervisionSegment(id="sup2", recording_id="rec1", start=4.5, duration=0.9, text="Yes?"), SupervisionSegment(id="sup3", recording_id="rec1", start=4.9, duration=4.3, text="Oh, nothing"), ] return MonoCut( id="rec1-cut1", start=0.0, duration=10.0, channel=0, recording=rec, supervisions=sups, )
def test_trim_to_supervisions_mixed_cuts(): cut_set = CutSet.from_cuts([ Cut('cut1', start=0, duration=30, channel=0, supervisions=[ SupervisionSegment('sup1', 'rec1', start=1.5, duration=8.5), SupervisionSegment('sup2', 'rec1', start=10, duration=5), SupervisionSegment('sup3', 'rec1', start=20, duration=8), ]).append( Cut('cut2', start=0, duration=30, channel=0, supervisions=[ SupervisionSegment('sup4', 'rec1', start=0, duration=30), ])) ]) cuts = cut_set.trim_to_supervisions() assert len(cuts) == 4 assert all(isinstance(cut, MixedCut) for cut in cuts) assert all(cut.start == 0 for cut in cuts) assert all(len(cut.supervisions) == 1 for cut in cuts) assert all(cut.supervisions[0].start == 0 for cut in cuts) cut = cuts[0] assert cut.duration == 8.5 assert cut.supervisions[0].id == 'sup1' cut = cuts[1] assert cut.duration == 5 assert cut.supervisions[0].id == 'sup2' cut = cuts[2] assert cut.duration == 8 assert cut.supervisions[0].id == 'sup3' cut = cuts[3] assert cut.duration == 30 assert cut.supervisions[0].id == 'sup4'
def cut(recording): return MonoCut( id="cut", start=0, duration=1.0, channel=0, recording=recording, supervisions=[ SupervisionSegment(id="sup", recording_id=recording.id, start=0, duration=0.5) ], )
def test_trim_to_unsupervised_segments(): cut_set = CutSet.from_cuts([ # Yields 3 unsupervised cuts - before first supervision, # between sup2 and sup3, and after sup3. Cut('cut1', start=0, duration=30, channel=0, supervisions=[ SupervisionSegment('sup1', 'rec1', start=1.5, duration=8.5), SupervisionSegment('sup2', 'rec1', start=10, duration=5), SupervisionSegment('sup3', 'rec1', start=20, duration=8), ]), # Does not yield any "unsupervised" cut. Cut('cut2', start=0, duration=30, channel=0, supervisions=[ SupervisionSegment('sup4', 'rec1', start=0, duration=30), ]), ]) unsupervised_cuts = cut_set.trim_to_unsupervised_segments() assert len(unsupervised_cuts) == 3 assert unsupervised_cuts[0].start == 0 assert unsupervised_cuts[0].duration == 1.5 assert unsupervised_cuts[0].supervisions == [] assert unsupervised_cuts[1].start == 15 assert unsupervised_cuts[1].duration == 5 assert unsupervised_cuts[1].supervisions == [] assert unsupervised_cuts[2].start == 28 assert unsupervised_cuts[2].duration == 2 assert unsupervised_cuts[2].supervisions == []
def prepare_same_close_mic(part3_path): check_dependencies() from textgrids import TextGrid recordings = [] supervisions = [] for audio_path in tqdm( (part3_path / "AudioSameCloseMic").glob("*.wav"), desc="Creating manifests for SameCloseMic", ): try: recording_id = audio_path.stem recording = Recording.from_file(audio_path) tg = TextGrid( part3_path / f"ScriptsSame/{recording_id}.TextGrid", coding="utf-16" ) segments = [ s for s in ( SupervisionSegment( id=f"{recording_id}-{idx}", recording_id=recording_id, start=segment.xmin, # We're trimming the last segment's duration as it exceeds the actual duration of the recording. # This is safe because if we end up with a zero/negative duration, the validation will catch it. duration=min( round(segment.xmax - segment.xmin, ndigits=8), recording.duration - segment.xmin, ), text=segment.text, language="Singaporean English", speaker=recording_id, ) for idx, segment in enumerate(tg[recording_id]) if segment.text not in ("<S>", "<Z>") # skip silences ) if s.duration > 0 # NSC has some bad segments ] recordings.append(recording) supervisions.extend(segments) except: print(f"Error when processing {audio_path} - skipping...") return { "recordings": RecordingSet.from_recordings(recordings), "supervisions": SupervisionSet.from_segments(supervisions), }
def libri_cut_with_supervision(libri_recording_orig): return MonoCut( id="libri_cut_1", start=0, duration=libri_recording_orig.duration, channel=0, supervisions=[ SupervisionSegment( id="sup", recording_id="rec", start=0, duration=libri_recording_orig.duration, ) ], recording=libri_recording_orig, )
def deserialize_item(data: dict) -> Any: # Figures out what type of manifest is being decoded with some heuristics # and returns a Lhotse manifest object rather than a raw dict. from lhotse import Cut, Features, Recording, SupervisionSegment from lhotse.cut import MixedCut data = arr2list_recursive(data) if 'sources' in data: return Recording.from_dict(data) if 'num_features' in data: return Features.from_dict(data) if 'type' not in data: return SupervisionSegment.from_dict(data) cut_type = data.pop('type') if cut_type == 'Cut': return Cut.from_dict(data) if cut_type == 'MixedCut': return MixedCut.from_dict(data) raise ValueError(f"Unexpected cut type during deserialization: '{cut_type}'")
def with_cut( self, sampling_rate: int, num_samples: int, features: bool = True, supervision: bool = False, alignment: bool = False, custom_field: bool = False, frame_shift: Seconds = 0.01, use_zeroes: bool = False, ) -> MonoCut: duration = num_samples / sampling_rate cut = MonoCut( id=str(uuid4()), start=0, duration=duration, channel=0, recording=self.with_recording( sampling_rate=sampling_rate, num_samples=num_samples, use_zeros=use_zeroes, ), ) if features: cut = self._with_features( cut, frame_shift=frame_shift, sampling_rate=sampling_rate ) if supervision: cut.supervisions.append( SupervisionSegment( id=f"sup-{cut.id}", recording_id=cut.recording_id, start=0, duration=cut.duration, text="irrelevant", alignment=self._with_alignment(cut, "irrelevant") if alignment else None, ) ) if custom_field: self._with_custom_temporal_array(cut=cut, frame_shift=frame_shift) return cut