def test_extend_by_cut_with_supervision( cut_start, cut_duration, extend_duration, extend_direction, supervision_start, supervision_duration, expected_start, expected_end, ): recording = dummy_recording(int(uuid4()), duration=1.0) supervisions = SupervisionSet.from_segments([ SupervisionSegment( id=int(uuid4()), recording_id=recording.id, start=supervision_start, duration=supervision_duration, ) ]) cut = dummy_cut(int(uuid4()), start=cut_start, duration=cut_duration, supervisions=supervisions) extended_cut = cut.extend_by(duration=extend_duration, direction=extend_direction) assert isclose(extended_cut.supervisions[0].start, expected_start) assert isclose(extended_cut.supervisions[0].end, expected_end)
def supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment( id="segment-1", recording_id="recording-1", channel=0, start=0.1, duration=0.3, text="transcript of the first segment", language="english", speaker="Norman Dyhrentfurth", gender="male", alignment={ "word": [ AlignmentItem(symbol="transcript", start=0.1, duration=0.08), AlignmentItem(symbol="of", start=0.18, duration=0.02), AlignmentItem(symbol="the", start=0.2, duration=0.03), AlignmentItem(symbol="first", start=0.23, duration=0.07), AlignmentItem(symbol="segment", start=0.3, duration=0.1), ] }, ) ])
def supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment(id='segment-1', recording_id='recording-1', channel=0, start=0.1, duration=0.3, text='transcript of the first segment', language='english', speaker='Norman Dyhrentfurth', gender='male', alignment={ 'word': [ AlignmentItem(symbol='transcript', start=0.1, duration=0.08), AlignmentItem(symbol='of', start=0.18, duration=0.02), AlignmentItem(symbol='the', start=0.2, duration=0.03), AlignmentItem(symbol='first', start=0.23, duration=0.07), AlignmentItem(symbol='segment', start=0.3, duration=0.1), ] }) ])
def prepare_vctk( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" speaker_meta = _parse_speaker_description(corpus_dir) recordings = RecordingSet.from_recordings( Recording.from_file(wav) for wav in (corpus_dir / "wav48").rglob("*.wav")) supervisions = [] for path in (corpus_dir / "txt").rglob("*.txt"): # One utterance (line) per file text = path.read_text().strip() speaker = path.name.split("_")[0] # p226_001.txt -> p226 seg_id = path.stem meta = speaker_meta.get(speaker, defaultdict(lambda: None)) if meta is None: logging.warning(f"Cannot find metadata for speaker {speaker}.") supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language="English", speaker=speaker, gender=meta["gender"], custom={ "accent": meta["accent"], "age": meta["age"], "region": meta["region"], }, )) supervisions = SupervisionSet.from_segments(supervisions) # note(pzelasko): There were 172 recordings without supervisions when I ran it. # I am just removing them. recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / "recordings.json") supervisions.to_json(output_dir / "supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_tedlium( tedlium_root: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the TED-LIUM v3 corpus. The manifests are created in a dict with three splits: train, dev and test. Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'. :param tedlium_root: Path to the unpacked TED-LIUM data. :return: A dict with standard corpus splits containing the manifests. """ tedlium_root = Path(tedlium_root) output_dir = Path(output_dir) if output_dir is not None else None corpus = {} for split in ("train", "dev", "test"): root = tedlium_root / "legacy" / split recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in (root / "sph").glob("*.sph")) stms = list((root / "stm").glob("*.stm")) assert len(stms) == len(recordings), ( f"Mismatch: found {len(recordings)} " f"sphere files and {len(stms)} STM files. " f"You might be missing some parts of TEDLIUM...") segments = [] for p in stms: with p.open() as f: for idx, l in enumerate(f): rec_id, _, _, start, end, _, *words = l.split() start, end = float(start), float(end) text = " ".join(words).replace("{NOISE}", "[NOISE]") if text == "ignore_time_segment_in_scoring": continue segments.append( SupervisionSegment( id=f"{rec_id}-{idx}", recording_id=rec_id, start=start, duration=round(end - start, ndigits=8), channel=0, text=text, language="English", speaker=rec_id, )) supervisions = SupervisionSet.from_segments(segments) corpus[split] = { "recordings": recordings, "supervisions": supervisions } validate_recordings_and_supervisions(**corpus[split]) if output_dir is not None: recordings.to_file(output_dir / f"tedlium_recordings_{split}.jsonl.gz") supervisions.to_file(output_dir / f"tedlium_supervisions_{split}.jsonl.gz") return corpus
def prepare_single_partition( raw_manifest_path: Path, corpus_dir: Path, speaker_id: str, clean_or_other: str, ): recordings = [] supervisions = [] for meta in load_jsonl(raw_manifest_path): recording = Recording.from_file(corpus_dir / meta["audio_filepath"]) recordings.append(recording) supervisions.append( SupervisionSegment( id=recording.id, recording_id=recording.id, start=0, duration=recording.duration, channel=0, text=meta["text"], speaker=ID2SPEAKER[speaker_id], gender=ID2GENDER[speaker_id], custom={ "text_punct": meta["text_normalized"], "split": clean_or_other }, )) recordings = RecordingSet.from_recordings(recordings) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) return recordings, supervisions
def prepare_tedlium( tedlium_root: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the TED-LIUM v3 corpus. The manifests are created in a dict with three splits: train, dev and test. Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'. :param tedlium_root: Path to the unpacked TED-LIUM data. :return: A dict with standard corpus splits containing the manifests. """ tedlium_root = Path(tedlium_root) output_dir = Path(output_dir) if output_dir is not None else None corpus = {} for split in ('train', 'dev', 'test'): root = tedlium_root / 'legacy' / split recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in (root / 'sph').glob('*.sph') ) stms = list((root / 'stm').glob('*.stm')) assert len(stms) == len(recordings), f'Mismatch: found {len(recordings)} ' \ f'sphere files and {len(stms)} STM files. ' \ f'You might be missing some parts of TEDLIUM...' segments = [] for p in stms: with p.open() as f: for idx, l in enumerate(f): rec_id, _, _, start, end, _, *words = l.split() start, end = float(start), float(end) text = ' '.join(words).replace('{NOISE}', '[NOISE]') if text == 'ignore_time_segment_in_scoring': continue segments.append( SupervisionSegment( id=f'{rec_id}-{idx}', recording_id=rec_id, start=start, duration=round(end - start, ndigits=8), channel=0, text=text, language='English', speaker=rec_id, ) ) supervisions = SupervisionSet.from_segments(segments) corpus[split] = { 'recordings': recordings, 'supervisions': supervisions } validate_recordings_and_supervisions(**corpus[split]) if output_dir is not None: recordings.to_json(output_dir / f'{split}_recordings.json') supervisions.to_json(output_dir / f'{split}_supervisions.json') return corpus
def prepare_single_babel_language(corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None): manifests = defaultdict(dict) for split in ('dev', 'eval', 'training'): audio_dir = corpus_dir / f'conversational/{split}/audio' recordings = RecordingSet.from_recordings(Recording.from_sphere(p) for p in audio_dir.glob('*.sph')) if len(recordings) == 0: logging.warning(f"No SPHERE files found in {audio_dir}") manifests[split]['recordings'] = recordings supervisions = [] text_dir = corpus_dir / f'conversational/{split}/transcription' for p in text_dir.glob('*'): # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine # parts: # 0 -> BABEL # 1 -> BP # 2 -> <language-code> (101) # 3 -> <speaker-id> (10033) # 4 -> <date> (20111024) # 5 -> <hour> (205740) # 6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split('_') channel = {'inLine': 'A', 'outLine': 'B'}.get(channel, 'A') # Add a None at the end so that the last timestamp is only used as "next_timestamp" # and ends the iretation (otherwise we'd lose the last segment). lines = p.read_text().splitlines() + [None] for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])): start = float(timestamp[1:-1]) end = float(next_timestamp[1:-1]) supervisions.append( SupervisionSegment( id=f'{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}', recording_id=p.stem, start=start, duration=round(end - start, ndigits=8), channel=0, text=normalize_text(text), language=BABELCODE2LANG[lang_code], speaker=speaker, ) ) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") manifests[split]['supervisions'] = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions( manifests[split]['recordings'], manifests[split]['superevisions'] ) if output_dir is not None: language = BABELCODE2LANG[lang_code] if split == 'training': split = 'train' manifests[split]['recordings'].to_json(f'recordings_{language}_{split}.json') manifests[split]['supervisions'].to_json(f'supervisions_{language}_{split}.json') return manifests
def prepare_cmu_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the CMU Arctic manifests, which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( # Example ID: cmu_us_sup_arctic-arctic_a0001 Recording.from_file( wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}" ) for wav in corpus_dir.rglob("*.wav") ) supervisions = [] for path in corpus_dir.rglob("txt.done.data"): lines = path.read_text().splitlines() speaker = _get_speaker(path.parent.parent.name) for l in lines: l = l[2:-2] # get rid of parentheses and whitespaces on the edges seg_id, text = l.split(maxsplit=1) seg_id = f"{speaker}-{seg_id}" supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text.replace('"', ""), # get rid of quotation marks, language="English", speaker=speaker, gender=GENDER_MAP.get(speaker), custom={"accent": ACCENT_MAP.get(speaker)}, ) ) supervisions = SupervisionSet.from_segments(supervisions) # There seem to be 20 recordings missing; remove the before validation recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) recordings.to_json(output_dir / "cmu_arctic_recordings.json") supervisions.to_json(output_dir / "cmu_arctic_supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_norm_cn( corpus_dir: Pathlike, output_dir: Pathlike, num_jobs: int = 15, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the VoxCeleb1 corpus. The manifests are created in a dict with """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) dataset_parts = ["dev", "test", "train"] for part in dataset_parts: transcript_path = corpus_dir / f"{part}/text.txt" transcript_dict = {} with open(transcript_path, "r", encoding="utf-8") as f: for line in f.readlines(): idx_transcript = line.split() if len(idx_transcript) < 2 : logging.info(f"get transcript err: {line}") continue transcript_dict[idx_transcript[0]] = " ".join(idx_transcript[1:]) file_path = corpus_dir / f"{part}/wav.scp" file_paths = [] with open(file_path, "r", encoding="utf-8") as f: file_paths = [line.strip() for line in f] recordings = [] supervisions = [] with ThreadPoolExecutor(num_jobs) as ex: for recording, supervision in tqdm( ex.map( process_file, file_paths, repeat(transcript_dict), ), desc="Processing NormcnSpeech JSON entries", leave=False, ): #for p in file_paths: # recording, supervision = process_file(p, transcript_dict) if recording is not None : recordings.append(recording) supervisions.append(supervision) supervision_set = SupervisionSet.from_segments(supervisions) recording_set = RecordingSet.from_recordings(recordings) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{part}.json") recording_set.to_json(output_dir / f"recordings_{part}.json") manifests[part] = {"recordings": recording_set, "supervisions": supervision_set} return manifests
def supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment(id='segment-1', recording_id='recording-1', channel=0, start=0.1, duration=0.3, text='transcript of the first segment', language='english', speaker='Norman Dyhrentfurth', gender='male') ])
def prepare_same_close_mic(part3_path): check_dependencies() from textgrids import TextGrid recordings = [] supervisions = [] for audio_path in tqdm( (part3_path / "AudioSameCloseMic").glob("*.wav"), desc="Creating manifests for SameCloseMic", ): try: recording_id = audio_path.stem recording = Recording.from_file(audio_path) tg = TextGrid( part3_path / f"ScriptsSame/{recording_id}.TextGrid", coding="utf-16" ) segments = [ s for s in ( SupervisionSegment( id=f"{recording_id}-{idx}", recording_id=recording_id, start=segment.xmin, # We're trimming the last segment's duration as it exceeds the actual duration of the recording. # This is safe because if we end up with a zero/negative duration, the validation will catch it. duration=min( round(segment.xmax - segment.xmin, ndigits=8), recording.duration - segment.xmin, ), text=segment.text, language="Singaporean English", speaker=recording_id, ) for idx, segment in enumerate(tg[recording_id]) if segment.text not in ("<S>", "<Z>") # skip silences ) if s.duration > 0 # NSC has some bad segments ] recordings.append(recording) supervisions.extend(segments) except: print(f"Error when processing {audio_path} - skipping...") return { "recordings": RecordingSet.from_recordings(recordings), "supervisions": SupervisionSet.from_segments(supervisions), }
def _prepare_voxceleb_v1( corpus_path: Pathlike, num_jobs: int, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the VoxCeleb1 corpus. The manifests are created in a dict with 2 splits: train ("dev") and test. """ speaker_metadata = {} with open(corpus_path / "vox1_meta.csv", "r") as f: next(f) for line in f: spkid, name, gender, nationality, split = line.strip().split("\t") speaker_metadata[spkid] = SpeakerMetadata( id=spkid, name=name, gender=gender, nationality=nationality, split=split ) with ProcessPoolExecutor(num_jobs) as ex: recordings = [] supervisions = [] futures = [] for p in (corpus_path / "wav").rglob("*.wav"): futures.append(ex.submit(_process_file, p, speaker_metadata)) for future in tqdm( as_completed(futures), total=len(futures), desc="Processing VoxCeleb1", leave=False, ): recording, supervision = future.result() recordings.append(recording) supervisions.append(supervision) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) manifests = defaultdict(dict) # Split into dev and test sets based on the split of the speakers. for split in ("dev", "test"): manifests[split]["supervisions"] = supervision_set.filter( lambda s: s.custom["split"] == split ) split_ids = [s.recording_id for s in manifests[split]["supervisions"]] manifests[split]["recordings"] = recording_set.filter( lambda r: r.id in split_ids ) manifests["train"] = manifests.pop("dev") return manifests
def _prepare_voxceleb_v2( corpus_path: Pathlike, num_jobs: int, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the VoxCeleb2 corpus. The manifests are created the same dict without any splits since the whole data is used in the final "train" split. """ # Read the speaker metadata. speaker_metadata = {} with open(corpus_path / "vox2_meta.csv", "r") as f: next(f) for line in f: spkid, _, gender, split = map(str.strip, line.split(",")) speaker_metadata[spkid] = SpeakerMetadata(id=spkid, name="", gender=gender, nationality="", split=split) # Read the wav files and prepare manifests with ProcessPoolExecutor(num_jobs) as ex: recordings = [] supervisions = [] futures = [] for p in (corpus_path / split).glob("*.wav"): futures.append( ex.submit(_process_file, p, speaker_metadata, type="command")) for future in tqdm( futures, total=len(futures), desc=f"Processing VoxCeleb2 {split} split...", leave=False, ): recording, supervision = future.result() recordings.append(recording) supervisions.append(supervision) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) manifests = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_music( corpus_dir: Path, use_vocals: bool = True ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: music_dir = corpus_dir / "music" recordings = scan_recordings(music_dir) supervisions = SupervisionSet.from_segments( SupervisionSegment( id=utt, recording_id=utt, start=0, duration=recordings.duration(utt), speaker=musician, custom={"genres": genres.split(","), "vocals": vocals == "Y"}, ) for file in music_dir.rglob("ANNOTATIONS") for utt, genres, vocals, musician in read_annotations(file, max_fields=4) ) if not use_vocals: supervisions = supervisions.filter(lambda s: s.custom["vocals"] is False) return {"recordings": recordings, "supervisions": supervisions}
def prepare_separate_phone_mic(part3_path): check_dependencies() from textgrids import TextGrid recordings = [] supervisions = [] for audio_path in tqdm( (part3_path / 'AudioSeparateIVR').rglob('**/*.wav'), desc='Creating manifests for SeparateIVR' ): try: recording_id = f'{audio_path.parent.name}_{audio_path.stem}' recording = Recording.from_file(audio_path) tg = TextGrid(part3_path / f'ScriptsSeparate/{recording_id}.TextGrid', coding='utf-16') segments = [ s for s in ( SupervisionSegment( id=f'{recording_id}-{idx}', recording_id=recording_id, start=segment.xmin, # We're trimming the last segment's duration as it exceeds the actual duration of the recording. # This is safe because if we end up with a zero/negative duration, the validation will catch it. duration=min(round(segment.xmax - segment.xmin, ndigits=8), recording.duration - segment.xmin), text=segment.text, language='Singaporean English', speaker=recording_id, ) for idx, segment in enumerate(tg[recording_id]) if segment.text not in ('<S>', '<Z>') # skip silences ) if s.duration > 0 # NSC has some bad segments ] supervisions.extend(segments) recordings.append(recording) except: print(f'Error when processing {audio_path} - skipping...') return { 'recordings': RecordingSet.from_recordings(recordings), 'supervisions': SupervisionSet.from_segments(supervisions) }
def read_rttm(path: Pathlike) -> SupervisionSet: lines = Path(path).read_text().splitlines() sups = [] rec_cntr = Counter() for line in lines: _, recording_id, channel, start, duration, _, _, speaker, _, _ = line.split( ) start, duration, channel = float(start), float(duration), int(channel) if duration == 0.0: continue rec_cntr[recording_id] += 1 sups.append( SupervisionSegment( id=f"{recording_id}_{rec_cntr[recording_id]}", recording_id=recording_id, start=start, duration=duration, channel=channel, speaker=f"{recording_id}_{speaker}", language="English", )) return SupervisionSet.from_segments(sups)
def prepare_music( corpus_dir: Path, use_vocals: bool = True ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: music_dir = corpus_dir / 'music' recordings = scan_recordings(music_dir) supervisions = SupervisionSet.from_segments( SupervisionSegment(id=utt, recording_id=utt, start=0, duration=recordings.duration(utt), speaker=musician, custom={ 'genres': genres.split(','), 'vocals': vocals == 'Y' }) for file in music_dir.rglob('ANNOTATIONS') for utt, genres, vocals, musician in read_annotations(file, max_fields=4)) if not use_vocals: supervisions = supervisions.filter( lambda s: s.custom['vocals'] is False) return {'recordings': recordings, 'supervisions': supervisions}
def prepare_same_close_mic(part3_path): check_dependencies() from textgrids import TextGrid recordings = [] supervisions = [] for audio_path in tqdm( (part3_path / 'AudioSameCloseMic').glob('*.wav'), desc='Creating manifests for SameCloseMic' ): try: recording_id = audio_path.stem recording = Recording.from_wav(audio_path) tg = TextGrid(part3_path / f'ScriptsSame/{recording_id}.TextGrid', coding='utf-16') segments = [ s for s in ( SupervisionSegment( id=f'{recording_id}-{idx}', recording_id=recording_id, start=segment.xmin, duration=round(segment.xmax - segment.xmin, ndigits=8), text=segment.text, language='Singaporean English', speaker=recording_id, ) for idx, segment in enumerate(tg[recording_id]) if segment.text not in ('<S>', '<Z>') # skip silences ) if s.duration > 0 # NSC has some bad segments ] recordings.append(recording) supervisions.extend(segments) except: print(f'Error when processing {audio_path} - skipping...') return { 'recordings': RecordingSet.from_recordings(recordings), 'supervisions': SupervisionSet.from_segments(supervisions) }
def prepare_cmu_indic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the CMU Indic manifests, which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( # Example ID: cmu_indic_ben_rm_bn_00001 Recording.from_file( wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}" ) for wav in corpus_dir.rglob("*.wav") ) supervisions = [] for path in corpus_dir.rglob("txt.done.data"): lines = path.read_text().splitlines() speaker = _get_speaker(path.parent.parent.name) lang_code = speaker.split("_")[0] # example: 'ben_rm' -> 'ben' (Bengali) try: # Example contents of voice.feats file: # variant guj # age 28 # gender male # description Built with build_cg_rfs_voice, 3 rf and 3 dur # gujarati_data h2r_prompts # prompt_dur 59.27min age = int( (path.parent / "voice.feats") .read_text() .splitlines()[1] .replace("age ", "") .strip() ) except: age = None for l in lines: l = l[2:-2] # get rid of parentheses and whitespaces on the edges seg_id, text = l.split(maxsplit=1) seg_id = f"{speaker}-{seg_id}" language = LANGUAGE_MAP[lang_code] is_english = "arctic" in seg_id # Determine available custom meta-data to attach. custom = None if is_english or age is not None: custom = {} if is_english: custom["accent"] = language if age is not None: custom["age"] = age supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text.replace('"', ""), # get rid of quotation marks, language="English" if is_english else language, speaker=speaker, gender=GENDER_MAP.get(speaker), custom=custom, ) ) supervisions = SupervisionSet.from_segments(supervisions) # There seem to be 20 recordings missing; remove the before validation recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) recordings.to_file(output_dir / "cmu-indic_recordings_all.jsonl.gz") supervisions.to_file(output_dir / "cmu-indic_supervisions_all.jsonl.gz") return {"recordings": recordings, "supervisions": supervisions}
def prepare_single_mtedx_language( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, language: str = "language", num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares manifests using a single MTEDx language. This function works as follows: - First it looks for the audio directory in the data/wav where the .flac files are stored. - Then, it looks for the vtt directory in data/{train,dev,test}/vtt which contains the segmentation and transcripts for the audio. - The transcripts undergo some basic text normalization :param corpus_dir: Path to the root of the MTEDx download :param output_dir: Path where the manifests are stored as .json files :param language: The two-letter language code. :param num_jobs: Number of threads to use when preparing data. :return: """ if isinstance(corpus_dir, str): corpus_dir = Path(corpus_dir) manifests = defaultdict(dict) with ThreadPoolExecutor(num_jobs) as ex: for split in ("train", "valid", "test"): audio_dir = corpus_dir / f"data/{split}/wav" recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.flac") ) if len(recordings) == 0: logging.warning(f"No .flac files found in {audio_dir}") supervisions = [] text_dir = corpus_dir / f"data/{split}/vtt" futures = [] for p in text_dir.glob("*"): futures.append(ex.submit(_filename_to_supervisions, p, language)) for future in tqdm(futures, desc="Processing", leave=False): result = future.result() if result is None: continue for sup in result: supervisions.append(sup) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests[split] = { "recordings": recordings, "supervisions": supervisions, } if output_dir is not None: if isinstance(output_dir, str): output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) save_split = "dev" if split == "valid" else split recordings.to_file(output_dir / f"recordings_{language}_{split}.json") supervisions.to_file( output_dir / f"supervisions_{language}_{split}.json" ) return dict(manifests)
def prepare_l2_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' speaker_meta = _parse_speaker_description() recordings = RecordingSet.from_recordings( # Example ID: zhaa-arctic_b0126 Recording.from_file( wav, recording_id=f'{wav.parent.parent.name.lower()}-{wav.stem}') for wav in corpus_dir.rglob('*.wav')) supervisions = [] for path in corpus_dir.rglob('*.txt'): # One utterance (line) per file text = path.read_text().strip() is_suitcase_corpus = 'suitcase_corpus' in path.parts speaker = path.parent.parent.name.lower( ) # <root>/ABA/transcript/arctic_a0051.txt -> aba if is_suitcase_corpus: speaker = path.stem # <root>/suitcase_corpus/transcript/aba.txt -> aba seg_id = f'suitcase_corpus-{speaker}' if is_suitcase_corpus else f'{speaker}-{path.stem}' supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language='English', speaker=speaker, gender=speaker_meta[speaker]['gender'], custom={'accent': speaker_meta[speaker]['native_lang']})) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) splits = { 'read': { 'recordings': recordings.filter(lambda r: 'suitcase_corpus' not in r.id), 'supervisions': supervisions.filter( lambda s: 'suitcase_corpus' not in s.recording_id) }, 'suitcase': { 'recordings': recordings.filter(lambda r: 'suitcase_corpus' in r.id), 'supervisions': supervisions.filter(lambda s: 'suitcase_corpus' in s.recording_id) } } if output_dir is not None: output_dir = Path(output_dir) makedirs(output_dir, exist_ok=True) for key, manifests in splits.items(): manifests['recordings'].to_json(output_dir / f'recordings-{key}.json') manifests['supervisions'].to_json(output_dir / f'supervisions-{key}.json') return splits
def prepare_libricss( corpus_dir: Pathlike, output_dir: Pathlike = None, type: str = "mdm", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. NOTE: The recordings contain all 7 channels. If you want to use only one channel, you can use either ``recording.load_audio(channel=0)`` or ``MonoCut(id=...,recording=recording,channel=0)`` while creating the CutSet. :param corpus_dir: Pathlike, the path to the extracted corpus. :param output_dir: Pathlike, the path where to write the manifests. :param type: str, the type of data to prepare ('mdm', 'sdm', 'ihm-mix', or 'ihm'). These settings are similar to the ones in AMI and ICSI recipes. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ assert type in ["mdm", "ihm-mix", "ihm"] manifests = {} corpus_dir = Path(corpus_dir) corpus_dir = (corpus_dir / "for_release" if corpus_dir.stem != "for_release" else corpus_dir) recordings = [] segments = [] for ov in OVERLAP_RATIOS: for session in (corpus_dir / ov).iterdir(): _, _, _, _, _, name, actual_ov = session.name.split("_") actual_ov = float(actual_ov.split("actual")[1]) recording_id = f"{ov}_{name}" audio_path = (session / "clean" / "mix.wav" if type == "ihm-mix" else session / "clean" / "each_spk.wav" if type == "ihm" else session / "record" / "raw_recording.wav") recordings.append( Recording.from_file(audio_path, recording_id=recording_id)) for idx, seg in enumerate( parse_transcript(session / "transcription" / "meeting_info.txt")): segments.append( SupervisionSegment( id=f"{recording_id}-{idx}", recording_id=recording_id, start=seg[0], duration=seg[1] - seg[0], text=seg[4], language="English", speaker=seg[2], channel=SPK_TO_CHANNEL_MAP[session.name][seg[2]] if type == "ihm" else 0, )) supervisions = SupervisionSet.from_segments(segments) recordings = RecordingSet.from_recordings(recordings) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) recordings.to_jsonl(output_dir / "recordings.jsonl") supervisions.to_jsonl(output_dir / "supervisions.jsonl") return {"recordings": recordings, "supervisions": supervisions}
def prepare_callhome_egyptian( audio_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Callhome Egyptian Arabic Corpus We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir: Path to ``LDC97S45`` package. :param transcript_dir: Path to the ``LDC97T19`` content :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir = Path(audio_dir) transcript_dir = Path(transcript_dir) manifests = {} for split in ["train", "devtest", "evaltest"]: audio_paths = check_and_rglob( # The LDC distribution has a typo. audio_dir / "callhome/arabic" / split.replace("evaltest", "evltest"), "*.sph", ) recordings = RecordingSet.from_recordings( Recording.from_file( p, relative_path_depth=None if absolute_paths else 4) for p in tqdm(audio_paths)) transcript_paths = check_and_rglob( transcript_dir / f"callhome_arabic_trans_970711/transcrp/{split}/roman", "*.txt", ) # TODO: Add text normalization like in Kaldi recipe. # Not doing this right now as it's not needed for VAD/diarization... supervisions = [] for p in transcript_paths: idx = 0 for line in p.read_text().splitlines(): line = line.strip() if not line: continue recording_id = p.stem # example line: # 19.33 21.18 B: %ah Tayyib start, end, spk, text = line.split(maxsplit=3) spk = spk.replace(":", "") duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue start = float(start) supervisions.append( SupervisionSegment( id=f"{recording_id}_{idx}", recording_id=recording_id, start=start, duration=duration, speaker=f"{recording_id}_{spk}", text=text, )) idx += 1 supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / f"recordings_{split}.json") supervisions.to_json(output_dir / f"supervisions_{split}.json") manifests[split] = { "recordings": recordings, "supervisions": supervisions } return manifests
def prepare_libritts( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = 'auto', output_dir: Optional[Pathlike] = None, num_jobs: int = 1 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :param num_jobs: the number of parallel workers parsing the data. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if dataset_parts == 'auto': dataset_parts = LIBRITTS elif isinstance(dataset_parts, str): assert dataset_parts in LIBRITTS dataset_parts = [dataset_parts] if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, prefix='libritts') if maybe_manifests is not None: return maybe_manifests # Contents of the file # ;ID |SEX| SUBSET |MINUTES| NAME # 14 | F | train-clean-360 | 25.03 | ... # 16 | F | train-clean-360 | 25.11 | ... # 17 | M | train-clean-360 | 25.04 | ... spk2gender = { spk_id.strip(): gender.strip() for spk_id, gender, *_ in (line.split('|') for line in ( corpus_dir / 'SPEAKERS.txt').read_text().splitlines() if not line.startswith(';')) } manifests = defaultdict(dict) for part in tqdm(dataset_parts, desc='Preparing LibriTTS parts'): part_path = corpus_dir / part recordings = RecordingSet.from_dir(part_path, '*.wav', num_jobs=num_jobs) supervisions = [] for trans_path in tqdm( part_path.rglob('*.trans.tsv'), desc='Scanning transcript files (progbar per speaker)', leave=False): # The trans.tsv files contain only the recordings that were kept for LibriTTS. # Example path to a file: # /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv # # Example content: # 84_121123_000007_000001 Maximilian. Maximilian. # 84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief. Villefort rose, half ashamed of being surprised in such a paroxysm of grief. # book.tsv contains additional metadata utt2snr = { rec_id: float(snr) for rec_id, *_, snr in map(str.split, ( trans_path.parent / trans_path.name.replace('.trans.tsv', '.book.tsv') ).read_text().splitlines()) } for line in trans_path.read_text().splitlines(): rec_id, orig_text, norm_text = line.split('\t') spk_id = rec_id.split('_')[0] supervisions.append( SupervisionSegment(id=rec_id, recording_id=rec_id, start=0.0, duration=recordings[rec_id].duration, channel=0, text=norm_text, language='English', speaker=spk_id, gender=spk2gender[spk_id], custom={ 'orig_text': orig_text, 'snr': utt2snr[rec_id] })) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: supervisions.to_json(output_dir / f'libritts_supervisions_{part}.json') recordings.to_json(output_dir / f'libritts_recordings_{part}.json') manifests[part] = { 'recordings': recordings, 'supervisions': supervisions } return dict(manifests) # Convert to normal dict
def prepare_mgb2( corpus_dir: Pathlike, output_dir: Pathlike, text_cleaning: bool = True, buck_walter: bool = False, num_jobs: int = 1, mer_thresh: int = 80, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :param text_cleaning: Bool, if True, basic text cleaning is performed (similar to ESPNet recipe). :param buck_walter: Bool, use BuckWalter transliteration :param num_jobs: int, the number of jobs to use for parallel processing. :param mer_thresh: int, filter out segments based on mer (Match Error Rate) :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. .. note:: Unlike other recipes, output_dir is not Optional here because we write the manifests to the output directory while processing to avoid OOM issues, since it is a large dataset. .. caution:: The `text_cleaning` option removes all punctuation and diacritics. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" dataset_parts = ["dev", "train", "test"] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=output_dir, prefix="mgb2", suffix="jsonl.gz", lazy=True, ) for part in dataset_parts: info(f"Processing MGB2 subset: {part}") if manifests_exist( part=part, output_dir=output_dir, prefix="mgb2", suffix="jsonl.gz" ): info(f"MGB2 subset: {part} already prepared - skipping.") continue # Read the recordings and write them into manifest. We additionally store the # duration of the recordings in a dict which will be used later to create the # supervisions. output_dir = Path(output_dir) corpus_dir = Path(corpus_dir) if part == "test" or part == "dev": (output_dir / part).mkdir(parents=True, exist_ok=True) copy( corpus_dir / part / "text.non_overlap_speech", output_dir / part / "text", ) copy( corpus_dir / part / "segments.non_overlap_speech", output_dir / part / "segments", ) with open(corpus_dir / part / "wav.scp", "r") as f_in, open( output_dir / part / "wav.scp", "w" ) as f_out: for line in f_in: f_out.write(line.replace("wav/", f"{corpus_dir}/{part}/wav/")) f_out.write("\n") recordings, supervisions, _ = load_kaldi_data_dir( (output_dir / part), 16000 ) if buck_walter is False: supervisions = supervisions.transform_text(from_buck_walter) if part == "test": assert ( len(supervisions) == 5365 ), f"Expected 5365 supervisions for test, found {len(supervisions)}" elif part == "dev": assert ( len(supervisions) == 5002 ), f"Expected 5002 supervisions for dev, found {len(supervisions)}" elif part == "train": recordings = RecordingSet.from_dir( (corpus_dir / part / "wav"), pattern="*.wav", num_jobs=num_jobs ) xml_paths = check_and_rglob( path.join(corpus_dir, part, "xml/utf8"), "*.xml" ) # Read supervisions and write them to manifest with recursion_limit(5000): supervisions_list = list( chain.from_iterable( [make_supervisions(p, mer_thresh) for p in xml_paths] ) ) supervisions = SupervisionSet.from_segments(supervisions_list) assert ( len(supervisions) == 375103 ), f"Expected 375103 supervisions for train, found {len(supervisions)}" if text_cleaning is True: supervisions = supervisions.transform_text(cleaning) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) # saving recordings and supervisions recordings.to_file((output_dir / f"mgb2_recordings_{part}.jsonl.gz")) supervisions.to_file((output_dir / f"mgb2_supervisions_{part}.jsonl.gz")) manifests[part] = { "recordings": recordings, "supervisions": supervisions, } return manifests
def prepare_single_babel_language( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, no_eval_ok: bool = False, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares manifests using a single BABEL LDC package. This function works like the following: - first, it will scan `corpus_dir` for a directory named `conversational`; if there is more than once, it picks the first one (and emits a warning) - then, it will try to find `dev`, `eval`, and `training` splits inside (if any of them is not present, it will skip it with a warning) - finally, it scans the selected location for SPHERE audio files and transcripts. :param corpus_dir: Path to the root of the LDC package with a BABEL language. :param output_dir: Path where the manifests are stored.json :param no_eval_ok: When set to True, this function won't emit a warning that the eval set was not found. :return: """ manifests = defaultdict(dict) # Auto-detect the location of the "conversational" directory orig_corpus_dir = corpus_dir corpus_dir = Path(corpus_dir) corpus_dir = [d for d in corpus_dir.rglob("conversational") if d.is_dir()] if not corpus_dir: raise ValueError( f"Could not find 'conversational' directory anywhere inside '{orig_corpus_dir}' " f"- please check your path.") if len(corpus_dir) > 1: # People have very messy data distributions, the best we can do is warn them. logging.warning( f"It seems there are multiple 'conversational' directories in '{orig_corpus_dir}' - " f"we are selecting the first one only ({corpus_dir[0]}). Please ensure that you provided " f"the path to a single language's dir, and the root dir for all BABEL languages." ) corpus_dir = corpus_dir[0].parent for split in ("dev", "eval", "training"): audio_dir = corpus_dir / f"conversational/{split}/audio" sph_recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.sph")) wav_recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.wav")) recordings = combine(sph_recordings, wav_recordings) if len(recordings) == 0: if split == "eval" and no_eval_ok: continue logging.warning(f"No SPHERE or WAV files found in {audio_dir}") supervisions = [] text_dir = corpus_dir / f"conversational/{split}/transcription" for p in tqdm.tqdm(text_dir.glob("*")): # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine # parts: # 0 -> BABEL # 1 -> BP # 2 -> <language-code> (101) # 3 -> <speaker-id> (10033) # 4 -> <date> (20111024) # 5 -> <hour> (205740) # 6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split( "_") channel = {"inLine": "A", "outLine": "B"}.get(channel, "A") # Fix problematic segments that have two consecutive timestamp lines with no transcript in between lines = p.read_text().splitlines() + [""] lines = [ prev_l for prev_l, l in sliding_window(2, lines) if not (prev_l.startswith("[") and l.startswith("[")) ] # Add a None at the end so that the last timestamp is only used as "next_timestamp" # and ends the iretation (otherwise we'd lose the last segment). lines += [None] for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])): try: start = float(timestamp[1:-1]) end = float(next_timestamp[1:-1]) # Create supervision supervisions.append( SupervisionSegment( id= f"{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}", recording_id=p.stem, start=start, duration=round(end - start, ndigits=8), channel=0, text=normalize_text(text), language=BABELCODE2LANG[lang_code], speaker=f"{lang_code}_{speaker}_{channel}", )) except Exception as e: logging.warning( f"Error while parsing segment. Message: {str(e)}") raise ValueError( f"Too many errors while parsing segments (file: '{p}'). " f"Please check your data or increase the threshold.") supervisions = deduplicate_supervisions(supervisions) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") supervisions = SupervisionSet.from_segments(supervisions) # Fixing and validation of manifests if split == "eval" and len(supervisions) == 0: # We won't remove missing recordings for the "eval" split in cases where # the user does not have its corresponding transcripts (very likely). pass else: recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) supervisions = trim_supervisions_to_recordings( recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests[split] = { "recordings": recordings, "supervisions": supervisions } if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) language = BABELCODE2LANG[lang_code] save_split = "train" if split == "training" else split recordings.to_file(output_dir / f"recordings_{language}_{save_split}.json") supervisions.to_file(output_dir / f"supervisions_{language}_{save_split}.json") return dict(manifests)
def prepare_callhome_english_asr( audio_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the CallHome American English corpus. We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir: Path to ``LDC97S42`` content :param transcript_dir: Path to the ``LDC97T14`` content :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir = Path(audio_dir) transcript_dir = Path(transcript_dir) manifests = {} for split in ["evaltest", "train", "devtest"]: audio_paths = check_and_rglob( # The LDC distribution has a typo. audio_dir / "data" / split.replace("evaltest", "evltest"), "*.sph", ) recordings = RecordingSet.from_recordings( Recording.from_file( p, relative_path_depth=None if absolute_paths else 4) for p in tqdm(audio_paths)) transcript_paths = check_and_rglob( transcript_dir / "transcrpt" / split, "*.txt", ) # TODO: Add text normalization like in Kaldi recipe. # Not doing this right now as it's not needed for VAD/diarization... supervisions = [] for p in transcript_paths: idx = 0 postprocessed_lines = list() for line in p.read_text().splitlines(): line = line.strip() if not line: continue if line.startswith("#"): continue try: start, end, spk, text = line.split(maxsplit=3) duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue postprocessed_lines.append(line) except InvalidOperation: postprocessed_lines[ -1] = postprocessed_lines[-1] + " " + line except ValueError: postprocessed_lines[ -1] = postprocessed_lines[-1] + " " + line for line in postprocessed_lines: recording_id = p.stem # example line: # 19.33 21.18 B: %ah Tayyib start, end, spk, text = line.split(maxsplit=3) spk = spk.replace(":", "") duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue start = float(start) supervisions.append( SupervisionSegment( recording_id=recording_id, start=start, duration=duration, channel=ord(spk[0]) - ord("A"), speaker=f"{recording_id}_{spk:0>2s}", id=f"{recording_id}_{spk:0>2s}_{idx:0>5d}", text=text, )) idx += 1 supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_file(output_dir / f"callhome-english_recordings_{split}.jsonl.gz") supervisions.to_file( output_dir / f"callhome-english_supervisions_{split}.jsonl.gz") manifests[split] = { "recordings": recordings, "supervisions": supervisions } return manifests
def prepare_bvcc( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: corpus_dir = Path(corpus_dir) phase1_main = (corpus_dir / "phase1-main").resolve() assert phase1_main.exists(), f"Main track dir is missing {phase1_main}" main1_sets = phase1_main / "DATA" / "sets" main1_wav = phase1_main / "DATA" / "wav" assert (main1_sets.exists() and main1_wav.exists() ), f"Have you run data preparation in {phase1_main}?" main1_devp = main1_sets / "DEVSET" assert main1_devp.exists(), main1_devp main1_trainp = main1_sets / "TRAINSET" assert main1_trainp.exists(), main1_trainp phase1_ood = (corpus_dir / "phase1-ood").resolve() assert phase1_ood.exists( ), f"Out of domain track dir is missing {phase1_ood}" ood1_sets = phase1_ood / "DATA" / "sets" ood1_wav = phase1_ood / "DATA" / "wav" assert (ood1_sets.exists() and ood1_wav.exists() ), f"Have you run data preparation in {phase1_ood}?" ood1_unlabeled = ood1_sets / "unlabeled_mos_list.txt" assert ood1_unlabeled.exists(), ood1_unlabeled ood1_devp = ood1_sets / "DEVSET" assert ood1_devp.exists(), ood1_devp ood1_trainp = ood1_sets / "TRAINSET" assert ood1_trainp.exists(), ood1_devp manifests = {} # ### Main track sets main1_recs = RecordingSet.from_dir(main1_wav, pattern="*.wav", num_jobs=num_jobs) logging.info("Preparing main1_dev") main1_dev_sup = SupervisionSet.from_segments( gen_supervision_per_utt( sorted(open(main1_devp).readlines()), main1_recs, parse_main_line, )) main1_dev_recs = main1_recs.filter(lambda rec: rec.id in main1_dev_sup) manifests["main1_dev"] = { "recordings": main1_dev_recs, "supervisions": main1_dev_sup, } logging.info("Preparing main1_train") main1_train_sup = SupervisionSet.from_segments( gen_supervision_per_utt( sorted(open(main1_trainp).readlines()), main1_recs, parse_main_line, )) main1_train_recs = main1_recs.filter(lambda rec: rec.id in main1_train_sup) manifests["main1_train"] = { "recordings": main1_train_recs, "supervisions": main1_train_sup, } # ### Out of Domain (OOD) track sets unlabeled_wavpaths = [ ood1_wav / name.strip() for name in open(ood1_unlabeled).readlines() ] manifests["ood1_unlabeled"] = { "recordings": RecordingSet.from_recordings( Recording.from_file(p) for p in unlabeled_wavpaths) } ood1_recs = RecordingSet.from_dir(ood1_wav, pattern="*.wav", num_jobs=num_jobs) logging.info("Preparing ood1_dev") ood1_dev_sup = SupervisionSet.from_segments( gen_supervision_per_utt( sorted(open(ood1_devp).readlines()), ood1_recs, parse_ood_line, )) ood1_dev_recs = ood1_recs.filter(lambda rec: rec.id in ood1_dev_sup) manifests["ood1_dev"] = { "recordings": ood1_dev_recs, "supervisions": ood1_dev_sup, } logging.info("Preparing ood1_train") ood1_train_sup = SupervisionSet.from_segments( gen_supervision_per_utt( sorted(open(ood1_trainp).readlines()), ood1_recs, parse_ood_line, )) ood1_train_recs = ood1_recs.filter(lambda rec: rec.id in ood1_train_sup) manifests["ood1_train"] = { "recordings": ood1_train_recs, "supervisions": ood1_train_sup, } # Optionally serializing to disc if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part, d in manifests.items(): d["recordings"].to_file(output_dir / f"recordings_{part}.jsonl.gz") if "supervisions" in d: d["supervisions"].to_file(output_dir / f"supervisions_{part}.jsonl.gz") return manifests
def prepare_libritts( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, link_previous_utt: bool = False, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :param num_jobs: the number of parallel workers parsing the data. :param link_previous_utt: If true adds previous utterance id to supervisions. Useful for reconstructing chains of utterances as they were read. If previous utterance was skipped from LibriTTS datasets previous_utt label is None. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if dataset_parts == "auto": dataset_parts = LIBRITTS elif isinstance(dataset_parts, str): assert dataset_parts in LIBRITTS dataset_parts = [dataset_parts] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, prefix="libritts") # Contents of the file # ;ID |SEX| SUBSET |MINUTES| NAME # 14 | F | train-clean-360 | 25.03 | ... # 16 | F | train-clean-360 | 25.11 | ... # 17 | M | train-clean-360 | 25.04 | ... spk2gender = { spk_id.strip(): gender.strip() for spk_id, gender, *_ in (line.split("|") for line in ( corpus_dir / "SPEAKERS.txt").read_text().splitlines() if not line.startswith(";")) } for part in tqdm(dataset_parts, desc="Preparing LibriTTS parts"): if manifests_exist(part=part, output_dir=output_dir, prefix="libritts"): logging.info( f"LibriTTS subset: {part} already prepared - skipping.") continue part_path = corpus_dir / part recordings = RecordingSet.from_dir(part_path, "*.wav", num_jobs=num_jobs) supervisions = [] for trans_path in tqdm( part_path.rglob("*.trans.tsv"), desc="Scanning transcript files (progbar per speaker)", leave=False, ): # The trans.tsv files contain only the recordings that were kept for LibriTTS. # Example path to a file: # /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv # # Example content: # 84_121123_000007_000001 Maximilian. Maximilian. # 84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief. Villefort rose, half ashamed of being surprised in such a paroxysm of grief. # book.tsv contains additional metadata utt2snr = [(rec_id, float(snr)) for rec_id, *_, snr in map( str.split, (trans_path.parent / trans_path.name.replace(".trans.tsv", ".book.tsv") ).read_text().splitlines(), )] # keeps the order of uttids as they appear in book.tsv uttids = [r for r, _ in utt2snr] utt2snr = dict(utt2snr) if link_previous_utt: # Using the property of sorted keys to find previous utterance # The keys has structure speaker_book_x_y e.g. 1089_134691_000004_000001 utt2prevutt = dict(zip(uttids + [None], [None] + uttids)) prev_rec_id = None for line in trans_path.read_text().splitlines(): rec_id, orig_text, norm_text = line.split("\t") spk_id = rec_id.split("_")[0] customd = {"orig_text": orig_text, "snr": utt2snr[rec_id]} if link_previous_utt: # all recordings ids should be in the book.csv # but they are some missing e.g. 446_123502_000030_000003 prev_utt = utt2prevutt.get(rec_id, None) # previous utterance has to be present in trans.csv - otherwise it was skipped prev_utt = prev_utt if prev_utt == prev_rec_id else None customd["prev_utt"] = prev_utt prev_rec_id = rec_id supervisions.append( SupervisionSegment( id=rec_id, recording_id=rec_id, start=0.0, duration=recordings[rec_id].duration, channel=0, text=norm_text, language="English", speaker=spk_id, gender=spk2gender[spk_id], custom=customd, )) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: supervisions.to_file(output_dir / f"libritts_supervisions_{part}.jsonl.gz") recordings.to_file(output_dir / f"libritts_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recordings, "supervisions": supervisions } return manifests