def prepare_cmu_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the CMU Arctic manifests, which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( # Example ID: cmu_us_sup_arctic-arctic_a0001 Recording.from_file( wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}" ) for wav in corpus_dir.rglob("*.wav") ) supervisions = [] for path in corpus_dir.rglob("txt.done.data"): lines = path.read_text().splitlines() speaker = _get_speaker(path.parent.parent.name) for l in lines: l = l[2:-2] # get rid of parentheses and whitespaces on the edges seg_id, text = l.split(maxsplit=1) seg_id = f"{speaker}-{seg_id}" supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text.replace('"', ""), # get rid of quotation marks, language="English", speaker=speaker, gender=GENDER_MAP.get(speaker), custom={"accent": ACCENT_MAP.get(speaker)}, ) ) supervisions = SupervisionSet.from_segments(supervisions) # There seem to be 20 recordings missing; remove the before validation recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) recordings.to_json(output_dir / "cmu_arctic_recordings.json") supervisions.to_json(output_dir / "cmu_arctic_supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def parse_icsi_annotations( transcripts_dir: Pathlike, normalize: str = "upper" ) -> Tuple[Dict[str, List[SupervisionSegment]], Dict[str, Dict[str, int]]]: annotations = defaultdict(list) # In Lhotse, channels are integers, so we map channel ids to integers for each session channel_to_idx_map = defaultdict(dict) spk_to_channel_map = defaultdict(dict) # First we get global speaker ids and channels for meeting_file in tqdm(transcripts_dir.rglob("./*.mrt"), desc="Parsing ICSI mrt files"): if meeting_file.stem == "preambles": continue with open(meeting_file) as f: meeting_id = meeting_file.stem root = ET.parse(f).getroot() # <Meeting> for child in root: if child.tag == "Preamble": for grandchild in child: if grandchild.tag == "Channels": channel_to_idx_map[meeting_id] = { channel.attrib["Name"]: idx for idx, channel in enumerate(grandchild) } elif grandchild.tag == "Participants": for speaker in grandchild: # some speakers may not have an associated channel in some meetings, so we # assign them the SDM channel spk_to_channel_map[meeting_id][ speaker.attrib["Name"]] = ( speaker.attrib["Channel"] if "Channel" in speaker.attrib else "chan6") elif child.tag == "Transcript": for segment in child: if len(list(segment) ) == 0 and "Participant" in segment.attrib: start_time = float(segment.attrib["StartTime"]) end_time = float(segment.attrib["EndTime"]) speaker = segment.attrib["Participant"] channel = spk_to_channel_map[meeting_id][speaker] text = normalize_text(segment.text.strip(), normalize=normalize) annotations[(meeting_id, speaker, channel)].append( IcsiSegmentAnnotation( text, speaker, channel, speaker[0], start_time, end_time, )) return annotations, channel_to_idx_map
def prepare_cmu_indic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the CMU Indic manifests, which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( # Example ID: cmu_indic_ben_rm_bn_00001 Recording.from_file( wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}" ) for wav in corpus_dir.rglob("*.wav") ) supervisions = [] for path in corpus_dir.rglob("txt.done.data"): lines = path.read_text().splitlines() speaker = _get_speaker(path.parent.parent.name) lang_code = speaker.split("_")[0] # example: 'ben_rm' -> 'ben' (Bengali) try: # Example contents of voice.feats file: # variant guj # age 28 # gender male # description Built with build_cg_rfs_voice, 3 rf and 3 dur # gujarati_data h2r_prompts # prompt_dur 59.27min age = int( (path.parent / "voice.feats") .read_text() .splitlines()[1] .replace("age ", "") .strip() ) except: age = None for l in lines: l = l[2:-2] # get rid of parentheses and whitespaces on the edges seg_id, text = l.split(maxsplit=1) seg_id = f"{speaker}-{seg_id}" language = LANGUAGE_MAP[lang_code] is_english = "arctic" in seg_id # Determine available custom meta-data to attach. custom = None if is_english or age is not None: custom = {} if is_english: custom["accent"] = language if age is not None: custom["age"] = age supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text.replace('"', ""), # get rid of quotation marks, language="English" if is_english else language, speaker=speaker, gender=GENDER_MAP.get(speaker), custom=custom, ) ) supervisions = SupervisionSet.from_segments(supervisions) # There seem to be 20 recordings missing; remove the before validation recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) recordings.to_file(output_dir / "cmu-indic_recordings_all.jsonl.gz") supervisions.to_file(output_dir / "cmu-indic_supervisions_all.jsonl.gz") return {"recordings": recordings, "supervisions": supervisions}
def prepare_l2_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' speaker_meta = _parse_speaker_description() recordings = RecordingSet.from_recordings( # Example ID: zhaa-arctic_b0126 Recording.from_file( wav, recording_id=f'{wav.parent.parent.name.lower()}-{wav.stem}') for wav in corpus_dir.rglob('*.wav')) supervisions = [] for path in corpus_dir.rglob('*.txt'): # One utterance (line) per file text = path.read_text().strip() is_suitcase_corpus = 'suitcase_corpus' in path.parts speaker = path.parent.parent.name.lower( ) # <root>/ABA/transcript/arctic_a0051.txt -> aba if is_suitcase_corpus: speaker = path.stem # <root>/suitcase_corpus/transcript/aba.txt -> aba seg_id = f'suitcase_corpus-{speaker}' if is_suitcase_corpus else f'{speaker}-{path.stem}' supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language='English', speaker=speaker, gender=speaker_meta[speaker]['gender'], custom={'accent': speaker_meta[speaker]['native_lang']})) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) splits = { 'read': { 'recordings': recordings.filter(lambda r: 'suitcase_corpus' not in r.id), 'supervisions': supervisions.filter( lambda s: 'suitcase_corpus' not in s.recording_id) }, 'suitcase': { 'recordings': recordings.filter(lambda r: 'suitcase_corpus' in r.id), 'supervisions': supervisions.filter(lambda s: 'suitcase_corpus' in s.recording_id) } } if output_dir is not None: output_dir = Path(output_dir) makedirs(output_dir, exist_ok=True) for key, manifests in splits.items(): manifests['recordings'].to_json(output_dir / f'recordings-{key}.json') manifests['supervisions'].to_json(output_dir / f'supervisions-{key}.json') return splits
def prepare_heroico( speech_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param speech_dir: Pathlike, the path of the speech data dir. param transcripts_dir: Pathlike, the path of the transcript data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the fold, and the value is Dicts with the keys 'audio' and 'supervisions'. """ speech_dir = Path(speech_dir) transcript_dir = Path(transcript_dir) assert speech_dir.is_dir(), f'No such directory: {speech_dir}' assert transcript_dir.is_dir(), f'No such directory: {transcript_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) # set some patterns to match fields in transcript files and filenames answers_line_pattern = re.compile("\d+/\d+\t.+") answers_path_pattern = re.compile('Answers_Spanish') heroico_recitations_line_pattern = re.compile("\d+\t.+") heroico_recitations_devtest_path_pattern = re.compile('Recordings_Spanish') heroico_recitations_train_path_pattern = re.compile('Recordings_Spanish') usma_line_pattern = re.compile("s\d+\t.+") usma_native_demo_pattern = re.compile( "usma/native\-[fm]\-\w+\-\S+\-\S+\-\S+\-\S+\-\w+\d+") usma_native_path_pattern = re.compile('usma/native') usma_native_prompt_id_pattern = re.compile('s\d+') usma_nonnative_demo_pattern = re.compile( "nonnative\-[fm]\-[a-zA-Z]+\d*\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\d+" ) usma_nonnative_path_pattern = re.compile('nonnative.+\.wav') # Generate a mapping: utt_id -> (audio_path, audio_info, text) transcripts = defaultdict(dict) # store answers trnscripts answers_trans_path = Path(transcript_dir, heroico_dataset_answers) with open(answers_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() # some recordings do not have a transcript, skip them here if not answers_line_pattern.match(line): continue # IDs have the form speaker/prompt_id spk_utt, text = line.split(maxsplit=1) spk_id, prompt_id = spk_utt.split('/') utt_id = '-'.join(['answers', spk_id, prompt_id]) transcripts[utt_id] = text # store heroico recitations transcripts heroico_recitations_trans_path = Path(transcript_dir, heroico_dataset_recordings) with open(heroico_recitations_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() if not heroico_recitations_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = '-'.join(['heroico-recitations', idx]) transcripts[utt_id] = text # store usma transcripts usma_trans_path = Path(transcript_dir, usma_dataset) with open(usma_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() if not usma_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = '-'.join(['usma', idx]) transcripts[utt_id] = text # store utterance info audio_paths = speech_dir.rglob('*.wav') uttdata = {} for wav_file in audio_paths: wav_path = Path(wav_file) path_components = wav_path.parts pid = wav_path.stem if re.findall(answers_path_pattern, str(wav_file)): # store utternce info for Heroico Answers spk = wav_path.parts[-2] utt_id = '-'.join(['answers', spk, pid]) if utt_id not in transcripts: uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='train', speaker=spk, prompt_id=pid, subcorpus='answers', utterance_id=utt_id, transcript=transcripts[utt_id]) elif re.findall(usma_native_path_pattern, str(wav_file)): # store utterance info for usma native data spk = wav_path.parts[-2] utt_id = '-'.join(['usma', spk, pid]) trans_id = '-'.join(['usma', pid]) if not usma_native_demo_pattern.match(spk): uttdata[str(wav_file)] = None if not usma_native_prompt_id_pattern.match(pid): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='test', speaker=spk, prompt_id=pid, subcorpus='usma', utterance_id=utt_id, transcript=transcripts[trans_id]) elif re.findall(usma_nonnative_path_pattern, str(wav_file)): # store utterance data for usma nonnative data spk = wav_path.parts[-2] utt_id = '-'.join(['usma', spk, pid]) trans_id = '-'.join(['usma', pid]) if not usma_nonnative_demo_pattern.match(spk): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='test', speaker=spk, prompt_id=pid, subcorpus='usma', utterance_id=utt_id, transcript=transcripts[trans_id]) elif int(pid) <= 354 or int(pid) >= 562: # store utterance info for heroico recitations for train dataset spk = wav_path.parts[-2] utt_id = '-'.join(['heroico-recitations', spk, pid]) trans_id = '-'.join(['heroico-recitations', pid]) uttdata[str(wav_file)] = UttInfo(fold='train', speaker=spk, prompt_id=pid, subcorpus='heroico-recitations', utterance_id=utt_id, transcript=transcripts[trans_id]) elif int(pid) > 354 and int(pid) < 562: spk = wav_path.parts[-2] utt_id = '-'.join(['heroico-recitations-repeats', spk, pid]) trans_id = '-'.join(['heroico-recitations-repeats', pid]) uttdata[str(wav_file)] = UttInfo( fold='devtest', speaker=spk, prompt_id=pid, subcorpus='heroico-recitations-repeats', utterance_id=utt_id, transcript=transcripts[trans_id]) else: logging.warning(f'No such file: {wav_file}') audio_paths = speech_dir.rglob('*.wav') audio_files = [w for w in audio_paths] for fld in folds: metadata = {} for wav_file in audio_files: wav_path = Path(wav_file) # skip files with no record if not uttdata[str(wav_file)]: continue # only process the current fold if uttdata[str(wav_file)].fold != fld: continue path_components = wav_path.parts prompt_id = wav_path.stem # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = soundfile.info(str(wav_file)) spk = wav_path.parts[-2] utt_id = '-'.join( [uttdata[str(wav_file)].subcorpus, spk, prompt_id]) metadata[utt_id] = HeroicoMetaData( audio_path=wav_file, audio_info=info, text=uttdata[str(wav_file)].transcript) # Audio audio = RecordingSet.from_recordings( Recording(id=idx, sources=[ AudioSource(type='file', channels=[0], source=str(metadata[idx].audio_path)) ], sampling_rate=int(metadata[idx].audio_info.samplerate), num_samples=metadata[idx].audio_info.frames, duration=metadata[idx].audio_info.duration) for idx in metadata) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='Spanish', speaker=idx.split('-')[-2], text=metadata[idx].text) for idx in audio.recordings) validate_recordings_and_supervisions(audio, supervision) if output_dir is not None: supervision.to_json(output_dir / f'supervisions_{fld}.json') audio.to_json(output_dir / f'recordings_{fld}.json') manifests[fld] = {'recordings': audio, 'supervisions': supervision} return manifests
def prepare_single_babel_language( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, no_eval_ok: bool = False, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares manifests using a single BABEL LDC package. This function works like the following: - first, it will scan `corpus_dir` for a directory named `conversational`; if there is more than once, it picks the first one (and emits a warning) - then, it will try to find `dev`, `eval`, and `training` splits inside (if any of them is not present, it will skip it with a warning) - finally, it scans the selected location for SPHERE audio files and transcripts. :param corpus_dir: Path to the root of the LDC package with a BABEL language. :param output_dir: Path where the manifests are stored.json :param no_eval_ok: When set to True, this function won't emit a warning that the eval set was not found. :return: """ manifests = defaultdict(dict) # Auto-detect the location of the "conversational" directory orig_corpus_dir = corpus_dir corpus_dir = Path(corpus_dir) corpus_dir = [d for d in corpus_dir.rglob("conversational") if d.is_dir()] if not corpus_dir: raise ValueError( f"Could not find 'conversational' directory anywhere inside '{orig_corpus_dir}' " f"- please check your path.") if len(corpus_dir) > 1: # People have very messy data distributions, the best we can do is warn them. logging.warning( f"It seems there are multiple 'conversational' directories in '{orig_corpus_dir}' - " f"we are selecting the first one only ({corpus_dir[0]}). Please ensure that you provided " f"the path to a single language's dir, and the root dir for all BABEL languages." ) corpus_dir = corpus_dir[0].parent for split in ("dev", "eval", "training"): audio_dir = corpus_dir / f"conversational/{split}/audio" sph_recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.sph")) wav_recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.wav")) recordings = combine(sph_recordings, wav_recordings) if len(recordings) == 0: if split == "eval" and no_eval_ok: continue logging.warning(f"No SPHERE or WAV files found in {audio_dir}") supervisions = [] text_dir = corpus_dir / f"conversational/{split}/transcription" for p in tqdm.tqdm(text_dir.glob("*")): # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine # parts: # 0 -> BABEL # 1 -> BP # 2 -> <language-code> (101) # 3 -> <speaker-id> (10033) # 4 -> <date> (20111024) # 5 -> <hour> (205740) # 6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split( "_") channel = {"inLine": "A", "outLine": "B"}.get(channel, "A") # Fix problematic segments that have two consecutive timestamp lines with no transcript in between lines = p.read_text().splitlines() + [""] lines = [ prev_l for prev_l, l in sliding_window(2, lines) if not (prev_l.startswith("[") and l.startswith("[")) ] # Add a None at the end so that the last timestamp is only used as "next_timestamp" # and ends the iretation (otherwise we'd lose the last segment). lines += [None] for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])): try: start = float(timestamp[1:-1]) end = float(next_timestamp[1:-1]) # Create supervision supervisions.append( SupervisionSegment( id= f"{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}", recording_id=p.stem, start=start, duration=round(end - start, ndigits=8), channel=0, text=normalize_text(text), language=BABELCODE2LANG[lang_code], speaker=f"{lang_code}_{speaker}_{channel}", )) except Exception as e: logging.warning( f"Error while parsing segment. Message: {str(e)}") raise ValueError( f"Too many errors while parsing segments (file: '{p}'). " f"Please check your data or increase the threshold.") supervisions = deduplicate_supervisions(supervisions) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") supervisions = SupervisionSet.from_segments(supervisions) # Fixing and validation of manifests if split == "eval" and len(supervisions) == 0: # We won't remove missing recordings for the "eval" split in cases where # the user does not have its corresponding transcripts (very likely). pass else: recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) supervisions = trim_supervisions_to_recordings( recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests[split] = { "recordings": recordings, "supervisions": supervisions } if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) language = BABELCODE2LANG[lang_code] save_split = "train" if split == "training" else split recordings.to_file(output_dir / f"recordings_{language}_{save_split}.json") supervisions.to_file(output_dir / f"supervisions_{language}_{save_split}.json") return dict(manifests)
def prepare_icsi( audio_dir: Pathlike, transcripts_dir: Pathlike, output_dir: Optional[Pathlike] = None, mic: Optional[str] = "ihm", normalize_text: str = "kaldi", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param audio_dir: Pathlike, the path which holds the audio data :param transcripts_dir: Pathlike, the path which holds the transcripts data :param output_dir: Pathlike, the path where to write the manifests - `None` means manifests aren't stored on disk. :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use. :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text :return: a Dict whose key is ('train', 'dev', 'test'), and the values are dicts of manifests under keys 'recordings' and 'supervisions'. """ audio_dir = Path(audio_dir) transcripts_dir = Path(transcripts_dir) assert audio_dir.is_dir(), f"No such directory: {audio_dir}" assert transcripts_dir.is_dir(), f"No such directory: {transcripts_dir}" assert mic in MIC_TO_CHANNELS.keys(), f"Mic {mic} not supported" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logging.info("Parsing ICSI transcripts") annotations, channel_to_idx_map = parse_icsi_annotations( transcripts_dir, normalize=normalize_text) # Audio logging.info("Preparing recording manifests") channels = "".join(MIC_TO_CHANNELS[mic]) if mic == "ihm" or mic == "mdm": audio_paths = audio_dir.rglob(f"chan[{channels}].sph") audio = prepare_audio_grouped( list(audio_paths), channel_to_idx_map if mic == "ihm" else None) elif mic == "sdm" or mic == "ihm-mix": audio_paths = (audio_dir.rglob(f"chan[{channels}].sph") if len(channels) else audio_dir.rglob("*.wav")) audio = prepare_audio_single(list(audio_paths)) # Supervisions logging.info("Preparing supervision manifests") supervision = (prepare_supervision_ihm( audio, annotations, channel_to_idx_map) if mic == "ihm" else prepare_supervision_other(audio, annotations)) manifests = defaultdict(dict) for part in ["train", "dev", "test"]: # Get recordings for current data split audio_part = audio.filter(lambda x: x.id in PARTITIONS[part]) supervision_part = supervision.filter( lambda x: x.recording_id in PARTITIONS[part]) # Write to output directory if a path is provided if output_dir is not None: audio_part.to_file(output_dir / f"recordings_{part}.jsonl") supervision_part.to_file(output_dir / f"supervisions_{part}.jsonl") audio_part, supervision_part = fix_manifests(audio_part, supervision_part) validate_recordings_and_supervisions(audio_part, supervision_part) # Combine all manifests into one dictionary manifests[part] = { "recordings": audio_part, "supervisions": supervision_part } return dict(manifests)
def prepare_l2_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" speaker_meta = _parse_speaker_description() recordings = RecordingSet.from_recordings( # Example ID: zhaa-arctic_b0126 Recording.from_file( wav, recording_id=f"{wav.parent.parent.name.lower()}-{wav.stem}") for wav in corpus_dir.rglob("*.wav")) supervisions = [] for path in corpus_dir.rglob("*.txt"): # One utterance (line) per file text = path.read_text().strip() is_suitcase_corpus = "suitcase_corpus" in path.parts speaker = (path.parent.parent.name.lower() ) # <root>/ABA/transcript/arctic_a0051.txt -> aba if is_suitcase_corpus: speaker = path.stem # <root>/suitcase_corpus/transcript/aba.txt -> aba seg_id = (f"suitcase_corpus-{speaker}" if is_suitcase_corpus else f"{speaker}-{path.stem}") supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language="English", speaker=speaker, gender=speaker_meta[speaker]["gender"], custom={"accent": speaker_meta[speaker]["native_lang"]}, )) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) splits = { "read": { "recordings": recordings.filter(lambda r: "suitcase_corpus" not in r.id), "supervisions": supervisions.filter( lambda s: "suitcase_corpus" not in s.recording_id), }, "suitcase": { "recordings": recordings.filter(lambda r: "suitcase_corpus" in r.id), "supervisions": supervisions.filter(lambda s: "suitcase_corpus" in s.recording_id), }, } if output_dir is not None: output_dir = Path(output_dir) makedirs(output_dir, exist_ok=True) for key, manifests in splits.items(): manifests["recordings"].to_file( output_dir / f"l2-arctic_recordings_{key}.jsonl.gz") manifests["supervisions"].to_file( output_dir / f"l2-arctic_supervisions_{key}.jsonl.gz") return splits