def prepare_single_partition( raw_manifest_path: Path, corpus_dir: Path, speaker_id: str, clean_or_other: str, ): recordings = [] supervisions = [] for meta in load_jsonl(raw_manifest_path): recording = Recording.from_file(corpus_dir / meta["audio_filepath"]) recordings.append(recording) supervisions.append( SupervisionSegment( id=recording.id, recording_id=recording.id, start=0, duration=recording.duration, channel=0, text=meta["text"], speaker=ID2SPEAKER[speaker_id], gender=ID2GENDER[speaker_id], custom={ "text_punct": meta["text_normalized"], "split": clean_or_other }, )) recordings = RecordingSet.from_recordings(recordings) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) return recordings, supervisions
def prepare_tedlium( tedlium_root: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the TED-LIUM v3 corpus. The manifests are created in a dict with three splits: train, dev and test. Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'. :param tedlium_root: Path to the unpacked TED-LIUM data. :return: A dict with standard corpus splits containing the manifests. """ tedlium_root = Path(tedlium_root) output_dir = Path(output_dir) if output_dir is not None else None corpus = {} for split in ("train", "dev", "test"): root = tedlium_root / "legacy" / split recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in (root / "sph").glob("*.sph")) stms = list((root / "stm").glob("*.stm")) assert len(stms) == len(recordings), ( f"Mismatch: found {len(recordings)} " f"sphere files and {len(stms)} STM files. " f"You might be missing some parts of TEDLIUM...") segments = [] for p in stms: with p.open() as f: for idx, l in enumerate(f): rec_id, _, _, start, end, _, *words = l.split() start, end = float(start), float(end) text = " ".join(words).replace("{NOISE}", "[NOISE]") if text == "ignore_time_segment_in_scoring": continue segments.append( SupervisionSegment( id=f"{rec_id}-{idx}", recording_id=rec_id, start=start, duration=round(end - start, ndigits=8), channel=0, text=text, language="English", speaker=rec_id, )) supervisions = SupervisionSet.from_segments(segments) corpus[split] = { "recordings": recordings, "supervisions": supervisions } validate_recordings_and_supervisions(**corpus[split]) if output_dir is not None: recordings.to_file(output_dir / f"tedlium_recordings_{split}.jsonl.gz") supervisions.to_file(output_dir / f"tedlium_supervisions_{split}.jsonl.gz") return corpus
def prepare_vctk( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" speaker_meta = _parse_speaker_description(corpus_dir) recordings = RecordingSet.from_recordings( Recording.from_file(wav) for wav in (corpus_dir / "wav48").rglob("*.wav")) supervisions = [] for path in (corpus_dir / "txt").rglob("*.txt"): # One utterance (line) per file text = path.read_text().strip() speaker = path.name.split("_")[0] # p226_001.txt -> p226 seg_id = path.stem meta = speaker_meta.get(speaker, defaultdict(lambda: None)) if meta is None: logging.warning(f"Cannot find metadata for speaker {speaker}.") supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language="English", speaker=speaker, gender=meta["gender"], custom={ "accent": meta["accent"], "age": meta["age"], "region": meta["region"], }, )) supervisions = SupervisionSet.from_segments(supervisions) # note(pzelasko): There were 172 recordings without supervisions when I ran it. # I am just removing them. recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / "recordings.json") supervisions.to_json(output_dir / "supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_tedlium( tedlium_root: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the TED-LIUM v3 corpus. The manifests are created in a dict with three splits: train, dev and test. Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'. :param tedlium_root: Path to the unpacked TED-LIUM data. :return: A dict with standard corpus splits containing the manifests. """ tedlium_root = Path(tedlium_root) output_dir = Path(output_dir) if output_dir is not None else None corpus = {} for split in ('train', 'dev', 'test'): root = tedlium_root / 'legacy' / split recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in (root / 'sph').glob('*.sph') ) stms = list((root / 'stm').glob('*.stm')) assert len(stms) == len(recordings), f'Mismatch: found {len(recordings)} ' \ f'sphere files and {len(stms)} STM files. ' \ f'You might be missing some parts of TEDLIUM...' segments = [] for p in stms: with p.open() as f: for idx, l in enumerate(f): rec_id, _, _, start, end, _, *words = l.split() start, end = float(start), float(end) text = ' '.join(words).replace('{NOISE}', '[NOISE]') if text == 'ignore_time_segment_in_scoring': continue segments.append( SupervisionSegment( id=f'{rec_id}-{idx}', recording_id=rec_id, start=start, duration=round(end - start, ndigits=8), channel=0, text=text, language='English', speaker=rec_id, ) ) supervisions = SupervisionSet.from_segments(segments) corpus[split] = { 'recordings': recordings, 'supervisions': supervisions } validate_recordings_and_supervisions(**corpus[split]) if output_dir is not None: recordings.to_json(output_dir / f'{split}_recordings.json') supervisions.to_json(output_dir / f'{split}_supervisions.json') return corpus
def prepare_single_babel_language(corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None): manifests = defaultdict(dict) for split in ('dev', 'eval', 'training'): audio_dir = corpus_dir / f'conversational/{split}/audio' recordings = RecordingSet.from_recordings(Recording.from_sphere(p) for p in audio_dir.glob('*.sph')) if len(recordings) == 0: logging.warning(f"No SPHERE files found in {audio_dir}") manifests[split]['recordings'] = recordings supervisions = [] text_dir = corpus_dir / f'conversational/{split}/transcription' for p in text_dir.glob('*'): # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine # parts: # 0 -> BABEL # 1 -> BP # 2 -> <language-code> (101) # 3 -> <speaker-id> (10033) # 4 -> <date> (20111024) # 5 -> <hour> (205740) # 6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split('_') channel = {'inLine': 'A', 'outLine': 'B'}.get(channel, 'A') # Add a None at the end so that the last timestamp is only used as "next_timestamp" # and ends the iretation (otherwise we'd lose the last segment). lines = p.read_text().splitlines() + [None] for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])): start = float(timestamp[1:-1]) end = float(next_timestamp[1:-1]) supervisions.append( SupervisionSegment( id=f'{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}', recording_id=p.stem, start=start, duration=round(end - start, ndigits=8), channel=0, text=normalize_text(text), language=BABELCODE2LANG[lang_code], speaker=speaker, ) ) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") manifests[split]['supervisions'] = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions( manifests[split]['recordings'], manifests[split]['superevisions'] ) if output_dir is not None: language = BABELCODE2LANG[lang_code] if split == 'training': split = 'train' manifests[split]['recordings'].to_json(f'recordings_{language}_{split}.json') manifests[split]['supervisions'].to_json(f'supervisions_{language}_{split}.json') return manifests
def prepare_cmu_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the CMU Arctic manifests, which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( # Example ID: cmu_us_sup_arctic-arctic_a0001 Recording.from_file( wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}" ) for wav in corpus_dir.rglob("*.wav") ) supervisions = [] for path in corpus_dir.rglob("txt.done.data"): lines = path.read_text().splitlines() speaker = _get_speaker(path.parent.parent.name) for l in lines: l = l[2:-2] # get rid of parentheses and whitespaces on the edges seg_id, text = l.split(maxsplit=1) seg_id = f"{speaker}-{seg_id}" supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text.replace('"', ""), # get rid of quotation marks, language="English", speaker=speaker, gender=GENDER_MAP.get(speaker), custom={"accent": ACCENT_MAP.get(speaker)}, ) ) supervisions = SupervisionSet.from_segments(supervisions) # There seem to be 20 recordings missing; remove the before validation recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) recordings.to_json(output_dir / "cmu_arctic_recordings.json") supervisions.to_json(output_dir / "cmu_arctic_supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_norm_cn( corpus_dir: Pathlike, output_dir: Pathlike, num_jobs: int = 15, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the VoxCeleb1 corpus. The manifests are created in a dict with """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) dataset_parts = ["dev", "test", "train"] for part in dataset_parts: transcript_path = corpus_dir / f"{part}/text.txt" transcript_dict = {} with open(transcript_path, "r", encoding="utf-8") as f: for line in f.readlines(): idx_transcript = line.split() if len(idx_transcript) < 2 : logging.info(f"get transcript err: {line}") continue transcript_dict[idx_transcript[0]] = " ".join(idx_transcript[1:]) file_path = corpus_dir / f"{part}/wav.scp" file_paths = [] with open(file_path, "r", encoding="utf-8") as f: file_paths = [line.strip() for line in f] recordings = [] supervisions = [] with ThreadPoolExecutor(num_jobs) as ex: for recording, supervision in tqdm( ex.map( process_file, file_paths, repeat(transcript_dict), ), desc="Processing NormcnSpeech JSON entries", leave=False, ): #for p in file_paths: # recording, supervision = process_file(p, transcript_dict) if recording is not None : recordings.append(recording) supervisions.append(supervision) supervision_set = SupervisionSet.from_segments(supervisions) recording_set = RecordingSet.from_recordings(recordings) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{part}.json") recording_set.to_json(output_dir / f"recordings_{part}.json") manifests[part] = {"recordings": recording_set, "supervisions": supervision_set} return manifests
def test_known_issue_with_overlap(): r = dummy_recording(0) rec = RecordingSet.from_recordings([r]) # Make two segments. The first segment is 1s long. The segment segment # is 0.3 seconds long and lies entirely within the first. Both have the # same recording_id as the single entry in rec. sup = SupervisionSet.from_segments( [ SupervisionSegment( id="utt1", recording_id=r.id, start=0.0, duration=1.0, channel=0, text="Hello", ), SupervisionSegment( id="utt2", recording_id=r.id, start=0.2, duration=0.5, channel=0, text="World", ), ] ) cuts = CutSet.from_manifests(recordings=rec, supervisions=sup) assert len(cuts) == 1 cuts_trim = cuts.trim_to_supervisions(keep_overlapping=False) assert len(cuts_trim) == 2 cut = cuts_trim[0] assert cut.start == 0 assert cut.duration == 1 assert len(cut.supervisions) == 1 sup = cut.supervisions[0] assert sup.start == 0 assert sup.duration == 1 assert sup.text == "Hello" cut = cuts_trim[1] assert cut.start == 0.2 assert cut.duration == 0.5 assert len(cut.supervisions) == 1 sup = cut.supervisions[0] assert sup.start == 0 assert sup.duration == 0.5 assert sup.text == "World"
def recording_set(): return RecordingSet.from_recordings([ Recording(id='x', sources=[ AudioSource(type='file', channels=[0], source='text/fixtures/mono_c0.wav'), AudioSource(type='command', channels=[1], source='cat text/fixtures/mono_c1.wav') ], sampling_rate=8000, num_samples=4000, duration=0.5) ])
def prepare_same_close_mic(part3_path): check_dependencies() from textgrids import TextGrid recordings = [] supervisions = [] for audio_path in tqdm( (part3_path / "AudioSameCloseMic").glob("*.wav"), desc="Creating manifests for SameCloseMic", ): try: recording_id = audio_path.stem recording = Recording.from_file(audio_path) tg = TextGrid( part3_path / f"ScriptsSame/{recording_id}.TextGrid", coding="utf-16" ) segments = [ s for s in ( SupervisionSegment( id=f"{recording_id}-{idx}", recording_id=recording_id, start=segment.xmin, # We're trimming the last segment's duration as it exceeds the actual duration of the recording. # This is safe because if we end up with a zero/negative duration, the validation will catch it. duration=min( round(segment.xmax - segment.xmin, ndigits=8), recording.duration - segment.xmin, ), text=segment.text, language="Singaporean English", speaker=recording_id, ) for idx, segment in enumerate(tg[recording_id]) if segment.text not in ("<S>", "<Z>") # skip silences ) if s.duration > 0 # NSC has some bad segments ] recordings.append(recording) supervisions.extend(segments) except: print(f"Error when processing {audio_path} - skipping...") return { "recordings": RecordingSet.from_recordings(recordings), "supervisions": SupervisionSet.from_segments(supervisions), }
def _prepare_voxceleb_v1( corpus_path: Pathlike, num_jobs: int, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the VoxCeleb1 corpus. The manifests are created in a dict with 2 splits: train ("dev") and test. """ speaker_metadata = {} with open(corpus_path / "vox1_meta.csv", "r") as f: next(f) for line in f: spkid, name, gender, nationality, split = line.strip().split("\t") speaker_metadata[spkid] = SpeakerMetadata( id=spkid, name=name, gender=gender, nationality=nationality, split=split ) with ProcessPoolExecutor(num_jobs) as ex: recordings = [] supervisions = [] futures = [] for p in (corpus_path / "wav").rglob("*.wav"): futures.append(ex.submit(_process_file, p, speaker_metadata)) for future in tqdm( as_completed(futures), total=len(futures), desc="Processing VoxCeleb1", leave=False, ): recording, supervision = future.result() recordings.append(recording) supervisions.append(supervision) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) manifests = defaultdict(dict) # Split into dev and test sets based on the split of the speakers. for split in ("dev", "test"): manifests[split]["supervisions"] = supervision_set.filter( lambda s: s.custom["split"] == split ) split_ids = [s.recording_id for s in manifests[split]["supervisions"]] manifests[split]["recordings"] = recording_set.filter( lambda r: r.id in split_ids ) manifests["train"] = manifests.pop("dev") return manifests
def recording_set(): return RecordingSet.from_recordings([ Recording( id="x", sources=[ AudioSource(type="file", channels=[0], source="text/fixtures/mono_c0.wav"), AudioSource( type="command", channels=[1], source="cat text/fixtures/mono_c1.wav", ), ], sampling_rate=8000, num_samples=4000, duration=0.5, ) ])
def _prepare_voxceleb_v2( corpus_path: Pathlike, num_jobs: int, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the VoxCeleb2 corpus. The manifests are created the same dict without any splits since the whole data is used in the final "train" split. """ # Read the speaker metadata. speaker_metadata = {} with open(corpus_path / "vox2_meta.csv", "r") as f: next(f) for line in f: spkid, _, gender, split = map(str.strip, line.split(",")) speaker_metadata[spkid] = SpeakerMetadata(id=spkid, name="", gender=gender, nationality="", split=split) # Read the wav files and prepare manifests with ProcessPoolExecutor(num_jobs) as ex: recordings = [] supervisions = [] futures = [] for p in (corpus_path / split).glob("*.wav"): futures.append( ex.submit(_process_file, p, speaker_metadata, type="command")) for future in tqdm( futures, total=len(futures), desc=f"Processing VoxCeleb2 {split} split...", leave=False, ): recording, supervision = future.result() recordings.append(recording) supervisions.append(supervision) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) manifests = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_callhome_english( audio_dir: Pathlike, rttm_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, sph2pipe_path: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Switchboard corpus. We create two manifests: one with recordings, and the other one with text supervisions. When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. :param audio_dir: Path to ``LDC2001S97`` package. :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). If not provided, the transcripts will be downloaded. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ if rttm_dir is None: rttm_dir = download_callhome_metadata() rttm_path = rttm_dir / 'fullref.rttm' supervisions = read_rttm(rttm_path) audio_paths = check_and_rglob(audio_dir, '*.sph') recordings = RecordingSet.from_recordings( make_recording_callhome(p, sph2pipe_path=sph2pipe_path) for p in tqdm(audio_paths) ) recordings, supervisions = remove_missing_recordings_and_supervisions(recordings, supervisions) supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / 'recordings.json') supervisions.to_json(output_dir / 'supervisions.json') return { 'recordings': recordings, 'supervisions': supervisions }
def prepare_callhome_english_sre( audio_dir: Pathlike, rttm_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Callhome American English portion prepartion. We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir: Path to ``LDC2001S97`` package. :param rttm_dir: Path to the transcripts directory. If not provided, the transcripts will be downloaded. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ if rttm_dir is None: rttm_dir = download_callhome_metadata() rttm_path = rttm_dir / "fullref.rttm" supervisions = read_rttm(rttm_path) audio_paths = check_and_rglob(audio_dir, "*.sph") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 4) for p in tqdm(audio_paths)) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / "recordings.json") supervisions.to_json(output_dir / "supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_separate_phone_mic(part3_path): check_dependencies() from textgrids import TextGrid recordings = [] supervisions = [] for audio_path in tqdm( (part3_path / 'AudioSeparateIVR').rglob('**/*.wav'), desc='Creating manifests for SeparateIVR' ): try: recording_id = f'{audio_path.parent.name}_{audio_path.stem}' recording = Recording.from_file(audio_path) tg = TextGrid(part3_path / f'ScriptsSeparate/{recording_id}.TextGrid', coding='utf-16') segments = [ s for s in ( SupervisionSegment( id=f'{recording_id}-{idx}', recording_id=recording_id, start=segment.xmin, # We're trimming the last segment's duration as it exceeds the actual duration of the recording. # This is safe because if we end up with a zero/negative duration, the validation will catch it. duration=min(round(segment.xmax - segment.xmin, ndigits=8), recording.duration - segment.xmin), text=segment.text, language='Singaporean English', speaker=recording_id, ) for idx, segment in enumerate(tg[recording_id]) if segment.text not in ('<S>', '<Z>') # skip silences ) if s.duration > 0 # NSC has some bad segments ] supervisions.extend(segments) recordings.append(recording) except: print(f'Error when processing {audio_path} - skipping...') return { 'recordings': RecordingSet.from_recordings(recordings), 'supervisions': SupervisionSet.from_segments(supervisions) }
def prepare_same_close_mic(part3_path): check_dependencies() from textgrids import TextGrid recordings = [] supervisions = [] for audio_path in tqdm( (part3_path / 'AudioSameCloseMic').glob('*.wav'), desc='Creating manifests for SameCloseMic' ): try: recording_id = audio_path.stem recording = Recording.from_wav(audio_path) tg = TextGrid(part3_path / f'ScriptsSame/{recording_id}.TextGrid', coding='utf-16') segments = [ s for s in ( SupervisionSegment( id=f'{recording_id}-{idx}', recording_id=recording_id, start=segment.xmin, duration=round(segment.xmax - segment.xmin, ndigits=8), text=segment.text, language='Singaporean English', speaker=recording_id, ) for idx, segment in enumerate(tg[recording_id]) if segment.text not in ('<S>', '<Z>') # skip silences ) if s.duration > 0 # NSC has some bad segments ] recordings.append(recording) supervisions.extend(segments) except: print(f'Error when processing {audio_path} - skipping...') return { 'recordings': RecordingSet.from_recordings(recordings), 'supervisions': SupervisionSet.from_segments(supervisions) }
def prepare_bvcc( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: corpus_dir = Path(corpus_dir) phase1_main = (corpus_dir / "phase1-main").resolve() assert phase1_main.exists(), f"Main track dir is missing {phase1_main}" main1_sets = phase1_main / "DATA" / "sets" main1_wav = phase1_main / "DATA" / "wav" assert (main1_sets.exists() and main1_wav.exists() ), f"Have you run data preparation in {phase1_main}?" main1_devp = main1_sets / "DEVSET" assert main1_devp.exists(), main1_devp main1_trainp = main1_sets / "TRAINSET" assert main1_trainp.exists(), main1_trainp phase1_ood = (corpus_dir / "phase1-ood").resolve() assert phase1_ood.exists( ), f"Out of domain track dir is missing {phase1_ood}" ood1_sets = phase1_ood / "DATA" / "sets" ood1_wav = phase1_ood / "DATA" / "wav" assert (ood1_sets.exists() and ood1_wav.exists() ), f"Have you run data preparation in {phase1_ood}?" ood1_unlabeled = ood1_sets / "unlabeled_mos_list.txt" assert ood1_unlabeled.exists(), ood1_unlabeled ood1_devp = ood1_sets / "DEVSET" assert ood1_devp.exists(), ood1_devp ood1_trainp = ood1_sets / "TRAINSET" assert ood1_trainp.exists(), ood1_devp manifests = {} # ### Main track sets main1_recs = RecordingSet.from_dir(main1_wav, pattern="*.wav", num_jobs=num_jobs) logging.info("Preparing main1_dev") main1_dev_sup = SupervisionSet.from_segments( gen_supervision_per_utt( sorted(open(main1_devp).readlines()), main1_recs, parse_main_line, )) main1_dev_recs = main1_recs.filter(lambda rec: rec.id in main1_dev_sup) manifests["main1_dev"] = { "recordings": main1_dev_recs, "supervisions": main1_dev_sup, } logging.info("Preparing main1_train") main1_train_sup = SupervisionSet.from_segments( gen_supervision_per_utt( sorted(open(main1_trainp).readlines()), main1_recs, parse_main_line, )) main1_train_recs = main1_recs.filter(lambda rec: rec.id in main1_train_sup) manifests["main1_train"] = { "recordings": main1_train_recs, "supervisions": main1_train_sup, } # ### Out of Domain (OOD) track sets unlabeled_wavpaths = [ ood1_wav / name.strip() for name in open(ood1_unlabeled).readlines() ] manifests["ood1_unlabeled"] = { "recordings": RecordingSet.from_recordings( Recording.from_file(p) for p in unlabeled_wavpaths) } ood1_recs = RecordingSet.from_dir(ood1_wav, pattern="*.wav", num_jobs=num_jobs) logging.info("Preparing ood1_dev") ood1_dev_sup = SupervisionSet.from_segments( gen_supervision_per_utt( sorted(open(ood1_devp).readlines()), ood1_recs, parse_ood_line, )) ood1_dev_recs = ood1_recs.filter(lambda rec: rec.id in ood1_dev_sup) manifests["ood1_dev"] = { "recordings": ood1_dev_recs, "supervisions": ood1_dev_sup, } logging.info("Preparing ood1_train") ood1_train_sup = SupervisionSet.from_segments( gen_supervision_per_utt( sorted(open(ood1_trainp).readlines()), ood1_recs, parse_ood_line, )) ood1_train_recs = ood1_recs.filter(lambda rec: rec.id in ood1_train_sup) manifests["ood1_train"] = { "recordings": ood1_train_recs, "supervisions": ood1_train_sup, } # Optionally serializing to disc if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part, d in manifests.items(): d["recordings"].to_file(output_dir / f"recordings_{part}.jsonl.gz") if "supervisions" in d: d["supervisions"].to_file(output_dir / f"supervisions_{part}.jsonl.gz") return manifests
def prepare_l2_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' speaker_meta = _parse_speaker_description() recordings = RecordingSet.from_recordings( # Example ID: zhaa-arctic_b0126 Recording.from_file( wav, recording_id=f'{wav.parent.parent.name.lower()}-{wav.stem}') for wav in corpus_dir.rglob('*.wav')) supervisions = [] for path in corpus_dir.rglob('*.txt'): # One utterance (line) per file text = path.read_text().strip() is_suitcase_corpus = 'suitcase_corpus' in path.parts speaker = path.parent.parent.name.lower( ) # <root>/ABA/transcript/arctic_a0051.txt -> aba if is_suitcase_corpus: speaker = path.stem # <root>/suitcase_corpus/transcript/aba.txt -> aba seg_id = f'suitcase_corpus-{speaker}' if is_suitcase_corpus else f'{speaker}-{path.stem}' supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language='English', speaker=speaker, gender=speaker_meta[speaker]['gender'], custom={'accent': speaker_meta[speaker]['native_lang']})) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) splits = { 'read': { 'recordings': recordings.filter(lambda r: 'suitcase_corpus' not in r.id), 'supervisions': supervisions.filter( lambda s: 'suitcase_corpus' not in s.recording_id) }, 'suitcase': { 'recordings': recordings.filter(lambda r: 'suitcase_corpus' in r.id), 'supervisions': supervisions.filter(lambda s: 'suitcase_corpus' in s.recording_id) } } if output_dir is not None: output_dir = Path(output_dir) makedirs(output_dir, exist_ok=True) for key, manifests in splits.items(): manifests['recordings'].to_json(output_dir / f'recordings-{key}.json') manifests['supervisions'].to_json(output_dir / f'supervisions-{key}.json') return splits
def scan_recordings(corpus_dir: Path) -> RecordingSet: return RecordingSet.from_recordings( Recording.from_file(file) for file in corpus_dir.rglob('*.wav') )
def prepare_libricss( corpus_dir: Pathlike, output_dir: Pathlike = None, type: str = "mdm", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. NOTE: The recordings contain all 7 channels. If you want to use only one channel, you can use either ``recording.load_audio(channel=0)`` or ``MonoCut(id=...,recording=recording,channel=0)`` while creating the CutSet. :param corpus_dir: Pathlike, the path to the extracted corpus. :param output_dir: Pathlike, the path where to write the manifests. :param type: str, the type of data to prepare ('mdm', 'sdm', 'ihm-mix', or 'ihm'). These settings are similar to the ones in AMI and ICSI recipes. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ assert type in ["mdm", "ihm-mix", "ihm"] manifests = {} corpus_dir = Path(corpus_dir) corpus_dir = (corpus_dir / "for_release" if corpus_dir.stem != "for_release" else corpus_dir) recordings = [] segments = [] for ov in OVERLAP_RATIOS: for session in (corpus_dir / ov).iterdir(): _, _, _, _, _, name, actual_ov = session.name.split("_") actual_ov = float(actual_ov.split("actual")[1]) recording_id = f"{ov}_{name}" audio_path = (session / "clean" / "mix.wav" if type == "ihm-mix" else session / "clean" / "each_spk.wav" if type == "ihm" else session / "record" / "raw_recording.wav") recordings.append( Recording.from_file(audio_path, recording_id=recording_id)) for idx, seg in enumerate( parse_transcript(session / "transcription" / "meeting_info.txt")): segments.append( SupervisionSegment( id=f"{recording_id}-{idx}", recording_id=recording_id, start=seg[0], duration=seg[1] - seg[0], text=seg[4], language="English", speaker=seg[2], channel=SPK_TO_CHANNEL_MAP[session.name][seg[2]] if type == "ihm" else 0, )) supervisions = SupervisionSet.from_segments(segments) recordings = RecordingSet.from_recordings(recordings) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) recordings.to_jsonl(output_dir / "recordings.jsonl") supervisions.to_jsonl(output_dir / "supervisions.jsonl") return {"recordings": recordings, "supervisions": supervisions}
def prepare_rir_noise( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, parts: Sequence[str] = ("point_noise", "iso_noise", "real_rir", "sim_rir"), ) -> Dict[str, Dict[str, Union[RecordingSet, CutSet]]]: """ Prepare the RIR Noise corpus. :param corpus_dir: Pathlike, the path of the dir to store the dataset. :param output_dir: Pathlike, the path of the dir to write the manifests. :param parts: Sequence[str], the parts of the dataset to prepare. The corpus contains 4 things: point-source noises (point_noise), isotropic noises (iso_noise), real RIRs (real_rir), and simulated RIRs (sim_rir). We will prepare these parts in the corresponding dict keys. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if not parts: raise ValueError("No parts specified for manifest preparation.") if isinstance(parts, str): parts = [parts] manifests = defaultdict(dict) for part in parts: logging.info(f"Preparing {part}...") audio_dir = corpus_dir / PARTS[part] assert audio_dir.is_dir(), f"No such directory: {audio_dir}" if part == "sim_rir": # The "small", "medium", and "large" rooms have the same file names, so # we have to handle them separately to avoid duplicating manifests. recordings = [] for room_type in ("small", "medium", "large"): room_dir = audio_dir / f"{room_type}room" recordings += [ Recording.from_file( file, recording_id=f"{room_type}-{file.stem}") for file in room_dir.rglob("*.wav") ] manifests[part]["recordings"] = RecordingSet.from_recordings( recordings) elif part == "point_noise": manifests[part]["recordings"] = RecordingSet.from_recordings( Recording.from_file(file) for file in audio_dir.rglob("*.wav")) elif part == "iso_noise": manifests[part]["recordings"] = RecordingSet.from_recordings( Recording.from_file(file) for file in audio_dir.rglob("*.wav") if "noise" in file.stem) elif part == "real_rir": manifests[part]["recordings"] = RecordingSet.from_recordings( Recording.from_file(file) for file in audio_dir.rglob("*.wav") if "rir" in file.stem) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in manifests: for key, manifest in manifests[part].items(): manifest.to_file(output_dir / f"{key}_{part}.json") return manifests
def prepare_callhome_egyptian( audio_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Callhome Egyptian Arabic Corpus We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir: Path to ``LDC97S45`` package. :param transcript_dir: Path to the ``LDC97T19`` content :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir = Path(audio_dir) transcript_dir = Path(transcript_dir) manifests = {} for split in ["train", "devtest", "evaltest"]: audio_paths = check_and_rglob( # The LDC distribution has a typo. audio_dir / "callhome/arabic" / split.replace("evaltest", "evltest"), "*.sph", ) recordings = RecordingSet.from_recordings( Recording.from_file( p, relative_path_depth=None if absolute_paths else 4) for p in tqdm(audio_paths)) transcript_paths = check_and_rglob( transcript_dir / f"callhome_arabic_trans_970711/transcrp/{split}/roman", "*.txt", ) # TODO: Add text normalization like in Kaldi recipe. # Not doing this right now as it's not needed for VAD/diarization... supervisions = [] for p in transcript_paths: idx = 0 for line in p.read_text().splitlines(): line = line.strip() if not line: continue recording_id = p.stem # example line: # 19.33 21.18 B: %ah Tayyib start, end, spk, text = line.split(maxsplit=3) spk = spk.replace(":", "") duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue start = float(start) supervisions.append( SupervisionSegment( id=f"{recording_id}_{idx}", recording_id=recording_id, start=start, duration=duration, speaker=f"{recording_id}_{spk}", text=text, )) idx += 1 supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / f"recordings_{split}.json") supervisions.to_json(output_dir / f"supervisions_{split}.json") manifests[split] = { "recordings": recordings, "supervisions": supervisions } return manifests
def prepare_single_mtedx_language( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, language: str = "language", num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares manifests using a single MTEDx language. This function works as follows: - First it looks for the audio directory in the data/wav where the .flac files are stored. - Then, it looks for the vtt directory in data/{train,dev,test}/vtt which contains the segmentation and transcripts for the audio. - The transcripts undergo some basic text normalization :param corpus_dir: Path to the root of the MTEDx download :param output_dir: Path where the manifests are stored as .json files :param language: The two-letter language code. :param num_jobs: Number of threads to use when preparing data. :return: """ if isinstance(corpus_dir, str): corpus_dir = Path(corpus_dir) manifests = defaultdict(dict) with ThreadPoolExecutor(num_jobs) as ex: for split in ("train", "valid", "test"): audio_dir = corpus_dir / f"data/{split}/wav" recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.flac") ) if len(recordings) == 0: logging.warning(f"No .flac files found in {audio_dir}") supervisions = [] text_dir = corpus_dir / f"data/{split}/vtt" futures = [] for p in text_dir.glob("*"): futures.append(ex.submit(_filename_to_supervisions, p, language)) for future in tqdm(futures, desc="Processing", leave=False): result = future.result() if result is None: continue for sup in result: supervisions.append(sup) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests[split] = { "recordings": recordings, "supervisions": supervisions, } if output_dir is not None: if isinstance(output_dir, str): output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) save_split = "dev" if split == "valid" else split recordings.to_file(output_dir / f"recordings_{language}_{split}.json") supervisions.to_file( output_dir / f"supervisions_{language}_{split}.json" ) return dict(manifests)
def prepare_single_babel_language( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, no_eval_ok: bool = False, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares manifests using a single BABEL LDC package. This function works like the following: - first, it will scan `corpus_dir` for a directory named `conversational`; if there is more than once, it picks the first one (and emits a warning) - then, it will try to find `dev`, `eval`, and `training` splits inside (if any of them is not present, it will skip it with a warning) - finally, it scans the selected location for SPHERE audio files and transcripts. :param corpus_dir: Path to the root of the LDC package with a BABEL language. :param output_dir: Path where the manifests are stored.json :param no_eval_ok: When set to True, this function won't emit a warning that the eval set was not found. :return: """ manifests = defaultdict(dict) # Auto-detect the location of the "conversational" directory orig_corpus_dir = corpus_dir corpus_dir = Path(corpus_dir) corpus_dir = [d for d in corpus_dir.rglob("conversational") if d.is_dir()] if not corpus_dir: raise ValueError( f"Could not find 'conversational' directory anywhere inside '{orig_corpus_dir}' " f"- please check your path.") if len(corpus_dir) > 1: # People have very messy data distributions, the best we can do is warn them. logging.warning( f"It seems there are multiple 'conversational' directories in '{orig_corpus_dir}' - " f"we are selecting the first one only ({corpus_dir[0]}). Please ensure that you provided " f"the path to a single language's dir, and the root dir for all BABEL languages." ) corpus_dir = corpus_dir[0].parent for split in ("dev", "eval", "training"): audio_dir = corpus_dir / f"conversational/{split}/audio" sph_recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.sph")) wav_recordings = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_dir.glob("*.wav")) recordings = combine(sph_recordings, wav_recordings) if len(recordings) == 0: if split == "eval" and no_eval_ok: continue logging.warning(f"No SPHERE or WAV files found in {audio_dir}") supervisions = [] text_dir = corpus_dir / f"conversational/{split}/transcription" for p in tqdm.tqdm(text_dir.glob("*")): # p.stem -> BABEL_BP_101_10033_20111024_205740_inLine # parts: # 0 -> BABEL # 1 -> BP # 2 -> <language-code> (101) # 3 -> <speaker-id> (10033) # 4 -> <date> (20111024) # 5 -> <hour> (205740) # 6 -> channel (inLine) ; inLine <=> A ; outLine <=> B ; "scripted" <=> A p0, p1, lang_code, speaker, date, hour, channel, *_ = p.stem.split( "_") channel = {"inLine": "A", "outLine": "B"}.get(channel, "A") # Fix problematic segments that have two consecutive timestamp lines with no transcript in between lines = p.read_text().splitlines() + [""] lines = [ prev_l for prev_l, l in sliding_window(2, lines) if not (prev_l.startswith("[") and l.startswith("[")) ] # Add a None at the end so that the last timestamp is only used as "next_timestamp" # and ends the iretation (otherwise we'd lose the last segment). lines += [None] for (timestamp, text), (next_timestamp, _) in sliding_window(2, zip(lines[::2], lines[1::2])): try: start = float(timestamp[1:-1]) end = float(next_timestamp[1:-1]) # Create supervision supervisions.append( SupervisionSegment( id= f"{lang_code}_{speaker}_{channel}_{date}_{hour}_{int(100 * start):06}", recording_id=p.stem, start=start, duration=round(end - start, ndigits=8), channel=0, text=normalize_text(text), language=BABELCODE2LANG[lang_code], speaker=f"{lang_code}_{speaker}_{channel}", )) except Exception as e: logging.warning( f"Error while parsing segment. Message: {str(e)}") raise ValueError( f"Too many errors while parsing segments (file: '{p}'). " f"Please check your data or increase the threshold.") supervisions = deduplicate_supervisions(supervisions) if len(supervisions) == 0: logging.warning(f"No supervisions found in {text_dir}") supervisions = SupervisionSet.from_segments(supervisions) # Fixing and validation of manifests if split == "eval" and len(supervisions) == 0: # We won't remove missing recordings for the "eval" split in cases where # the user does not have its corresponding transcripts (very likely). pass else: recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) supervisions = trim_supervisions_to_recordings( recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests[split] = { "recordings": recordings, "supervisions": supervisions } if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) language = BABELCODE2LANG[lang_code] save_split = "train" if split == "training" else split recordings.to_file(output_dir / f"recordings_{language}_{save_split}.json") supervisions.to_file(output_dir / f"supervisions_{language}_{save_split}.json") return dict(manifests)
def prepare_callhome_english_asr( audio_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the CallHome American English corpus. We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir: Path to ``LDC97S42`` content :param transcript_dir: Path to the ``LDC97T14`` content :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir = Path(audio_dir) transcript_dir = Path(transcript_dir) manifests = {} for split in ["evaltest", "train", "devtest"]: audio_paths = check_and_rglob( # The LDC distribution has a typo. audio_dir / "data" / split.replace("evaltest", "evltest"), "*.sph", ) recordings = RecordingSet.from_recordings( Recording.from_file( p, relative_path_depth=None if absolute_paths else 4) for p in tqdm(audio_paths)) transcript_paths = check_and_rglob( transcript_dir / "transcrpt" / split, "*.txt", ) # TODO: Add text normalization like in Kaldi recipe. # Not doing this right now as it's not needed for VAD/diarization... supervisions = [] for p in transcript_paths: idx = 0 postprocessed_lines = list() for line in p.read_text().splitlines(): line = line.strip() if not line: continue if line.startswith("#"): continue try: start, end, spk, text = line.split(maxsplit=3) duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue postprocessed_lines.append(line) except InvalidOperation: postprocessed_lines[ -1] = postprocessed_lines[-1] + " " + line except ValueError: postprocessed_lines[ -1] = postprocessed_lines[-1] + " " + line for line in postprocessed_lines: recording_id = p.stem # example line: # 19.33 21.18 B: %ah Tayyib start, end, spk, text = line.split(maxsplit=3) spk = spk.replace(":", "") duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue start = float(start) supervisions.append( SupervisionSegment( recording_id=recording_id, start=start, duration=duration, channel=ord(spk[0]) - ord("A"), speaker=f"{recording_id}_{spk:0>2s}", id=f"{recording_id}_{spk:0>2s}_{idx:0>5d}", text=text, )) idx += 1 supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_file(output_dir / f"callhome-english_recordings_{split}.jsonl.gz") supervisions.to_file( output_dir / f"callhome-english_supervisions_{split}.jsonl.gz") manifests[split] = { "recordings": recordings, "supervisions": supervisions } return manifests
def prepare_adept( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ): """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( Recording.from_file( path=path, # converts: # path/to/ADEPT/wav_44khz/propositional_attitude/surprise/ad01_0204.wav # to: # propositional_attitude_surprise_ad01_0204 recording_id=str(path.relative_to(path.parent.parent.parent)) [:-4].replace("/", "_"), ) for path in (corpus_dir / "wav_44khz").rglob("*.wav")) supervisions = [] with open(corpus_dir / "adept_prompts.json") as f: interpretation_map = json.load(f) for path in (corpus_dir / "txt").rglob("*.txt"): annotation_type, label, prompt_id = str( path.relative_to(path.parent.parent.parent))[:-4].split("/") speaker_id = "ADEPT_" + prompt_id.split("_")[0] recording_id = "_".join((annotation_type, label, prompt_id)) interpretation_group = interpretation_map.get(annotation_type) interpretation = (interpretation_group[prompt_id][label] if interpretation_group else None) recording = recordings[recording_id] custom = { "type": annotation_type, "label": label, "prompt_id": prompt_id } if interpretation: # label is "interpretation_1", "interpretation_2", ..., "middle", "end", etc # Interpretations' labels meaning is defined by their textual realisation: # {..., "middle": "Galleries are WHAT on Thursdays?", "end": "Galleries are free WHEN?"} custom["text"] = interpretation supervisions.append( SupervisionSegment( id=recording_id, recording_id=recording_id, start=0, duration=recording.duration, channel=0, text=path.read_text(), language="English", speaker=speaker_id, custom=custom, )) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) supervisions.to_file(output_dir / "adept_supervisions.json") recordings.to_file(output_dir / "adept_recordings.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_l2_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" speaker_meta = _parse_speaker_description() recordings = RecordingSet.from_recordings( # Example ID: zhaa-arctic_b0126 Recording.from_file( wav, recording_id=f"{wav.parent.parent.name.lower()}-{wav.stem}") for wav in corpus_dir.rglob("*.wav")) supervisions = [] for path in corpus_dir.rglob("*.txt"): # One utterance (line) per file text = path.read_text().strip() is_suitcase_corpus = "suitcase_corpus" in path.parts speaker = (path.parent.parent.name.lower() ) # <root>/ABA/transcript/arctic_a0051.txt -> aba if is_suitcase_corpus: speaker = path.stem # <root>/suitcase_corpus/transcript/aba.txt -> aba seg_id = (f"suitcase_corpus-{speaker}" if is_suitcase_corpus else f"{speaker}-{path.stem}") supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language="English", speaker=speaker, gender=speaker_meta[speaker]["gender"], custom={"accent": speaker_meta[speaker]["native_lang"]}, )) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) splits = { "read": { "recordings": recordings.filter(lambda r: "suitcase_corpus" not in r.id), "supervisions": supervisions.filter( lambda s: "suitcase_corpus" not in s.recording_id), }, "suitcase": { "recordings": recordings.filter(lambda r: "suitcase_corpus" in r.id), "supervisions": supervisions.filter(lambda s: "suitcase_corpus" in s.recording_id), }, } if output_dir is not None: output_dir = Path(output_dir) makedirs(output_dir, exist_ok=True) for key, manifests in splits.items(): manifests["recordings"].to_file( output_dir / f"l2-arctic_recordings_{key}.jsonl.gz") manifests["supervisions"].to_file( output_dir / f"l2-arctic_supervisions_{key}.jsonl.gz") return splits
def prepare_cmu_indic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the CMU Indic manifests, which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( # Example ID: cmu_indic_ben_rm_bn_00001 Recording.from_file( wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}" ) for wav in corpus_dir.rglob("*.wav") ) supervisions = [] for path in corpus_dir.rglob("txt.done.data"): lines = path.read_text().splitlines() speaker = _get_speaker(path.parent.parent.name) lang_code = speaker.split("_")[0] # example: 'ben_rm' -> 'ben' (Bengali) try: # Example contents of voice.feats file: # variant guj # age 28 # gender male # description Built with build_cg_rfs_voice, 3 rf and 3 dur # gujarati_data h2r_prompts # prompt_dur 59.27min age = int( (path.parent / "voice.feats") .read_text() .splitlines()[1] .replace("age ", "") .strip() ) except: age = None for l in lines: l = l[2:-2] # get rid of parentheses and whitespaces on the edges seg_id, text = l.split(maxsplit=1) seg_id = f"{speaker}-{seg_id}" language = LANGUAGE_MAP[lang_code] is_english = "arctic" in seg_id # Determine available custom meta-data to attach. custom = None if is_english or age is not None: custom = {} if is_english: custom["accent"] = language if age is not None: custom["age"] = age supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text.replace('"', ""), # get rid of quotation marks, language="English" if is_english else language, speaker=speaker, gender=GENDER_MAP.get(speaker), custom=custom, ) ) supervisions = SupervisionSet.from_segments(supervisions) # There seem to be 20 recordings missing; remove the before validation recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) recordings.to_file(output_dir / "cmu-indic_recordings_all.jsonl.gz") supervisions.to_file(output_dir / "cmu-indic_supervisions_all.jsonl.gz") return {"recordings": recordings, "supervisions": supervisions}
def prepare_callhome_egyptian( audio_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None, sph2pipe_path: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Switchboard corpus. We create two manifests: one with recordings, and the other one with text supervisions. When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. :param audio_dir: Path to ``LDC2001S97`` package. :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). If not provided, the transcripts will be downloaded. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir = Path(audio_dir) transcript_dir = Path(transcript_dir) manifests = {} for split in ['train', 'devtest', 'evaltest']: audio_paths = check_and_rglob( # The LDC distribution has a typo. audio_dir / 'callhome/arabic' / split.replace('evaltest', 'evltest'), '*.sph') recordings = RecordingSet.from_recordings( make_recording_callhome(p, sph2pipe_path=sph2pipe_path) for p in tqdm(audio_paths)) transcript_paths = check_and_rglob( transcript_dir / f'callhome_arabic_trans_970711/transcrp/{split}/roman', '*.txt') # TODO: Add text normalization like in Kaldi recipe. # Not doing this right now as it's not needed for VAD/diarization... supervisions = [] for p in transcript_paths: idx = 0 for line in p.read_text().splitlines(): line = line.strip() if not line: continue recording_id = p.stem # example line: # 19.33 21.18 B: %ah Tayyib start, end, spk, text = line.split(maxsplit=3) spk = spk.replace(":", "") duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue start = float(start) supervisions.append( SupervisionSegment(id=f'{recording_id}_{idx}', recording_id=recording_id, start=start, duration=duration, speaker=f'{recording_id}_{spk}', text=text)) idx += 1 supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / f'recordings_{split}.json') supervisions.to_json(output_dir / f'supervisions_{split}.json') manifests[split] = { 'recordings': recordings, 'supervisions': supervisions } return manifests