def prepare_libritts( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, link_previous_utt: bool = False, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :param num_jobs: the number of parallel workers parsing the data. :param link_previous_utt: If true adds previous utterance id to supervisions. Useful for reconstructing chains of utterances as they were read. If previous utterance was skipped from LibriTTS datasets previous_utt label is None. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if dataset_parts == "auto": dataset_parts = LIBRITTS elif isinstance(dataset_parts, str): assert dataset_parts in LIBRITTS dataset_parts = [dataset_parts] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, prefix="libritts") # Contents of the file # ;ID |SEX| SUBSET |MINUTES| NAME # 14 | F | train-clean-360 | 25.03 | ... # 16 | F | train-clean-360 | 25.11 | ... # 17 | M | train-clean-360 | 25.04 | ... spk2gender = { spk_id.strip(): gender.strip() for spk_id, gender, *_ in (line.split("|") for line in ( corpus_dir / "SPEAKERS.txt").read_text().splitlines() if not line.startswith(";")) } for part in tqdm(dataset_parts, desc="Preparing LibriTTS parts"): if manifests_exist(part=part, output_dir=output_dir, prefix="libritts"): logging.info( f"LibriTTS subset: {part} already prepared - skipping.") continue part_path = corpus_dir / part recordings = RecordingSet.from_dir(part_path, "*.wav", num_jobs=num_jobs) supervisions = [] for trans_path in tqdm( part_path.rglob("*.trans.tsv"), desc="Scanning transcript files (progbar per speaker)", leave=False, ): # The trans.tsv files contain only the recordings that were kept for LibriTTS. # Example path to a file: # /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv # # Example content: # 84_121123_000007_000001 Maximilian. Maximilian. # 84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief. Villefort rose, half ashamed of being surprised in such a paroxysm of grief. # book.tsv contains additional metadata utt2snr = [(rec_id, float(snr)) for rec_id, *_, snr in map( str.split, (trans_path.parent / trans_path.name.replace(".trans.tsv", ".book.tsv") ).read_text().splitlines(), )] # keeps the order of uttids as they appear in book.tsv uttids = [r for r, _ in utt2snr] utt2snr = dict(utt2snr) if link_previous_utt: # Using the property of sorted keys to find previous utterance # The keys has structure speaker_book_x_y e.g. 1089_134691_000004_000001 utt2prevutt = dict(zip(uttids + [None], [None] + uttids)) prev_rec_id = None for line in trans_path.read_text().splitlines(): rec_id, orig_text, norm_text = line.split("\t") spk_id = rec_id.split("_")[0] customd = {"orig_text": orig_text, "snr": utt2snr[rec_id]} if link_previous_utt: # all recordings ids should be in the book.csv # but they are some missing e.g. 446_123502_000030_000003 prev_utt = utt2prevutt.get(rec_id, None) # previous utterance has to be present in trans.csv - otherwise it was skipped prev_utt = prev_utt if prev_utt == prev_rec_id else None customd["prev_utt"] = prev_utt prev_rec_id = rec_id supervisions.append( SupervisionSegment( id=rec_id, recording_id=rec_id, start=0.0, duration=recordings[rec_id].duration, channel=0, text=norm_text, language="English", speaker=spk_id, gender=spk2gender[spk_id], custom=customd, )) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: supervisions.to_json(output_dir / f"libritts_supervisions_{part}.json") recordings.to_json(output_dir / f"libritts_recordings_{part}.json") manifests[part] = { "recordings": recordings, "supervisions": supervisions } return manifests
def prepare_icsi( audio_dir: Pathlike, transcripts_dir: Pathlike, output_dir: Optional[Pathlike] = None, mic: Optional[str] = "ihm", normalize_text: str = "kaldi", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param audio_dir: Pathlike, the path which holds the audio data :param transcripts_dir: Pathlike, the path which holds the transcripts data :param output_dir: Pathlike, the path where to write the manifests - `None` means manifests aren't stored on disk. :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use. :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text :return: a Dict whose key is ('train', 'dev', 'test'), and the values are dicts of manifests under keys 'recordings' and 'supervisions'. """ audio_dir = Path(audio_dir) transcripts_dir = Path(transcripts_dir) assert audio_dir.is_dir(), f"No such directory: {audio_dir}" assert transcripts_dir.is_dir(), f"No such directory: {transcripts_dir}" assert mic in MIC_TO_CHANNELS.keys(), f"Mic {mic} not supported" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logging.info("Parsing ICSI transcripts") annotations, channel_to_idx_map = parse_icsi_annotations( transcripts_dir, normalize=normalize_text) # Audio logging.info("Preparing recording manifests") channels = "".join(MIC_TO_CHANNELS[mic]) if mic == "ihm" or mic == "mdm": audio_paths = audio_dir.rglob(f"chan[{channels}].sph") audio = prepare_audio_grouped( list(audio_paths), channel_to_idx_map if mic == "ihm" else None) elif mic == "sdm" or mic == "ihm-mix": audio_paths = (audio_dir.rglob(f"chan[{channels}].sph") if len(channels) else audio_dir.rglob("*.wav")) audio = prepare_audio_single(list(audio_paths)) # Supervisions logging.info("Preparing supervision manifests") supervision = (prepare_supervision_ihm( audio, annotations, channel_to_idx_map) if mic == "ihm" else prepare_supervision_other(audio, annotations)) manifests = defaultdict(dict) for part in ["train", "dev", "test"]: # Get recordings for current data split audio_part = audio.filter(lambda x: x.id in PARTITIONS[part]) supervision_part = supervision.filter( lambda x: x.recording_id in PARTITIONS[part]) # Write to output directory if a path is provided if output_dir is not None: audio_part.to_file(output_dir / f"recordings_{part}.jsonl") supervision_part.to_file(output_dir / f"supervisions_{part}.jsonl") audio_part, supervision_part = fix_manifests(audio_part, supervision_part) validate_recordings_and_supervisions(audio_part, supervision_part) # Combine all manifests into one dictionary manifests[part] = { "recordings": audio_part, "supervisions": supervision_part } return dict(manifests)
def prepare_mobvoihotwords( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) dataset_parts = ['train', 'dev', 'test'] for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] for prefix in ['p_', 'n_']: prefixed_part = prefix + part json_path = corpus_dir / 'mobvoi_hotword_dataset_resources' / f'{prefixed_part}.json' with open(json_path, 'r', encoding='utf-8') as f: json_data = json.load(f) for entry in json_data: idx = entry['utt_id'] speaker = idx if entry['speaker_id'] is None else entry[ 'speaker_id'] audio_path = corpus_dir / 'mobvoi_hotword_dataset' / f'{idx}.wav' text = 'FREETEXT' if entry['keyword_id'] == 0: text = 'HiXiaowen' elif entry['keyword_id'] == 1: text = 'NihaoWenwen' else: assert entry['keyword_id'] == -1 if not audio_path.is_file(): logging.warning(f'No such file: {audio_path}') continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language='Chinese', speaker=speaker, text=text.strip()) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) if output_dir is not None: supervision_set.to_json(output_dir / f'supervisions_{part}.json') recording_set.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': recording_set, 'supervisions': supervision_set } return manifests
def prepare_mobvoihotwords( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" dataset_parts = ["train", "dev", "test"] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir) for part in dataset_parts: logging.info(f"Preparing MobvoiHotwords subset: {part}") if manifests_exist(part=part, output_dir=output_dir): logging.info( f"MobvoiHotwords subset: {part} already prepared - skipping.") continue # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] for prefix in ["p_", "n_"]: prefixed_part = prefix + part json_path = (corpus_dir / "mobvoi_hotword_dataset_resources" / f"{prefixed_part}.json") with open(json_path, "r", encoding="utf-8") as f: json_data = json.load(f) for entry in json_data: idx = entry["utt_id"] speaker = (idx if entry["speaker_id"] is None else entry["speaker_id"]) audio_path = corpus_dir / "mobvoi_hotword_dataset" / f"{idx}.wav" text = "FREETEXT" if entry["keyword_id"] == 0: text = "HiXiaowen" elif entry["keyword_id"] == 1: text = "NihaoWenwen" else: assert entry["keyword_id"] == -1 if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language="Chinese", speaker=speaker, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"mobvoi_supervisions_{part}.jsonl.gz") recording_set.to_file(output_dir / f"mobvoi_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests
def prepare_heroico( speech_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param speech_dir: Pathlike, the path of the speech data dir. param transcripts_dir: Pathlike, the path of the transcript data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the fold, and the value is Dicts with the keys 'audio' and 'supervisions'. """ speech_dir = Path(speech_dir) transcript_dir = Path(transcript_dir) assert speech_dir.is_dir(), f'No such directory: {speech_dir}' assert transcript_dir.is_dir(), f'No such directory: {transcript_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) # set some patterns to match fields in transcript files and filenames answers_line_pattern = re.compile("\d+/\d+\t.+") answers_path_pattern = re.compile('Answers_Spanish') heroico_recitations_line_pattern = re.compile("\d+\t.+") heroico_recitations_devtest_path_pattern = re.compile('Recordings_Spanish') heroico_recitations_train_path_pattern = re.compile('Recordings_Spanish') usma_line_pattern = re.compile("s\d+\t.+") usma_native_demo_pattern = re.compile("usma/native\-[fm]\-\w+\-\S+\-\S+\-\S+\-\S+\-\w+\d+") usma_native_path_pattern = re.compile('usma/native') usma_native_prompt_id_pattern = re.compile('s\d+') usma_nonnative_demo_pattern = re.compile( "nonnative\-[fm]\-[a-zA-Z]+\d*\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\d+") usma_nonnative_path_pattern = re.compile('nonnative.+\.wav') # Generate a mapping: utt_id -> (audio_path, audio_info, text) transcripts = defaultdict(dict) # store answers trnscripts answers_trans_path = Path(transcript_dir, heroico_dataset_answers) with open(answers_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() # some recordings do not have a transcript, skip them here if not answers_line_pattern.match(line): continue # IDs have the form speaker/prompt_id spk_utt, text = line.split(maxsplit=1) spk_id, prompt_id = spk_utt.split('/') utt_id = '-'.join(['answers', spk_id, prompt_id]) transcripts[utt_id] = text # store heroico recitations transcripts heroico_recitations_trans_path = Path(transcript_dir, heroico_dataset_recordings) with open(heroico_recitations_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() if not heroico_recitations_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = '-'.join(['heroico-recitations', idx]) transcripts[utt_id] = text # store usma transcripts usma_trans_path = Path(transcript_dir, usma_dataset) with open(usma_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() if not usma_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = '-'.join(['usma', idx]) transcripts[utt_id] = text # store utterance info audio_paths = speech_dir.rglob('*.wav') uttdata = {} for wav_file in audio_paths: wav_path = Path(wav_file) path_components = wav_path.parts pid = wav_path.stem if re.findall(answers_path_pattern, str(wav_file)): # store utternce info for Heroico Answers spk = wav_path.parts[-2] utt_id = '-'.join(['answers', spk, pid]) if utt_id not in transcripts: uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='train', speaker=spk, prompt_id=pid, subcorpus='answers', utterance_id=utt_id, transcript=transcripts[utt_id]) elif re.findall(usma_native_path_pattern, str(wav_file)): # store utterance info for usma native data spk = wav_path.parts[-2] utt_id = '-'.join(['usma', spk, pid]) trans_id = '-'.join(['usma', pid]) if not usma_native_demo_pattern.match(spk): uttdata[str(wav_file)] = None if not usma_native_prompt_id_pattern.match(pid): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='test', speaker=spk, prompt_id=pid, subcorpus='usma', utterance_id=utt_id, transcript=transcripts[trans_id]) elif re.findall(usma_nonnative_path_pattern, str(wav_file)): # store utterance data for usma nonnative data spk = wav_path.parts[-2] utt_id = '-'.join(['usma', spk, pid]) trans_id = '-'.join(['usma', pid]) if not usma_nonnative_demo_pattern.match(spk): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='test', speaker=spk, prompt_id=pid, subcorpus='usma', utterance_id=utt_id, transcript=transcripts[trans_id]) elif int(pid) <= 354 or int(pid) >= 562: # store utterance info for heroico recitations for train dataset spk = wav_path.parts[-2] utt_id = '-'.join(['heroico-recitations', spk, pid]) trans_id = '-'.join(['heroico-recitations', pid]) uttdata[str(wav_file)] = UttInfo(fold='train', speaker=spk, prompt_id=pid, subcorpus='heroico-recitations', utterance_id=utt_id, transcript=transcripts[trans_id]) elif int(pid) > 354 and int(pid) < 562: spk = wav_path.parts[-2] utt_id = '-'.join(['heroico-recitations-repeats', spk, pid]) trans_id = '-'.join(['heroico-recitations-repeats', pid]) uttdata[str(wav_file)] = UttInfo(fold='devtest', speaker=spk, prompt_id=pid, subcorpus='heroico-recitations-repeats', utterance_id=utt_id, transcript=transcripts[trans_id]) else: logging.warning(f'No such file: {wav_file}') audio_paths = speech_dir.rglob('*.wav') audio_files = [w for w in audio_paths] for fld in folds: metadata = {} for wav_file in audio_files: wav_path = Path(wav_file) # skip files with no record if not uttdata[str(wav_file)]: continue # only process the current fold if uttdata[str(wav_file)].fold != fld: continue path_components = wav_path.parts prompt_id = wav_path.stem # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = torchaudio.info(str(wav_file)) spk = wav_path.parts[-2] utt_id = '-'.join([uttdata[str(wav_file)].subcorpus, spk, prompt_id]) metadata[utt_id] = HeroicoMetaData(audio_path=wav_file, audio_info=info[0], text=uttdata[str(wav_file)].transcript) # Audio audio = RecordingSet.from_recordings( Recording( id=idx, sources=[ AudioSource( type='file', channels=[0], source=str(metadata[idx].audio_path) ) ], sampling_rate=int(metadata[idx].audio_info.rate), num_samples=metadata[idx].audio_info.length, duration=(metadata[idx].audio_info.length / metadata[idx].audio_info.rate) ) for idx in metadata ) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='Spanish', speaker=idx.split('-')[-2], text=metadata[idx].text ) for idx in audio.recordings ) if output_dir is not None: supervision.to_json(output_dir / f'supervisions_{fld}.json') audio.to_json(output_dir / f'recordings_{fld}.json') manifests[fld] = { 'recordings': audio, 'supervisions': supervision } return manifests
def prepare_adept( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ): """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( Recording.from_file( path=path, # converts: # path/to/ADEPT/wav_44khz/propositional_attitude/surprise/ad01_0204.wav # to: # propositional_attitude_surprise_ad01_0204 recording_id=str(path.relative_to(path.parent.parent.parent)) [:-4].replace("/", "_"), ) for path in (corpus_dir / "wav_44khz").rglob("*.wav")) supervisions = [] with open(corpus_dir / "adept_prompts.json") as f: interpretation_map = json.load(f) for path in (corpus_dir / "txt").rglob("*.txt"): annotation_type, label, prompt_id = str( path.relative_to(path.parent.parent.parent))[:-4].split("/") speaker_id = "ADEPT_" + prompt_id.split("_")[0] recording_id = "_".join((annotation_type, label, prompt_id)) interpretation_group = interpretation_map.get(annotation_type) interpretation = (interpretation_group[prompt_id][label] if interpretation_group else None) recording = recordings[recording_id] custom = { "type": annotation_type, "label": label, "prompt_id": prompt_id } if interpretation: # label is "interpretation_1", "interpretation_2", ..., "middle", "end", etc # Interpretations' labels meaning is defined by their textual realisation: # {..., "middle": "Galleries are WHAT on Thursdays?", "end": "Galleries are free WHEN?"} custom["text"] = interpretation supervisions.append( SupervisionSegment( id=recording_id, recording_id=recording_id, start=0, duration=recording.duration, channel=0, text=path.read_text(), language="English", speaker=speaker_id, custom=custom, )) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) supervisions.to_file(output_dir / "adept_supervisions.json") recordings.to_file(output_dir / "adept_recordings.json") return {"recordings": recordings, "supervisions": supervisions}