def prepare_aishell4( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ if not is_module_available("textgrid"): raise ValueError( "To prepare AISHELL-4 data, please 'pip install textgrid' first.") import textgrid corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" manifests = defaultdict(dict) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) global_spk_id = {} for part in ["train_L", "train_M", "train_S", "test"]: recordings = [] supervisions = [] wav_path = corpus_dir / part / "wav" for audio_path in wav_path.rglob("*.flac"): idx = audio_path.stem try: tg = textgrid.TextGrid.fromFile( f"{corpus_dir}/{part}/TextGrid/{idx}.TextGrid") except ValueError: logging.warning( f"{idx} has annotation issues. Skipping this recording.") continue recording = Recording.from_file(audio_path) recordings.append(recording) for tier in tg.tiers: local_spk_id = tier.name key = (idx, local_spk_id) if key not in global_spk_id: global_spk_id[key] = f"SPK{len(global_spk_id)+1:04d}" spk_id = global_spk_id[key] for j, interval in enumerate(tier.intervals): if interval.mark != "": start = interval.minTime end = interval.maxTime text = interval.mark segment = SupervisionSegment( id=f"{idx}-{spk_id}-{j}", recording_id=idx, start=start, duration=round(end - start, 4), channel=0, language="Chinese", speaker=spk_id, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"aishell4_supervisions_{part}.jsonl.gz") recording_set.to_file(output_dir / f"aishell4_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests
def prepare_icsi( audio_dir: Pathlike, transcripts_dir: Pathlike, output_dir: Optional[Pathlike] = None, mic: Optional[str] = "ihm", normalize_text: str = "kaldi", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param audio_dir: Pathlike, the path which holds the audio data :param transcripts_dir: Pathlike, the path which holds the transcripts data :param output_dir: Pathlike, the path where to write the manifests - `None` means manifests aren't stored on disk. :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use. :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text :return: a Dict whose key is ('train', 'dev', 'test'), and the values are dicts of manifests under keys 'recordings' and 'supervisions'. """ audio_dir = Path(audio_dir) transcripts_dir = Path(transcripts_dir) assert audio_dir.is_dir(), f"No such directory: {audio_dir}" assert transcripts_dir.is_dir(), f"No such directory: {transcripts_dir}" assert mic in MIC_TO_CHANNELS.keys(), f"Mic {mic} not supported" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logging.info("Parsing ICSI transcripts") annotations, channel_to_idx_map = parse_icsi_annotations( transcripts_dir, normalize=normalize_text) # Audio logging.info("Preparing recording manifests") channels = "".join(MIC_TO_CHANNELS[mic]) if mic == "ihm" or mic == "mdm": audio_paths = audio_dir.rglob(f"chan[{channels}].sph") audio = prepare_audio_grouped( list(audio_paths), channel_to_idx_map if mic == "ihm" else None) elif mic == "sdm" or mic == "ihm-mix": audio_paths = (audio_dir.rglob(f"chan[{channels}].sph") if len(channels) else audio_dir.rglob("*.wav")) audio = prepare_audio_single(list(audio_paths)) # Supervisions logging.info("Preparing supervision manifests") supervision = (prepare_supervision_ihm( audio, annotations, channel_to_idx_map) if mic == "ihm" else prepare_supervision_other(audio, annotations)) manifests = defaultdict(dict) for part in ["train", "dev", "test"]: # Get recordings for current data split audio_part = audio.filter(lambda x: x.id in PARTITIONS[part]) supervision_part = supervision.filter( lambda x: x.recording_id in PARTITIONS[part]) # Write to output directory if a path is provided if output_dir is not None: audio_part.to_file(output_dir / f"icsi-{mic}_recordings_{part}.jsonl.gz") supervision_part.to_file( output_dir / f"icsi-{mic}_supervisions_{part}.jsonl.gz") audio_part, supervision_part = fix_manifests(audio_part, supervision_part) validate_recordings_and_supervisions(audio_part, supervision_part) # Combine all manifests into one dictionary manifests[part] = { "recordings": audio_part, "supervisions": supervision_part } return dict(manifests)
def prepare_ami( data_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param data_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is ('train', 'dev', 'eval'), and the value is Dicts with keys 'audio' and 'supervisions'. """ data_dir = Path(data_dir) assert data_dir.is_dir(), f'No such directory: {data_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) anotation_lists = parse_ami_annotations(data_dir / 'annotations.gzip') # Create a mapping from a tuple of (session_id, channel) to the list of annotations. # This way we can map the supervisions to the right channels in a multi-channel recording. annotation_by_id_and_channel = { (filename.split('.')[0], int(filename[-5])): annotations for filename, annotations in anotation_lists.items() } wav_dir = data_dir / 'wav_db' audio_paths = wav_dir.rglob('*.wav') # Group together multiple channels from the same session. # We will use that to create a Recording with multiple sources (channels). from cytoolz import groupby channel_wavs = groupby(lambda p: p.parts[-3], audio_paths) manifests = defaultdict(dict) for part in dataset_parts: # Audio recordings = [] for session_name, channel_paths in channel_wavs.items(): if session_name not in dataset_parts[part]: continue audio_info = torchaudio.info(str(channel_paths[0]))[0] recordings.append( Recording( id=session_name, sources=[ AudioSource(type='file', channels=[idx], source=str(audio_path)) for idx, audio_path in enumerate(sorted(channel_paths)) ], sampling_rate=int(audio_info.rate), num_samples=audio_info.length, duration=audio_info.length / audio_info.rate, )) audio = RecordingSet.from_recordings(recordings) # Supervisions segments_by_pause = [] for recording in audio: for source in recording.sources: # In AMI "source.channels" will always be a one-element list channel, = source.channels anotation = annotation_by_id_and_channel.get( (recording.id, channel)) if anotation is None: logging.warning( f'No annotation found for recording "{recording.id}" channel {channel} ' f'(file {source.source})') continue for seg_idx, seg_info in enumerate(anotation): for subseg_idx, subseg_info in enumerate(seg_info): duration = subseg_info.end_time - subseg_info.begin_time if duration > 0: segments_by_pause.append( SupervisionSegment( id=f'{recording.id}-{seg_idx}-{subseg_idx}', recording_id=recording.id, start=subseg_info.begin_time, duration=duration, channel=channel, language='English', speaker=subseg_info.speaker, gender=subseg_info.gender, text=subseg_info.text)) supervision = SupervisionSet.from_segments(segments_by_pause) if output_dir is not None: audio.to_json(output_dir / f'recordings_{part}.json') supervision.to_json(output_dir / f'supervisions_{part}.json') manifests[part] = {'recordings': audio, 'supervisions': supervision} return manifests
def prepare_libritts( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = 'auto', output_dir: Optional[Pathlike] = None, num_jobs: int = 1 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :param num_jobs: the number of parallel workers parsing the data. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if dataset_parts == 'auto': dataset_parts = LIBRITTS elif isinstance(dataset_parts, str): assert dataset_parts in LIBRITTS dataset_parts = [dataset_parts] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, prefix='libritts') # Contents of the file # ;ID |SEX| SUBSET |MINUTES| NAME # 14 | F | train-clean-360 | 25.03 | ... # 16 | F | train-clean-360 | 25.11 | ... # 17 | M | train-clean-360 | 25.04 | ... spk2gender = { spk_id.strip(): gender.strip() for spk_id, gender, *_ in (line.split('|') for line in ( corpus_dir / 'SPEAKERS.txt').read_text().splitlines() if not line.startswith(';')) } for part in tqdm(dataset_parts, desc='Preparing LibriTTS parts'): if manifests_exist(part=part, output_dir=output_dir, prefix='libritts'): logging.info( f'LibriTTS subset: {part} already prepared - skipping.') continue part_path = corpus_dir / part recordings = RecordingSet.from_dir(part_path, '*.wav', num_jobs=num_jobs) supervisions = [] for trans_path in tqdm( part_path.rglob('*.trans.tsv'), desc='Scanning transcript files (progbar per speaker)', leave=False): # The trans.tsv files contain only the recordings that were kept for LibriTTS. # Example path to a file: # /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv # # Example content: # 84_121123_000007_000001 Maximilian. Maximilian. # 84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief. Villefort rose, half ashamed of being surprised in such a paroxysm of grief. # book.tsv contains additional metadata utt2snr = { rec_id: float(snr) for rec_id, *_, snr in map(str.split, ( trans_path.parent / trans_path.name.replace('.trans.tsv', '.book.tsv') ).read_text().splitlines()) } for line in trans_path.read_text().splitlines(): rec_id, orig_text, norm_text = line.split('\t') spk_id = rec_id.split('_')[0] supervisions.append( SupervisionSegment(id=rec_id, recording_id=rec_id, start=0.0, duration=recordings[rec_id].duration, channel=0, text=norm_text, language='English', speaker=spk_id, gender=spk2gender[spk_id], custom={ 'orig_text': orig_text, 'snr': utt2snr[rec_id] })) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: supervisions.to_json(output_dir / f'libritts_supervisions_{part}.json') recordings.to_json(output_dir / f'libritts_recordings_{part}.json') manifests[part] = { 'recordings': recordings, 'supervisions': supervisions } return manifests
def prepare_ami( data_dir: Pathlike, output_dir: Optional[Pathlike] = None, mic: Optional[str] = 'ihm', partition: Optional[str] = 'full-corpus', max_pause: Optional[float] = 0.0 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param data_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use. :param partition: str {'full-corpus','full-corpus-asr','scenario-only'}, AMI official data split :param max_pause: float (default = 0.0), max pause allowed between word segments to combine segments :return: a Dict whose key is ('train', 'dev', 'eval'), and the values are dicts of manifests under keys 'recordings' and 'supervisions'. The `partition` and `max_pause` must be chosen depending on the task. For example: - Speaker diarization: set `partition="full-corpus"` and `max_pause=0` - ASR: set `partition="full-corpus-asr"` and `max_pause=0.3` (or some value in the range 0.2-0.5) """ data_dir = Path(data_dir) assert data_dir.is_dir(), f'No such directory: {data_dir}' assert mic in MICS, f'Mic {mic} not supported' assert partition in PARTITIONS, f'Partition {partition} not supported' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logging.info('Parsing AMI annotations') annotations = parse_ami_annotations(data_dir / 'annotations.zip', max_pause=max_pause) # Audio logging.info('Preparing recording manifests') wav_dir = data_dir / 'wav_db' if mic in ['ihm', 'mdm']: audio_paths = wav_dir.rglob('*Headset-?.wav') if mic == 'ihm' else \ wav_dir.rglob('*Array?-0?.wav') audio = prepare_audio_grouped(list(audio_paths)) elif mic in ['ihm-mix', 'sdm']: audio_paths = wav_dir.rglob('*Mix-Headset.wav') if mic == 'ihm-mix' else \ wav_dir.rglob('*Array1-01.wav') audio = prepare_audio_single(list(audio_paths)) # Supervisions logging.info('Preparing supervision manifests') supervision = prepare_supervision_ihm(audio, annotations) if mic == 'ihm' \ else prepare_supervision_other(audio, annotations) manifests = defaultdict(dict) dataset_parts = PARTITIONS[partition] for part in ['train', 'dev', 'test']: # Get recordings for current data split audio_part = audio.filter(lambda x: x.id in dataset_parts[part]) supervision_part = supervision.filter( lambda x: x.recording_id in dataset_parts[part]) # Write to output directory if a path is provided if output_dir is not None: audio_part.to_json(output_dir / f'recordings_{part}.json') supervision_part.to_json(output_dir / f'supervisions_{part}.json') validate_recordings_and_supervisions(audio_part, supervision_part) # Combine all manifests into one dictionary manifests[part] = { 'recordings': audio_part, 'supervisions': supervision_part } return dict(manifests)
def prepare_libritts( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, link_previous_utt: bool = False, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :param num_jobs: the number of parallel workers parsing the data. :param link_previous_utt: If true adds previous utterance id to supervisions. Useful for reconstructing chains of utterances as they were read. If previous utterance was skipped from LibriTTS datasets previous_utt label is None. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if dataset_parts == "auto": dataset_parts = LIBRITTS elif isinstance(dataset_parts, str): assert dataset_parts in LIBRITTS dataset_parts = [dataset_parts] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, prefix="libritts") # Contents of the file # ;ID |SEX| SUBSET |MINUTES| NAME # 14 | F | train-clean-360 | 25.03 | ... # 16 | F | train-clean-360 | 25.11 | ... # 17 | M | train-clean-360 | 25.04 | ... spk2gender = { spk_id.strip(): gender.strip() for spk_id, gender, *_ in (line.split("|") for line in ( corpus_dir / "SPEAKERS.txt").read_text().splitlines() if not line.startswith(";")) } for part in tqdm(dataset_parts, desc="Preparing LibriTTS parts"): if manifests_exist(part=part, output_dir=output_dir, prefix="libritts"): logging.info( f"LibriTTS subset: {part} already prepared - skipping.") continue part_path = corpus_dir / part recordings = RecordingSet.from_dir(part_path, "*.wav", num_jobs=num_jobs) supervisions = [] for trans_path in tqdm( part_path.rglob("*.trans.tsv"), desc="Scanning transcript files (progbar per speaker)", leave=False, ): # The trans.tsv files contain only the recordings that were kept for LibriTTS. # Example path to a file: # /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv # # Example content: # 84_121123_000007_000001 Maximilian. Maximilian. # 84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief. Villefort rose, half ashamed of being surprised in such a paroxysm of grief. # book.tsv contains additional metadata utt2snr = [(rec_id, float(snr)) for rec_id, *_, snr in map( str.split, (trans_path.parent / trans_path.name.replace(".trans.tsv", ".book.tsv") ).read_text().splitlines(), )] # keeps the order of uttids as they appear in book.tsv uttids = [r for r, _ in utt2snr] utt2snr = dict(utt2snr) if link_previous_utt: # Using the property of sorted keys to find previous utterance # The keys has structure speaker_book_x_y e.g. 1089_134691_000004_000001 utt2prevutt = dict(zip(uttids + [None], [None] + uttids)) prev_rec_id = None for line in trans_path.read_text().splitlines(): rec_id, orig_text, norm_text = line.split("\t") spk_id = rec_id.split("_")[0] customd = {"orig_text": orig_text, "snr": utt2snr[rec_id]} if link_previous_utt: # all recordings ids should be in the book.csv # but they are some missing e.g. 446_123502_000030_000003 prev_utt = utt2prevutt.get(rec_id, None) # previous utterance has to be present in trans.csv - otherwise it was skipped prev_utt = prev_utt if prev_utt == prev_rec_id else None customd["prev_utt"] = prev_utt prev_rec_id = rec_id supervisions.append( SupervisionSegment( id=rec_id, recording_id=rec_id, start=0.0, duration=recordings[rec_id].duration, channel=0, text=norm_text, language="English", speaker=spk_id, gender=spk2gender[spk_id], custom=customd, )) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: supervisions.to_file(output_dir / f"libritts_supervisions_{part}.jsonl.gz") recordings.to_file(output_dir / f"libritts_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recordings, "supervisions": supervisions } return manifests
def prepare_aspire( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, mic: str = "single" ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the corpus dir (LDC2017S21). :param output_dir: Pathlike, the path where to write the manifests. :param mic: str, the microphone type, either "single" or "multi". :return: a Dict whose key is the dataset part ('dev' and 'dev_test'), and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" assert mic in [ "single", "multi", ], f"mic must be either 'single' or 'multi', got {mic}" corpus_dir = corpus_dir / "IARPA-ASpIRE-Dev-Sets-v2.0" / "data" audio_dir = corpus_dir / "dev_and_dev_test_audio" stm_dir = corpus_dir / "dev_and_dev_test_STM_files" if mic == "single": audio_paths = { "dev": audio_dir / "ASpIRE_single_dev", "dev_test": audio_dir / "ASpIRE_single_dev_test", } stm_file = { "dev": stm_dir / "dev.stm", "dev_test": stm_dir / "dev_test.stm", } else: audio_paths = { "dev": audio_dir / "ASpIRE_multi_dev", "dev_test": audio_dir / "ASpIRE_multi_dev_test", } stm_file = { "dev": stm_dir / "multi_dev.stm", "dev_test": stm_dir / "multi_dev_test.stm", } manifests = defaultdict(dict) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["dev", "dev_test"]: recordings = [] supervisions = [] # Prepare the recordings if mic == "single": recording_set = RecordingSet.from_dir(audio_paths[part], "*.wav") else: import soundfile as sf audio_groups = { k: list(v) for k, v in itertools.groupby( sorted(audio_paths[part].glob("*.wav")), key=lambda x: "_".join(x.stem.split("_")[:-1]), ) } # group audios so that each entry is a session containing all channels for session_name, audios in audio_groups.items(): audio_sf = sf.SoundFile(str(audios[0])) recordings.append( Recording( id=session_name, sources=[ AudioSource( type="file", channels=[int(audio.stem[-2:]) - 1], source=str(audio), ) for audio in sorted(audios) ], sampling_rate=audio_sf.samplerate, num_samples=audio_sf.frames, duration=audio_sf.frames / audio_sf.samplerate, )) recording_set = RecordingSet.from_recordings(recordings) # Read STM file and prepare segments segments = [] with open(stm_file[part]) as f: for line in f: session, _, speaker, start, end, text = line.strip().split( maxsplit=5) segments.append( AspireSegmentAnnotation(session, speaker, float(start), float(end), text)) # Group the segments by session and speaker segments_grouped = defaultdict(list) for segment in segments: segments_grouped[(segment.session, segment.speaker)].append(segment) # Create the supervisions supervisions = [] for k, segs in segments_grouped.items(): session, speaker = k supervisions += [ SupervisionSegment( id=f"{session}-{speaker}-{i:03d}", recording_id=session, start=seg.start, duration=round(seg.end - seg.start, 4), speaker=speaker, text=seg.text, language="English", ) for i, seg in enumerate(segs) ] supervision_set = SupervisionSet.from_segments(supervisions) recording_set, supervision_set = fix_manifests(recording_set, supervision_set) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"aspire_supervisions_{part}.jsonl.gz") recording_set.to_file(output_dir / f"aspire_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests
def prepare_timit( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, num_phones: int = 48, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consists of the Recodings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write and save the manifests. :param num_phones: int=48, the number of phones (60, 48 or 39) for modeling and 48 is regarded as the default value. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) dataset_parts = ["TRAIN", "DEV", "TEST"] phones_dict = {} if num_phones in [60, 48, 39]: phones_dict = get_phonemes(num_phones) else: raise ValueError("The value of num_phones must be in [60, 48, 39].") dev_spks, test_spks = get_speakers() with ThreadPoolExecutor(num_jobs) as ex: for part in dataset_parts: wav_files = [] if part == "TRAIN": print("starting....") wav_files = glob.glob(str(corpus_dir) + "/TRAIN/*/*/*.WAV") # filter the SA (dialect sentences) wav_files = list( filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files)) elif part == "DEV": wav_files = glob.glob(str(corpus_dir) + "/TEST/*/*/*.WAV") # filter the SA (dialect sentences) wav_files = list( filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files)) wav_files = list( filter(lambda x: x.split("/")[-2].lower() in dev_spks, wav_files)) else: wav_files = glob.glob(str(corpus_dir) + "/TEST/*/*/*.WAV") # filter the SA (dialect sentences) wav_files = list( filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files)) wav_files = list( filter(lambda x: x.split("/")[-2].lower() in test_spks, wav_files)) logging.debug(f"{part} dataset manifest generation.") recordings = [] supervisions = [] for wav_file in tqdm(wav_files): items = str(wav_file).strip().split("/") idx = items[-2] + "-" + items[-1][:-4] speaker = items[-2] transcript_file = Path(wav_file).with_suffix(".PHN") if not Path(wav_file).is_file(): logging.warning(f"No such file: {wav_file}") continue if not Path(transcript_file).is_file(): logging.warning(f"No transcript: {transcript_file}") continue text = [] with open(transcript_file, "r") as f: lines = f.readlines() for line in lines: phone = line.rstrip("\n").split(" ")[-1] if num_phones != 60: phone = phones_dict[str(phone)] text.append(phone) text = " ".join(text).replace("h#", "sil") recording = Recording.from_file(path=wav_file, recording_id=idx) recordings.append(recording) segment = SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language="English", speaker=speaker, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{part}.json") recording_set.to_json(output_dir / f"recordings_{part}.json") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_earnings22( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, normalize_text: bool = False, ) -> Union[RecordingSet, SupervisionSet]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. The structure is expected to mimic the structure in the github repository, notably the mp3 files will be searched for in [corpus_dir]/media and transcriptions in the directory [corpus_dir]/transcripts/nlp_references :param output_dir: Pathlike, the path where to write the manifests. :param normalize_text: Bool, if True, normalize the text. :return: (recordings, supervisions) pair .. caution:: The `normalize_text` option removes all punctuation and converts all upper case to lower case. This includes removing possibly important punctuations such as dashes and apostrophes. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) media_dir = corpus_dir / "media" audio_files = list(media_dir.glob("*.mp3")) assert len(audio_files) == 125 audio_files.sort() recording_set = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_files) nlp_dir = corpus_dir / "transcripts" / "nlp_references" nlp_files = list(nlp_dir.glob("*.nlp")) assert len(nlp_files) == 125 metadata = read_metadata(corpus_dir / "metadata.csv") nlp_files.sort() supervision_segments = list() for nlp_file in nlp_files: id = nlp_file.stem text = " ".join(parse_nlp_file(nlp_file)) if normalize_text: text = normalize(text) s = SupervisionSegment( id=id, recording_id=id, start=0.0, duration=recording_set[id].duration, # recording.duration, channel=0, language=f"English-{metadata[id][4]}", text=text, ) supervision_segments.append(s) supervision_set = SupervisionSet.from_segments(supervision_segments) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / "earnings22_supervisions_all.jsonl.gz") recording_set.to_file(output_dir / "earnings22_recordings_all.jsonl.gz") return recording_set, supervision_set
def prepare_mobvoihotwords( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" dataset_parts = ["train", "dev", "test"] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir) for part in dataset_parts: logging.info(f"Preparing MobvoiHotwords subset: {part}") if manifests_exist(part=part, output_dir=output_dir): logging.info( f"MobvoiHotwords subset: {part} already prepared - skipping.") continue # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] for prefix in ["p_", "n_"]: prefixed_part = prefix + part json_path = (corpus_dir / "mobvoi_hotword_dataset_resources" / f"{prefixed_part}.json") with open(json_path, "r", encoding="utf-8") as f: json_data = json.load(f) for entry in json_data: idx = entry["utt_id"] speaker = (idx if entry["speaker_id"] is None else entry["speaker_id"]) audio_path = corpus_dir / "mobvoi_hotword_dataset" / f"{idx}.wav" text = "FREETEXT" if entry["keyword_id"] == 0: text = "HiXiaowen" elif entry["keyword_id"] == 1: text = "NihaoWenwen" else: assert entry["keyword_id"] == -1 if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language="Chinese", speaker=speaker, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{part}.json") recording_set.to_json(output_dir / f"recordings_{part}.json") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests
def prepare_mobvoihotwords( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' dataset_parts = ['train', 'dev', 'test'] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir) for part in dataset_parts: logging.info(f'Preparing MobvoiHotwords subset: {part}') if manifests_exist(part=part, output_dir=output_dir): logging.info( f'MobvoiHotwords subset: {part} already prepared - skipping.') continue # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] for prefix in ['p_', 'n_']: prefixed_part = prefix + part json_path = corpus_dir / 'mobvoi_hotword_dataset_resources' / f'{prefixed_part}.json' with open(json_path, 'r', encoding='utf-8') as f: json_data = json.load(f) for entry in json_data: idx = entry['utt_id'] speaker = idx if entry['speaker_id'] is None else entry[ 'speaker_id'] audio_path = corpus_dir / 'mobvoi_hotword_dataset' / f'{idx}.wav' text = 'FREETEXT' if entry['keyword_id'] == 0: text = 'HiXiaowen' elif entry['keyword_id'] == 1: text = 'NihaoWenwen' else: assert entry['keyword_id'] == -1 if not audio_path.is_file(): logging.warning(f'No such file: {audio_path}') continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language='Chinese', speaker=speaker, text=text.strip()) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f'supervisions_{part}.json') recording_set.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': recording_set, 'supervisions': supervision_set } return manifests
def prepare_ami( data_dir: Pathlike, annotations_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, mic: Optional[str] = "ihm", partition: Optional[str] = "full-corpus", normalize_text: str = "kaldi", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param data_dir: Pathlike, the path of the data dir. :param annotations: Pathlike, the path of the annotations dir or zip file. :param output_dir: Pathlike, the path where to write the manifests. :param mic: str {'ihm','ihm-mix','sdm','mdm'}, type of mic to use. :param partition: str {'full-corpus','full-corpus-asr','scenario-only'}, AMI official data split :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text :return: a Dict whose key is ('train', 'dev', 'eval'), and the values are dicts of manifests under keys 'recordings' and 'supervisions'. Example usage: 1. Prepare IHM-Mix data for ASR: >>> manifests = prepare_ami('/path/to/ami-corpus', mic='ihm-mix', partition='full-corpus-asr') 2. Prepare SDM data: >>> manifests = prepare_ami('/path/to/ami-corpus', mic='sdm', partition='full-corpus') """ data_dir = Path(data_dir) assert data_dir.is_dir(), f"No such directory: {data_dir}" assert mic in MICS, f"Mic {mic} not supported" assert partition in PARTITIONS, f"Partition {partition} not supported" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) logging.info("Parsing AMI annotations") if not annotations_dir: if (data_dir / "ami_public_manual_1.6.2").is_dir(): annotations_dir = data_dir / "ami_public_manual_1.6.2" elif (data_dir / "ami_public_manual_1.6.2.zip").is_file(): annotations_dir = data_dir / "ami_public_manual_1.6.2.zip" else: raise ValueError( f"No annotations directory specified and no zip file found in {data_dir}" ) # Prepare annotations which is a list of segment-level transcriptions annotations = parse_ami_annotations(annotations_dir, normalize=normalize_text) # Audio logging.info("Preparing recording manifests") wav_dir = data_dir if mic in ["ihm", "mdm"]: audio_paths = (wav_dir.rglob("*Headset-?.wav") if mic == "ihm" else wav_dir.rglob("*Array?-0?.wav")) audio = prepare_audio_grouped(list(audio_paths)) elif mic in ["ihm-mix", "sdm"]: audio_paths = (wav_dir.rglob("*Mix-Headset.wav") if mic == "ihm-mix" else wav_dir.rglob("*Array1-01.wav")) audio = prepare_audio_single(list(audio_paths)) # Supervisions logging.info("Preparing supervision manifests") supervision = (prepare_supervision_ihm(audio, annotations) if mic == "ihm" else prepare_supervision_other(audio, annotations)) manifests = defaultdict(dict) dataset_parts = PARTITIONS[partition] for part in ["train", "dev", "test"]: # Get recordings for current data split audio_part = audio.filter(lambda x: x.id in dataset_parts[part]) supervision_part = supervision.filter( lambda x: x.recording_id in dataset_parts[part]) # Write to output directory if a path is provided if output_dir is not None: audio_part.to_file(output_dir / f"ami-{mic}_recordings_{part}.jsonl.gz") supervision_part.to_file(output_dir / f"ami-{mic}_supervisions_{part}.jsonl.gz") audio_part, supervision_part = fix_manifests(audio_part, supervision_part) validate_recordings_and_supervisions(audio_part, supervision_part) # Combine all manifests into one dictionary manifests[part] = { "recordings": audio_part, "supervisions": supervision_part } return dict(manifests)
def prepare_l2_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' speaker_meta = _parse_speaker_description() recordings = RecordingSet.from_recordings( # Example ID: zhaa-arctic_b0126 Recording.from_file( wav, recording_id=f'{wav.parent.parent.name.lower()}-{wav.stem}') for wav in corpus_dir.rglob('*.wav')) supervisions = [] for path in corpus_dir.rglob('*.txt'): # One utterance (line) per file text = path.read_text().strip() is_suitcase_corpus = 'suitcase_corpus' in path.parts speaker = path.parent.parent.name.lower( ) # <root>/ABA/transcript/arctic_a0051.txt -> aba if is_suitcase_corpus: speaker = path.stem # <root>/suitcase_corpus/transcript/aba.txt -> aba seg_id = f'suitcase_corpus-{speaker}' if is_suitcase_corpus else f'{speaker}-{path.stem}' supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language='English', speaker=speaker, gender=speaker_meta[speaker]['gender'], custom={'accent': speaker_meta[speaker]['native_lang']})) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) splits = { 'read': { 'recordings': recordings.filter(lambda r: 'suitcase_corpus' not in r.id), 'supervisions': supervisions.filter( lambda s: 'suitcase_corpus' not in s.recording_id) }, 'suitcase': { 'recordings': recordings.filter(lambda r: 'suitcase_corpus' in r.id), 'supervisions': supervisions.filter(lambda s: 'suitcase_corpus' in s.recording_id) } } if output_dir is not None: for key, manifests in splits.items(): manifests['recordings'].to_json(output_dir / f'recordings-{key}.json') manifests['supervisions'].to_json(output_dir / f'supervisions-{key}.json') return splits
def prepare_ali_meeting( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, mic: Optional[str] = "far", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :param mic: str, "near" or "far", specifies whether to prepare the near-field or far-field data. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ if not is_module_available("textgrid"): raise ValueError( "To prepare AliMeeting data, please 'pip install textgrid' first." ) import textgrid corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" manifests = defaultdict(dict) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["Train", "Eval", "Test"]: recordings = [] supervisions = [] # Eval and Test may further be inside another folder (since the "far" and "near" are grouped together) corpus_dir_split = corpus_dir if part == "Eval" or part == "Test": corpus_dir_split = ( corpus_dir / f"{part}_Ali" if (corpus_dir / f"{part}_Ali").is_dir() else corpus_dir ) wav_paths = corpus_dir_split / f"{part}_Ali_{mic}" / "audio_dir" text_paths = corpus_dir_split / f"{part}_Ali_{mic}" / "textgrid_dir" # For 'near' setting: # - wav files have names like R0003_M0046_F_SPK0093.wav # - textgrid files have names like R0003_M0046_F_SPK0093.TextGrid # Speaker ID information is present in the file name itself # For 'far' setting: # - wav files have names like R0015_M0151_MS002.wav # - textgrid files have names like R0015_M015.TextGrid # Speaker ID information is present inside the TextGrid file for text_path in tqdm( list(text_paths.rglob("*.TextGrid")), desc=f"Preparing {part}" ): session_id = text_path.stem if mic == "near": _, _, gender, spk_id = session_id.split("_") spk_id = spk_id[3:] # SPK1953 -> 1953 try: tg = textgrid.TextGrid.fromFile(str(text_path)) except ValueError: logging.warning( f"{session_id} has annotation issues. Skipping this recording." ) continue wav_path = list(wav_paths.rglob(f"{session_id}*.wav"))[0] recording = Recording.from_file(wav_path, recording_id=session_id) recordings.append(recording) for tier in tg.tiers: if mic == "far": parts = tier.name.split("_") if len(parts) == 4: _, _, gender, spk_id = parts elif len(parts) == 2: gender, spk_id = parts spk_id = spk_id[3:] # SPK1953 -> 1953 for i, interval in enumerate(tier.intervals): if interval.mark != "": start = interval.minTime end = interval.maxTime text = interval.mark segment = SupervisionSegment( id=f"{session_id}-{spk_id}-{i}", recording_id=recording.id, start=start, duration=round(end - start, 4), channel=0, language="Chinese", speaker=spk_id, gender=gender, text=text.strip(), ) supervisions.append(segment) recording_set, supervision_set = fix_manifests( RecordingSet.from_recordings(recordings), SupervisionSet.from_segments(supervisions), ) # Fix manifests validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file( output_dir / f"alimeeting_supervisions_{part.lower()}.jsonl.gz" ) recording_set.to_file( output_dir / f"alimeeting_recordings_{part.lower()}.jsonl.gz" ) manifests[part.lower()] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_vctk( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" speaker_meta = _parse_speaker_description(corpus_dir) recordings = RecordingSet.from_recordings( Recording.from_file(wav) for wav in (corpus_dir / "wav48").rglob("*.wav") ) supervisions = [] for path in (corpus_dir / "txt").rglob("*.txt"): # One utterance (line) per file text = path.read_text().strip() speaker = path.name.split("_")[0] # p226_001.txt -> p226 seg_id = path.stem meta = speaker_meta.get(speaker, defaultdict(lambda: None)) if meta is None: logging.warning(f"Cannot find metadata for speaker {speaker}.") supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language="English", speaker=speaker, gender=meta["gender"], custom={ "accent": meta["accent"], "age": meta["age"], "region": meta["region"], }, ) ) supervisions = SupervisionSet.from_segments(supervisions) # note(pzelasko): There were 172 recordings without supervisions when I ran it. # I am just removing them. recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions ) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_file(output_dir / "vctk_recordings_all.jsonl.gz") supervisions.to_file(output_dir / "vctk_supervisions_all.jsonl.gz") return {"recordings": recordings, "supervisions": supervisions}
def prepare_wenet_speech( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "all", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: Which parts of dataset to prepare, all for all the parts. :param output_dir: Pathlike, the path where to write the manifests. :num_jobs Number of workers to extract manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) subsets = WETNET_SPEECH_PARTS if "all" in dataset_parts else dataset_parts manifests = defaultdict(dict) for sub in subsets: if sub not in WETNET_SPEECH_PARTS: raise ValueError(f"No such part of dataset in WenetSpeech : {sub}") manifests[sub] = {"recordings": [], "supervisions": []} raw_manifests_path = corpus_dir / "WenetSpeech.json" assert raw_manifests_path.is_file(), f"No such file : {raw_manifests_path}" logging.info(f"Loading raw manifests from : {raw_manifests_path}") raw_manifests = json.load(open(raw_manifests_path, "r", encoding="utf8")) with ProcessPoolExecutor(num_jobs) as ex: for recording, segments in tqdm( ex.map( parse_utterance, raw_manifests["audios"], repeat(corpus_dir), repeat(subsets), ), desc="Processing WenetSpeech JSON entries", ): for part in segments: manifests[part]["recordings"].append(recording) manifests[part]["supervisions"].extend(segments[part]) for sub in subsets: recordings, supervisions = fix_manifests( recordings=RecordingSet.from_recordings( manifests[sub]["recordings"]), supervisions=SupervisionSet.from_segments( manifests[sub]["supervisions"]), ) validate_recordings_and_supervisions(recordings=recordings, supervisions=supervisions) if output_dir is not None: supervisions.to_file(output_dir / f"wenetspeech_supervisions_{sub}.jsonl.gz") recordings.to_file(output_dir / f"wenetspeech_recordings_{sub}.jsonl.gz") manifests[sub] = { "recordings": recordings, "supervisions": supervisions, } return manifests
def prepare_heroico( speech_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param speech_dir: Pathlike, the path of the speech data dir. param transcripts_dir: Pathlike, the path of the transcript data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the fold, and the value is Dicts with the keys 'audio' and 'supervisions'. """ speech_dir = Path(speech_dir) transcript_dir = Path(transcript_dir) assert speech_dir.is_dir(), f'No such directory: {speech_dir}' assert transcript_dir.is_dir(), f'No such directory: {transcript_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) # set some patterns to match fields in transcript files and filenames answers_line_pattern = re.compile("\d+/\d+\t.+") answers_path_pattern = re.compile('Answers_Spanish') heroico_recitations_line_pattern = re.compile("\d+\t.+") heroico_recitations_devtest_path_pattern = re.compile('Recordings_Spanish') heroico_recitations_train_path_pattern = re.compile('Recordings_Spanish') usma_line_pattern = re.compile("s\d+\t.+") usma_native_demo_pattern = re.compile( "usma/native\-[fm]\-\w+\-\S+\-\S+\-\S+\-\S+\-\w+\d+") usma_native_path_pattern = re.compile('usma/native') usma_native_prompt_id_pattern = re.compile('s\d+') usma_nonnative_demo_pattern = re.compile( "nonnative\-[fm]\-[a-zA-Z]+\d*\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\d+" ) usma_nonnative_path_pattern = re.compile('nonnative.+\.wav') # Generate a mapping: utt_id -> (audio_path, audio_info, text) transcripts = defaultdict(dict) # store answers trnscripts answers_trans_path = Path(transcript_dir, heroico_dataset_answers) with open(answers_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() # some recordings do not have a transcript, skip them here if not answers_line_pattern.match(line): continue # IDs have the form speaker/prompt_id spk_utt, text = line.split(maxsplit=1) spk_id, prompt_id = spk_utt.split('/') utt_id = '-'.join(['answers', spk_id, prompt_id]) transcripts[utt_id] = text # store heroico recitations transcripts heroico_recitations_trans_path = Path(transcript_dir, heroico_dataset_recordings) with open(heroico_recitations_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() if not heroico_recitations_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = '-'.join(['heroico-recitations', idx]) transcripts[utt_id] = text # store usma transcripts usma_trans_path = Path(transcript_dir, usma_dataset) with open(usma_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() if not usma_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = '-'.join(['usma', idx]) transcripts[utt_id] = text # store utterance info audio_paths = speech_dir.rglob('*.wav') uttdata = {} for wav_file in audio_paths: wav_path = Path(wav_file) path_components = wav_path.parts pid = wav_path.stem if re.findall(answers_path_pattern, str(wav_file)): # store utternce info for Heroico Answers spk = wav_path.parts[-2] utt_id = '-'.join(['answers', spk, pid]) if utt_id not in transcripts: uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='train', speaker=spk, prompt_id=pid, subcorpus='answers', utterance_id=utt_id, transcript=transcripts[utt_id]) elif re.findall(usma_native_path_pattern, str(wav_file)): # store utterance info for usma native data spk = wav_path.parts[-2] utt_id = '-'.join(['usma', spk, pid]) trans_id = '-'.join(['usma', pid]) if not usma_native_demo_pattern.match(spk): uttdata[str(wav_file)] = None if not usma_native_prompt_id_pattern.match(pid): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='test', speaker=spk, prompt_id=pid, subcorpus='usma', utterance_id=utt_id, transcript=transcripts[trans_id]) elif re.findall(usma_nonnative_path_pattern, str(wav_file)): # store utterance data for usma nonnative data spk = wav_path.parts[-2] utt_id = '-'.join(['usma', spk, pid]) trans_id = '-'.join(['usma', pid]) if not usma_nonnative_demo_pattern.match(spk): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='test', speaker=spk, prompt_id=pid, subcorpus='usma', utterance_id=utt_id, transcript=transcripts[trans_id]) elif int(pid) <= 354 or int(pid) >= 562: # store utterance info for heroico recitations for train dataset spk = wav_path.parts[-2] utt_id = '-'.join(['heroico-recitations', spk, pid]) trans_id = '-'.join(['heroico-recitations', pid]) uttdata[str(wav_file)] = UttInfo(fold='train', speaker=spk, prompt_id=pid, subcorpus='heroico-recitations', utterance_id=utt_id, transcript=transcripts[trans_id]) elif int(pid) > 354 and int(pid) < 562: spk = wav_path.parts[-2] utt_id = '-'.join(['heroico-recitations-repeats', spk, pid]) trans_id = '-'.join(['heroico-recitations-repeats', pid]) uttdata[str(wav_file)] = UttInfo( fold='devtest', speaker=spk, prompt_id=pid, subcorpus='heroico-recitations-repeats', utterance_id=utt_id, transcript=transcripts[trans_id]) else: logging.warning(f'No such file: {wav_file}') audio_paths = speech_dir.rglob('*.wav') audio_files = [w for w in audio_paths] for fld in folds: metadata = {} for wav_file in audio_files: wav_path = Path(wav_file) # skip files with no record if not uttdata[str(wav_file)]: continue # only process the current fold if uttdata[str(wav_file)].fold != fld: continue path_components = wav_path.parts prompt_id = wav_path.stem # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = soundfile.info(str(wav_file)) spk = wav_path.parts[-2] utt_id = '-'.join( [uttdata[str(wav_file)].subcorpus, spk, prompt_id]) metadata[utt_id] = HeroicoMetaData( audio_path=wav_file, audio_info=info, text=uttdata[str(wav_file)].transcript) # Audio audio = RecordingSet.from_recordings( Recording(id=idx, sources=[ AudioSource(type='file', channels=[0], source=str(metadata[idx].audio_path)) ], sampling_rate=int(metadata[idx].audio_info.samplerate), num_samples=metadata[idx].audio_info.frames, duration=metadata[idx].audio_info.duration) for idx in metadata) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='Spanish', speaker=idx.split('-')[-2], text=metadata[idx].text) for idx in audio.recordings) if output_dir is not None: supervision.to_json(output_dir / f'supervisions_{fld}.json') audio.to_json(output_dir / f'recordings_{fld}.json') manifests[fld] = {'recordings': audio, 'supervisions': supervision} return manifests
def prepare_ljspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata_csv_path = corpus_dir / 'metadata.csv' assert metadata_csv_path.is_file(), f'No such file: {metadata_csv_path}' metadata = {} with open(metadata_csv_path) as f: for line in f: idx, text, _ = line.split('|') audio_path = corpus_dir / 'wavs' / f'{idx}.wav' if audio_path.is_file(): # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = torchaudio.info(str(audio_path)) metadata[idx] = LJSpeechMetaData(audio_path=audio_path, audio_info=info[0], text=text) else: logging.warning(f'No such file: {audio_path}') # Audio audio = RecordingSet.from_recordings( Recording(id=idx, sources=[ AudioSource(type='file', channels=[0], source=str(metadata[idx].audio_path)) ], sampling_rate=int(metadata[idx].audio_info.rate), num_samples=metadata[idx].audio_info.length, duration=(metadata[idx].audio_info.length / metadata[idx].audio_info.rate)) for idx in metadata) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='English', gender='female', text=metadata[idx].text) for idx in audio.recordings) if output_dir is not None: supervision.to_json(output_dir / 'supervisions.json') audio.to_json(output_dir / 'audio.json') return {'audio': audio, 'supervisions': supervision}
def prepare_rir_noise( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, parts: Sequence[str] = ("point_noise", "iso_noise", "real_rir", "sim_rir"), ) -> Dict[str, Dict[str, Union[RecordingSet, CutSet]]]: """ Prepare the RIR Noise corpus. :param corpus_dir: Pathlike, the path of the dir to store the dataset. :param output_dir: Pathlike, the path of the dir to write the manifests. :param parts: Sequence[str], the parts of the dataset to prepare. The corpus contains 4 things: point-source noises (point_noise), isotropic noises (iso_noise), real RIRs (real_rir), and simulated RIRs (sim_rir). We will prepare these parts in the corresponding dict keys. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if not parts: raise ValueError("No parts specified for manifest preparation.") if isinstance(parts, str): parts = [parts] manifests = defaultdict(dict) for part in parts: logging.info(f"Preparing {part}...") audio_dir = corpus_dir / PARTS[part] assert audio_dir.is_dir(), f"No such directory: {audio_dir}" if part == "sim_rir": # The "small", "medium", and "large" rooms have the same file names, so # we have to handle them separately to avoid duplicating manifests. recordings = [] for room_type in ("small", "medium", "large"): room_dir = audio_dir / f"{room_type}room" recordings += [ Recording.from_file( file, recording_id=f"{room_type}-{file.stem}") for file in room_dir.rglob("*.wav") ] manifests[part]["recordings"] = RecordingSet.from_recordings( recordings) elif part == "point_noise": manifests[part]["recordings"] = RecordingSet.from_recordings( Recording.from_file(file) for file in audio_dir.rglob("*.wav")) elif part == "iso_noise": manifests[part]["recordings"] = RecordingSet.from_recordings( Recording.from_file(file) for file in audio_dir.rglob("*.wav") if "noise" in file.stem) elif part == "real_rir": manifests[part]["recordings"] = RecordingSet.from_recordings( Recording.from_file(file) for file in audio_dir.rglob("*.wav") if "rir" in file.stem) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in manifests: for key, manifest in manifests[part].items(): manifest.to_file(output_dir / f"{part.replace('_','-')}_{key}_all.jsonl.gz") return manifests
def prepare_l2_arctic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepares and returns the L2 Arctic manifests which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "read" and "spontaneous". Each hold another dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" speaker_meta = _parse_speaker_description() recordings = RecordingSet.from_recordings( # Example ID: zhaa-arctic_b0126 Recording.from_file( wav, recording_id=f"{wav.parent.parent.name.lower()}-{wav.stem}") for wav in corpus_dir.rglob("*.wav")) supervisions = [] for path in corpus_dir.rglob("*.txt"): # One utterance (line) per file text = path.read_text().strip() is_suitcase_corpus = "suitcase_corpus" in path.parts speaker = (path.parent.parent.name.lower() ) # <root>/ABA/transcript/arctic_a0051.txt -> aba if is_suitcase_corpus: speaker = path.stem # <root>/suitcase_corpus/transcript/aba.txt -> aba seg_id = (f"suitcase_corpus-{speaker}" if is_suitcase_corpus else f"{speaker}-{path.stem}") supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text, language="English", speaker=speaker, gender=speaker_meta[speaker]["gender"], custom={"accent": speaker_meta[speaker]["native_lang"]}, )) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) splits = { "read": { "recordings": recordings.filter(lambda r: "suitcase_corpus" not in r.id), "supervisions": supervisions.filter( lambda s: "suitcase_corpus" not in s.recording_id), }, "suitcase": { "recordings": recordings.filter(lambda r: "suitcase_corpus" in r.id), "supervisions": supervisions.filter(lambda s: "suitcase_corpus" in s.recording_id), }, } if output_dir is not None: output_dir = Path(output_dir) makedirs(output_dir, exist_ok=True) for key, manifests in splits.items(): manifests["recordings"].to_file( output_dir / f"l2-arctic_recordings_{key}.jsonl.gz") manifests["supervisions"].to_file( output_dir / f"l2-arctic_supervisions_{key}.jsonl.gz") return splits
def prepare_aishell( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) transcript_path = corpus_dir / 'data_aishell/transcript/aishell_transcript_v0.8.txt' transcript_dict = {} with open(transcript_path, 'r', encoding='utf-8') as f: for line in f.readlines(): idx_transcript = line.split() transcript_dict[idx_transcript[0]] = ' '.join(idx_transcript[1:]) manifests = defaultdict(dict) dataset_parts = ['train', 'dev', 'test'] for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] wav_path = corpus_dir / 'data_aishell' / 'wav' / f'{part}' for audio_path in wav_path.rglob('**/*.wav'): idx = audio_path.stem speaker = audio_path.parts[-2] if idx not in transcript_dict: logging.warning(f'No transcript: {idx}') continue text = transcript_dict[idx] if not audio_path.is_file(): logging.warning(f'No such file: {audio_path}') continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language='Chinese', speaker=speaker, text=text.strip()) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) if output_dir is not None: supervision_set.to_json(output_dir / f'supervisions_{part}.json') recording_set.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': recording_set, 'supervisions': supervision_set } return manifests
def prepare_mini_librispeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata = {} part_path = corpus_dir / part for trans_path in part_path.rglob('*.txt'): with open(trans_path) as f: for line in f: idx, text = line.split(maxsplit=1) audio_path = part_path / Path(idx.replace('-', '/')).parent / f'{idx}.flac' if audio_path.is_file(): # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = torchaudio.info(str(audio_path)) metadata[idx] = LibriSpeechMetaData(audio_path=audio_path, audio_info=info[0], text=text) else: logging.warning(f'No such file: {audio_path}') # Audio audio = RecordingSet.from_recordings( Recording( id=idx, sources=[ AudioSource( type='file', channels=[0], source=str(metadata[idx].audio_path) ) ], sampling_rate=int(metadata[idx].audio_info.rate), num_samples=metadata[idx].audio_info.length, duration=(metadata[idx].audio_info.length / metadata[idx].audio_info.rate) ) for idx in metadata ) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='English', speaker=re.sub(r'-.*', r'', idx), text=metadata[idx].text.strip() ) for idx in audio.recordings ) if output_dir is not None: supervision.to_json(output_dir / f'supervisions_{part}.json') audio.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': audio, 'supervisions': supervision } return manifests
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, map_string_to_underscores: Optional[str] = None, num_jobs: int = 1, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. :param map_string_to_underscores: optional string, when specified, we will replace all instances of this string in SupervisonSegment IDs to underscores. This is to help with handling underscores in Kaldi (see :func:`.export_to_kaldi`). This is also done for speaker IDs. """ path = Path(path) assert path.is_dir() def fix_id(t: str) -> str: if map_string_to_underscores is None: return t return t.replace(map_string_to_underscores, "_") # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True) with ProcessPoolExecutor(num_jobs) as ex: dur_vals = ex.map(get_duration, recordings.values()) durations = dict(zip(recordings.keys(), dur_vals)) recording_set = RecordingSet.from_recordings( Recording( id=recording_id, sources=[ AudioSource( type="command" if path_or_cmd.endswith("|") else "file", channels=[0], source=path_or_cmd[:-1] if path_or_cmd. endswith("|") else path_or_cmd, ) ], sampling_rate=sampling_rate, num_samples=compute_num_samples(durations[recording_id], sampling_rate), duration=durations[recording_id], ) for recording_id, path_or_cmd in recordings.items()) supervision_set = None segments = path / "segments" if segments.is_file(): with segments.open() as f: supervision_segments = [ sup_string.strip().split() for sup_string in f ] texts = load_kaldi_text_mapping(path / "text") speakers = load_kaldi_text_mapping(path / "utt2spk") genders = load_kaldi_text_mapping(path / "spk2gender") languages = load_kaldi_text_mapping(path / "utt2lang") supervision_set = SupervisionSet.from_segments( SupervisionSegment( id=fix_id(segment_id), recording_id=recording_id, start=float(start), duration=add_durations( float(end), -float(start), sampling_rate=sampling_rate), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=fix_id(speakers[segment_id]), gender=genders[speakers[segment_id]], ) for segment_id, recording_id, start, end in supervision_segments) feature_set = None feats_scp = path / "feats.scp" if feats_scp.exists() and is_module_available("kaldi_native_io"): if frame_shift is not None: import kaldi_native_io from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features( type="kaldi_native_io", num_frames=mat.shape[0], num_features=mat.shape[1], frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat.shape[0] * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[utt_id]. recording_id if supervision_set is not None else utt_id, channels=0, ) for utt_id, mat in kaldi_native_io.SequentialFloatMatrixReader( f"scp:{feats_scp}")) else: warnings.warn("Failed to import Kaldi 'feats.scp' to Lhotse: " "frame_shift must be not None. " "Feature import omitted.") return recording_set, supervision_set, feature_set
def prepare_adept( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ): """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( Recording.from_file( path=path, # converts: # path/to/ADEPT/wav_44khz/propositional_attitude/surprise/ad01_0204.wav # to: # propositional_attitude_surprise_ad01_0204 recording_id=str(path.relative_to(path.parent.parent.parent)) [:-4].replace("/", "_"), ) for path in (corpus_dir / "wav_44khz").rglob("*.wav")) supervisions = [] with open(corpus_dir / "adept_prompts.json") as f: interpretation_map = json.load(f) for path in (corpus_dir / "txt").rglob("*.txt"): annotation_type, label, prompt_id = str( path.relative_to(path.parent.parent.parent))[:-4].split("/") speaker_id = "ADEPT_" + prompt_id.split("_")[0] recording_id = "_".join((annotation_type, label, prompt_id)) interpretation_group = interpretation_map.get(annotation_type) interpretation = (interpretation_group[prompt_id][label] if interpretation_group else None) recording = recordings[recording_id] custom = { "type": annotation_type, "label": label, "prompt_id": prompt_id } if interpretation: # label is "interpretation_1", "interpretation_2", ..., "middle", "end", etc # Interpretations' labels meaning is defined by their textual realisation: # {..., "middle": "Galleries are WHAT on Thursdays?", "end": "Galleries are free WHEN?"} custom["text"] = interpretation supervisions.append( SupervisionSegment( id=recording_id, recording_id=recording_id, start=0, duration=recording.duration, channel=0, text=path.read_text(), language="English", speaker=speaker_id, custom=custom, )) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) supervisions.to_file(output_dir / "adept_supervisions.json") recordings.to_file(output_dir / "adept_recordings.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_librispeech( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if dataset_parts == "mini_librispeech": dataset_parts = set(MINI_LIBRISPEECH).intersection( path.name for path in corpus_dir.glob("*") ) elif dataset_parts == "auto": dataset_parts = ( set(LIBRISPEECH) .union(MINI_LIBRISPEECH) .intersection(path.name for path in corpus_dir.glob("*")) ) if not dataset_parts: raise ValueError( f"Could not find any of librispeech or mini_librispeech splits in: {corpus_dir}" ) elif isinstance(dataset_parts, str): dataset_parts = [dataset_parts] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=output_dir ) with ThreadPoolExecutor(num_jobs) as ex: for part in tqdm(dataset_parts, desc="Dataset parts"): logging.info(f"Processing LibriSpeech subset: {part}") if manifests_exist(part=part, output_dir=output_dir): logging.info(f"LibriSpeech subset: {part} already prepared - skipping.") continue recordings = [] supervisions = [] part_path = corpus_dir / part futures = [] for trans_path in tqdm( part_path.rglob("*.trans.txt"), desc="Distributing tasks", leave=False ): alignments = {} ali_path = trans_path.parent / ( trans_path.stem.split(".")[0] + ".alignment.txt" ) if ali_path.exists(): alignments = parse_alignments(ali_path) # "trans_path" file contains lines like: # # 121-121726-0000 ALSO A POPULAR CONTRIVANCE # 121-121726-0001 HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE # 121-121726-0002 ANGOR PAIN PAINFUL TO HEAR # # We will create a separate Recording and SupervisionSegment for those. with open(trans_path) as f: for line in f: futures.append( ex.submit(parse_utterance, part_path, line, alignments) ) for future in tqdm(futures, desc="Processing", leave=False): result = future.result() if result is None: continue recording, segment = result recordings.append(recording) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"supervisions_{part}.json") recording_set.to_file(output_dir / f"recordings_{part}.json") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_aishell( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) transcript_path = corpus_dir / "data_aishell/transcript/aishell_transcript_v0.8.txt" transcript_dict = {} with open(transcript_path, "r", encoding="utf-8") as f: for line in f.readlines(): idx_transcript = line.split() transcript_dict[idx_transcript[0]] = " ".join(idx_transcript[1:]) manifests = defaultdict(dict) dataset_parts = ["train", "dev", "test"] for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] wav_path = corpus_dir / "data_aishell" / "wav" / f"{part}" for audio_path in wav_path.rglob("**/*.wav"): idx = audio_path.stem speaker = audio_path.parts[-2] if idx not in transcript_dict: logging.warning(f"No transcript: {idx}") continue text = transcript_dict[idx] if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language="Chinese", speaker=speaker, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"aishell_supervisions_{part}.jsonl.gz") recording_set.to_file(output_dir / f"aishell_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests
def prepare_mls( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, opus: bool = True, num_jobs: int = 1 ) -> Dict[str, Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]]: """ Prepare Multilingual LibriSpeech corpus. Returns a dict structured like the following: .. code-block:: python { 'english': { 'train': {'recordings': RecordingSet(...), 'supervisions': SupervisionSet(...)}, 'dev': ..., 'test': ... }, 'polish': { ... }, ... } :param corpus_dir: Path to the corpus root (directories with specific languages should be inside). :param output_dir: Optional path where the manifests should be stored. :param opus: Should we scan for OPUS files (otherwise we'll look for FLAC files). :param num_jobs: How many jobs should be used for creating recording manifests. :return: A dict with structure: ``d[language][split] = {recordings, supervisions}``. """ corpus_dir = Path(corpus_dir) output_dir = Path(output_dir) if output_dir is not None else None assert corpus_dir.is_dir() languages = { d.name.split('_')[1]: d for d in corpus_dir.glob('mls_*') if d.is_dir() and '_lm_' not in d.name and (opus or not d.name.endswith('opus')) } logging.info(f'Found MLS languages: {list(languages)}') manifests = defaultdict(dict) for lang, lang_dir in tqdm(languages.items(), desc='Langauges', total=len(languages)): logging.info(f'Processing language: {lang}') # Read the speaker to gender mapping. spk2gender = {} for line in (lang_dir / 'metainfo.txt').read_text().splitlines(): spk, gender, *_ = line.split('|') spk2gender[spk.strip()] = gender.strip() for split in tqdm(['test', 'dev', 'train'], desc='Splits'): # If everything is ready, read it and skip it. recordings_path = None if output_dir is None else output_dir / f'recordings_{lang}_{split}.jsonl.gz' supervisions_path = None if output_dir is None else output_dir / f'supervisions_{lang}_{split}.jsonl.gz' if ( recordings_path is not None and recordings_path.is_file() and supervisions_path is not None and supervisions_path.is_file() ): logging.info(f'Skipping - {lang}/{split} - already exists!') recordings = RecordingSet.from_file(recordings_path) supervisions = SupervisionSet.from_file(supervisions_path) manifests[lang][split] = { 'recordings': recordings, 'supervisions': supervisions } continue # Create recordings manifest. split_dir = lang_dir / split recordings = RecordingSet.from_dir( path=split_dir, pattern='*.opus' if opus else '*.flac', num_jobs=num_jobs ) # Create supervisions manifest. supervisions = [] for line in (split_dir / 'transcripts.txt').read_text().splitlines(): recording_id, text = line.split('\t') speaker = recording_id.split('_')[0] supervisions.append(SupervisionSegment( id=recording_id, recording_id=recording_id, text=text, speaker=speaker, gender=spk2gender[speaker], start=0.0, duration=recordings.duration(recording_id), language=lang )) supervisions = SupervisionSet.from_segments(supervisions) # Fix any missing recordings/supervisions. recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) # Save for return. manifests[lang][split] = { 'recordings': recordings, 'supervisions': supervisions } # Optional storage on disk. if output_dir is not None: output_dir.mkdir(exist_ok=True, parents=True) recordings.to_jsonl(recordings_path) supervisions.to_jsonl(supervisions_path) return dict(manifests)
def prepare_cmu_indic( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares and returns the CMU Indic manifests, which consist of Recordings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict of {'recordings': ..., 'supervisions': ...} """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" recordings = RecordingSet.from_recordings( # Example ID: cmu_indic_ben_rm_bn_00001 Recording.from_file( wav, recording_id=f"{_get_speaker(wav.parent.parent.name)}-{wav.stem}") for wav in corpus_dir.rglob("*.wav")) supervisions = [] for path in corpus_dir.rglob("txt.done.data"): lines = path.read_text().splitlines() speaker = _get_speaker(path.parent.parent.name) lang_code = speaker.split("_")[ 0] # example: 'ben_rm' -> 'ben' (Bengali) try: # Example contents of voice.feats file: # variant guj # age 28 # gender male # description Built with build_cg_rfs_voice, 3 rf and 3 dur # gujarati_data h2r_prompts # prompt_dur 59.27min age = int((path.parent / "voice.feats").read_text().splitlines()[1].replace( "age ", "").strip()) except: age = None for l in lines: l = l[2:-2] # get rid of parentheses and whitespaces on the edges seg_id, text = l.split(maxsplit=1) seg_id = f"{speaker}-{seg_id}" language = LANGUAGE_MAP[lang_code] is_english = "arctic" in seg_id # Determine available custom meta-data to attach. custom = None if is_english or age is not None: custom = {} if is_english: custom["accent"] = language if age is not None: custom["age"] = age supervisions.append( SupervisionSegment( id=seg_id, recording_id=seg_id, start=0, duration=recordings[seg_id].duration, text=text.replace('"', ""), # get rid of quotation marks, language="English" if is_english else language, speaker=speaker, gender=GENDER_MAP.get(speaker), custom=custom, )) supervisions = SupervisionSet.from_segments(supervisions) # There seem to be 20 recordings missing; remove the before validation recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) recordings.to_json(output_dir / "cmu_indic_recordings.json") supervisions.to_json(output_dir / "cmu_indic_supervisions.json") return {"recordings": recordings, "supervisions": supervisions}