def prepare_mobvoihotwords( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' dataset_parts = ['train', 'dev', 'test'] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir) for part in dataset_parts: logging.info(f'Preparing MobvoiHotwords subset: {part}') if manifests_exist(part=part, output_dir=output_dir): logging.info( f'MobvoiHotwords subset: {part} already prepared - skipping.') continue # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] for prefix in ['p_', 'n_']: prefixed_part = prefix + part json_path = corpus_dir / 'mobvoi_hotword_dataset_resources' / f'{prefixed_part}.json' with open(json_path, 'r', encoding='utf-8') as f: json_data = json.load(f) for entry in json_data: idx = entry['utt_id'] speaker = idx if entry['speaker_id'] is None else entry[ 'speaker_id'] audio_path = corpus_dir / 'mobvoi_hotword_dataset' / f'{idx}.wav' text = 'FREETEXT' if entry['keyword_id'] == 0: text = 'HiXiaowen' elif entry['keyword_id'] == 1: text = 'NihaoWenwen' else: assert entry['keyword_id'] == -1 if not audio_path.is_file(): logging.warning(f'No such file: {audio_path}') continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language='Chinese', speaker=speaker, text=text.strip()) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f'supervisions_{part}.json') recording_set.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': recording_set, 'supervisions': supervision_set } return manifests
def prepare_ali_meeting( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, mic: Optional[str] = "far", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :param mic: str, "near" or "far", specifies whether to prepare the near-field or far-field data. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ if not is_module_available("textgrid"): raise ValueError( "To prepare AliMeeting data, please 'pip install textgrid' first.") import textgrid corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" manifests = defaultdict(dict) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["Train", "Eval", "Test"]: recordings = [] supervisions = [] # Eval and Test may further be inside another folder (since the "far" and "near" are grouped together) if part == "Eval" or part == "Test": corpus_dir = (corpus_dir / f"{part}_Ali" if (corpus_dir / f"{part}_Ali").is_dir() else corpus_dir) wav_paths = corpus_dir / f"{part}_Ali_{mic}" / "audio_dir" text_paths = corpus_dir / f"{part}_Ali_{mic}" / "textgrid_dir" # For 'near' setting: # - wav files have names like R0003_M0046_F_SPK0093.wav # - textgrid files have names like R0003_M0046_F_SPK0093.TextGrid # Speaker ID information is present in the file name itself # For 'far' setting: # - wav files have names like R0015_M0151_MS002.wav # - textgrid files have names like R0015_M015.TextGrid # Speaker ID information is present inside the TextGrid file for text_path in tqdm(list(text_paths.rglob("*.TextGrid")), desc=f"Preparing {part}"): session_id = text_path.stem if mic == "near": _, _, gender, spk_id = session_id.split("_") spk_id = spk_id[3:] # SPK1953 -> 1953 try: tg = textgrid.TextGrid.fromFile(str(text_path)) except ValueError: logging.warning( f"{session_id} has annotation issues. Skipping this recording." ) continue wav_path = list(wav_paths.rglob(f"{session_id}*.wav"))[0] recording = Recording.from_file(wav_path, recording_id=session_id) recordings.append(recording) for tier in tg.tiers: if mic == "far": parts = tier.name.split("_") if len(parts) == 4: _, _, gender, spk_id = parts elif len(parts) == 2: gender, spk_id = parts spk_id = spk_id[3:] # SPK1953 -> 1953 for i, interval in enumerate(tier.intervals): if interval.mark != "": start = interval.minTime end = interval.maxTime text = interval.mark segment = SupervisionSegment( id=f"{session_id}-{spk_id}-{i}", recording_id=recording.id, start=start, duration=round(end - start, 4), channel=0, language="Chinese", speaker=spk_id, gender=gender, text=text.strip(), ) supervisions.append(segment) recording_set, supervision_set = fix_manifests( RecordingSet.from_recordings(recordings), SupervisionSet.from_segments(supervisions), ) # Fix manifests validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"supervisions_{part.lower()}.jsonl") recording_set.to_file(output_dir / f"recordings_{part.lower()}.jsonl") manifests[part.lower()] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_librimix( librimix_csv: Pathlike, output_dir: Optional[Pathlike] = None, with_precomputed_mixtures: bool = False, sampling_rate: int = 16000, min_segment_seconds: Seconds = 3.0 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: import pandas as pd assert Path(librimix_csv).is_file(), f'No such file: {librimix_csv}' df = pd.read_csv(librimix_csv) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) # First, create the audio manifest that specifies the pairs of source recordings # to be mixed together. audio_sources = RecordingSet.from_recordings( Recording( id=row['mixture_ID'], sources=[ AudioSource( type='file', channels=[0], source=row['source_1_path']), AudioSource( type='file', channels=[1], source=row['source_2_path']) ], sampling_rate=sampling_rate, num_samples=int(row['length']), duration=row['length'] / sampling_rate) for idx, row in df.iterrows() if row['length'] / sampling_rate > min_segment_seconds) supervision_sources = make_corresponding_supervisions(audio_sources) validate_recordings_and_supervisions(audio_sources, supervision_sources) if output_dir is not None: audio_sources.to_json(output_dir / 'recordings_sources.json') supervision_sources.to_json(output_dir / 'supervisions_sources.json') manifests['sources'] = { 'recordings': audio_sources, 'supervisions': supervision_sources } # When requested, create an audio manifest for the pre-computed mixtures. # A different way of performing the mix would be using Lhotse's on-the-fly # overlaying of audio Cuts. if with_precomputed_mixtures: audio_mix = RecordingSet.from_recordings( Recording(id=row['mixture_ID'], sources=[ AudioSource(type='file', channels=[0], source=row['mixture_path']), ], sampling_rate=sampling_rate, num_samples=int(row['length']), duration=row['length'] / sampling_rate) for idx, row in df.iterrows() if row['length'] / sampling_rate > min_segment_seconds) supervision_mix = make_corresponding_supervisions(audio_mix) validate_recordings_and_supervisions(audio_mix, supervision_mix) if output_dir is not None: audio_mix.to_json(output_dir / 'recordings_mix.json') supervision_mix.to_json(output_dir / 'supervisions_mix.json') manifests['premixed'] = { 'recordings': audio_mix, 'supervisions': supervision_mix } # When the LibriMix CSV specifies noises, we create a separate RecordingSet for them, # so that we can extract their features and overlay them as Cuts later. if 'noise_path' in df: audio_noise = RecordingSet.from_recordings( Recording(id=row['mixture_ID'], sources=[ AudioSource(type='file', channels=[0], source=row['noise_path']), ], sampling_rate=sampling_rate, num_samples=int(row['length']), duration=row['length'] / sampling_rate) for idx, row in df.iterrows() if row['length'] / sampling_rate > min_segment_seconds) supervision_noise = make_corresponding_supervisions(audio_noise) validate_recordings_and_supervisions(audio_noise, supervision_noise) if output_dir is not None: audio_noise.to_json(output_dir / 'recordings_noise.json') supervision_noise.to_json(output_dir / 'supervisions_noise.json') manifests['noise'] = { 'recordings': audio_noise, 'supervisions': supervision_noise } return manifests
def prepare_gale_arabic( audio_dirs: List[Pathlike], transcript_dirs: List[Pathlike], output_dir: Optional[Pathlike] = None, absolute_paths: bool = True, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for GALE Arabic Broadcast speech corpus. :param audio_dirs: List of paths to audio corpora. :param transcripts_dirs: List of paths to transcript corpora. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ assert len(audio_dirs) == len( transcript_dirs ), "Paths to the same speech and transcript corpora must be provided" logging.info("Reading audio and transcript paths from provided dirs") # Some of the audio is wav while others are flac. Also, some recordings # may be repeated across corpora so we make a dict to avoid adding them # twice. audio_paths = defaultdict( Path, { p.stem: p for p in chain.from_iterable([ check_and_rglob(dir, ext, strict=False) for dir in audio_dirs for ext in ["*.wav", "*.flac"] ]) }, ) transcript_paths = chain.from_iterable( [check_and_rglob(dir, "*.tdf") for dir in transcript_dirs]) transcript_paths = [p for p in transcript_paths] logging.info("Preparing recordings manifest") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths.values()) logging.info("Preparing supervisions manifest") supervisions = SupervisionSet.from_segments( parse_transcripts(transcript_paths)) # Some supervisions exceed recording boundaries, so here we trim them supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests = defaultdict(dict) manifests["test"] = { "recordings": recordings.filter(lambda r: r.id in TEST), "supervisions": supervisions.filter(lambda s: s.recording_id in TEST), } manifests["train"] = { "recordings": recordings.filter(lambda r: r.id not in TEST), "supervisions": supervisions.filter(lambda s: s.recording_id not in TEST), } if output_dir is not None: logging.info("Writing manifests to JSON files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["train", "test"]: manifests[part]["recordings"].to_json(output_dir / f"recordings_{part}.json") manifests[part]["supervisions"].to_json( output_dir / f"supervisions_{part}.json") return manifests
def prepare_heroico( speech_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param speech_dir: Pathlike, the path of the speech data dir. param transcripts_dir: Pathlike, the path of the transcript data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the fold, and the value is Dicts with the keys 'audio' and 'supervisions'. """ speech_dir = Path(speech_dir) transcript_dir = Path(transcript_dir) assert speech_dir.is_dir(), f'No such directory: {speech_dir}' assert transcript_dir.is_dir(), f'No such directory: {transcript_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) # set some patterns to match fields in transcript files and filenames answers_line_pattern = re.compile("\d+/\d+\t.+") answers_path_pattern = re.compile('Answers_Spanish') heroico_recitations_line_pattern = re.compile("\d+\t.+") heroico_recitations_devtest_path_pattern = re.compile('Recordings_Spanish') heroico_recitations_train_path_pattern = re.compile('Recordings_Spanish') usma_line_pattern = re.compile("s\d+\t.+") usma_native_demo_pattern = re.compile( "usma/native\-[fm]\-\w+\-\S+\-\S+\-\S+\-\S+\-\w+\d+") usma_native_path_pattern = re.compile('usma/native') usma_native_prompt_id_pattern = re.compile('s\d+') usma_nonnative_demo_pattern = re.compile( "nonnative\-[fm]\-[a-zA-Z]+\d*\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\d+" ) usma_nonnative_path_pattern = re.compile('nonnative.+\.wav') # Generate a mapping: utt_id -> (audio_path, audio_info, text) transcripts = defaultdict(dict) # store answers trnscripts answers_trans_path = Path(transcript_dir, heroico_dataset_answers) with open(answers_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() # some recordings do not have a transcript, skip them here if not answers_line_pattern.match(line): continue # IDs have the form speaker/prompt_id spk_utt, text = line.split(maxsplit=1) spk_id, prompt_id = spk_utt.split('/') utt_id = '-'.join(['answers', spk_id, prompt_id]) transcripts[utt_id] = text # store heroico recitations transcripts heroico_recitations_trans_path = Path(transcript_dir, heroico_dataset_recordings) with open(heroico_recitations_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() if not heroico_recitations_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = '-'.join(['heroico-recitations', idx]) transcripts[utt_id] = text # store usma transcripts usma_trans_path = Path(transcript_dir, usma_dataset) with open(usma_trans_path, encoding='iso-8859-1') as f: for line in f: line = line.rstrip() if not usma_line_pattern.match(line): continue idx, text = line.split(maxsplit=1) utt_id = '-'.join(['usma', idx]) transcripts[utt_id] = text # store utterance info audio_paths = speech_dir.rglob('*.wav') uttdata = {} for wav_file in audio_paths: wav_path = Path(wav_file) path_components = wav_path.parts pid = wav_path.stem if re.findall(answers_path_pattern, str(wav_file)): # store utternce info for Heroico Answers spk = wav_path.parts[-2] utt_id = '-'.join(['answers', spk, pid]) if utt_id not in transcripts: uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='train', speaker=spk, prompt_id=pid, subcorpus='answers', utterance_id=utt_id, transcript=transcripts[utt_id]) elif re.findall(usma_native_path_pattern, str(wav_file)): # store utterance info for usma native data spk = wav_path.parts[-2] utt_id = '-'.join(['usma', spk, pid]) trans_id = '-'.join(['usma', pid]) if not usma_native_demo_pattern.match(spk): uttdata[str(wav_file)] = None if not usma_native_prompt_id_pattern.match(pid): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='test', speaker=spk, prompt_id=pid, subcorpus='usma', utterance_id=utt_id, transcript=transcripts[trans_id]) elif re.findall(usma_nonnative_path_pattern, str(wav_file)): # store utterance data for usma nonnative data spk = wav_path.parts[-2] utt_id = '-'.join(['usma', spk, pid]) trans_id = '-'.join(['usma', pid]) if not usma_nonnative_demo_pattern.match(spk): uttdata[str(wav_file)] = None continue uttdata[str(wav_file)] = UttInfo(fold='test', speaker=spk, prompt_id=pid, subcorpus='usma', utterance_id=utt_id, transcript=transcripts[trans_id]) elif int(pid) <= 354 or int(pid) >= 562: # store utterance info for heroico recitations for train dataset spk = wav_path.parts[-2] utt_id = '-'.join(['heroico-recitations', spk, pid]) trans_id = '-'.join(['heroico-recitations', pid]) uttdata[str(wav_file)] = UttInfo(fold='train', speaker=spk, prompt_id=pid, subcorpus='heroico-recitations', utterance_id=utt_id, transcript=transcripts[trans_id]) elif int(pid) > 354 and int(pid) < 562: spk = wav_path.parts[-2] utt_id = '-'.join(['heroico-recitations-repeats', spk, pid]) trans_id = '-'.join(['heroico-recitations-repeats', pid]) uttdata[str(wav_file)] = UttInfo( fold='devtest', speaker=spk, prompt_id=pid, subcorpus='heroico-recitations-repeats', utterance_id=utt_id, transcript=transcripts[trans_id]) else: logging.warning(f'No such file: {wav_file}') audio_paths = speech_dir.rglob('*.wav') audio_files = [w for w in audio_paths] for fld in folds: metadata = {} for wav_file in audio_files: wav_path = Path(wav_file) # skip files with no record if not uttdata[str(wav_file)]: continue # only process the current fold if uttdata[str(wav_file)].fold != fld: continue path_components = wav_path.parts prompt_id = wav_path.stem # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = torchaudio.info(str(wav_file)) spk = wav_path.parts[-2] utt_id = '-'.join( [uttdata[str(wav_file)].subcorpus, spk, prompt_id]) metadata[utt_id] = HeroicoMetaData( audio_path=wav_file, audio_info=info[0], text=uttdata[str(wav_file)].transcript) # Audio audio = RecordingSet.from_recordings( Recording(id=idx, sources=[ AudioSource(type='file', channels=[0], source=str(metadata[idx].audio_path)) ], sampling_rate=int(metadata[idx].audio_info.rate), num_samples=metadata[idx].audio_info.length, duration=(metadata[idx].audio_info.length / metadata[idx].audio_info.rate)) for idx in metadata) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='Spanish', speaker=idx.split('-')[-2], text=metadata[idx].text) for idx in audio.recordings) if output_dir is not None: supervision.to_json(output_dir / f'supervisions_{fld}.json') audio.to_json(output_dir / f'recordings_{fld}.json') manifests[fld] = {'recordings': audio, 'supervisions': supervision} return manifests
def prepare_switchboard( audio_dir: Pathlike, transcripts_dir: Optional[Pathlike] = None, sentiment_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, omit_silence: bool = True, absolute_paths: bool = False ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Switchboard corpus. We create two manifests: one with recordings, and the other one with text supervisions. When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. :param audio_dir: Path to ``LDC97S62`` package. :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). If not provided, the transcripts will be downloaded. :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations for SWBD segments. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ if transcripts_dir is None: transcripts_dir = download_and_untar() audio_paths = check_and_rglob(audio_dir, '*.sph') text_paths = check_and_rglob(transcripts_dir, '*trans.text') groups = [] name_to_text = {p.stem.split('-')[0]: p for p in text_paths} for ap in audio_paths: name = ap.stem.replace('sw0', 'sw') groups.append({ 'audio': ap, 'text-0': name_to_text[f'{name}A'], 'text-1': name_to_text[f'{name}B'] }) recordings = RecordingSet.from_recordings( Recording.from_sphere( group['audio'], relative_path_depth=None if absolute_paths else 3) for group in groups) supervisions = SupervisionSet.from_segments( chain.from_iterable( make_segments(transcript_path=group[f'text-{channel}'], recording=recording, channel=channel, omit_silence=omit_silence) for group, recording in zip(groups, recordings) for channel in [0, 1])) validate_recordings_and_supervisions(recordings, supervisions) if sentiment_dir is not None: parse_and_add_sentiment_labels(sentiment_dir, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / 'recordings.json') supervisions.to_json(output_dir / 'supervisions.json') return {'recordings': recordings, 'supervisions': supervisions}
def prepare_aishell( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) transcript_path = corpus_dir / 'data_aishell/transcript/aishell_transcript_v0.8.txt' transcript_dict = {} with open(transcript_path, 'r', encoding='utf-8') as f: for line in f.readlines(): idx_transcript = line.split() transcript_dict[idx_transcript[0]] = ' '.join(idx_transcript[1:]) manifests = defaultdict(dict) dataset_parts = ['train', 'dev', 'test'] for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] wav_path = corpus_dir / 'data_aishell' / 'wav' / f'{part}' for audio_path in wav_path.rglob('**/*.wav'): idx = audio_path.stem speaker = audio_path.parts[-2] if idx not in transcript_dict: logging.warning(f'No transcript: {idx}') continue text = transcript_dict[idx] if not audio_path.is_file(): logging.warning(f'No such file: {audio_path}') continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language='Chinese', speaker=speaker, text=text.strip()) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) if output_dir is not None: supervision_set.to_json(output_dir / f'supervisions_{part}.json') recording_set.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': recording_set, 'supervisions': supervision_set } return manifests
def prepare_ami( data_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param data_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is ('train', 'dev', 'eval'), and the value is Dicts with keys 'audio' and 'supervisions'. """ data_dir = Path(data_dir) assert data_dir.is_dir(), f'No such directory: {data_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) anotation_lists = parse_ami_annotations(data_dir / 'annotations.gzip') wav_dir = data_dir / 'wav_db' audio_paths = list(wav_dir.rglob('*.wav')) manifests = defaultdict(dict) for part in dataset_parts: # Audio recordings = [] for audio_path in audio_paths: audio_idx = audio_path.name if re.sub(r'\..*', '', audio_idx) not in dataset_parts[part]: continue if audio_idx not in anotation_lists: logging.warning(f'No annotation found for {audio_idx}') continue audio_info = torchaudio.info(str(audio_path))[0] recordings.append( Recording( id=audio_idx, sources=[ AudioSource(type='file', channels=[0], source=str(audio_path)) ], sampling_rate=int(audio_info.rate), num_samples=audio_info.length, duration=int(audio_info.length / audio_info.rate), )) if len(recordings) == 0: continue audio = RecordingSet.from_recordings(recordings) # Supervisions segments_by_pause = [] for idx in audio.recordings: anotation = anotation_lists[idx] for seg_idx, seg_info in enumerate(anotation): for subseg_idx, subseg_info in enumerate(seg_info): duration = subseg_info.end_time - subseg_info.begin_time if duration > 0: segments_by_pause.append( SupervisionSegment( id=f'{idx}-{seg_idx}-{subseg_idx}', recording_id=idx, start=subseg_info.begin_time, duration=duration, channel=0, language='English', speaker=re.sub(r'-.*', r'', idx), text=subseg_info.text)) supervision = SupervisionSet.from_segments(segments_by_pause) if output_dir is not None: audio.to_json(output_dir / f'audio_{part}.json') supervision.to_json(output_dir / f'supervisions_{part}.json') manifests[part] = {'audio': audio, 'supervisions': supervision} return manifests
def prepare_aishell4( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ if not is_module_available("textgrid"): raise ValueError( "To prepare AISHELL-4 data, please 'pip install textgrid' first.") import textgrid corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" manifests = defaultdict(dict) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) global_spk_id = {} for part in ["train_L", "train_M", "train_S", "test"]: recordings = [] supervisions = [] wav_path = corpus_dir / part / "wav" for audio_path in wav_path.rglob("*.flac"): idx = audio_path.stem try: tg = textgrid.TextGrid.fromFile( f"{corpus_dir}/{part}/TextGrid/{idx}.TextGrid") except ValueError: logging.warning( f"{idx} has annotation issues. Skipping this recording.") continue recording = Recording.from_file(audio_path) recordings.append(recording) for tier in tg.tiers: local_spk_id = tier.name key = (idx, local_spk_id) if key not in global_spk_id: global_spk_id[key] = f"SPK{len(global_spk_id)+1:04d}" spk_id = global_spk_id[key] for j, interval in enumerate(tier.intervals): if interval.mark != "": start = interval.minTime end = interval.maxTime text = interval.mark segment = SupervisionSegment( id=f"{idx}-{spk_id}-{j}", recording_id=idx, start=start, duration=round(end - start, 4), channel=0, language="Chinese", speaker=spk_id, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"aishell4_supervisions_{part}.jsonl.gz") recording_set.to_file(output_dir / f"aishell4_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests
def prepare_aspire( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, mic: str = "single" ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the corpus dir (LDC2017S21). :param output_dir: Pathlike, the path where to write the manifests. :param mic: str, the microphone type, either "single" or "multi". :return: a Dict whose key is the dataset part ('dev' and 'dev_test'), and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" assert mic in [ "single", "multi", ], f"mic must be either 'single' or 'multi', got {mic}" corpus_dir = corpus_dir / "IARPA-ASpIRE-Dev-Sets-v2.0" / "data" audio_dir = corpus_dir / "dev_and_dev_test_audio" stm_dir = corpus_dir / "dev_and_dev_test_STM_files" if mic == "single": audio_paths = { "dev": audio_dir / "ASpIRE_single_dev", "dev_test": audio_dir / "ASpIRE_single_dev_test", } stm_file = { "dev": stm_dir / "dev.stm", "dev_test": stm_dir / "dev_test.stm", } else: audio_paths = { "dev": audio_dir / "ASpIRE_multi_dev", "dev_test": audio_dir / "ASpIRE_multi_dev_test", } stm_file = { "dev": stm_dir / "multi_dev.stm", "dev_test": stm_dir / "multi_dev_test.stm", } manifests = defaultdict(dict) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["dev", "dev_test"]: recordings = [] supervisions = [] # Prepare the recordings if mic == "single": recording_set = RecordingSet.from_dir(audio_paths[part], "*.wav") else: import soundfile as sf audio_groups = { k: list(v) for k, v in itertools.groupby( sorted(audio_paths[part].glob("*.wav")), key=lambda x: "_".join(x.stem.split("_")[:-1]), ) } # group audios so that each entry is a session containing all channels for session_name, audios in audio_groups.items(): audio_sf = sf.SoundFile(str(audios[0])) recordings.append( Recording( id=session_name, sources=[ AudioSource( type="file", channels=[int(audio.stem[-2:]) - 1], source=str(audio), ) for audio in sorted(audios) ], sampling_rate=audio_sf.samplerate, num_samples=audio_sf.frames, duration=audio_sf.frames / audio_sf.samplerate, )) recording_set = RecordingSet.from_recordings(recordings) # Read STM file and prepare segments segments = [] with open(stm_file[part]) as f: for line in f: session, _, speaker, start, end, text = line.strip().split( maxsplit=5) segments.append( AspireSegmentAnnotation(session, speaker, float(start), float(end), text)) # Group the segments by session and speaker segments_grouped = defaultdict(list) for segment in segments: segments_grouped[(segment.session, segment.speaker)].append(segment) # Create the supervisions supervisions = [] for k, segs in segments_grouped.items(): session, speaker = k supervisions += [ SupervisionSegment( id=f"{session}-{speaker}-{i:03d}", recording_id=session, start=seg.start, duration=round(seg.end - seg.start, 4), speaker=speaker, text=seg.text, language="English", ) for i, seg in enumerate(segs) ] supervision_set = SupervisionSet.from_segments(supervisions) recording_set, supervision_set = fix_manifests(recording_set, supervision_set) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"aspire_supervisions_{part}.jsonl.gz") recording_set.to_file(output_dir / f"aspire_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests
def prepare_peoples_speech( corpus_dir: Pathlike, output_dir: Pathlike, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare :class:`~lhotse.RecordingSet` and :class:`~lhotse.SupervisionSet` manifests for The People's Speech. The metadata is read lazily and written to manifests in a stream to minimize the CPU RAM usage. If you want to convert this data to a :class:`~lhotse.CutSet` without using excessive memory, we suggest to call it like:: >>> peoples_speech = prepare_peoples_speech(corpus_dir=..., output_dir=...) >>> cuts = CutSet.from_manifests( ... recordings=peoples_speech["recordings"], ... supervisions=peoples_speech["supervisions"], ... output_path=..., ... lazy=True, ... ) :param corpus_dir: Pathlike, the path of the main data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a dict with keys "recordings" and "supervisions" with lazily opened manifests. """ corpus_dir = Path(corpus_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recs_path = output_dir / "peoples-speech_recordings.jsonl.gz" sups_path = output_dir / "peoples-speech_supervisions.jsonl.gz" if recs_path.is_file() and sups_path.is_file(): # Nothing to do: just open the manifests in lazy mode. return { "recordings": RecordingSet.from_jsonl_lazy(recs_path), "supervisions": SupervisionSet.from_jsonl_lazy(sups_path), } exist = 0 tot = 0 err = 0 with RecordingSet.open_writer( recs_path, ) as rec_writer, SupervisionSet.open_writer( sups_path, ) as sup_writer: for item in tqdm( # Note: People's Speech manifest.json is really a JSONL. load_jsonl(corpus_dir / "manifest.json"), desc= "Converting People's Speech manifest.json to Lhotse manifests", ): for duration_ms, text, audio_path in zip( *item["training_data"].values()): full_path = corpus_dir / audio_path tot += 1 if not full_path.exists(): # If we can't find some data, we'll just continue and some items # were missing later. continue exist += 1 try: audio_info = info(full_path) duration = duration_ms / 1000 r = Recording( id=full_path.stem, sampling_rate=audio_info.samplerate, num_samples=compute_num_samples( duration, audio_info.samplerate), duration=duration, sources=[ AudioSource( type="file", channels=[0], source=str(full_path), ) ], ) s = SupervisionSegment( id=r.id, recording_id=r.id, start=0, duration=r.duration, channel=0, text=text, language="English", custom={"session_id": item["identifier"]}, ) validate_recordings_and_supervisions(recordings=r, supervisions=s) rec_writer.write(r) sup_writer.write(s) except Exception as e: # If some files are missing (e.g. somebody is working on a subset # of 30.000 hours), we won't interrupt processing; we will only # do so for violated assertions. if isinstance(e, AssertionError): raise err += 1 continue if exist < tot or err > 0: warnings.warn( f"We finished preparing The People's Speech Lhotse manifests. " f"Out of {tot} entries in the original manifest, we found {exist} " f"audio files existed, out of which {err} had errors during processing." ) return { "recordings": rec_writer.open_manifest(), "supervisions": sup_writer.open_manifest(), }
def prepare_ljspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata_csv_path = corpus_dir / 'metadata.csv' assert metadata_csv_path.is_file(), f'No such file: {metadata_csv_path}' metadata = {} with open(metadata_csv_path) as f: for line in f: idx, text, _ = line.split('|') audio_path = corpus_dir / 'wavs' / f'{idx}.wav' if audio_path.is_file(): # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = torchaudio.info(str(audio_path)) metadata[idx] = LJSpeechMetaData(audio_path=audio_path, audio_info=info[0], text=text) else: logging.warning(f'No such file: {audio_path}') # Audio audio = RecordingSet.from_recordings( Recording(id=idx, sources=[ AudioSource(type='file', channels=[0], source=str(metadata[idx].audio_path)) ], sampling_rate=int(metadata[idx].audio_info.rate), num_samples=metadata[idx].audio_info.length, duration=(metadata[idx].audio_info.length / metadata[idx].audio_info.rate)) for idx in metadata) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='English', gender='female', text=metadata[idx].text) for idx in audio.recordings) if output_dir is not None: supervision.to_json(output_dir / 'supervisions.json') audio.to_json(output_dir / 'audio.json') return {'audio': audio, 'supervisions': supervision}
def prepare_earnings22( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, normalize_text: bool = False, ) -> Union[RecordingSet, SupervisionSet]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. The structure is expected to mimic the structure in the github repository, notably the mp3 files will be searched for in [corpus_dir]/media and transcriptions in the directory [corpus_dir]/transcripts/nlp_references :param output_dir: Pathlike, the path where to write the manifests. :param normalize_text: Bool, if True, normalize the text. :return: (recordings, supervisions) pair .. caution:: The `normalize_text` option removes all punctuation and converts all upper case to lower case. This includes removing possibly important punctuations such as dashes and apostrophes. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) media_dir = corpus_dir / "media" audio_files = list(media_dir.glob("*.mp3")) assert len(audio_files) == 125 audio_files.sort() recording_set = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_files) nlp_dir = corpus_dir / "transcripts" / "nlp_references" nlp_files = list(nlp_dir.glob("*.nlp")) assert len(nlp_files) == 125 metadata = read_metadata(corpus_dir / "metadata.csv") nlp_files.sort() supervision_segments = list() for nlp_file in nlp_files: id = nlp_file.stem text = " ".join(parse_nlp_file(nlp_file)) if normalize_text: text = normalize(text) s = SupervisionSegment( id=id, recording_id=id, start=0.0, duration=recording_set[id].duration, # recording.duration, channel=0, language=f"English-{metadata[id][4]}", text=text, ) supervision_segments.append(s) supervision_set = SupervisionSet.from_segments(supervision_segments) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / "earnings22_supervisions_all.jsonl.gz") recording_set.to_file(output_dir / "earnings22_recordings_all.jsonl.gz") return recording_set, supervision_set
def create_recording( audio_path_and_rel_path_depth: Tuple[Pathlike, Union[int, None]]) -> Recording: audio_path, rel_path_depth = audio_path_and_rel_path_depth return Recording.from_file(audio_path, relative_path_depth=rel_path_depth)
def recording(file_source): return Recording(id='rec', sources=[file_source] * 2, sampling_rate=8000, num_samples=4000, duration=0.5)
def prepare_aishell( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) transcript_path = corpus_dir / "data_aishell/transcript/aishell_transcript_v0.8.txt" transcript_dict = {} with open(transcript_path, "r", encoding="utf-8") as f: for line in f.readlines(): idx_transcript = line.split() transcript_dict[idx_transcript[0]] = " ".join(idx_transcript[1:]) manifests = defaultdict(dict) dataset_parts = ["train", "dev", "test"] for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] wav_path = corpus_dir / "data_aishell" / "wav" / f"{part}" for audio_path in wav_path.rglob("**/*.wav"): idx = audio_path.stem speaker = audio_path.parts[-2] if idx not in transcript_dict: logging.warning(f"No transcript: {idx}") continue text = transcript_dict[idx] if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language="Chinese", speaker=speaker, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{part}.json") recording_set.to_json(output_dir / f"recordings_{part}.json") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests
def dummy_recording(): return Recording(id='irrelevant', sources=[AudioSource(type='file', channels=[0], source='irrelevant')], sampling_rate=16000, num_samples=160000, duration=10.0)
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int, frame_shift: Optional[Seconds] = None, map_string_to_underscores: Optional[str] = None, num_jobs: int = 1, ) -> Tuple[RecordingSet, Optional[SupervisionSet], Optional[FeatureSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. :param map_string_to_underscores: optional string, when specified, we will replace all instances of this string in SupervisonSegment IDs to underscores. This is to help with handling underscores in Kaldi (see :func:`.export_to_kaldi`). This is also done for speaker IDs. """ path = Path(path) assert path.is_dir() def fix_id(t: str) -> str: if map_string_to_underscores is None: return t return t.replace(map_string_to_underscores, "_") # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / "wav.scp", must_exist=True) with ProcessPoolExecutor(num_jobs) as ex: dur_vals = ex.map(get_duration, recordings.values()) durations = dict(zip(recordings.keys(), dur_vals)) recording_set = RecordingSet.from_recordings( Recording( id=recording_id, sources=[ AudioSource( type="command" if path_or_cmd.endswith("|") else "file", channels=[0], source=path_or_cmd[:-1] if path_or_cmd. endswith("|") else path_or_cmd, ) ], sampling_rate=sampling_rate, num_samples=compute_num_samples(durations[recording_id], sampling_rate), duration=durations[recording_id], ) for recording_id, path_or_cmd in recordings.items()) supervision_set = None segments = path / "segments" if segments.is_file(): with segments.open() as f: supervision_segments = [ sup_string.strip().split() for sup_string in f ] texts = load_kaldi_text_mapping(path / "text") speakers = load_kaldi_text_mapping(path / "utt2spk") genders = load_kaldi_text_mapping(path / "spk2gender") languages = load_kaldi_text_mapping(path / "utt2lang") supervision_set = SupervisionSet.from_segments( SupervisionSegment( id=fix_id(segment_id), recording_id=recording_id, start=float(start), duration=add_durations( float(end), -float(start), sampling_rate=sampling_rate), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=fix_id(speakers[segment_id]), gender=genders[speakers[segment_id]], ) for segment_id, recording_id, start, end in supervision_segments) feature_set = None feats_scp = path / "feats.scp" if feats_scp.exists() and is_module_available("kaldiio"): if frame_shift is not None: import kaldiio from lhotse.features.io import KaldiReader feature_set = FeatureSet.from_features( Features( type="kaldiio", num_frames=mat.shape[0], num_features=mat.shape[1], frame_shift=frame_shift, sampling_rate=sampling_rate, start=0, duration=mat.shape[0] * frame_shift, storage_type=KaldiReader.name, storage_path=str(feats_scp), storage_key=utt_id, recording_id=supervision_set[utt_id]. recording_id if supervision_set is not None else utt_id, channels=0, ) for utt_id, mat in kaldiio.load_scp_sequential(str(feats_scp))) else: warnings.warn("Failed to import Kaldi 'feats.scp' to Lhotse: " "frame_shift must be not None. " "Feature import omitted.") return recording_set, supervision_set, feature_set
def recording(): return Recording.from_file( "test/fixtures/libri/libri-1088-134315-0000.wav")
def prepare_ami( data_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param data_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is ('train', 'dev', 'eval'), and the value is Dicts with keys 'audio' and 'supervisions'. """ data_dir = Path(data_dir) assert data_dir.is_dir(), f'No such directory: {data_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) anotation_lists = parse_ami_annotations(data_dir / 'annotations.gzip') # Create a mapping from a tuple of (session_id, channel) to the list of annotations. # This way we can map the supervisions to the right channels in a multi-channel recording. annotation_by_id_and_channel = { (filename.split('.')[0], int(filename[-5])): annotations for filename, annotations in anotation_lists.items() } wav_dir = data_dir / 'wav_db' audio_paths = wav_dir.rglob('*.wav') # Group together multiple channels from the same session. # We will use that to create a Recording with multiple sources (channels). from cytoolz import groupby channel_wavs = groupby(lambda p: p.parts[-3], audio_paths) manifests = defaultdict(dict) for part in dataset_parts: # Audio recordings = [] for session_name, channel_paths in channel_wavs.items(): if session_name not in dataset_parts[part]: continue audio_info = torchaudio.info(str(channel_paths[0]))[0] recordings.append(Recording( id=session_name, sources=[ AudioSource( type='file', channels=[idx], source=str(audio_path) ) for idx, audio_path in enumerate(sorted(channel_paths)) ], sampling_rate=int(audio_info.rate), num_samples=audio_info.length, duration=audio_info.length / audio_info.rate, )) audio = RecordingSet.from_recordings(recordings) # Supervisions segments_by_pause = [] for recording in audio: for source in recording.sources: # In AMI "source.channels" will always be a one-element list channel, = source.channels anotation = annotation_by_id_and_channel.get((recording.id, channel)) if anotation is None: logging.warning(f'No annotation found for recording "{recording.id}" channel {channel} ' f'(file {source.source})') continue for seg_idx, seg_info in enumerate(anotation): for subseg_idx, subseg_info in enumerate(seg_info): duration = subseg_info.end_time - subseg_info.begin_time if duration > 0: segments_by_pause.append(SupervisionSegment( id=f'{recording.id}-{seg_idx}-{subseg_idx}', recording_id=recording.id, start=subseg_info.begin_time, duration=duration, channel=channel, language='English', speaker=subseg_info.speaker, gender=subseg_info.gender, text=subseg_info.text )) supervision = SupervisionSet.from_segments(segments_by_pause) if output_dir is not None: audio.to_json(output_dir / f'recordings_{part}.json') supervision.to_json(output_dir / f'supervisions_{part}.json') manifests[part] = { 'recordings': audio, 'supervisions': supervision } return manifests
def prepare_libricss( corpus_dir: Pathlike, output_dir: Pathlike = None, type: str = "mdm", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. NOTE: The recordings contain all 7 channels. If you want to use only one channel, you can use either ``recording.load_audio(channel=0)`` or ``MonoCut(id=...,recording=recording,channel=0)`` while creating the CutSet. :param corpus_dir: Pathlike, the path to the extracted corpus. :param output_dir: Pathlike, the path where to write the manifests. :param type: str, the type of data to prepare ('mdm', 'sdm', 'ihm-mix', or 'ihm'). These settings are similar to the ones in AMI and ICSI recipes. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ assert type in ["mdm", "ihm-mix", "ihm"] manifests = {} corpus_dir = Path(corpus_dir) corpus_dir = ( corpus_dir / "for_release" if corpus_dir.stem != "for_release" else corpus_dir ) recordings = [] segments = [] for ov in OVERLAP_RATIOS: for session in (corpus_dir / ov).iterdir(): _, _, _, _, _, name, actual_ov = session.name.split("_") actual_ov = float(actual_ov.split("actual")[1]) recording_id = f"{ov}_{name}" audio_path = ( session / "clean" / "mix.wav" if type == "ihm-mix" else session / "clean" / "each_spk.wav" if type == "ihm" else session / "record" / "raw_recording.wav" ) recordings.append( Recording.from_file(audio_path, recording_id=recording_id) ) for idx, seg in enumerate( parse_transcript(session / "transcription" / "meeting_info.txt") ): segments.append( SupervisionSegment( id=f"{recording_id}-{idx}", recording_id=recording_id, start=seg[0], duration=seg[1] - seg[0], text=seg[4], language="English", speaker=seg[2], channel=SPK_TO_CHANNEL_MAP[session.name][seg[2]] if type == "ihm" else 0, ) ) supervisions = SupervisionSet.from_segments(segments) recordings = RecordingSet.from_recordings(recordings) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) recordings.to_file(output_dir / "libricss_recordings_all.jsonl.gz") supervisions.to_file(output_dir / "libricss_supervisions_all.jsonl.gz") return {"recordings": recordings, "supervisions": supervisions}
def prepare_cslu_kids( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: Optional[bool] = True, normalize_text: Optional[bool] = True, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for CSLU Kids corpus. The supervision contains either the prompted text, or a transcription of the spontaneous speech, depending on whether the utterance was scripted or spontaneous. Additionally, the following information is present in the `custom` tag: scripted/spontaneous utterance, and verification label (rating between 1 and 4) for scripted utterances (see https://catalog.ldc.upenn.edu/docs/LDC2007S18/verification-note.txt or top documentation in this script for more information). :param corpus_dir: Path to downloaded LDC corpus. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Wheter to write absolute paths to audio sources (default = False) :param normalize_text: remove noise tags (<bn>, <bs>) from spontaneous speech transcripts (default = True) :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ corpus_dir = Path(corpus_dir) if isinstance(corpus_dir, str) else corpus_dir # Get list of all recordings audio_paths = check_and_rglob(corpus_dir, '*.wav') # Read verification labels verification = {} for file in check_and_rglob(corpus_dir, "*-verified.txt"): with open(file, 'r') as f: for line in f: path, label = line.strip().split() utt = Path(path).stem verification[utt] = int(label) # Read prompted transcriptions prompts = {} with open(corpus_dir / 'docs' / 'all.map', 'r') as f: for line in f: if line.strip() != "": prompt, text = line.strip().split(maxsplit=1) prompts[prompt] = text[1:-1] # remove " " around the text recordings = [] supervisions = [] for p in tqdm(audio_paths, desc="Preparing manifests"): # /data/corpora/LDC2007S18/speech/scripted/00/0/ks001/ks001000.wav uttid = p.stem # ks001000 spk = p.parent.stem # ks001 cat = p.parent.parent.stem # 0 prompt = p.parent.parent.parent.stem # 00 type = p.parent.parent.parent.parent.stem # scripted recording = Recording.from_file( p, relative_path_depth=None if absolute_paths else 3) recordings.append(recording) if type == "scripted": text = prompts[prompt] verification_label = verification[ uttid] if uttid in verification else None custom = {'type': type, 'verification_label': verification_label} elif type == "spontaneous": text = read_text( corpus_dir / 'trans' / type / prompt / cat / spk / f'{uttid}.txt', normalize=normalize_text, ) custom = {'type': type} supervisions.append( SupervisionSegment( id=uttid, recording_id=uttid, start=0, duration=recording.duration, speaker=spk, language='English', text=text, custom=custom, )) recordings = RecordingSet.from_recordings(recordings) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests = { 'recordings': recordings, 'supervisions': supervisions, } if output_dir is not None: logging.info("Writing manifests to JSON files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests["recordings"].to_json(output_dir / 'recordings.json') manifests["supervisions"].to_json(output_dir / 'supervisions.json') return manifests
def prepare_timit( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, num_phones: int = 48, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consists of the Recodings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write and save the manifests. :param num_phones: int=48, the number of phones (60, 48 or 39) for modeling and 48 is regarded as the default value. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) dataset_parts = ["TRAIN", "DEV", "TEST"] phones_dict = {} if num_phones in [60, 48, 39]: phones_dict = get_phonemes(num_phones) else: raise ValueError("The value of num_phones must be in [60, 48, 39].") dev_spks, test_spks = get_speakers() with ThreadPoolExecutor(num_jobs) as ex: for part in dataset_parts: wav_files = [] if part == "TRAIN": print("starting....") wav_files = glob.glob(str(corpus_dir) + "/TRAIN/*/*/*.WAV") # filter the SA (dialect sentences) wav_files = list( filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files) ) elif part == "DEV": wav_files = glob.glob(str(corpus_dir) + "/TEST/*/*/*.WAV") # filter the SA (dialect sentences) wav_files = list( filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files) ) wav_files = list( filter(lambda x: x.split("/")[-2].lower() in dev_spks, wav_files) ) else: wav_files = glob.glob(str(corpus_dir) + "/TEST/*/*/*.WAV") # filter the SA (dialect sentences) wav_files = list( filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files) ) wav_files = list( filter(lambda x: x.split("/")[-2].lower() in test_spks, wav_files) ) logging.debug(f"{part} dataset manifest generation.") recordings = [] supervisions = [] for wav_file in tqdm(wav_files): items = str(wav_file).strip().split("/") idx = items[-2] + "-" + items[-1][:-4] speaker = items[-2] transcript_file = Path(wav_file).with_suffix(".PHN") if not Path(wav_file).is_file(): logging.warning(f"No such file: {wav_file}") continue if not Path(transcript_file).is_file(): logging.warning(f"No transcript: {transcript_file}") continue text = [] with open(transcript_file, "r") as f: lines = f.readlines() for line in lines: phone = line.rstrip("\n").split(" ")[-1] if num_phones != 60: phone = phones_dict[str(phone)] text.append(phone) text = " ".join(text).replace("h#", "sil") recording = Recording.from_file(path=wav_file, recording_id=idx) recordings.append(recording) segment = SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language="English", speaker=speaker, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{part}.json") recording_set.to_json(output_dir / f"recordings_{part}.json") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_librispeech( corpus_dir: Pathlike, dataset_parts: Optional[Tuple[str]] = dataset_parts_mini, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: dataset part name, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean' :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata = {} part_path = corpus_dir / part for trans_path in part_path.rglob('*.txt'): with open(trans_path) as f: for line in f: idx, text = line.split(maxsplit=1) audio_path = part_path / Path(idx.replace( '-', '/')).parent / f'{idx}.flac' if audio_path.is_file(): # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = torchaudio.info(str(audio_path)) metadata[idx] = LibriSpeechMetaData( audio_path=audio_path, audio_info=info[0], text=text) else: logging.warning(f'No such file: {audio_path}') # Audio audio = RecordingSet.from_recordings( Recording(id=idx, sources=[ AudioSource(type='file', channels=[0], source=str(metadata[idx].audio_path)) ], sampling_rate=int(metadata[idx].audio_info.rate), num_samples=metadata[idx].audio_info.length, duration=(metadata[idx].audio_info.length / metadata[idx].audio_info.rate)) for idx in metadata) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='English', speaker=re.sub(r'-.*', r'', idx), text=metadata[idx].text.strip()) for idx in audio.recordings) if output_dir is not None: supervision.to_json(output_dir / f'supervisions_{part}.json') audio.to_json(output_dir / f'recordings_{part}.json') manifests[part] = {'recordings': audio, 'supervisions': supervision} return manifests
def dummy_recording(unique_id: int) -> Recording: return Recording(id=f'dummy-recording-{unique_id:04d}', sources=[], sampling_rate=16000, num_samples=16000, duration_seconds=1.0)
def load_kaldi_data_dir( path: Pathlike, sampling_rate: int) -> Tuple[RecordingSet, Optional[SupervisionSet]]: """ Load a Kaldi data directory and convert it to a Lhotse RecordingSet and SupervisionSet manifests. For this to work, at least the wav.scp file must exist. SupervisionSet is created only when a segments file exists. All the other files (text, utt2spk, etc.) are optional, and some of them might not be handled yet. In particular, feats.scp files are ignored. """ path = Path(path) assert path.is_dir() # must exist for RecordingSet recordings = load_kaldi_text_mapping(path / 'wav.scp', must_exist=True) durations = defaultdict(float) reco2dur = path / 'reco2dur' if not reco2dur.is_file(): raise ValueError( f"No such file: '{reco2dur}' -- fix it by running: utils/data/get_reco2dur.sh <data-dir>" ) with reco2dur.open() as f: for line in f: recording_id, dur = line.strip().split() durations[recording_id] = float(dur) audio_set = RecordingSet.from_recordings( Recording(id=recording_id, sources=[ AudioSource(type='command' if path_or_cmd. endswith('|') else 'file', channels=[0], source=path_or_cmd[:-1] if path_or_cmd. endswith('|') else path_or_cmd) ], sampling_rate=sampling_rate, num_samples=int(durations[recording_id] * sampling_rate), duration=durations[recording_id]) for recording_id, path_or_cmd in recordings.items()) # must exist for SupervisionSet segments = path / 'segments' if not segments.is_file(): return audio_set, None with segments.open() as f: supervision_segments = [l.strip().split() for l in f] texts = load_kaldi_text_mapping(path / 'text') speakers = load_kaldi_text_mapping(path / 'utt2spk') genders = load_kaldi_text_mapping(path / 'spk2gender') languages = load_kaldi_text_mapping(path / 'utt2lang') supervision_set = SupervisionSet.from_segments( SupervisionSegment(id=segment_id, recording_id=recording_id, start=float(start), duration=float(end) - float(start), channel=0, text=texts[segment_id], language=languages[segment_id], speaker=speakers[segment_id], gender=genders[speakers[segment_id]]) for segment_id, recording_id, start, end in supervision_segments) return audio_set, supervision_set
def prepare_mobvoihotwords( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" dataset_parts = ["train", "dev", "test"] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir) for part in dataset_parts: logging.info(f"Preparing MobvoiHotwords subset: {part}") if manifests_exist(part=part, output_dir=output_dir): logging.info( f"MobvoiHotwords subset: {part} already prepared - skipping.") continue # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] for prefix in ["p_", "n_"]: prefixed_part = prefix + part json_path = (corpus_dir / "mobvoi_hotword_dataset_resources" / f"{prefixed_part}.json") with open(json_path, "r", encoding="utf-8") as f: json_data = json.load(f) for entry in json_data: idx = entry["utt_id"] speaker = (idx if entry["speaker_id"] is None else entry["speaker_id"]) audio_path = corpus_dir / "mobvoi_hotword_dataset" / f"{idx}.wav" text = "FREETEXT" if entry["keyword_id"] == 0: text = "HiXiaowen" elif entry["keyword_id"] == 1: text = "NihaoWenwen" else: assert entry["keyword_id"] == -1 if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language="Chinese", speaker=speaker, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{part}.json") recording_set.to_json(output_dir / f"recordings_{part}.json") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests