def parse_utterance(row: Any, lang_path: Path, language: str) -> Tuple[Recording, SupervisionSegment]: # Create the Recording first audio_path = lang_path / "clips" / row.path if not audio_path.is_file(): raise ValueError(f"No such file: {audio_path}") recording_id = Path(row.path).stem recording = Recording.from_file(audio_path, recording_id=recording_id) # Then, create the corresponding supervisions segment = SupervisionSegment( id=recording_id, recording_id=recording_id, start=0.0, duration=recording.duration, channel=0, # Look up language code => language name mapping (it is empty at the time of writing this comment) # if the language code is unknown, fall back to using the language code. language=COMMONVOICE_CODE2LANG.get(language, language), speaker=row.client_id, text=row.sentence.strip(), gender=row.gender if row.gender != "nan" else None, custom={ "age": row.age if row.age != "nan" else None, "accent": row.accent if row.accent != "nan" else None, }, ) return recording, segment
def parse_utterance( dataset_split_path: Path, line: str, alignments: Dict[str, List[AlignmentItem]], ) -> Optional[Tuple[Recording, SupervisionSegment]]: recording_id, text = line.strip().split(maxsplit=1) # Create the Recording first audio_path = (dataset_split_path / Path(recording_id.replace("-", "/")).parent / f"{recording_id}.flac") if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") return None recording = Recording.from_file(audio_path, recording_id=recording_id) # Then, create the corresponding supervisions segment = SupervisionSegment( id=recording_id, recording_id=recording_id, start=0.0, duration=recording.duration, channel=0, language="English", speaker=re.sub(r"-.*", r"", recording.id), text=text.strip(), alignment={"word": alignments[recording_id]} if recording_id in alignments else None, ) return recording, segment
def test_recording_from_bytes(): path = "test/fixtures/mono_c0.wav" recording = Recording.from_file(path) memory_recording = Recording.from_bytes( data=open(path, "rb").read(), recording_id=recording.id, ) np.testing.assert_equal(memory_recording.load_audio(), recording.load_audio())
def test_opus_stereo_recording_from_file_force_sampling_rate(): path = "test/fixtures/stereo.opus" recording = Recording.from_file(path, force_opus_sampling_rate=8000) assert recording.sampling_rate == 8000 assert isclose(recording.duration, 1.0055) samples = recording.load_audio() num_channels, num_samples = samples.shape assert num_channels == recording.num_channels assert num_samples == recording.num_samples assert num_samples == 8044
def test_save_audio(libri_cut, ext): with NamedTemporaryFile(suffix=ext) as f: stored_cut = libri_cut.save_audio(f.name) samples1 = libri_cut.load_audio() rec = Recording.from_file(f.name) samples2 = rec.load_audio() assert np.array_equal(samples1, samples2) assert rec.duration == libri_cut.duration assert rec.duration == stored_cut.duration assert libri_cut.duration == stored_cut.duration
def test_store_audio(libri_cut): with NamedTemporaryFile() as f: stored_cut = libri_cut.compute_and_store_recording(f.name) samples1 = libri_cut.load_audio() rec = Recording.from_file(f.name) samples2 = rec.load_audio() assert np.array_equal(samples1, samples2) assert rec.duration == libri_cut.duration assert rec.duration == stored_cut.duration assert libri_cut.duration == stored_cut.duration
def prepare_broadcast_news( audio_dir: Pathlike, transcripts_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for 1997 English Broadcast News corpus. We create three manifests: one with recordings, one with segments supervisions, and one with section supervisions. The latter can be used e.g. for topic segmentation. :param audio_dir: Path to ``LDC98S71`` package. :param transcripts_dir: Path to ``LDC98T28`` package. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :return: A dict with manifests. The keys are: ``{'recordings', 'sections', 'segments'}``. """ audio_paths = check_and_rglob(audio_dir, "*.sph") sgml_paths = check_and_rglob(transcripts_dir, "*.sgml") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths ) # BeautifulSoup has quite inefficient tag-to-string rendering that uses a recursive implementation; # on some systems the recursion limit needs to be raised for this to work. with recursion_limit(5000): supervisions_list = [ make_supervisions(p, r) for p, r in zip(sgml_paths, recordings) ] section_supervisions = SupervisionSet.from_segments( chain.from_iterable(sups["sections"] for sups in supervisions_list) ) segment_supervisions = SupervisionSet.from_segments( chain.from_iterable(sups["segments"] for sups in supervisions_list) ) validate_recordings_and_supervisions(recordings, segment_supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_file(output_dir / "broadcast-news_recordings_all.jsonl.gz") section_supervisions.to_file( output_dir / "broadcast-news_sections_all.jsonl.gz" ) segment_supervisions.to_file( output_dir / "broadcast-news_segments_all.jsonl.gz" ) return { "recordings": recordings, "sections": section_supervisions, "segments": segment_supervisions, }
def create_recording( audio_path_and_rel_path_depth: Tuple[Pathlike, Union[int, None]] ) -> Optional[Recording]: audio_path, rel_path_depth = audio_path_and_rel_path_depth try: return Recording.from_file( audio_path, relative_path_depth=rel_path_depth, ) except CalledProcessError: return None
def test_opus_recording_from_file(): path = "test/fixtures/mono_c0.opus" recording = Recording.from_file(path) # OPUS always overrides the sampling rate to 48000 assert recording.sampling_rate == 48000 # OPUS may crate extra audio frames / samples... assert isclose(recording.duration, 0.5054166666666666) samples = recording.load_audio() num_channels, num_samples = samples.shape assert num_channels == recording.num_channels assert num_samples == recording.num_samples assert num_samples == 24260
def prepare_ljspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata_csv_path = corpus_dir / "metadata.csv" assert metadata_csv_path.is_file(), f"No such file: {metadata_csv_path}" recordings = [] supervisions = [] with open(metadata_csv_path) as f: for line in f: recording_id, text, _ = line.split("|") audio_path = corpus_dir / "wavs" / f"{recording_id}.wav" if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) segment = SupervisionSegment( id=recording_id, recording_id=recording_id, start=0.0, duration=recording.duration, channel=0, language="English", gender="female", text=text, ) recordings.append(recording) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / "supervisions.json") recording_set.to_json(output_dir / "recordings.json") return {"recordings": recording_set, "supervisions": supervision_set}
def test_opus_stereo_recording_from_file_force_sampling_rate_read_chunk(): path = "test/fixtures/stereo.opus" recording = Recording.from_file(path, force_opus_sampling_rate=8000) assert recording.sampling_rate == 8000 assert isclose(recording.duration, 1.0055) all_samples = recording.load_audio() samples = recording.load_audio(offset=0.5, duration=0.25) num_channels, num_samples = samples.shape assert num_channels == recording.num_channels assert num_samples == 2000 np.testing.assert_almost_equal(samples, all_samples[:, 4000:6000], decimal=5)
def test_recording_from_sphere(relative_path_depth, expected_source_path): rec = Recording.from_file("test/fixtures/stereo.sph", relative_path_depth=relative_path_depth) assert rec == Recording( id="stereo", sampling_rate=8000, num_samples=8000, duration=1.0, sources=[ AudioSource(type="file", channels=[0, 1], source=expected_source_path) ], )
def parse_utterance( dataset_split_path: Path, line: str, ) -> Optional[Tuple[Recording, SupervisionSegment]]: recording_id, text = line.strip().split(maxsplit=1) # Create the Recording first audio_path = dataset_split_path / Path(recording_id.replace( '-', '/')).parent / f'{recording_id}.flac' if not audio_path.is_file(): logging.warning(f'No such file: {audio_path}') return None recording = Recording.from_file(audio_path, recording_id=recording_id) # Then, create the corresponding supervisions segment = SupervisionSegment(id=recording_id, recording_id=recording_id, start=0.0, duration=recording.duration, channel=0, language='English', speaker=re.sub(r'-.*', r'', recording.id), text=text.strip()) return recording, segment
def _prepare_dataset( dataset: List[Pathlike], ) -> Tuple[List[Recording], List[SupervisionSegment]]: """Build a list of Recording and SupervisionSegment from a list of sound filenames. :param dataset: List[Pathlike], a list of sound filenames :return: a tuple containing a list of Recording and a list of SupervisionSegment """ word_map = {"0": "NO", "1": "YES"} recordings = [] supervisions = [] for audio_path in dataset: words = audio_path.stem.split("_") assert len(words) == 8 assert set(words).union({"0", "1"}) == {"0", "1"}, f"words is: {words}" words = [word_map[w] for w in words] text = " ".join(words) recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment( id=audio_path.stem, recording_id=audio_path.stem, start=0.0, duration=recording.duration, channel=0, language="Hebrew", text=text, ) supervisions.append(segment) return recordings, supervisions
def prepare_gale_mandarin( audio_dirs: List[Pathlike], transcript_dirs: List[Pathlike], output_dir: Optional[Pathlike] = None, absolute_paths: Optional[bool] = True, segment_words: Optional[bool] = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for GALE Mandarin Broadcast speech corpus. :param audio_dirs: List of paths to audio corpora. :param transcripts_dirs: List of paths to transcript corpora. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Wheter to write absolute paths to audio sources (default = False) :param segment_words: Use `jieba` package to perform word segmentation (default = False) :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ assert len(audio_dirs) == len( transcript_dirs ), "Paths to the same speech and transcript corpora must be provided" logging.info("Reading audio and transcript paths from provided dirs") # Some of the audio is wav while others are flac. Also, some recordings # may be repeated across corpora so we make a dict to avoid adding them # twice. audio_paths = defaultdict( Path, { p.stem: p for p in chain.from_iterable( [ check_and_rglob(dir, ext, strict=False) for dir in audio_dirs for ext in ["*.wav", "*.flac"] ] ) }, ) transcript_paths = chain.from_iterable( [check_and_rglob(dir, "*.tdf") for dir in transcript_dirs] ) logging.info("Preparing recordings manifest") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths.values() ) logging.info("Preparing supervisions manifest") supervisions = SupervisionSet.from_segments( parse_transcripts(transcript_paths, segment_words=segment_words) ).filter(lambda s: s.recording_id in audio_paths) # Some supervisions exceed recording boundaries, so here we trim them supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) TEST = [ line.decode("utf-8").strip() for url in TEST_FILE_URLS for line in urlopen(url) ] manifests = defaultdict(dict) manifests["dev"] = { "recordings": recordings.filter(lambda r: r.id in TEST), "supervisions": supervisions.filter(lambda s: s.recording_id in TEST), } manifests["train"] = { "recordings": recordings.filter(lambda r: r.id not in TEST), "supervisions": supervisions.filter(lambda s: s.recording_id not in TEST), } if output_dir is not None: logging.info("Writing manifests to JSONL files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["train", "dev"]: manifests[part]["recordings"].to_file( output_dir / f"gale-mandarin_recordings_{part}.jsonl.gz" ) manifests[part]["supervisions"].to_file( output_dir / f"gale-mandarin_supervisions_{part}.jsonl.gz" ) return manifests
def prepare_cslu_kids( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: Optional[bool] = True, normalize_text: Optional[bool] = True, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for CSLU Kids corpus. The supervision contains either the prompted text, or a transcription of the spontaneous speech, depending on whether the utterance was scripted or spontaneous. Additionally, the following information is present in the `custom` tag: scripted/spontaneous utterance, and verification label (rating between 1 and 4) for scripted utterances (see https://catalog.ldc.upenn.edu/docs/LDC2007S18/verification-note.txt or top documentation in this script for more information). :param corpus_dir: Path to downloaded LDC corpus. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Wheter to write absolute paths to audio sources (default = False) :param normalize_text: remove noise tags (<bn>, <bs>) from spontaneous speech transcripts (default = True) :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ corpus_dir = Path(corpus_dir) if isinstance(corpus_dir, str) else corpus_dir # Get list of all recordings audio_paths = check_and_rglob(corpus_dir, "*.wav") # Read verification labels verification = {} for file in check_and_rglob(corpus_dir, "*-verified.txt"): with open(file, "r") as f: for line in f: path, label = line.strip().split() utt = Path(path).stem verification[utt] = int(label) # Read prompted transcriptions prompts = {} with open(corpus_dir / "docs" / "all.map", "r") as f: for line in f: if line.strip() != "": prompt, text = line.strip().split(maxsplit=1) prompts[prompt] = text[1:-1] # remove " " around the text recordings = [] supervisions = [] for p in tqdm(audio_paths, desc="Preparing manifests"): # /data/corpora/LDC2007S18/speech/scripted/00/0/ks001/ks001000.wav uttid = p.stem # ks001000 spk = p.parent.stem # ks001 cat = p.parent.parent.stem # 0 prompt = p.parent.parent.parent.stem # 00 type = p.parent.parent.parent.parent.stem # scripted recording = Recording.from_file( p, relative_path_depth=None if absolute_paths else 3 ) recordings.append(recording) if type == "scripted": text = prompts[prompt] verification_label = verification[uttid] if uttid in verification else None custom = {"type": type, "verification_label": verification_label} elif type == "spontaneous": text = read_text( corpus_dir / "trans" / type / prompt / cat / spk / f"{uttid}.txt", normalize=normalize_text, ) custom = {"type": type} supervisions.append( SupervisionSegment( id=uttid, recording_id=uttid, start=0, duration=recording.duration, speaker=spk, language="English", text=text, custom=custom, ) ) recordings = RecordingSet.from_recordings(recordings) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests = { "recordings": recordings, "supervisions": supervisions, } if output_dir is not None: logging.info("Writing manifests to JSON files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests["recordings"].to_json(output_dir / "recordings.json") manifests["supervisions"].to_json(output_dir / "supervisions.json") return manifests
def prepare_aishell4( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ if not is_module_available("textgrid"): raise ValueError( "To prepare AISHELL-4 data, please 'pip install textgrid' first.") import textgrid corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" manifests = defaultdict(dict) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) global_spk_id = {} for part in ["train_L", "train_M", "train_S", "test"]: recordings = [] supervisions = [] wav_path = corpus_dir / part / "wav" for audio_path in wav_path.rglob("*.flac"): idx = audio_path.stem try: tg = textgrid.TextGrid.fromFile( f"{corpus_dir}/{part}/TextGrid/{idx}.TextGrid") except ValueError: logging.warning( f"{idx} has annotation issues. Skipping this recording.") continue recording = Recording.from_file(audio_path) recordings.append(recording) for tier in tg.tiers: local_spk_id = tier.name key = (idx, local_spk_id) if key not in global_spk_id: global_spk_id[key] = f"SPK{len(global_spk_id)+1:04d}" spk_id = global_spk_id[key] for j, interval in enumerate(tier.intervals): if interval.mark != "": start = interval.minTime end = interval.maxTime text = interval.mark segment = SupervisionSegment( id=f"{idx}-{spk_id}-{j}", recording_id=idx, start=start, duration=round(end - start, 4), channel=0, language="Chinese", speaker=spk_id, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"supervisions_{part}.jsonl") recording_set.to_file(output_dir / f"recordings_{part}.jsonl") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests
def test_recording_from_file_using_audioread(): path = 'test/fixtures/mono_c0.opus' recording = Recording.from_file(path) recording.load_audio()
def recording(): return Recording.from_file( "test/fixtures/libri/libri-1088-134315-0000.wav")
def prepare_aidatatang_200zh( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" d = corpus_dir / "aidatatang_200zh" assert d.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) transcript_path = d / "transcript/aidatatang_200_zh_transcript.txt" assert transcript_path.is_file(), f"No such file: {transcript_path}" transcript_dict = {} with open(transcript_path, "r", encoding="utf-8") as f: for line in f.readlines(): idx_transcript = line.split() transcript_dict[idx_transcript[0]] = " ".join(idx_transcript[1:]) manifests = defaultdict(dict) dataset_parts = ["dev", "test", "train"] for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) logging.info(f"Processing {part}") recordings = [] supervisions = [] wav_path = d / "corpus" / part for audio_path in wav_path.rglob("**/*.wav"): idx = audio_path.stem speaker = audio_path.parts[-2] if idx not in transcript_dict: logging.warning(f"No transcript: {idx}") continue text = transcript_dict[idx] if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language="Chinese", speaker=speaker, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{part}.json") recording_set.to_json(output_dir / f"recordings_{part}.json") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests
def prepare_ali_meeting( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, mic: Optional[str] = "far", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :param mic: str, "near" or "far", specifies whether to prepare the near-field or far-field data. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'. """ if not is_module_available("textgrid"): raise ValueError( "To prepare AliMeeting data, please 'pip install textgrid' first." ) import textgrid corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" manifests = defaultdict(dict) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["Train", "Eval", "Test"]: recordings = [] supervisions = [] # Eval and Test may further be inside another folder (since the "far" and "near" are grouped together) if part == "Eval" or part == "Test": corpus_dir = ( corpus_dir / f"{part}_Ali" if (corpus_dir / f"{part}_Ali").is_dir() else corpus_dir ) wav_paths = corpus_dir / f"{part}_Ali_{mic}" / "audio_dir" text_paths = corpus_dir / f"{part}_Ali_{mic}" / "textgrid_dir" # For 'near' setting: # - wav files have names like R0003_M0046_F_SPK0093.wav # - textgrid files have names like R0003_M0046_F_SPK0093.TextGrid # Speaker ID information is present in the file name itself # For 'far' setting: # - wav files have names like R0015_M0151_MS002.wav # - textgrid files have names like R0015_M015.TextGrid # Speaker ID information is present inside the TextGrid file for text_path in tqdm( list(text_paths.rglob("*.TextGrid")), desc=f"Preparing {part}" ): session_id = text_path.stem if mic == "near": _, _, gender, spk_id = session_id.split("_") spk_id = spk_id[3:] # SPK1953 -> 1953 try: tg = textgrid.TextGrid.fromFile(str(text_path)) except ValueError: logging.warning( f"{session_id} has annotation issues. Skipping this recording." ) continue wav_path = list(wav_paths.rglob(f"{session_id}*.wav"))[0] recording = Recording.from_file(wav_path, recording_id=session_id) recordings.append(recording) for tier in tg.tiers: if mic == "far": parts = tier.name.split("_") if len(parts) == 4: _, _, gender, spk_id = parts elif len(parts) == 2: gender, spk_id = parts spk_id = spk_id[3:] # SPK1953 -> 1953 for i, interval in enumerate(tier.intervals): if interval.mark != "": start = interval.minTime end = interval.maxTime text = interval.mark segment = SupervisionSegment( id=f"{session_id}-{spk_id}-{i}", recording_id=recording.id, start=start, duration=round(end - start, 4), channel=0, language="Chinese", speaker=spk_id, gender=gender, text=text.strip(), ) supervisions.append(segment) recording_set, supervision_set = fix_manifests( RecordingSet.from_recordings(recordings), SupervisionSet.from_segments(supervisions), ) # Fix manifests validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"supervisions_{part.lower()}.jsonl") recording_set.to_file(output_dir / f"recordings_{part.lower()}.jsonl") manifests[part.lower()] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_aishell( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) transcript_path = corpus_dir / 'data_aishell/transcript/aishell_transcript_v0.8.txt' transcript_dict = {} with open(transcript_path, 'r', encoding='utf-8') as f: for line in f.readlines(): idx_transcript = line.split() transcript_dict[idx_transcript[0]] = ' '.join(idx_transcript[1:]) manifests = defaultdict(dict) dataset_parts = ['train', 'dev', 'test'] for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] wav_path = corpus_dir / 'data_aishell' / 'wav' / f'{part}' for audio_path in wav_path.rglob('**/*.wav'): idx = audio_path.stem speaker = audio_path.parts[-2] if idx not in transcript_dict: logging.warning(f'No transcript: {idx}') continue text = transcript_dict[idx] if not audio_path.is_file(): logging.warning(f'No such file: {audio_path}') continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language='Chinese', speaker=speaker, text=text.strip()) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f'supervisions_{part}.json') recording_set.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': recording_set, 'supervisions': supervision_set } return manifests
def create_recording( audio_path_and_rel_path_depth: Tuple[Pathlike, Union[int, None]]) -> Recording: audio_path, rel_path_depth = audio_path_and_rel_path_depth return Recording.from_file(audio_path, relative_path_depth=rel_path_depth)
def prepare_switchboard( audio_dir: Pathlike, transcripts_dir: Optional[Pathlike] = None, sentiment_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, omit_silence: bool = True, absolute_paths: bool = False ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Switchboard corpus. We create two manifests: one with recordings, and the other one with text supervisions. When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. :param audio_dir: Path to ``LDC97S62`` package. :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). If not provided, the transcripts will be downloaded. :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations for SWBD segments. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ if transcripts_dir is None: transcripts_dir = download_and_untar() audio_paths = check_and_rglob(audio_dir, '*.sph') text_paths = check_and_rglob(transcripts_dir, '*trans.text') groups = [] name_to_text = {p.stem.split('-')[0]: p for p in text_paths} for ap in audio_paths: name = ap.stem.replace('sw0', 'sw') groups.append({ 'audio': ap, 'text-0': name_to_text[f'{name}A'], 'text-1': name_to_text[f'{name}B'] }) recordings = RecordingSet.from_recordings( Recording.from_file(group['audio'], relative_path_depth=None if absolute_paths else 3) for group in groups) supervisions = SupervisionSet.from_segments( chain.from_iterable( make_segments(transcript_path=group[f'text-{channel}'], recording=recording, channel=channel, omit_silence=omit_silence) for group, recording in zip(groups, recordings) for channel in [0, 1])) validate_recordings_and_supervisions(recordings, supervisions) if sentiment_dir is not None: parse_and_add_sentiment_labels(sentiment_dir, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / 'recordings.json') supervisions.to_json(output_dir / 'supervisions.json') return {'recordings': recordings, 'supervisions': supervisions}
def audio_read_worker(p: Path) -> Recording: r = Recording.from_file(p, recording_id=f"{p.parent.stem}_{p.stem}") durations[r.id] = r.duration return r
def prepare_earnings21( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, normalize_text: bool = False, ) -> Union[RecordingSet, SupervisionSet]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. The structure is expected to mimic the structure in the github repository, notably the mp3 files will be searched for in [corpus_dir]/media and transcriptions in the directory [corpus_dir]/transcripts/nlp_references :param output_dir: Pathlike, the path where to write the manifests. :param normalize_text: Bool, if True, normalize the text. :return: (recordings, supervisions) pair .. caution:: The `normalize_text` option removes all punctuation and converts all upper case to lower case. This includes removing possibly important punctuations such as dashes and apostrophes. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) media_dir = corpus_dir / "media" audio_files = list(media_dir.glob("*.mp3")) assert len(audio_files) == 44 audio_files.sort() recording_set = RecordingSet.from_recordings( Recording.from_file(p) for p in audio_files) nlp_dir = corpus_dir / "transcripts" / "nlp_references" nlp_files = list(nlp_dir.glob("*.nlp")) assert len(nlp_files) == 44 supervision_segments = list() for nlp_file in nlp_files: id = nlp_file.stem text = " ".join(parse_nlp_file(nlp_file)) if normalize_text: text = normalize(text) s = SupervisionSegment( id=id, recording_id=id, start=0.0, duration=recording_set[id].duration, # recording.duration, channel=0, language="English", text=text, ) supervision_segments.append(s) supervision_set = SupervisionSet.from_segments(supervision_segments) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / "earnings21_supervisions_all.jsonl.gz") recording_set.to_file(output_dir / "earnings21_recordings_all.jsonl.gz") return recording_set, supervision_set
def prepare_libricss( corpus_dir: Pathlike, output_dir: Pathlike = None, type: str = "mdm", ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. NOTE: The recordings contain all 7 channels. If you want to use only one channel, you can use either ``recording.load_audio(channel=0)`` or ``MonoCut(id=...,recording=recording,channel=0)`` while creating the CutSet. :param corpus_dir: Pathlike, the path to the extracted corpus. :param output_dir: Pathlike, the path where to write the manifests. :param type: str, the type of data to prepare ('mdm', 'sdm', 'ihm-mix', or 'ihm'). These settings are similar to the ones in AMI and ICSI recipes. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ assert type in ["mdm", "ihm-mix", "ihm"] manifests = {} corpus_dir = Path(corpus_dir) corpus_dir = (corpus_dir / "for_release" if corpus_dir.stem != "for_release" else corpus_dir) recordings = [] segments = [] for ov in OVERLAP_RATIOS: for session in (corpus_dir / ov).iterdir(): _, _, _, _, _, name, actual_ov = session.name.split("_") actual_ov = float(actual_ov.split("actual")[1]) recording_id = f"{ov}_{name}" audio_path = (session / "clean" / "mix.wav" if type == "ihm-mix" else session / "clean" / "each_spk.wav" if type == "ihm" else session / "record" / "raw_recording.wav") recordings.append( Recording.from_file(audio_path, recording_id=recording_id)) for idx, seg in enumerate( parse_transcript(session / "transcription" / "meeting_info.txt")): segments.append( SupervisionSegment( id=f"{recording_id}-{idx}", recording_id=recording_id, start=seg[0], duration=seg[1] - seg[0], text=seg[4], language="English", speaker=seg[2], channel=SPK_TO_CHANNEL_MAP[session.name][seg[2]] if type == "ihm" else 0, )) supervisions = SupervisionSet.from_segments(segments) recordings = RecordingSet.from_recordings(recordings) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) recordings.to_jsonl(output_dir / "recordings.jsonl") supervisions.to_jsonl(output_dir / "supervisions.jsonl") return {"recordings": recordings, "supervisions": supervisions}
def prepare_timit( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, num_phones: int = 48, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consists of the Recodings and Supervisions. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write and save the manifests. :param num_phones: int=48, the number of phones (60, 48 or 39) for modeling and 48 is regarded as the default value. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) dataset_parts = ["TRAIN", "DEV", "TEST"] phones_dict = {} if num_phones in [60, 48, 39]: phones_dict = get_phonemes(num_phones) else: raise ValueError("The value of num_phones must be in [60, 48, 39].") dev_spks, test_spks = get_speakers() with ThreadPoolExecutor(num_jobs) as ex: for part in dataset_parts: wav_files = [] if part == "TRAIN": print("starting....") wav_files = glob.glob(str(corpus_dir) + "/TRAIN/*/*/*.WAV") # filter the SA (dialect sentences) wav_files = list( filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files)) elif part == "DEV": wav_files = glob.glob(str(corpus_dir) + "/TEST/*/*/*.WAV") # filter the SA (dialect sentences) wav_files = list( filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files)) wav_files = list( filter(lambda x: x.split("/")[-2].lower() in dev_spks, wav_files)) else: wav_files = glob.glob(str(corpus_dir) + "/TEST/*/*/*.WAV") # filter the SA (dialect sentences) wav_files = list( filter(lambda x: x.split("/")[-1][:2] != "SA", wav_files)) wav_files = list( filter(lambda x: x.split("/")[-2].lower() in test_spks, wav_files)) logging.debug(f"{part} dataset manifest generation.") recordings = [] supervisions = [] for wav_file in tqdm(wav_files): items = str(wav_file).strip().split("/") idx = items[-2] + "-" + items[-1][:-4] speaker = items[-2] transcript_file = Path(wav_file).with_suffix(".PHN") if not Path(wav_file).is_file(): logging.warning(f"No such file: {wav_file}") continue if not Path(transcript_file).is_file(): logging.warning(f"No transcript: {transcript_file}") continue text = [] with open(transcript_file, "r") as f: lines = f.readlines() for line in lines: phone = line.rstrip("\n").split(" ")[-1] if num_phones != 60: phone = phones_dict[str(phone)] text.append(phone) text = " ".join(text).replace("h#", "sil") recording = Recording.from_file(path=wav_file, recording_id=idx) recordings.append(recording) segment = SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language="English", speaker=speaker, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file( output_dir / f"timit_supervisions_{part}.jsonl.gz") recording_set.to_file(output_dir / f"timit_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_cmu_kids( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: Optional[bool] = True, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for CMU Kids corpus. The prepared supervisions contain the prompt text as the `text`. Additionally, in the `custom` tag, we provide the following data: speaker grade/age, population where the speaker came from (SIM95/FP), spoken transcript, and transcription bin (1/2). Here, bin `1` means utterances where the speaker followed the prompt and no noise/mispronunciation is present, and `2` refers to noisy utterances. The tag `spoken_transcript` is the transcription that was actually spoken. It contains noise tags and phone transcription in case the pronunciation differed from that in CMU Dict. :param corpus_dir: Path to downloaded LDC corpus. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Wheter to write absolute paths to audio sources (default = False) :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ corpus_dir = Path(corpus_dir) if isinstance(corpus_dir, str) else corpus_dir corpus_dir = corpus_dir.parent if corpus_dir.stem == "cmu_kids" else corpus_dir recordings = [] supervisions = [] # Get transcripts for all utterances utterances = {} with open(corpus_dir / 'cmu_kids' / 'tables' / 'sentence.tbl', 'r') as f: for line in f: utt, count, text = line.strip().split('\t') utterances[utt] = text # Get speaker metadata speaker_info = {} with open(corpus_dir / 'cmu_kids' / 'tables' / 'speaker.tbl', 'r') as f: for _ in range(2): next(f) for line in f: # ID LOC GR/AGE TOT BIN2 # fabm SUM95 3/9 100 62 # facs SUM95 2/8 90 55 spk, pop, gr_age, _, _ = line.strip().split('\t') grade, age = gr_age.split('/') speaker_info[spk] = (pop, grade, age) # Iterate through all transcriptions and add to supervisions with open(corpus_dir / 'cmu_kids' / 'tables' / 'transcrp.tbl', 'r') as f: for line in f: trn_id, transcript = line.strip().split(maxsplit=1) spk = trn_id[0:4] utt = trn_id[4:7] bin = int(trn_id[7]) pop, grade, age = speaker_info[spk] audio_path = (corpus_dir / 'cmu_kids' / 'kids' / spk / 'signal' / f'{trn_id}.sph') recording = Recording.from_file( audio_path, relative_path_depth=None if absolute_paths else 3) recordings.append(recording) supervisions.append( SupervisionSegment( id=trn_id, recording_id=trn_id, start=0, duration=recording.duration, speaker=spk, gender="Male" if spk[0] == 'm' else "Female", language='English', text=utterances[utt], custom={ 'speaker_grade': grade if grade != "NA" else None, 'speaker_age': int(age) if age != "NA" else None, 'speaker_population': pop, 'bin': bin, 'spoken_transcript': transcript, }, )) recordings = RecordingSet.from_recordings(recordings) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests = { 'recordings': recordings, 'supervisions': supervisions, } if output_dir is not None: logging.info("Writing manifests to JSON files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests["recordings"].to_json(output_dir / 'recordings.json') manifests["supervisions"].to_json(output_dir / 'supervisions.json') return manifests
def prepare_gale_arabic( audio_dirs: List[Pathlike], transcript_dirs: List[Pathlike], output_dir: Optional[Pathlike] = None, absolute_paths: bool = True, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for GALE Arabic Broadcast speech corpus. :param audio_dirs: List of paths to audio corpora. :param transcripts_dirs: List of paths to transcript corpora. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ assert len(audio_dirs) == len( transcript_dirs ), "Paths to the same speech and transcript corpora must be provided" logging.info("Reading audio and transcript paths from provided dirs") # Some of the audio is wav while others are flac. Also, some recordings # may be repeated across corpora so we make a dict to avoid adding them # twice. audio_paths = defaultdict( Path, { p.stem: p for p in chain.from_iterable([ check_and_rglob(dir, ext, strict=False) for dir in audio_dirs for ext in ['*.wav', '*.flac'] ]) }, ) transcript_paths = chain.from_iterable( [check_and_rglob(dir, '*.tdf') for dir in transcript_dirs]) transcript_paths = [p for p in transcript_paths] logging.info("Preparing recordings manifest") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths.values()) logging.info("Preparing supervisions manifest") supervisions = SupervisionSet.from_segments( parse_transcripts(transcript_paths)) # Some supervisions exceed recording boundaries, so here we trim them supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests = defaultdict(dict) manifests['test'] = { 'recordings': recordings.filter(lambda r: r.id in TEST), 'supervisions': supervisions.filter(lambda s: s.recording_id in TEST), } manifests['train'] = { 'recordings': recordings.filter(lambda r: r.id not in TEST), 'supervisions': supervisions.filter(lambda s: s.recording_id not in TEST), } if output_dir is not None: logging.info("Writing manifests to JSON files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["train", "test"]: manifests[part]["recordings"].to_json(output_dir / f'recordings_{part}.json') manifests[part]["supervisions"].to_json( output_dir / f'supervisions_{part}.json') return manifests