def prepare_switchboard( audio_dir: Pathlike, transcripts_dir: Optional[Pathlike] = None, sentiment_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, omit_silence: bool = True, absolute_paths: bool = False ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Switchboard corpus. We create two manifests: one with recordings, and the other one with text supervisions. When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. :param audio_dir: Path to ``LDC97S62`` package. :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). If not provided, the transcripts will be downloaded. :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations for SWBD segments. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ if transcripts_dir is None: transcripts_dir = download_and_untar() audio_paths = check_and_rglob(audio_dir, '*.sph') text_paths = check_and_rglob(transcripts_dir, '*trans.text') groups = [] name_to_text = {p.stem.split('-')[0]: p for p in text_paths} for ap in audio_paths: name = ap.stem.replace('sw0', 'sw') groups.append({'audio': ap, 'text-0': name_to_text[f'{name}A'], 'text-1': name_to_text[f'{name}B']}) recordings = RecordingSet.from_recordings( Recording.from_sphere(group['audio'], relative_path_depth=None if absolute_paths else 3) for group in groups ) supervisions = SupervisionSet.from_segments(chain.from_iterable( make_segments( transcript_path=group[f'text-{channel}'], recording=recording, channel=channel, omit_silence=omit_silence ) for group, recording in zip(groups, recordings) for channel in [0, 1] )) if sentiment_dir is not None: parse_and_add_sentiment_labels(sentiment_dir, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / 'recordings.json') supervisions.to_json(output_dir / 'supervisions.json') return { 'recordings': recordings, 'supervisions': supervisions }
def prepare_broadcast_news( audio_dir: Pathlike, transcripts_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for 1997 English Broadcast News corpus. We create three manifests: one with recordings, one with segments supervisions, and one with section supervisions. The latter can be used e.g. for topic segmentation. :param audio_dir: Path to ``LDC98S71`` package. :param transcripts_dir: Path to ``LDC98T28`` package. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :return: A dict with manifests. The keys are: ``{'recordings', 'sections', 'segments'}``. """ audio_paths = check_and_rglob(audio_dir, "*.sph") sgml_paths = check_and_rglob(transcripts_dir, "*.sgml") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths ) # BeautifulSoup has quite inefficient tag-to-string rendering that uses a recursive implementation; # on some systems the recursion limit needs to be raised for this to work. with recursion_limit(5000): supervisions_list = [ make_supervisions(p, r) for p, r in zip(sgml_paths, recordings) ] section_supervisions = SupervisionSet.from_segments( chain.from_iterable(sups["sections"] for sups in supervisions_list) ) segment_supervisions = SupervisionSet.from_segments( chain.from_iterable(sups["segments"] for sups in supervisions_list) ) validate_recordings_and_supervisions(recordings, segment_supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_file(output_dir / "broadcast-news_recordings_all.jsonl.gz") section_supervisions.to_file( output_dir / "broadcast-news_sections_all.jsonl.gz" ) segment_supervisions.to_file( output_dir / "broadcast-news_segments_all.jsonl.gz" ) return { "recordings": recordings, "sections": section_supervisions, "segments": segment_supervisions, }
def prepare_callhome_english( audio_dir: Pathlike, rttm_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, sph2pipe_path: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Switchboard corpus. We create two manifests: one with recordings, and the other one with text supervisions. When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. :param audio_dir: Path to ``LDC2001S97`` package. :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). If not provided, the transcripts will be downloaded. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ if rttm_dir is None: rttm_dir = download_callhome_metadata() rttm_path = rttm_dir / 'fullref.rttm' supervisions = read_rttm(rttm_path) audio_paths = check_and_rglob(audio_dir, '*.sph') recordings = RecordingSet.from_recordings( make_recording_callhome(p, sph2pipe_path=sph2pipe_path) for p in tqdm(audio_paths) ) recordings, supervisions = remove_missing_recordings_and_supervisions(recordings, supervisions) supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / 'recordings.json') supervisions.to_json(output_dir / 'supervisions.json') return { 'recordings': recordings, 'supervisions': supervisions }
def prepare_callhome_english_sre( audio_dir: Pathlike, rttm_dir: Optional[Pathlike] = None, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Callhome American English portion prepartion. We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir: Path to ``LDC2001S97`` package. :param rttm_dir: Path to the transcripts directory. If not provided, the transcripts will be downloaded. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ if rttm_dir is None: rttm_dir = download_callhome_metadata() rttm_path = rttm_dir / "fullref.rttm" supervisions = read_rttm(rttm_path) audio_paths = check_and_rglob(audio_dir, "*.sph") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 4) for p in tqdm(audio_paths)) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / "recordings.json") supervisions.to_json(output_dir / "supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_dihard3( dev_audio_dir: Pathlike, eval_audio_dir: Pathlike, output_dir: Optional[Pathlike] = None, uem_manifest: Optional[bool] = True, num_jobs: Optional[int] = 1, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the DIHARD III corpus. We create two manifests: one with recordings, and the other one with supervisions containing speaker id and timestamps. :param dev_audio_dir: Path to downloaded DIHARD III dev corpus (LDC2020E12), e.g. /data/corpora/LDC/LDC2020E12 :param eval_audio_dir: Path to downloaded DIHARD III eval corpus (LDC2021E02), e.g. /data/corpora/LDC/LDC2021E02` :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param uem_manifest: If True, also return a SupervisionSet describing the UEM segments (see use in dataset.DiarizationDataset) :param num_jobs: int (default = 1), number of jobs to scan corpus directory for recordings :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ manifests = defaultdict(dict) for part in tqdm(["dev", "eval"], desc="Preparing DIHARD parts"): audio_dir = dev_audio_dir if part == "dev" else eval_audio_dir if audio_dir is None or not Path(audio_dir).exists(): logging.warning(f"Nothing to be done for {part}") continue rttm_paths = list(check_and_rglob(audio_dir, "*.rttm")) uem_paths = list(check_and_rglob(audio_dir, "*.uem")) recordings = RecordingSet.from_dir(audio_dir, "*.flac", num_jobs=num_jobs) # Read metadata for recordings metadata = parse_metadata( list(check_and_rglob(audio_dir, "recordings.tbl"))[0]) supervisions = SupervisionSet.from_segments( chain.from_iterable( make_rttm_segments( rttm_path=[ x for x in rttm_paths if x.stem == recording.id ][0], recording=recording, metadata=metadata[recording.id], ) for recording in recordings)) if uem_manifest: uem = SupervisionSet.from_segments( chain.from_iterable( make_uem_segments( uem_path=[ x for x in uem_paths if x.stem == recording.id ][0], recording=recording, ) for recording in recordings)) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / f"recordings_{part}.json") supervisions.to_json(output_dir / f"supervisions_{part}.json") if uem_manifest: uem.to_json(output_dir / f"uem_{part}.json") manifests[part] = { "recordings": recordings, "supervisions": supervisions } if uem_manifest: manifests[part].update({"uem": uem}) return manifests
def prepare_gale_mandarin( audio_dirs: List[Pathlike], transcript_dirs: List[Pathlike], output_dir: Optional[Pathlike] = None, absolute_paths: Optional[bool] = True, segment_words: Optional[bool] = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for GALE Mandarin Broadcast speech corpus. :param audio_dirs: List of paths to audio corpora. :param transcripts_dirs: List of paths to transcript corpora. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Wheter to write absolute paths to audio sources (default = False) :param segment_words: Use `jieba` package to perform word segmentation (default = False) :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ assert len(audio_dirs) == len( transcript_dirs ), "Paths to the same speech and transcript corpora must be provided" logging.info("Reading audio and transcript paths from provided dirs") # Some of the audio is wav while others are flac. Also, some recordings # may be repeated across corpora so we make a dict to avoid adding them # twice. audio_paths = defaultdict( Path, { p.stem: p for p in chain.from_iterable( [ check_and_rglob(dir, ext, strict=False) for dir in audio_dirs for ext in ["*.wav", "*.flac"] ] ) }, ) transcript_paths = chain.from_iterable( [check_and_rglob(dir, "*.tdf") for dir in transcript_dirs] ) logging.info("Preparing recordings manifest") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths.values() ) logging.info("Preparing supervisions manifest") supervisions = SupervisionSet.from_segments( parse_transcripts(transcript_paths, segment_words=segment_words) ).filter(lambda s: s.recording_id in audio_paths) # Some supervisions exceed recording boundaries, so here we trim them supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) TEST = [ line.decode("utf-8").strip() for url in TEST_FILE_URLS for line in urlopen(url) ] manifests = defaultdict(dict) manifests["dev"] = { "recordings": recordings.filter(lambda r: r.id in TEST), "supervisions": supervisions.filter(lambda s: s.recording_id in TEST), } manifests["train"] = { "recordings": recordings.filter(lambda r: r.id not in TEST), "supervisions": supervisions.filter(lambda s: s.recording_id not in TEST), } if output_dir is not None: logging.info("Writing manifests to JSONL files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["train", "dev"]: manifests[part]["recordings"].to_file( output_dir / f"gale-mandarin_recordings_{part}.jsonl.gz" ) manifests[part]["supervisions"].to_file( output_dir / f"gale-mandarin_supervisions_{part}.jsonl.gz" ) return manifests
def prepare_cslu_kids( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: Optional[bool] = True, normalize_text: Optional[bool] = True, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for CSLU Kids corpus. The supervision contains either the prompted text, or a transcription of the spontaneous speech, depending on whether the utterance was scripted or spontaneous. Additionally, the following information is present in the `custom` tag: scripted/spontaneous utterance, and verification label (rating between 1 and 4) for scripted utterances (see https://catalog.ldc.upenn.edu/docs/LDC2007S18/verification-note.txt or top documentation in this script for more information). :param corpus_dir: Path to downloaded LDC corpus. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Wheter to write absolute paths to audio sources (default = False) :param normalize_text: remove noise tags (<bn>, <bs>) from spontaneous speech transcripts (default = True) :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ corpus_dir = Path(corpus_dir) if isinstance(corpus_dir, str) else corpus_dir # Get list of all recordings audio_paths = check_and_rglob(corpus_dir, "*.wav") # Read verification labels verification = {} for file in check_and_rglob(corpus_dir, "*-verified.txt"): with open(file, "r") as f: for line in f: path, label = line.strip().split() utt = Path(path).stem verification[utt] = int(label) # Read prompted transcriptions prompts = {} with open(corpus_dir / "docs" / "all.map", "r") as f: for line in f: if line.strip() != "": prompt, text = line.strip().split(maxsplit=1) prompts[prompt] = text[1:-1] # remove " " around the text recordings = [] supervisions = [] for p in tqdm(audio_paths, desc="Preparing manifests"): # /data/corpora/LDC2007S18/speech/scripted/00/0/ks001/ks001000.wav uttid = p.stem # ks001000 spk = p.parent.stem # ks001 cat = p.parent.parent.stem # 0 prompt = p.parent.parent.parent.stem # 00 type = p.parent.parent.parent.parent.stem # scripted recording = Recording.from_file( p, relative_path_depth=None if absolute_paths else 3 ) recordings.append(recording) if type == "scripted": text = prompts[prompt] verification_label = verification[uttid] if uttid in verification else None custom = {"type": type, "verification_label": verification_label} elif type == "spontaneous": text = read_text( corpus_dir / "trans" / type / prompt / cat / spk / f"{uttid}.txt", normalize=normalize_text, ) custom = {"type": type} supervisions.append( SupervisionSegment( id=uttid, recording_id=uttid, start=0, duration=recording.duration, speaker=spk, language="English", text=text, custom=custom, ) ) recordings = RecordingSet.from_recordings(recordings) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests = { "recordings": recordings, "supervisions": supervisions, } if output_dir is not None: logging.info("Writing manifests to JSON files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests["recordings"].to_json(output_dir / "recordings.json") manifests["supervisions"].to_json(output_dir / "supervisions.json") return manifests
def prepare_callhome_egyptian( audio_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Callhome Egyptian Arabic Corpus We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir: Path to ``LDC97S45`` package. :param transcript_dir: Path to the ``LDC97T19`` content :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir = Path(audio_dir) transcript_dir = Path(transcript_dir) manifests = {} for split in ["train", "devtest", "evaltest"]: audio_paths = check_and_rglob( # The LDC distribution has a typo. audio_dir / "callhome/arabic" / split.replace("evaltest", "evltest"), "*.sph", ) recordings = RecordingSet.from_recordings( Recording.from_file( p, relative_path_depth=None if absolute_paths else 4) for p in tqdm(audio_paths)) transcript_paths = check_and_rglob( transcript_dir / f"callhome_arabic_trans_970711/transcrp/{split}/roman", "*.txt", ) # TODO: Add text normalization like in Kaldi recipe. # Not doing this right now as it's not needed for VAD/diarization... supervisions = [] for p in transcript_paths: idx = 0 for line in p.read_text().splitlines(): line = line.strip() if not line: continue recording_id = p.stem # example line: # 19.33 21.18 B: %ah Tayyib start, end, spk, text = line.split(maxsplit=3) spk = spk.replace(":", "") duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue start = float(start) supervisions.append( SupervisionSegment( id=f"{recording_id}_{idx}", recording_id=recording_id, start=start, duration=duration, speaker=f"{recording_id}_{spk}", text=text, )) idx += 1 supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / f"recordings_{split}.json") supervisions.to_json(output_dir / f"supervisions_{split}.json") manifests[split] = { "recordings": recordings, "supervisions": supervisions } return manifests
def get_paths(fold_path_and_pattern: Tuple[Pathlike, str]) -> List[Path]: return check_and_rglob(*fold_path_and_pattern)
def prepare_fisher_english( corpus_dir: Pathlike, output_dir: Pathlike, audio_dirs: List[str] = FISHER_AUDIO_DIRS, transcript_dirs: List[str] = FISHER_TRANSCRIPT_DIRS, absolute_paths: bool = False, num_jobs: int = 1, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares manifests for Fisher English Part 1, 2. Script assumes that audio_dirs and transcript_dirs are in the corpus_path. We create two manifests: one with recordings, and the other one with text supervisions. :param corpus_path: Path to Fisher corpus :param audio_dirs: List of dirs of audio corpora. :param transcripts_dirs: List of dirs of transcript corpora. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ corpus_dir = Path(corpus_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for workdir in audio_dirs + transcript_dirs: workdir_path = corpus_dir / workdir if not workdir_path.is_dir(): raise ValueError( f"Could not find '{workdir}' directory inside '{corpus_dir}'.") audio_subdir_paths = [] for audio_dir in audio_dirs: audio_dir_path = corpus_dir / audio_dir for audio_partition_dir in audio_dir_path.iterdir(): audio_partition_dir_path = audio_dir_path / audio_partition_dir / "audio" audio_subdir_paths += [ audio_partition_dir_path / audio_subdir for audio_subdir in audio_partition_dir_path.iterdir() ] transcript_subdir_paths = [] for transcript_dir in transcript_dirs: transcript_dir_path = corpus_dir / transcript_dir / "data" / "trans" transcript_subdir_paths += [ transcript_dir_path / transcript_subdir for transcript_subdir in transcript_dir_path.iterdir() ] audio_paths = walk_dirs_parallel(audio_subdir_paths, "*.sph", "Parsing audio sub-dirs") transcript_paths = walk_dirs_parallel(transcript_subdir_paths, "*.txt", "Parsing transcript sub-dirs") sessions = {} for transcript_dir in transcript_dirs: sessions_data_path = check_and_rglob( corpus_dir / transcript_dir / "doc", "*_calldata.tbl")[0] with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f: tmp_sessions = [ l.rstrip("\n").split(",") for l in sessions_data_f.readlines() ][1:] sessions.update( {l[0]: { "A": l[5], "B": l[10] } for l in tmp_sessions}) assert len(transcript_paths) == len( audio_paths), f"{len(transcript_paths)} == {len(audio_paths)}" if len(transcript_paths) != len(sessions): warnings.warn( f"Fisher's *_calldata.tbl files indicate there should be {len(sessions)} sessions, " f"but our scanning of audio and transcript files indicates there are only {len(transcript_paths)}." ) recs_path = output_dir / "recordings_notfixed.jsonl.gz" if recs_path.is_file(): logging.info(f"Using existing recording manifest at {recs_path}") recordings = RecordingSet.from_jsonl_lazy(recs_path) else: logging.info(f"Building fresh recording manifest") create_recordings_input = [(p, None if absolute_paths else 5) for p in audio_paths] err_recos = 0 with ProcessPoolExecutor( num_jobs) as executor, RecordingSet.open_writer( recs_path) as writer: with tqdm(total=len(create_recordings_input), desc="Collect recordings") as pbar: for reco in executor.map(create_recording, create_recordings_input): if reco is not None: writer.write(reco, flush=True) else: err_recos += 1 pbar.update() if err_recos: warnings.warn(f"Out of {len(create_recordings_input)} recordings, " f"{err_recos} had errors and were omitted.") recordings = writer.open_manifest() sups_path = output_dir / "supervisions_notfixed.jsonl.gz" if sups_path.is_file(): logging.info(f"Using existing supervision manifest at {recs_path}") supervisions = SupervisionSet.from_jsonl_lazy(sups_path) else: logging.info(f"Building fresh supervision manifest") create_supervisions_input = [(sessions, p) for p in transcript_paths] err_sups = 0 with ThreadPoolExecutor(os.cpu_count() * 4) as executor, SupervisionSet.open_writer( sups_path) as writer: with tqdm(total=len(create_supervisions_input), desc="Create supervisions") as pbar: for tmp_supervisions in executor.map( create_supervision, create_supervisions_input): if not tmp_supervisions: err_sups += 1 for s in tmp_supervisions: writer.write(s) pbar.update() supervisions = writer.open_manifest() if err_recos: warnings.warn( f"Out of {len(create_supervisions_input)} transcript files, " f"{err_sups} had errors and were omitted.") recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) # Write the fixed and validated version to files with standard names. recordings.to_file(recs_path.parent / "recordings.jsonl.gz") supervisions.to_file(sups_path.parent / "supervisions.jsonl.gz") return {"recordings": recordings, "supervisions": supervisions}
def prepare_callhome_english_asr( audio_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the CallHome American English corpus. We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir: Path to ``LDC97S42`` content :param transcript_dir: Path to the ``LDC97T14`` content :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir = Path(audio_dir) transcript_dir = Path(transcript_dir) manifests = {} for split in ["evaltest", "train", "devtest"]: audio_paths = check_and_rglob( # The LDC distribution has a typo. audio_dir / "data" / split.replace("evaltest", "evltest"), "*.sph", ) recordings = RecordingSet.from_recordings( Recording.from_file( p, relative_path_depth=None if absolute_paths else 4) for p in tqdm(audio_paths)) transcript_paths = check_and_rglob( transcript_dir / "transcrpt" / split, "*.txt", ) # TODO: Add text normalization like in Kaldi recipe. # Not doing this right now as it's not needed for VAD/diarization... supervisions = [] for p in transcript_paths: idx = 0 postprocessed_lines = list() for line in p.read_text().splitlines(): line = line.strip() if not line: continue if line.startswith("#"): continue try: start, end, spk, text = line.split(maxsplit=3) duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue postprocessed_lines.append(line) except InvalidOperation: postprocessed_lines[ -1] = postprocessed_lines[-1] + " " + line except ValueError: postprocessed_lines[ -1] = postprocessed_lines[-1] + " " + line for line in postprocessed_lines: recording_id = p.stem # example line: # 19.33 21.18 B: %ah Tayyib start, end, spk, text = line.split(maxsplit=3) spk = spk.replace(":", "") duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue start = float(start) supervisions.append( SupervisionSegment( recording_id=recording_id, start=start, duration=duration, channel=ord(spk[0]) - ord("A"), speaker=f"{recording_id}_{spk:0>2s}", id=f"{recording_id}_{spk:0>2s}_{idx:0>5d}", text=text, )) idx += 1 supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_file(output_dir / f"callhome-english_recordings_{split}.jsonl.gz") supervisions.to_file( output_dir / f"callhome-english_supervisions_{split}.jsonl.gz") manifests[split] = { "recordings": recordings, "supervisions": supervisions } return manifests
def prepare_gale_arabic( audio_dirs: List[Pathlike], transcript_dirs: List[Pathlike], output_dir: Optional[Pathlike] = None, absolute_paths: bool = True, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for GALE Arabic Broadcast speech corpus. :param audio_dirs: List of paths to audio corpora. :param transcripts_dirs: List of paths to transcript corpora. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ assert len(audio_dirs) == len( transcript_dirs ), "Paths to the same speech and transcript corpora must be provided" logging.info("Reading audio and transcript paths from provided dirs") # Some of the audio is wav while others are flac. Also, some recordings # may be repeated across corpora so we make a dict to avoid adding them # twice. audio_paths = defaultdict( Path, { p.stem: p for p in chain.from_iterable([ check_and_rglob(dir, ext, strict=False) for dir in audio_dirs for ext in ['*.wav', '*.flac'] ]) }, ) transcript_paths = chain.from_iterable( [check_and_rglob(dir, '*.tdf') for dir in transcript_dirs]) transcript_paths = [p for p in transcript_paths] logging.info("Preparing recordings manifest") recordings = RecordingSet.from_recordings( Recording.from_file(p, relative_path_depth=None if absolute_paths else 3) for p in audio_paths.values()) logging.info("Preparing supervisions manifest") supervisions = SupervisionSet.from_segments( parse_transcripts(transcript_paths)) # Some supervisions exceed recording boundaries, so here we trim them supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) manifests = defaultdict(dict) manifests['test'] = { 'recordings': recordings.filter(lambda r: r.id in TEST), 'supervisions': supervisions.filter(lambda s: s.recording_id in TEST), } manifests['train'] = { 'recordings': recordings.filter(lambda r: r.id not in TEST), 'supervisions': supervisions.filter(lambda s: s.recording_id not in TEST), } if output_dir is not None: logging.info("Writing manifests to JSON files") output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["train", "test"]: manifests[part]["recordings"].to_json(output_dir / f'recordings_{part}.json') manifests[part]["supervisions"].to_json( output_dir / f'supervisions_{part}.json') return manifests
def prepare_fisher_spanish( audio_dir_path: Pathlike, transcript_dir_path: Pathlike, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares manifests for Fisher Spanish. We create two manifests: one with recordings, and the other one with text supervisions. :param audio_dir_path: Path to audio directory (usually LDC2010S01). :param transcript_dir_path: Path to transcript directory (usually LDC2010T04). :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir_path, transcript_dir_path = Path(audio_dir_path), Path( transcript_dir_path ) audio_paths = check_and_rglob(audio_dir_path, "*.sph") transcript_paths = check_and_rglob(transcript_dir_path, "*.tdf") sessions_data_path = check_and_rglob(transcript_dir_path, "*_call.tbl")[0] with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f: session_lines = [ l.rstrip("\n").split(",") for l in sessions_data_f.readlines() ][1:] sessions = {l[0]: {0: l[2], 1: l[8]} for l in session_lines} assert len(transcript_paths) == len(sessions) == len(audio_paths) create_recordings_input = [(p, None if absolute_paths else 4) for p in audio_paths] recordings = [None] * len(audio_paths) with ThreadPoolExecutor(os.cpu_count() * 4) as executor: with tqdm(total=len(audio_paths), desc="Collect recordings") as pbar: for i, reco in enumerate( executor.map(create_recording, create_recordings_input) ): recordings[i] = reco pbar.update() recordings = RecordingSet.from_recordings(recordings) create_supervisions_input = [(sessions, p) for p in transcript_paths] supervisions = [None] * len(create_supervisions_input) with ThreadPoolExecutor(os.cpu_count() * 4) as executor: with tqdm( total=len(create_supervisions_input), desc="Create supervisions" ) as pbar: for i, tmp_supervisions in enumerate( executor.map(create_supervision, create_supervisions_input) ): supervisions[i] = tmp_supervisions pbar.update() supervisions = list(it.chain.from_iterable(supervisions)) supervisions = SupervisionSet.from_segments(supervisions).filter( lambda s: s.duration > 0.0 ) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / "recordings.json") supervisions.to_json(output_dir / "supervisions.json") return {"recordings": recordings, "supervisions": supervisions}
def prepare_mgb2( corpus_dir: Pathlike, output_dir: Pathlike, text_cleaning: bool = True, buck_walter: bool = False, num_jobs: int = 1, mer_thresh: int = 80, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :param text_cleaning: Bool, if True, basic text cleaning is performed (similar to ESPNet recipe). :param buck_walter: Bool, use BuckWalter transliteration :param num_jobs: int, the number of jobs to use for parallel processing. :param mer_thresh: int, filter out segments based on mer (Match Error Rate) :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. .. note:: Unlike other recipes, output_dir is not Optional here because we write the manifests to the output directory while processing to avoid OOM issues, since it is a large dataset. .. caution:: The `text_cleaning` option removes all punctuation and diacritics. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" dataset_parts = ["dev", "train", "test"] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=output_dir, prefix="mgb2", suffix="jsonl.gz", lazy=True, ) for part in dataset_parts: info(f"Processing MGB2 subset: {part}") if manifests_exist( part=part, output_dir=output_dir, prefix="mgb2", suffix="jsonl.gz" ): info(f"MGB2 subset: {part} already prepared - skipping.") continue # Read the recordings and write them into manifest. We additionally store the # duration of the recordings in a dict which will be used later to create the # supervisions. output_dir = Path(output_dir) corpus_dir = Path(corpus_dir) if part == "test" or part == "dev": (output_dir / part).mkdir(parents=True, exist_ok=True) copy( corpus_dir / part / "text.non_overlap_speech", output_dir / part / "text", ) copy( corpus_dir / part / "segments.non_overlap_speech", output_dir / part / "segments", ) with open(corpus_dir / part / "wav.scp", "r") as f_in, open( output_dir / part / "wav.scp", "w" ) as f_out: for line in f_in: f_out.write(line.replace("wav/", f"{corpus_dir}/{part}/wav/")) f_out.write("\n") recordings, supervisions, _ = load_kaldi_data_dir( (output_dir / part), 16000 ) if buck_walter is False: supervisions = supervisions.transform_text(from_buck_walter) if part == "test": assert ( len(supervisions) == 5365 ), f"Expected 5365 supervisions for test, found {len(supervisions)}" elif part == "dev": assert ( len(supervisions) == 5002 ), f"Expected 5002 supervisions for dev, found {len(supervisions)}" elif part == "train": recordings = RecordingSet.from_dir( (corpus_dir / part / "wav"), pattern="*.wav", num_jobs=num_jobs ) xml_paths = check_and_rglob( path.join(corpus_dir, part, "xml/utf8"), "*.xml" ) # Read supervisions and write them to manifest with recursion_limit(5000): supervisions_list = list( chain.from_iterable( [make_supervisions(p, mer_thresh) for p in xml_paths] ) ) supervisions = SupervisionSet.from_segments(supervisions_list) assert ( len(supervisions) == 375103 ), f"Expected 375103 supervisions for train, found {len(supervisions)}" if text_cleaning is True: supervisions = supervisions.transform_text(cleaning) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) # saving recordings and supervisions recordings.to_file((output_dir / f"mgb2_recordings_{part}.jsonl.gz")) supervisions.to_file((output_dir / f"mgb2_supervisions_{part}.jsonl.gz")) manifests[part] = { "recordings": recordings, "supervisions": supervisions, } return manifests
def prepare_fisher_english( corpus_path: Pathlike, audio_dirs: List[str] = FISHER_AUDIO_DIRS, transcript_dirs: List[str] = FISHER_TRANSCRIPT_DIRS, output_dir: Optional[Pathlike] = None, absolute_paths: bool = False, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepares manifests for Fisher English Part 1, 2. Script assumes that audio_dirs and transcript_dirs are in the corpus_path. We create two manifests: one with recordings, and the other one with text supervisions. :param corpus_path: Path to Fisher corpus :param audio_dirs: List of dirs of audio corpora. :param transcripts_dirs: List of dirs of transcript corpora. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ corpus_path = Path(corpus_path) for workdir in audio_dirs + transcript_dirs: workdir_path = corpus_path / workdir if not workdir_path.is_dir(): raise ValueError( f"Could not find '{workdir}' directory inside '{corpus_path}'." ) audio_subdir_paths = [] for audio_dir in audio_dirs: audio_dir_path = corpus_path / audio_dir for audio_partition_dir in audio_dir_path.iterdir(): audio_partition_dir_path = audio_dir_path / audio_partition_dir / "audio" audio_subdir_paths += [ audio_partition_dir_path / audio_subdir for audio_subdir in audio_partition_dir_path.iterdir() ] transcript_subdir_paths = [] for transcript_dir in transcript_dirs: transcript_dir_path = corpus_path / transcript_dir / "data" / "trans" transcript_subdir_paths += [ transcript_dir_path / transcript_subdir for transcript_subdir in transcript_dir_path.iterdir() ] audio_paths = walk_dirs_parallel(audio_subdir_paths, "*.sph", "Parsing audio sub-dirs") transcript_paths = walk_dirs_parallel(transcript_subdir_paths, "*.txt", "Parsing transcript sub-dirs") sessions = {} for transcript_dir in transcript_dirs: sessions_data_path = check_and_rglob( corpus_path / transcript_dir / "doc", "*_calldata.tbl")[0] with codecs.open(sessions_data_path, "r", "utf8") as sessions_data_f: tmp_sessions = [ l.rstrip("\n").split(",") for l in sessions_data_f.readlines() ][1:] sessions.update( {l[0]: { "A": l[5], "B": l[10] } for l in tmp_sessions}) assert len(transcript_paths) == len(sessions) == len(audio_paths) create_recordings_input = [(p, None if absolute_paths else 5) for p in audio_paths] recordings = [None] * len(audio_paths) with ThreadPoolExecutor(os.cpu_count() * 4) as executor: with tqdm(total=len(create_recordings_input), desc="Collect recordings") as pbar: for i, reco in enumerate( executor.map(create_recording, create_recordings_input)): recordings[i] = reco pbar.update() recordings = RecordingSet.from_recordings(recordings) create_supervisions_input = [(sessions, p) for p in transcript_paths] supervisions = [None] * len(create_supervisions_input) with ThreadPoolExecutor(os.cpu_count() * 4) as executor: with tqdm(total=len(create_supervisions_input), desc="Create supervisions") as pbar: for i, tmp_supervisions in enumerate( executor.map(create_supervision, create_supervisions_input)): supervisions[i] = tmp_supervisions pbar.update() supervisions = list(it.chain.from_iterable(supervisions)) supervisions = SupervisionSet.from_segments(supervisions) supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_file(output_dir / "recordings.jsonl.gz") supervisions.to_file(output_dir / "supervisions.jsonl.gz") return {"recordings": recordings, "supervisions": supervisions}
def prepare_callhome_egyptian( audio_dir: Pathlike, transcript_dir: Pathlike, output_dir: Optional[Pathlike] = None, sph2pipe_path: Optional[Pathlike] = None, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the Switchboard corpus. We create two manifests: one with recordings, and the other one with text supervisions. When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. :param audio_dir: Path to ``LDC2001S97`` package. :param rttm_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). If not provided, the transcripts will be downloaded. :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param sph2pipe_path: When provided, we will "hard-wire" the sph2pipe path into the recording manifests. :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ audio_dir = Path(audio_dir) transcript_dir = Path(transcript_dir) manifests = {} for split in ['train', 'devtest', 'evaltest']: audio_paths = check_and_rglob( # The LDC distribution has a typo. audio_dir / 'callhome/arabic' / split.replace('evaltest', 'evltest'), '*.sph') recordings = RecordingSet.from_recordings( make_recording_callhome(p, sph2pipe_path=sph2pipe_path) for p in tqdm(audio_paths)) transcript_paths = check_and_rglob( transcript_dir / f'callhome_arabic_trans_970711/transcrp/{split}/roman', '*.txt') # TODO: Add text normalization like in Kaldi recipe. # Not doing this right now as it's not needed for VAD/diarization... supervisions = [] for p in transcript_paths: idx = 0 for line in p.read_text().splitlines(): line = line.strip() if not line: continue recording_id = p.stem # example line: # 19.33 21.18 B: %ah Tayyib start, end, spk, text = line.split(maxsplit=3) spk = spk.replace(":", "") duration = float(Decimal(end) - Decimal(start)) if duration <= 0: continue start = float(start) supervisions.append( SupervisionSegment(id=f'{recording_id}_{idx}', recording_id=recording_id, start=start, duration=duration, speaker=f'{recording_id}_{spk}', text=text)) idx += 1 supervisions = SupervisionSet.from_segments(supervisions) recordings, supervisions = remove_missing_recordings_and_supervisions( recordings, supervisions) supervisions = trim_supervisions_to_recordings(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / f'recordings_{split}.json') supervisions.to_json(output_dir / f'supervisions_{split}.json') manifests[split] = { 'recordings': recordings, 'supervisions': supervisions } return manifests