def prepare_dihard3( dev_audio_dir: Pathlike, eval_audio_dir: Pathlike, output_dir: Optional[Pathlike] = None, uem_manifest: Optional[bool] = True, num_jobs: Optional[int] = 1, ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Prepare manifests for the DIHARD III corpus. We create two manifests: one with recordings, and the other one with supervisions containing speaker id and timestamps. :param dev_audio_dir: Path to downloaded DIHARD III dev corpus (LDC2020E12), e.g. /data/corpora/LDC/LDC2020E12 :param eval_audio_dir: Path to downloaded DIHARD III eval corpus (LDC2021E02), e.g. /data/corpora/LDC/LDC2021E02` :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. :param uem_manifest: If True, also return a SupervisionSet describing the UEM segments (see use in dataset.DiarizationDataset) :param num_jobs: int (default = 1), number of jobs to scan corpus directory for recordings :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. """ manifests = defaultdict(dict) for part in tqdm(["dev", "eval"], desc="Preparing DIHARD parts"): audio_dir = dev_audio_dir if part == "dev" else eval_audio_dir if audio_dir is None or not Path(audio_dir).exists(): logging.warning(f"Nothing to be done for {part}") continue rttm_paths = list(check_and_rglob(audio_dir, "*.rttm")) uem_paths = list(check_and_rglob(audio_dir, "*.uem")) recordings = RecordingSet.from_dir(audio_dir, "*.flac", num_jobs=num_jobs) # Read metadata for recordings metadata = parse_metadata( list(check_and_rglob(audio_dir, "recordings.tbl"))[0]) supervisions = SupervisionSet.from_segments( chain.from_iterable( make_rttm_segments( rttm_path=[ x for x in rttm_paths if x.stem == recording.id ][0], recording=recording, metadata=metadata[recording.id], ) for recording in recordings)) if uem_manifest: uem = SupervisionSet.from_segments( chain.from_iterable( make_uem_segments( uem_path=[ x for x in uem_paths if x.stem == recording.id ][0], recording=recording, ) for recording in recordings)) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) recordings.to_json(output_dir / f"recordings_{part}.json") supervisions.to_json(output_dir / f"supervisions_{part}.json") if uem_manifest: uem.to_json(output_dir / f"uem_{part}.json") manifests[part] = { "recordings": recordings, "supervisions": supervisions } if uem_manifest: manifests[part].update({"uem": uem}) return manifests
def prepare_aspire( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, mic: str = "single" ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the corpus dir (LDC2017S21). :param output_dir: Pathlike, the path where to write the manifests. :param mic: str, the microphone type, either "single" or "multi". :return: a Dict whose key is the dataset part ('dev' and 'dev_test'), and the value is Dicts with the keys 'recordings' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" assert mic in [ "single", "multi", ], f"mic must be either 'single' or 'multi', got {mic}" corpus_dir = corpus_dir / "IARPA-ASpIRE-Dev-Sets-v2.0" / "data" audio_dir = corpus_dir / "dev_and_dev_test_audio" stm_dir = corpus_dir / "dev_and_dev_test_STM_files" if mic == "single": audio_paths = { "dev": audio_dir / "ASpIRE_single_dev", "dev_test": audio_dir / "ASpIRE_single_dev_test", } stm_file = { "dev": stm_dir / "dev.stm", "dev_test": stm_dir / "dev_test.stm", } else: audio_paths = { "dev": audio_dir / "ASpIRE_multi_dev", "dev_test": audio_dir / "ASpIRE_multi_dev_test", } stm_file = { "dev": stm_dir / "multi_dev.stm", "dev_test": stm_dir / "multi_dev_test.stm", } manifests = defaultdict(dict) if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part in ["dev", "dev_test"]: recordings = [] supervisions = [] # Prepare the recordings if mic == "single": recording_set = RecordingSet.from_dir(audio_paths[part], "*.wav") else: import soundfile as sf audio_groups = { k: list(v) for k, v in itertools.groupby( sorted(audio_paths[part].glob("*.wav")), key=lambda x: "_".join(x.stem.split("_")[:-1]), ) } # group audios so that each entry is a session containing all channels for session_name, audios in audio_groups.items(): audio_sf = sf.SoundFile(str(audios[0])) recordings.append( Recording( id=session_name, sources=[ AudioSource( type="file", channels=[int(audio.stem[-2:]) - 1], source=str(audio), ) for audio in sorted(audios) ], sampling_rate=audio_sf.samplerate, num_samples=audio_sf.frames, duration=audio_sf.frames / audio_sf.samplerate, )) recording_set = RecordingSet.from_recordings(recordings) # Read STM file and prepare segments segments = [] with open(stm_file[part]) as f: for line in f: session, _, speaker, start, end, text = line.strip().split( maxsplit=5) segments.append( AspireSegmentAnnotation(session, speaker, float(start), float(end), text)) # Group the segments by session and speaker segments_grouped = defaultdict(list) for segment in segments: segments_grouped[(segment.session, segment.speaker)].append(segment) # Create the supervisions supervisions = [] for k, segs in segments_grouped.items(): session, speaker = k supervisions += [ SupervisionSegment( id=f"{session}-{speaker}-{i:03d}", recording_id=session, start=seg.start, duration=round(seg.end - seg.start, 4), speaker=speaker, text=seg.text, language="English", ) for i, seg in enumerate(segs) ] supervision_set = SupervisionSet.from_segments(supervisions) recording_set, supervision_set = fix_manifests(recording_set, supervision_set) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"aspire_supervisions_{part}.jsonl.gz") recording_set.to_file(output_dir / f"aspire_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests