def prepare_hifitts( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the HiFiTTS dataset. :param corpus_dir: Path or str, the path to the downloaded corpus main directory. :param output_dir: Path or str, the path where to write the manifests. :param num_jobs: How many concurrent workers to use for preparing each dataset partition. :return: a dict with manifests for all the partitions (example query: ``manifests['92_clean_train']['recordings']``). """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" manifests = {} json_manifests = list(corpus_dir.glob("*.json")) dataset_partitions = [to_partition_id(p) for p in json_manifests] if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_partitions, output_dir=output_dir, prefix="hifitts") with ProcessPoolExecutor(num_jobs) as ex: futures = [] partition_ids = [] for raw_manifest_path in json_manifests: speaker_id, _, clean_or_other, part = raw_manifest_path.stem.split( "_") partition_id = to_partition_id(raw_manifest_path) if manifests_exist(part=partition_id, output_dir=output_dir, prefix="hifitts"): logging.info( f"HiFiTTS subset: {part} already prepared - skipping.") continue futures.append( ex.submit( prepare_single_partition, raw_manifest_path, corpus_dir, speaker_id, clean_or_other, )) partition_ids.append(partition_id) for future, partition_id in tqdm( zip(as_completed(futures), partition_ids), desc="Preparing HiFiTTS parts", total=len(futures), ): recordings, supervisions = future.result() if output_dir is not None: supervisions.to_json( output_dir / f"hifitts_supervisions_{partition_id}.json") recordings.to_json(output_dir / f"hifitts_recordings_{partition_id}.json") manifests[partition_id] = { "recordings": recordings, "supervisions": supervisions, } return manifests
def prepare_librispeech( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if dataset_parts == "mini_librispeech": dataset_parts = set(MINI_LIBRISPEECH).intersection( path.name for path in corpus_dir.glob("*")) elif dataset_parts == "auto": dataset_parts = (set(LIBRISPEECH).union(MINI_LIBRISPEECH).intersection( path.name for path in corpus_dir.glob("*"))) if not dataset_parts: raise ValueError( f"Could not find any of librispeech or mini_librispeech splits in: {corpus_dir}" ) elif isinstance(dataset_parts, str): dataset_parts = [dataset_parts] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir) with ThreadPoolExecutor(num_jobs) as ex: for part in tqdm(dataset_parts, desc="Dataset parts"): logging.info(f"Processing LibriSpeech subset: {part}") if manifests_exist(part=part, output_dir=output_dir): logging.info( f"LibriSpeech subset: {part} already prepared - skipping.") continue recordings = [] supervisions = [] part_path = corpus_dir / part futures = [] for trans_path in tqdm(part_path.rglob("*.trans.txt"), desc="Distributing tasks", leave=False): alignments = {} ali_path = trans_path.parent / (trans_path.stem.split(".")[0] + ".alignment.txt") if ali_path.exists(): alignments = parse_alignments(ali_path) # "trans_path" file contains lines like: # # 121-121726-0000 ALSO A POPULAR CONTRIVANCE # 121-121726-0001 HARANGUE THE TIRESOME PRODUCT OF A TIRELESS TONGUE # 121-121726-0002 ANGOR PAIN PAINFUL TO HEAR # # We will create a separate Recording and SupervisionSegment for those. with open(trans_path) as f: for line in f: futures.append( ex.submit(parse_utterance, part_path, line, alignments)) for future in tqdm(futures, desc="Processing", leave=False): result = future.result() if result is None: continue recording, segment = result recordings.append(recording) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_file(output_dir / f"supervisions_{part}.json") recording_set.to_file(output_dir / f"recordings_{part}.json") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set, } return manifests
def prepare_mgb2( corpus_dir: Pathlike, output_dir: Pathlike, text_cleaning: bool = True, buck_walter: bool = False, num_jobs: int = 1, mer_thresh: int = 80, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :param text_cleaning: Bool, if True, basic text cleaning is performed (similar to ESPNet recipe). :param buck_walter: Bool, use BuckWalter transliteration :param num_jobs: int, the number of jobs to use for parallel processing. :param mer_thresh: int, filter out segments based on mer (Match Error Rate) :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. .. note:: Unlike other recipes, output_dir is not Optional here because we write the manifests to the output directory while processing to avoid OOM issues, since it is a large dataset. .. caution:: The `text_cleaning` option removes all punctuation and diacritics. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" dataset_parts = ["dev", "train", "test"] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=output_dir, prefix="mgb2", suffix="jsonl.gz", lazy=True, ) for part in dataset_parts: info(f"Processing MGB2 subset: {part}") if manifests_exist( part=part, output_dir=output_dir, prefix="mgb2", suffix="jsonl.gz" ): info(f"MGB2 subset: {part} already prepared - skipping.") continue # Read the recordings and write them into manifest. We additionally store the # duration of the recordings in a dict which will be used later to create the # supervisions. output_dir = Path(output_dir) corpus_dir = Path(corpus_dir) if part == "test" or part == "dev": (output_dir / part).mkdir(parents=True, exist_ok=True) copy( corpus_dir / part / "text.non_overlap_speech", output_dir / part / "text", ) copy( corpus_dir / part / "segments.non_overlap_speech", output_dir / part / "segments", ) with open(corpus_dir / part / "wav.scp", "r") as f_in, open( output_dir / part / "wav.scp", "w" ) as f_out: for line in f_in: f_out.write(line.replace("wav/", f"{corpus_dir}/{part}/wav/")) f_out.write("\n") recordings, supervisions, _ = load_kaldi_data_dir( (output_dir / part), 16000 ) if buck_walter is False: supervisions = supervisions.transform_text(from_buck_walter) if part == "test": assert ( len(supervisions) == 5365 ), f"Expected 5365 supervisions for test, found {len(supervisions)}" elif part == "dev": assert ( len(supervisions) == 5002 ), f"Expected 5002 supervisions for dev, found {len(supervisions)}" elif part == "train": recordings = RecordingSet.from_dir( (corpus_dir / part / "wav"), pattern="*.wav", num_jobs=num_jobs ) xml_paths = check_and_rglob( path.join(corpus_dir, part, "xml/utf8"), "*.xml" ) # Read supervisions and write them to manifest with recursion_limit(5000): supervisions_list = list( chain.from_iterable( [make_supervisions(p, mer_thresh) for p in xml_paths] ) ) supervisions = SupervisionSet.from_segments(supervisions_list) assert ( len(supervisions) == 375103 ), f"Expected 375103 supervisions for train, found {len(supervisions)}" if text_cleaning is True: supervisions = supervisions.transform_text(cleaning) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) # saving recordings and supervisions recordings.to_file((output_dir / f"mgb2_recordings_{part}.jsonl.gz")) supervisions.to_file((output_dir / f"mgb2_supervisions_{part}.jsonl.gz")) manifests[part] = { "recordings": recordings, "supervisions": supervisions, } return manifests
def prepare_spgispeech( corpus_dir: Pathlike, output_dir: Pathlike, normalize_text: bool = True, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :param normalize_text: Bool, if True, normalize the text (similar to ESPNet recipe). :param num_jobs: int, the number of jobs to use for parallel processing. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. .. note:: Unlike other recipes, output_dir is not Optional here because we write the manifests to the output directory while processing to avoid OOM issues, since it is a large dataset. .. caution:: The `normalize_text` option removes all punctuation and converts all upper case to lower case. This includes removing possibly important punctuations such as dashes and apostrophes. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" audio_dir = (corpus_dir if (corpus_dir / "train").is_dir() else corpus_dir / "spgispeech") dataset_parts = ["train", "val"] manifests = {} output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=output_dir, prefix="spgispeech", suffix="jsonl.gz", lazy=True, ) for part in dataset_parts: logging.info(f"Processing SPGISpeech subset: {part}") if manifests_exist(part=part, output_dir=output_dir, prefix="spgispeech", suffix="jsonl.gz"): logging.info( f"SPGISpeech subset: {part} already prepared - skipping.") continue # Read the recordings and write them into manifest. We additionally store the # duration of the recordings in a dict which will be used later to create the # supervisions. global audio_read_worker durations = {} def audio_read_worker(p: Path) -> Recording: r = Recording.from_file(p, recording_id=f"{p.parent.stem}_{p.stem}") durations[r.id] = r.duration return r with RecordingSet.open_writer( output_dir / f"spgispeech_recordings_{part}.jsonl.gz") as rec_writer: for recording in tqdm( parallel_map( audio_read_worker, (audio_dir / part).rglob("*.wav"), num_jobs=num_jobs, ), desc="Processing SPGISpeech recordings", ): rec_writer.write(recording) # Read supervisions and write them to manifest with SupervisionSet.open_writer( output_dir / f"spgispeech_supervisions_{part}.jsonl.gz" ) as sup_writer, open(corpus_dir / f"{part}.csv", "r") as f: # Skip the header next(f) for line in tqdm(f, desc="Processing utterances"): parts = line.strip().split("|") # 07a785e9237c389c1354bb60abca42d5/1.wav -> 07a785e9237c389c1354bb60abca42d5_1 recording_id = parts[0].replace("/", "_").replace(".wav", "") text = parts[2] if normalize_text: text = normalize(text) spkid = recording_id.split("_")[0] segment = SupervisionSegment( id=recording_id, recording_id=recording_id, text=text, speaker=spkid, start=0, duration=durations[recording_id], language="English", ) sup_writer.write(segment) manifests[part] = { "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path), "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path), } return manifests
def prepare_gigaspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike], dataset_parts: Union[str, Sequence[str]] = "auto", num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: if is_module_available("speechcolab"): from speechcolab.datasets.gigaspeech import GigaSpeech else: raise ImportError( "To process the GigaSpeech corpus, please install optional dependency: pip install speechcolab" ) subsets = ("XL", "DEV", "TEST") if dataset_parts == "auto" else dataset_parts if isinstance(subsets, str): subsets = [subsets] corpus_dir = Path(corpus_dir) gigaspeech = GigaSpeech(corpus_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe some manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz", lazy=True, ) for part in subsets: logging.info(f"Processing GigaSpeech subset: {part}") if manifests_exist( part=part, output_dir=output_dir, prefix="gigaspeech", suffix="jsonl.gz" ): logging.info(f"GigaSpeech subset: {part} already prepared - skipping.") continue with RecordingSet.open_writer( output_dir / f"gigaspeech_recordings_{part}.jsonl.gz" ) as rec_writer, SupervisionSet.open_writer( output_dir / f"gigaspeech_supervisions_{part}.jsonl.gz" ) as sup_writer, CutSet.open_writer( output_dir / f"gigaspeech_cuts_{part}.jsonl.gz" ) as cut_writer: for recording, segments in tqdm( parallel_map( parse_utterance, gigaspeech.audios("{" + part + "}"), repeat(gigaspeech.gigaspeech_dataset_dir), num_jobs=num_jobs, ), desc="Processing GigaSpeech JSON entries", ): # Fix and validate the recording + supervisions recordings, segments = fix_manifests( recordings=RecordingSet.from_recordings([recording]), supervisions=SupervisionSet.from_segments(segments), ) validate_recordings_and_supervisions( recordings=recordings, supervisions=segments ) # Create the cut since most users will need it anyway. # There will be exactly one cut since there's exactly one recording. cuts = CutSet.from_manifests( recordings=recordings, supervisions=segments ) # Write the manifests rec_writer.write(recordings[0]) for s in segments: sup_writer.write(s) cut_writer.write(cuts[0]) manifests[part] = { "recordings": RecordingSet.from_jsonl_lazy(rec_writer.path), "supervisions": SupervisionSet.from_jsonl_lazy(sup_writer.path), "cuts": CutSet.from_jsonl_lazy(cut_writer.path), } return dict(manifests)
def prepare_libritts( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = 'auto', output_dir: Optional[Pathlike] = None, num_jobs: int = 1 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :param num_jobs: the number of parallel workers parsing the data. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if dataset_parts == 'auto': dataset_parts = LIBRITTS elif isinstance(dataset_parts, str): assert dataset_parts in LIBRITTS dataset_parts = [dataset_parts] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, prefix='libritts') # Contents of the file # ;ID |SEX| SUBSET |MINUTES| NAME # 14 | F | train-clean-360 | 25.03 | ... # 16 | F | train-clean-360 | 25.11 | ... # 17 | M | train-clean-360 | 25.04 | ... spk2gender = { spk_id.strip(): gender.strip() for spk_id, gender, *_ in (line.split('|') for line in ( corpus_dir / 'SPEAKERS.txt').read_text().splitlines() if not line.startswith(';')) } for part in tqdm(dataset_parts, desc='Preparing LibriTTS parts'): if manifests_exist(part=part, output_dir=output_dir, prefix='libritts'): logging.info( f'LibriTTS subset: {part} already prepared - skipping.') continue part_path = corpus_dir / part recordings = RecordingSet.from_dir(part_path, '*.wav', num_jobs=num_jobs) supervisions = [] for trans_path in tqdm( part_path.rglob('*.trans.tsv'), desc='Scanning transcript files (progbar per speaker)', leave=False): # The trans.tsv files contain only the recordings that were kept for LibriTTS. # Example path to a file: # /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv # # Example content: # 84_121123_000007_000001 Maximilian. Maximilian. # 84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief. Villefort rose, half ashamed of being surprised in such a paroxysm of grief. # book.tsv contains additional metadata utt2snr = { rec_id: float(snr) for rec_id, *_, snr in map(str.split, ( trans_path.parent / trans_path.name.replace('.trans.tsv', '.book.tsv') ).read_text().splitlines()) } for line in trans_path.read_text().splitlines(): rec_id, orig_text, norm_text = line.split('\t') spk_id = rec_id.split('_')[0] supervisions.append( SupervisionSegment(id=rec_id, recording_id=rec_id, start=0.0, duration=recordings[rec_id].duration, channel=0, text=norm_text, language='English', speaker=spk_id, gender=spk2gender[spk_id], custom={ 'orig_text': orig_text, 'snr': utt2snr[rec_id] })) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: supervisions.to_json(output_dir / f'libritts_supervisions_{part}.json') recordings.to_json(output_dir / f'libritts_recordings_{part}.json') manifests[part] = { 'recordings': recordings, 'supervisions': supervisions } return manifests
def prepare_libritts( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, link_previous_utt: bool = False, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :param num_jobs: the number of parallel workers parsing the data. :param link_previous_utt: If true adds previous utterance id to supervisions. Useful for reconstructing chains of utterances as they were read. If previous utterance was skipped from LibriTTS datasets previous_utt label is None. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if dataset_parts == "auto": dataset_parts = LIBRITTS elif isinstance(dataset_parts, str): assert dataset_parts in LIBRITTS dataset_parts = [dataset_parts] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, prefix="libritts") # Contents of the file # ;ID |SEX| SUBSET |MINUTES| NAME # 14 | F | train-clean-360 | 25.03 | ... # 16 | F | train-clean-360 | 25.11 | ... # 17 | M | train-clean-360 | 25.04 | ... spk2gender = { spk_id.strip(): gender.strip() for spk_id, gender, *_ in (line.split("|") for line in ( corpus_dir / "SPEAKERS.txt").read_text().splitlines() if not line.startswith(";")) } for part in tqdm(dataset_parts, desc="Preparing LibriTTS parts"): if manifests_exist(part=part, output_dir=output_dir, prefix="libritts"): logging.info( f"LibriTTS subset: {part} already prepared - skipping.") continue part_path = corpus_dir / part recordings = RecordingSet.from_dir(part_path, "*.wav", num_jobs=num_jobs) supervisions = [] for trans_path in tqdm( part_path.rglob("*.trans.tsv"), desc="Scanning transcript files (progbar per speaker)", leave=False, ): # The trans.tsv files contain only the recordings that were kept for LibriTTS. # Example path to a file: # /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv # # Example content: # 84_121123_000007_000001 Maximilian. Maximilian. # 84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief. Villefort rose, half ashamed of being surprised in such a paroxysm of grief. # book.tsv contains additional metadata utt2snr = [(rec_id, float(snr)) for rec_id, *_, snr in map( str.split, (trans_path.parent / trans_path.name.replace(".trans.tsv", ".book.tsv") ).read_text().splitlines(), )] # keeps the order of uttids as they appear in book.tsv uttids = [r for r, _ in utt2snr] utt2snr = dict(utt2snr) if link_previous_utt: # Using the property of sorted keys to find previous utterance # The keys has structure speaker_book_x_y e.g. 1089_134691_000004_000001 utt2prevutt = dict(zip(uttids + [None], [None] + uttids)) prev_rec_id = None for line in trans_path.read_text().splitlines(): rec_id, orig_text, norm_text = line.split("\t") spk_id = rec_id.split("_")[0] customd = {"orig_text": orig_text, "snr": utt2snr[rec_id]} if link_previous_utt: # all recordings ids should be in the book.csv # but they are some missing e.g. 446_123502_000030_000003 prev_utt = utt2prevutt.get(rec_id, None) # previous utterance has to be present in trans.csv - otherwise it was skipped prev_utt = prev_utt if prev_utt == prev_rec_id else None customd["prev_utt"] = prev_utt prev_rec_id = rec_id supervisions.append( SupervisionSegment( id=rec_id, recording_id=rec_id, start=0.0, duration=recordings[rec_id].duration, channel=0, text=norm_text, language="English", speaker=spk_id, gender=spk2gender[spk_id], custom=customd, )) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: supervisions.to_file(output_dir / f"libritts_supervisions_{part}.jsonl.gz") recordings.to_file(output_dir / f"libritts_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recordings, "supervisions": supervisions } return manifests
def prepare_mobvoihotwords( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' dataset_parts = ['train', 'dev', 'test'] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir) for part in dataset_parts: logging.info(f'Preparing MobvoiHotwords subset: {part}') if manifests_exist(part=part, output_dir=output_dir): logging.info( f'MobvoiHotwords subset: {part} already prepared - skipping.') continue # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] for prefix in ['p_', 'n_']: prefixed_part = prefix + part json_path = corpus_dir / 'mobvoi_hotword_dataset_resources' / f'{prefixed_part}.json' with open(json_path, 'r', encoding='utf-8') as f: json_data = json.load(f) for entry in json_data: idx = entry['utt_id'] speaker = idx if entry['speaker_id'] is None else entry[ 'speaker_id'] audio_path = corpus_dir / 'mobvoi_hotword_dataset' / f'{idx}.wav' text = 'FREETEXT' if entry['keyword_id'] == 0: text = 'HiXiaowen' elif entry['keyword_id'] == 1: text = 'NihaoWenwen' else: assert entry['keyword_id'] == -1 if not audio_path.is_file(): logging.warning(f'No such file: {audio_path}') continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language='Chinese', speaker=speaker, text=text.strip()) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f'supervisions_{part}.json') recording_set.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': recording_set, 'supervisions': supervision_set } return manifests
def prepare_mobvoihotwords( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" dataset_parts = ["train", "dev", "test"] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir) for part in dataset_parts: logging.info(f"Preparing MobvoiHotwords subset: {part}") if manifests_exist(part=part, output_dir=output_dir): logging.info( f"MobvoiHotwords subset: {part} already prepared - skipping.") continue # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) recordings = [] supervisions = [] for prefix in ["p_", "n_"]: prefixed_part = prefix + part json_path = (corpus_dir / "mobvoi_hotword_dataset_resources" / f"{prefixed_part}.json") with open(json_path, "r", encoding="utf-8") as f: json_data = json.load(f) for entry in json_data: idx = entry["utt_id"] speaker = (idx if entry["speaker_id"] is None else entry["speaker_id"]) audio_path = corpus_dir / "mobvoi_hotword_dataset" / f"{idx}.wav" text = "FREETEXT" if entry["keyword_id"] == 0: text = "HiXiaowen" elif entry["keyword_id"] == 1: text = "NihaoWenwen" else: assert entry["keyword_id"] == -1 if not audio_path.is_file(): logging.warning(f"No such file: {audio_path}") continue recording = Recording.from_file(audio_path) recordings.append(recording) segment = SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=recording.duration, channel=0, language="Chinese", speaker=speaker, text=text.strip(), ) supervisions.append(segment) recording_set = RecordingSet.from_recordings(recordings) supervision_set = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recording_set, supervision_set) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{part}.json") recording_set.to_json(output_dir / f"recordings_{part}.json") manifests[part] = { "recordings": recording_set, "supervisions": supervision_set } return manifests