def prepare_libritts( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = 'auto', output_dir: Optional[Pathlike] = None, num_jobs: int = 1 ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :param num_jobs: the number of parallel workers parsing the data. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if dataset_parts == 'auto': dataset_parts = LIBRITTS elif isinstance(dataset_parts, str): assert dataset_parts in LIBRITTS dataset_parts = [dataset_parts] if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. maybe_manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, prefix='libritts') if maybe_manifests is not None: return maybe_manifests # Contents of the file # ;ID |SEX| SUBSET |MINUTES| NAME # 14 | F | train-clean-360 | 25.03 | ... # 16 | F | train-clean-360 | 25.11 | ... # 17 | M | train-clean-360 | 25.04 | ... spk2gender = { spk_id.strip(): gender.strip() for spk_id, gender, *_ in (line.split('|') for line in ( corpus_dir / 'SPEAKERS.txt').read_text().splitlines() if not line.startswith(';')) } manifests = defaultdict(dict) for part in tqdm(dataset_parts, desc='Preparing LibriTTS parts'): part_path = corpus_dir / part recordings = RecordingSet.from_dir(part_path, '*.wav', num_jobs=num_jobs) supervisions = [] for trans_path in tqdm( part_path.rglob('*.trans.tsv'), desc='Scanning transcript files (progbar per speaker)', leave=False): # The trans.tsv files contain only the recordings that were kept for LibriTTS. # Example path to a file: # /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv # # Example content: # 84_121123_000007_000001 Maximilian. Maximilian. # 84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief. Villefort rose, half ashamed of being surprised in such a paroxysm of grief. # book.tsv contains additional metadata utt2snr = { rec_id: float(snr) for rec_id, *_, snr in map(str.split, ( trans_path.parent / trans_path.name.replace('.trans.tsv', '.book.tsv') ).read_text().splitlines()) } for line in trans_path.read_text().splitlines(): rec_id, orig_text, norm_text = line.split('\t') spk_id = rec_id.split('_')[0] supervisions.append( SupervisionSegment(id=rec_id, recording_id=rec_id, start=0.0, duration=recordings[rec_id].duration, channel=0, text=norm_text, language='English', speaker=spk_id, gender=spk2gender[spk_id], custom={ 'orig_text': orig_text, 'snr': utt2snr[rec_id] })) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: supervisions.to_json(output_dir / f'libritts_supervisions_{part}.json') recordings.to_json(output_dir / f'libritts_recordings_{part}.json') manifests[part] = { 'recordings': recordings, 'supervisions': supervisions } return dict(manifests) # Convert to normal dict
def prepare_mgb2( corpus_dir: Pathlike, output_dir: Pathlike, text_cleaning: bool = True, buck_walter: bool = False, num_jobs: int = 1, mer_thresh: int = 80, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :param text_cleaning: Bool, if True, basic text cleaning is performed (similar to ESPNet recipe). :param buck_walter: Bool, use BuckWalter transliteration :param num_jobs: int, the number of jobs to use for parallel processing. :param mer_thresh: int, filter out segments based on mer (Match Error Rate) :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. .. note:: Unlike other recipes, output_dir is not Optional here because we write the manifests to the output directory while processing to avoid OOM issues, since it is a large dataset. .. caution:: The `text_cleaning` option removes all punctuation and diacritics. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" dataset_parts = ["dev", "train", "test"] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=output_dir, prefix="mgb2", suffix="jsonl.gz", lazy=True, ) for part in dataset_parts: info(f"Processing MGB2 subset: {part}") if manifests_exist( part=part, output_dir=output_dir, prefix="mgb2", suffix="jsonl.gz" ): info(f"MGB2 subset: {part} already prepared - skipping.") continue # Read the recordings and write them into manifest. We additionally store the # duration of the recordings in a dict which will be used later to create the # supervisions. output_dir = Path(output_dir) corpus_dir = Path(corpus_dir) if part == "test" or part == "dev": (output_dir / part).mkdir(parents=True, exist_ok=True) copy( corpus_dir / part / "text.non_overlap_speech", output_dir / part / "text", ) copy( corpus_dir / part / "segments.non_overlap_speech", output_dir / part / "segments", ) with open(corpus_dir / part / "wav.scp", "r") as f_in, open( output_dir / part / "wav.scp", "w" ) as f_out: for line in f_in: f_out.write(line.replace("wav/", f"{corpus_dir}/{part}/wav/")) f_out.write("\n") recordings, supervisions, _ = load_kaldi_data_dir( (output_dir / part), 16000 ) if buck_walter is False: supervisions = supervisions.transform_text(from_buck_walter) if part == "test": assert ( len(supervisions) == 5365 ), f"Expected 5365 supervisions for test, found {len(supervisions)}" elif part == "dev": assert ( len(supervisions) == 5002 ), f"Expected 5002 supervisions for dev, found {len(supervisions)}" elif part == "train": recordings = RecordingSet.from_dir( (corpus_dir / part / "wav"), pattern="*.wav", num_jobs=num_jobs ) xml_paths = check_and_rglob( path.join(corpus_dir, part, "xml/utf8"), "*.xml" ) # Read supervisions and write them to manifest with recursion_limit(5000): supervisions_list = list( chain.from_iterable( [make_supervisions(p, mer_thresh) for p in xml_paths] ) ) supervisions = SupervisionSet.from_segments(supervisions_list) assert ( len(supervisions) == 375103 ), f"Expected 375103 supervisions for train, found {len(supervisions)}" if text_cleaning is True: supervisions = supervisions.transform_text(cleaning) recordings, supervisions = fix_manifests(recordings, supervisions) validate_recordings_and_supervisions(recordings, supervisions) # saving recordings and supervisions recordings.to_file((output_dir / f"mgb2_recordings_{part}.jsonl.gz")) supervisions.to_file((output_dir / f"mgb2_supervisions_{part}.jsonl.gz")) manifests[part] = { "recordings": recordings, "supervisions": supervisions, } return manifests
def prepare_bvcc( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None, num_jobs: int = 1, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: corpus_dir = Path(corpus_dir) phase1_main = (corpus_dir / "phase1-main").resolve() assert phase1_main.exists(), f"Main track dir is missing {phase1_main}" main1_sets = phase1_main / "DATA" / "sets" main1_wav = phase1_main / "DATA" / "wav" assert (main1_sets.exists() and main1_wav.exists() ), f"Have you run data preparation in {phase1_main}?" main1_devp = main1_sets / "DEVSET" assert main1_devp.exists(), main1_devp main1_trainp = main1_sets / "TRAINSET" assert main1_trainp.exists(), main1_trainp phase1_ood = (corpus_dir / "phase1-ood").resolve() assert phase1_ood.exists( ), f"Out of domain track dir is missing {phase1_ood}" ood1_sets = phase1_ood / "DATA" / "sets" ood1_wav = phase1_ood / "DATA" / "wav" assert (ood1_sets.exists() and ood1_wav.exists() ), f"Have you run data preparation in {phase1_ood}?" ood1_unlabeled = ood1_sets / "unlabeled_mos_list.txt" assert ood1_unlabeled.exists(), ood1_unlabeled ood1_devp = ood1_sets / "DEVSET" assert ood1_devp.exists(), ood1_devp ood1_trainp = ood1_sets / "TRAINSET" assert ood1_trainp.exists(), ood1_devp manifests = {} # ### Main track sets main1_recs = RecordingSet.from_dir(main1_wav, pattern="*.wav", num_jobs=num_jobs) logging.info("Preparing main1_dev") main1_dev_sup = SupervisionSet.from_segments( gen_supervision_per_utt( sorted(open(main1_devp).readlines()), main1_recs, parse_main_line, )) main1_dev_recs = main1_recs.filter(lambda rec: rec.id in main1_dev_sup) manifests["main1_dev"] = { "recordings": main1_dev_recs, "supervisions": main1_dev_sup, } logging.info("Preparing main1_train") main1_train_sup = SupervisionSet.from_segments( gen_supervision_per_utt( sorted(open(main1_trainp).readlines()), main1_recs, parse_main_line, )) main1_train_recs = main1_recs.filter(lambda rec: rec.id in main1_train_sup) manifests["main1_train"] = { "recordings": main1_train_recs, "supervisions": main1_train_sup, } # ### Out of Domain (OOD) track sets unlabeled_wavpaths = [ ood1_wav / name.strip() for name in open(ood1_unlabeled).readlines() ] manifests["ood1_unlabeled"] = { "recordings": RecordingSet.from_recordings( Recording.from_file(p) for p in unlabeled_wavpaths) } ood1_recs = RecordingSet.from_dir(ood1_wav, pattern="*.wav", num_jobs=num_jobs) logging.info("Preparing ood1_dev") ood1_dev_sup = SupervisionSet.from_segments( gen_supervision_per_utt( sorted(open(ood1_devp).readlines()), ood1_recs, parse_ood_line, )) ood1_dev_recs = ood1_recs.filter(lambda rec: rec.id in ood1_dev_sup) manifests["ood1_dev"] = { "recordings": ood1_dev_recs, "supervisions": ood1_dev_sup, } logging.info("Preparing ood1_train") ood1_train_sup = SupervisionSet.from_segments( gen_supervision_per_utt( sorted(open(ood1_trainp).readlines()), ood1_recs, parse_ood_line, )) ood1_train_recs = ood1_recs.filter(lambda rec: rec.id in ood1_train_sup) manifests["ood1_train"] = { "recordings": ood1_train_recs, "supervisions": ood1_train_sup, } # Optionally serializing to disc if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) for part, d in manifests.items(): d["recordings"].to_file(output_dir / f"recordings_{part}.jsonl.gz") if "supervisions" in d: d["supervisions"].to_file(output_dir / f"supervisions_{part}.jsonl.gz") return manifests
def prepare_libritts( corpus_dir: Pathlike, dataset_parts: Union[str, Sequence[str]] = "auto", output_dir: Optional[Pathlike] = None, num_jobs: int = 1, link_previous_utt: bool = False, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions. When all the manifests are available in the ``output_dir``, it will simply read and return them. :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'. By default we will infer which parts are available in ``corpus_dir``. :param output_dir: Pathlike, the path where to write the manifests. :param num_jobs: the number of parallel workers parsing the data. :param link_previous_utt: If true adds previous utterance id to supervisions. Useful for reconstructing chains of utterances as they were read. If previous utterance was skipped from LibriTTS datasets previous_utt label is None. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if dataset_parts == "auto": dataset_parts = LIBRITTS elif isinstance(dataset_parts, str): assert dataset_parts in LIBRITTS dataset_parts = [dataset_parts] manifests = {} if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Maybe the manifests already exist: we can read them and save a bit of preparation time. manifests = read_manifests_if_cached(dataset_parts=dataset_parts, output_dir=output_dir, prefix="libritts") # Contents of the file # ;ID |SEX| SUBSET |MINUTES| NAME # 14 | F | train-clean-360 | 25.03 | ... # 16 | F | train-clean-360 | 25.11 | ... # 17 | M | train-clean-360 | 25.04 | ... spk2gender = { spk_id.strip(): gender.strip() for spk_id, gender, *_ in (line.split("|") for line in ( corpus_dir / "SPEAKERS.txt").read_text().splitlines() if not line.startswith(";")) } for part in tqdm(dataset_parts, desc="Preparing LibriTTS parts"): if manifests_exist(part=part, output_dir=output_dir, prefix="libritts"): logging.info( f"LibriTTS subset: {part} already prepared - skipping.") continue part_path = corpus_dir / part recordings = RecordingSet.from_dir(part_path, "*.wav", num_jobs=num_jobs) supervisions = [] for trans_path in tqdm( part_path.rglob("*.trans.tsv"), desc="Scanning transcript files (progbar per speaker)", leave=False, ): # The trans.tsv files contain only the recordings that were kept for LibriTTS. # Example path to a file: # /export/corpora5/LibriTTS/dev-clean/84/121123/84_121123.trans.tsv # # Example content: # 84_121123_000007_000001 Maximilian. Maximilian. # 84_121123_000008_000000 Villefort rose, half ashamed of being surprised in such a paroxysm of grief. Villefort rose, half ashamed of being surprised in such a paroxysm of grief. # book.tsv contains additional metadata utt2snr = [(rec_id, float(snr)) for rec_id, *_, snr in map( str.split, (trans_path.parent / trans_path.name.replace(".trans.tsv", ".book.tsv") ).read_text().splitlines(), )] # keeps the order of uttids as they appear in book.tsv uttids = [r for r, _ in utt2snr] utt2snr = dict(utt2snr) if link_previous_utt: # Using the property of sorted keys to find previous utterance # The keys has structure speaker_book_x_y e.g. 1089_134691_000004_000001 utt2prevutt = dict(zip(uttids + [None], [None] + uttids)) prev_rec_id = None for line in trans_path.read_text().splitlines(): rec_id, orig_text, norm_text = line.split("\t") spk_id = rec_id.split("_")[0] customd = {"orig_text": orig_text, "snr": utt2snr[rec_id]} if link_previous_utt: # all recordings ids should be in the book.csv # but they are some missing e.g. 446_123502_000030_000003 prev_utt = utt2prevutt.get(rec_id, None) # previous utterance has to be present in trans.csv - otherwise it was skipped prev_utt = prev_utt if prev_utt == prev_rec_id else None customd["prev_utt"] = prev_utt prev_rec_id = rec_id supervisions.append( SupervisionSegment( id=rec_id, recording_id=rec_id, start=0.0, duration=recordings[rec_id].duration, channel=0, text=norm_text, language="English", speaker=spk_id, gender=spk2gender[spk_id], custom=customd, )) supervisions = SupervisionSet.from_segments(supervisions) validate_recordings_and_supervisions(recordings, supervisions) if output_dir is not None: supervisions.to_file(output_dir / f"libritts_supervisions_{part}.jsonl.gz") recordings.to_file(output_dir / f"libritts_recordings_{part}.jsonl.gz") manifests[part] = { "recordings": recordings, "supervisions": supervisions } return manifests