def download_vctk(target_dir: Pathlike = '.', force_download: Optional[bool] = False, url: Optional[str] = CREST_VCTK_URL) -> None: """ Download and untar/unzip the VCTK dataset. :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param url: str, the url of tarred/zipped VCTK corpus. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) archive_name = url.split('/')[-1] archive_path = target_dir / archive_name if force_download or not archive_path.is_file(): urlretrieve_progress(url, filename=archive_path, desc=f'Downloading {archive_name}') part_dir = target_dir / archive_name.replace('.zip', '').replace( '.tar.gz', '') completed_detector = part_dir / '.completed' if not completed_detector.is_file(): shutil.rmtree(part_dir, ignore_errors=True) opener = zipfile.ZipFile if archive_name.endswith( '.zip') else tarfile.open with opener(archive_path) as archive: archive.extractall(path=target_dir) completed_detector.touch()
def download_ali_meeting( target_dir: Pathlike = ".", force_download: Optional[bool] = False, base_url: Optional[ str ] = "https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/", ) -> Path: """ Downdload and untar the dataset :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. :return: the path to downloaded and extracted directory with data. """ url = f"{base_url}/AliMeeting/openlr" target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) dataset_tar_names = [ "Train_Ali_far.tar.gz", "Train_Ali_near.tar.gz", "Eval_Ali.tar.gz", "Test_Ali.tar.gz", ] for tar_name in dataset_tar_names: tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress( f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}" ) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) return target_dir
def download_and_untar( target_dir: Pathlike = '.', force_download: Optional[bool] = False, base_url: Optional[str] = 'http://www.openslr.org/resources') -> None: """ Downdload and untar the dataset :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. """ url = f'{base_url}/33' target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) dataset_tar_name = 'data_aishell.tgz' resources_tar_name = 'resource_aishell.tgz' for tar_name in [dataset_tar_name, resources_tar_name]: tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(f'{url}/{tar_name}', filename=tar_path, desc=f'Downloading {tar_name}') corpus_dir = target_dir / 'aishell' extracted_dir = corpus_dir / tar_name[:-4] completed_detector = extracted_dir / '.completed' if not completed_detector.is_file(): shutil.rmtree(extracted_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=corpus_dir) completed_detector.touch()
def download_cmu_indic( target_dir: Pathlike = ".", speakers: Sequence[str] = SPEAKERS, force_download: Optional[bool] = False, base_url: Optional[str] = BASE_URL, ) -> None: """ Download and untar the CMU Indic dataset. :param target_dir: Pathlike, the path of the dir to storage the dataset. :param speakers: a list of speakers to download. By default, downloads all. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of CMU Arctic download site. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) for spk in tqdm(speakers, desc="Downloading/unpacking CMU Indic speakers"): name = f"cmu_indic_{spk}" tar_name = f"{name}.tar.bz2" full_url = f"{base_url}{tar_name}" tar_path = target_dir / tar_name part_dir = target_dir / name completed_detector = part_dir / ".completed" if completed_detector.is_file(): logging.info(f"Skiping {spk} because {completed_detector} exists.") continue if force_download or not tar_path.is_file(): urlretrieve_progress(full_url, filename=tar_path, desc=f"Downloading {tar_name}") shutil.rmtree(part_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def download_mobvoihotwords( target_dir: Pathlike = ".", force_download: Optional[bool] = False, base_url: Optional[str] = "http://www.openslr.org/resources", ) -> None: """ Downdload and untar the dataset :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. """ url = f"{base_url}/87" target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) dataset_tar_name = "mobvoi_hotword_dataset.tgz" resources_tar_name = "mobvoi_hotword_dataset_resources.tgz" for tar_name in [dataset_tar_name, resources_tar_name]: tar_path = target_dir / tar_name corpus_dir = target_dir / "MobvoiHotwords" extracted_dir = corpus_dir / tar_name[:-4] completed_detector = extracted_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skip {tar_name} because {completed_detector} exists.") continue if force_download or not tar_path.is_file(): urlretrieve_progress(f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}") shutil.rmtree(extracted_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=corpus_dir) completed_detector.touch()
def download_vctk( target_dir: Pathlike = ".", force_download: Optional[bool] = False, url: Optional[str] = CREST_VCTK_URL, ) -> Path: """ Download and untar/unzip the VCTK dataset. :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param url: str, the url of tarred/zipped VCTK corpus. :return: the path to downloaded and extracted directory with data. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) archive_name = url.split("/")[-1] archive_path = target_dir / archive_name part_dir = target_dir / archive_name.replace(".zip", "").replace(".tar.gz", "") completed_detector = part_dir / ".completed" if completed_detector.is_file(): logging.info(f"Skipping {archive_name} because {completed_detector} exists.") return part_dir if force_download or not archive_path.is_file(): urlretrieve_progress( url, filename=archive_path, desc=f"Downloading {archive_name}" ) shutil.rmtree(part_dir, ignore_errors=True) opener = zipfile.ZipFile if archive_name.endswith(".zip") else tarfile.open with opener(archive_path) as archive: archive.extractall(path=target_dir) completed_detector.touch() return part_dir
def download_adept( target_dir: Pathlike = ".", force_download: bool = False, ) -> Path: """ Download and untar the ADEPT dataset. :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :return: the path to downloaded and extracted directory with data. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) corpus_dir = target_dir / "ADEPT" completed_detector = corpus_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skipping downloading ADEPT because {completed_detector} exists.") return corpus_dir # Maybe-download the archive. zip_name = "ADEPT.zip" zip_path = target_dir / zip_name if force_download or not zip_path.is_file(): urlretrieve_progress(ADEPT_URL, filename=zip_path, desc=f"Downloading {zip_name}") # Remove partial unpacked files, if any, and unpack everything. shutil.rmtree(corpus_dir, ignore_errors=True) with zipfile.ZipFile(zip_path) as zip_f: zip_f.extractall(path=corpus_dir) completed_detector.touch() return corpus_dir
def download_aishell4( target_dir: Pathlike = ".", force_download: Optional[bool] = False, base_url: Optional[str] = "http://www.openslr.org/resources", ) -> Path: """ Downdload and untar the dataset :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. :return: the path to downloaded and extracted directory with data. """ url = f"{base_url}/111" target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) dataset_tar_names = [ "train_L.tar.gz", "train_M.tar.gz", "train_S.tar.gz", "test.tar.gz", ] for tar_name in dataset_tar_names: tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}") with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) return target_dir
def download_timit( target_dir: Pathlike = ".", force_download: bool = False, base_url: Optional[str] = "https://data.deepai.org/timit.zip", ) -> None: """ Download and unzip the dataset TIMIT. :param target_dir: Pathlike, the path of the dir to store the dataset. :param force_download: bool, if True, download the zips no matter if the zips exists. :param base_url: str, the URL of the TIMIT dataset to download. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) zip_name = "timit.zip" zip_path = target_dir / zip_name corpus_dir = zip_path.with_suffix("") completed_detector = corpus_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skipping {zip_name} because {completed_detector} exists.") return if force_download or not zip_path.is_file(): urlretrieve_progress(base_url, filename=zip_path, desc=f"Downloading {zip_name}") with zipfile.ZipFile(zip_path) as zip_file: corpus_dir.mkdir(parents=True, exist_ok=True) for names in zip_file.namelist(): zip_file.extract(names, str(corpus_dir))
def extract(recording_manifest: Pathlike, output_dir: Pathlike, feature_manifest: Optional[Pathlike], storage_type: str, lilcom_tick_power: int, root_dir: Optional[Pathlike], num_jobs: int): """ Extract features for recordings in a given AUDIO_MANIFEST. The features are stored in OUTPUT_DIR, with one file per recording (or segment). """ recordings: RecordingSet = RecordingSet.from_json(recording_manifest) if root_dir is not None: recordings = recordings.with_path_prefix(root_dir) feature_extractor = (FeatureExtractor.from_yaml(feature_manifest) if feature_manifest is not None else Fbank()) output_dir = Path(output_dir) output_dir.mkdir(exist_ok=True, parents=True) storage_path = output_dir / 'feats.h5' if 'hdf5' in storage_type else output_dir / 'storage' with get_writer(storage_type)(storage_path, tick_power=lilcom_tick_power) as storage: feature_set_builder = FeatureSetBuilder( feature_extractor=feature_extractor, storage=storage, ) feature_set_builder.process_and_store_recordings( recordings=recordings, output_manifest=output_dir / 'feature_manifest.json.gz', num_jobs=num_jobs)
def download_rir_noise( target_dir: Pathlike = ".", url: Optional[str] = RIR_NOISE_ZIP_URL, force_download: Optional[bool] = False, ) -> Path: """ Download and untar the RIR Noise corpus. :param target_dir: Pathlike, the path of the dir to store the dataset. :param url: str, the url that downloads file called "rirs_noises.zip". :param force_download: bool, if True, download the archive even if it already exists. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) zip_name = "rirs_noises.zip" zip_path = target_dir / zip_name if zip_path.exists() and not force_download: logging.info(f"Skipping {zip_name} because file exists.") else: urlretrieve_progress(url, zip_path, desc=f"Downloading {zip_name}") logging.info(f"Downloaded {zip_name}.") zip_dir = target_dir / "RIRS_NOISES" if not zip_dir.exists(): logging.info(f"Unzipping {zip_name}.") with zipfile.ZipFile(zip_path) as zf: zf.extractall(target_dir) return zip_dir
def split(num_splits: int, manifest: Pathlike, output_dir: Pathlike): """Load MANIFEST, split it into NUM_SPLITS equal parts and save as separate manifests in OUTPUT_DIR. """ output_dir = Path(output_dir) manifest = Path(manifest) data_set = load_manifest(manifest) parts = split_manifest(manifest=data_set, num_splits=num_splits) output_dir.mkdir(parents=True, exist_ok=True) for idx, part in enumerate(parts): part.to_json(output_dir / f'{manifest.stem}.{idx + 1}.json')
def prepare_norm_cn( corpus_dir: Pathlike, output_dir: Pathlike, num_jobs: int = 15, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Prepare manifests for the VoxCeleb1 corpus. The manifests are created in a dict with """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f"No such directory: {corpus_dir}" if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) dataset_parts = ["dev", "test", "train"] for part in dataset_parts: transcript_path = corpus_dir / f"{part}/text.txt" transcript_dict = {} with open(transcript_path, "r", encoding="utf-8") as f: for line in f.readlines(): idx_transcript = line.split() if len(idx_transcript) < 2 : logging.info(f"get transcript err: {line}") continue transcript_dict[idx_transcript[0]] = " ".join(idx_transcript[1:]) file_path = corpus_dir / f"{part}/wav.scp" file_paths = [] with open(file_path, "r", encoding="utf-8") as f: file_paths = [line.strip() for line in f] recordings = [] supervisions = [] with ThreadPoolExecutor(num_jobs) as ex: for recording, supervision in tqdm( ex.map( process_file, file_paths, repeat(transcript_dict), ), desc="Processing NormcnSpeech JSON entries", leave=False, ): #for p in file_paths: # recording, supervision = process_file(p, transcript_dict) if recording is not None : recordings.append(recording) supervisions.append(supervision) supervision_set = SupervisionSet.from_segments(supervisions) recording_set = RecordingSet.from_recordings(recordings) if output_dir is not None: supervision_set.to_json(output_dir / f"supervisions_{part}.json") recording_set.to_json(output_dir / f"recordings_{part}.json") manifests[part] = {"recordings": recording_set, "supervisions": supervision_set} return manifests
def convert_kaldi(data_dir: Pathlike, sampling_rate: int, manifest_dir: Pathlike): """ Convert a Kaldi data dir DATA_DIR into a directory MANIFEST_DIR of lhotse manifests. Ignores feats.scp. The SAMPLING_RATE has to be explicitly specified as it is not available to read from DATA_DIR. """ recording_set, maybe_supervision_set = load_kaldi_data_dir(path=data_dir, sampling_rate=sampling_rate) manifest_dir = Path(manifest_dir) manifest_dir.mkdir(parents=True, exist_ok=True) recording_set.to_json(manifest_dir / 'audio.json') if maybe_supervision_set is not None: maybe_supervision_set.to_json(manifest_dir / 'supervision.json')
def split(num_splits: int, manifest: Pathlike, output_dir: Pathlike, shuffle: bool): """ Load MANIFEST, split it into NUM_SPLITS equal parts and save as separate manifests in OUTPUT_DIR. """ output_dir = Path(output_dir) manifest = Path(manifest) suffix = ''.join(manifest.suffixes) any_set = load_manifest(manifest) parts = any_set.split(num_splits=num_splits, shuffle=shuffle) output_dir.mkdir(parents=True, exist_ok=True) for idx, part in enumerate(parts): part.to_json( (output_dir / manifest).with_suffix(f'.{idx + 1}.{suffix}'))
def download_earnings21( target_dir: Pathlike = ".", force_download: Optional[bool] = False, url: Optional[str] = _DEFAULT_URL, ) -> Path: """Download and untar the dataset. :param target_dir: Pathlike, the path of the dir to store the dataset. The extracted files are saved to target_dir/earnings21/ Please note that the github repository contains other additional datasets and using this call, you will be downloading all of them and then throwing them out. :param force_download: Bool, if True, download the tar file no matter whether it exists or not. :param url: str, the url to download the dataset. :return: the path to downloaded and extracted directory with data. """ logging.info( "Downloading Earnings21 from github repository is not very efficient way" + " how to obtain the corpus. You will be downloading other data as well." ) target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) extracted_dir = target_dir / "earnings21" zip_path = target_dir / "speech-datasets-main.zip" completed_detector = extracted_dir / ".lhotse-download.completed" if completed_detector.is_file(): logging.info(f"Skipping - {completed_detector} exists.") return extracted_dir if force_download or not zip_path.is_file(): urlretrieve_progress(url, filename=zip_path, desc="Getting speech-datasets-main.zip") shutil.rmtree(extracted_dir, ignore_errors=True) with zipfile.ZipFile(zip_path) as zip: for f in zip.namelist(): if "earnings21" in f: zip.extract(f, path=target_dir) shutil.move(target_dir / "speech-datasets-main" / "earnings21", target_dir) shutil.rmtree(target_dir / "speech-datasets-main") completed_detector.touch() return extracted_dir
def download_and_untar( target_dir: Pathlike = '.', force_download: Optional[bool] = False, url: Optional[str] = 'http://www.openslr.org/resources/39') -> None: target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) tar_name = f'LDC2006S37.tar.gz' tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urllib.request.urlretrieve(f'{url}/{tar_name}', filename=tar_path) completed_detector = target_dir / '.completed' with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def download_and_untar(target_dir: Pathlike = '.', force_download: bool = False, url: str = SWBD_TEXT_URL) -> Path: target_dir = Path(target_dir) transcript_dir = target_dir / 'swb_ms98_transcriptions' if transcript_dir.is_dir(): return transcript_dir target_dir.mkdir(parents=True, exist_ok=True) tar_name = 'switchboard_word_alignments.tar.gz' tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urllib.request.urlretrieve(url, filename=tar_path) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) return transcript_dir
def import_(data_dir: Pathlike, sampling_rate: int, manifest_dir: Pathlike, frame_shift: Seconds): """ Convert a Kaldi data dir DATA_DIR into a directory MANIFEST_DIR of lhotse manifests. Ignores feats.scp. The SAMPLING_RATE has to be explicitly specified as it is not available to read from DATA_DIR. """ recording_set, maybe_supervision_set, maybe_feature_set = load_kaldi_data_dir( path=data_dir, sampling_rate=sampling_rate, frame_shift=frame_shift) manifest_dir = Path(manifest_dir) manifest_dir.mkdir(parents=True, exist_ok=True) recording_set.to_file(manifest_dir / 'recordings.jsonl.gz') if maybe_supervision_set is not None: maybe_supervision_set.to_file(manifest_dir / 'supervisions.jsonl.gz') if maybe_feature_set is not None: maybe_feature_set.to_file(manifest_dir / 'features.jsonl.gz')
def download_and_untar(target_dir: Pathlike = '.', force_download: Optional[bool] = False) -> None: target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) tar_path = target_dir / 'TEDLIUM_release-3.tgz' if force_download or not tar_path.is_file(): urllib.request.urlretrieve( 'http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz', filename=tar_path) corpus_dir = target_dir / 'TEDLIUM_release-3.tgz' completed_detector = corpus_dir / '.completed' if not completed_detector.is_file(): shutil.rmtree(corpus_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def download_and_untar( target_dir: Pathlike = '.', force_download: Optional[bool] = False ) -> None: target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) dataset_name = 'LJSpeech-1.1' tar_path = target_dir / f'{dataset_name}.tar.bz2' if force_download or not tar_path.is_file(): urllib.request.urlretrieve(f'http://data.keithito.com/data/speech/{dataset_name}.tar.bz2', filename=tar_path) corpus_dir = target_dir / dataset_name completed_detector = corpus_dir / '.completed' if not completed_detector.is_file(): shutil.rmtree(corpus_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def download_callhome_metadata( target_dir: Pathlike = '.', force_download: bool = False, url: str = "http://www.openslr.org/resources/10/sre2000-key.tar.gz" ) -> Path: target_dir = Path(target_dir) sre_dir = target_dir / 'sre2000-key' if sre_dir.is_dir(): return sre_dir target_dir.mkdir(parents=True, exist_ok=True) tar_name = 'sre2000-key.tar.gz' tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(url, filename=tar_path, desc=f'Downloading {tar_name}') with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) return sre_dir
def download_and_unzip( target_dir: Pathlike = '.', force_download: Optional[bool] = False, url: Optional[str] = 'https://zenodo.org/record/3871592/files/MiniLibriMix.zip' ) -> None: target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) zip_path = target_dir / 'MiniLibriMix.zip' if force_download or not zip_path.is_file(): urlretrieve_progress(url, filename=zip_path, desc='Downloading MiniLibriMix') unzipped_dir = target_dir / 'MiniLibriMix' completed_detector = unzipped_dir / '.completed' if not completed_detector.is_file(): shutil.rmtree(unzipped_dir, ignore_errors=True) with ZipFile(zip_path) as zf: zf.extractall(path=target_dir) completed_detector.touch()
def download_and_untar(target_dir: Pathlike = ".", force_download: bool = False, url: str = SWBD_TEXT_URL) -> Path: target_dir = Path(target_dir) transcript_dir = target_dir / "swb_ms98_transcriptions" if transcript_dir.is_dir(): return transcript_dir target_dir.mkdir(parents=True, exist_ok=True) tar_name = "switchboard_word_alignments.tar.gz" tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(url, filename=tar_path, desc=f"Downloading {tar_name}") with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) return transcript_dir
def download_and_untar( target_dir: Pathlike = '.', force_download: Optional[bool] = False, url: Optional[str] = 'http://www.openslr.org/resources/31') -> None: target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) for part in dataset_parts: tar_name = f'{part}.tar.gz' tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urllib.request.urlretrieve(f'{url}/{tar_name}', filename=tar_path) part_dir = target_dir / f'LibriSpeech/{part}' completed_detector = part_dir / '.completed' if not completed_detector.is_file(): shutil.rmtree(part_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def split(num_splits: int, manifest: Pathlike, output_dir: Pathlike, shuffle: bool): """ Load MANIFEST, split it into NUM_SPLITS equal parts and save as separate manifests in OUTPUT_DIR. """ from lhotse import load_manifest output_dir = Path(output_dir) manifest = Path(manifest) suffix = "".join(manifest.suffixes) any_set = load_manifest(manifest) parts = any_set.split(num_splits=num_splits, shuffle=shuffle) output_dir.mkdir(parents=True, exist_ok=True) num_digits = len(str(num_splits)) for idx, part in enumerate(parts): idx = f"{idx + 1}".zfill(num_digits) part.to_file( (output_dir / manifest.stem).with_suffix(f".{idx}{suffix}"))
def download_libritts( target_dir: Pathlike = ".", dataset_parts: Optional[Union[str, Sequence[str]]] = "all", force_download: Optional[bool] = False, base_url: Optional[str] = "http://www.openslr.org/resources", ) -> Path: """ Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech :param target_dir: Pathlike, the path of the dir to storage the dataset. :param dataset_parts: "librispeech", "mini_librispeech", or a list of splits (e.g. "dev-clean") to download. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. :return: the path to downloaded and extracted directory with data. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) if dataset_parts == "all": dataset_parts = LIBRITTS for part in tqdm(dataset_parts, desc="Downloading LibriSpeech parts"): if part not in LIBRITTS: logging.warning(f"Skipping invalid dataset part name: {part}") url = f"{base_url}/60" tar_name = f"{part}.tar.gz" tar_path = target_dir / tar_name part_dir = target_dir / f"LibriTTS/{part}" completed_detector = part_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skipping {part} because {completed_detector} exists.") continue if force_download or not tar_path.is_file(): urlretrieve_progress(f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}") shutil.rmtree(part_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch() return target_dir
def download_librispeech( target_dir: Pathlike = '.', dataset_parts: Optional[Union[str, Sequence[str]]] = "mini_librispeech", force_download: Optional[bool] = False, base_url: Optional[str] = 'http://www.openslr.org/resources') -> None: """ Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech :param target_dir: Pathlike, the path of the dir to storage the dataset. :param dataset_parts: "librispeech", "mini_librispeech", or a list of splits (e.g. "dev-clean") to download. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) if dataset_parts == "librispeech": dataset_parts = LIBRISPEECH elif dataset_parts == "mini_librispeech": dataset_parts = MINI_LIBRISPEECH for part in tqdm(dataset_parts, desc='Downloading LibriSpeech parts'): if part in LIBRISPEECH: url = f'{base_url}/12' elif part in MINI_LIBRISPEECH: url = f'{base_url}/31' else: logging.warning(f'Invalid dataset part name: {part}') continue tar_name = f'{part}.tar.gz' tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(f'{url}/{tar_name}', filename=tar_path, desc=f'Downloading {tar_name}') part_dir = target_dir / f'LibriSpeech/{part}' completed_detector = part_dir / '.completed' if not completed_detector.is_file(): shutil.rmtree(part_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def fix_(recordings: Pathlike, supervisions: Pathlike, output_dir: Pathlike): """ Fix a pair of Lhotse RECORDINGS and SUPERVISIONS manifests. It removes supervisions without corresponding recordings and vice versa, trims the supervisions that exceed the recording, etc. Stores the output files in OUTPUT_DIR under the same names as the input files. """ from lhotse import RecordingSet, SupervisionSet, fix_manifests output_dir = Path(output_dir) recordings = Path(recordings) supervisions = Path(supervisions) output_dir.mkdir(parents=True, exist_ok=True) recs = RecordingSet.from_file(recordings) sups = SupervisionSet.from_file(supervisions) recs, sups = fix_manifests(recordings=recs, supervisions=sups) recs.to_file(output_dir / recordings.name) sups.to_file(output_dir / supervisions.name)
def download_and_untar_sph2pipe( target_dir: Pathlike, url: str, force_download: bool = False, ) -> Path: target_dir = Path(target_dir) sph2pipe_dir = target_dir / "sph2pipe-2.5" if (sph2pipe_dir / "Makefile").is_file() and not force_download: return sph2pipe_dir target_dir.mkdir(parents=True, exist_ok=True) tar_name = "sph2pipe-2.5.tar.gz" tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(url, filename=tar_path, desc=f"Downloading {tar_name}") with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) return sph2pipe_dir