def download_vctk(target_dir: Pathlike = '.', force_download: Optional[bool] = False, url: Optional[str] = CREST_VCTK_URL) -> None: """ Download and untar/unzip the VCTK dataset. :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param url: str, the url of tarred/zipped VCTK corpus. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) archive_name = url.split('/')[-1] archive_path = target_dir / archive_name if force_download or not archive_path.is_file(): urlretrieve_progress(url, filename=archive_path, desc=f'Downloading {archive_name}') part_dir = target_dir / archive_name.replace('.zip', '').replace( '.tar.gz', '') completed_detector = part_dir / '.completed' if not completed_detector.is_file(): shutil.rmtree(part_dir, ignore_errors=True) opener = zipfile.ZipFile if archive_name.endswith( '.zip') else tarfile.open with opener(archive_path) as archive: archive.extractall(path=target_dir) completed_detector.touch()
def download_vctk( target_dir: Pathlike = ".", force_download: Optional[bool] = False, url: Optional[str] = CREST_VCTK_URL, ) -> Path: """ Download and untar/unzip the VCTK dataset. :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param url: str, the url of tarred/zipped VCTK corpus. :return: the path to downloaded and extracted directory with data. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) archive_name = url.split("/")[-1] archive_path = target_dir / archive_name part_dir = target_dir / archive_name.replace(".zip", "").replace( ".tar.gz", "") completed_detector = part_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skipping {archive_name} because {completed_detector} exists.") return part_dir if force_download or not archive_path.is_file(): urlretrieve_progress(url, filename=archive_path, desc=f"Downloading {archive_name}") shutil.rmtree(part_dir, ignore_errors=True) opener = zipfile.ZipFile if archive_name.endswith(".zip") else tarfile.open with opener(archive_path) as archive: archive.extractall(path=target_dir) completed_detector.touch() return part_dir
def download_mobvoihotwords( target_dir: Pathlike = ".", force_download: Optional[bool] = False, base_url: Optional[str] = "http://www.openslr.org/resources", ) -> None: """ Downdload and untar the dataset :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. """ url = f"{base_url}/87" target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) dataset_tar_name = "mobvoi_hotword_dataset.tgz" resources_tar_name = "mobvoi_hotword_dataset_resources.tgz" for tar_name in [dataset_tar_name, resources_tar_name]: tar_path = target_dir / tar_name corpus_dir = target_dir / "MobvoiHotwords" extracted_dir = corpus_dir / tar_name[:-4] completed_detector = extracted_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skip {tar_name} because {completed_detector} exists.") continue if force_download or not tar_path.is_file(): urlretrieve_progress(f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}") shutil.rmtree(extracted_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=corpus_dir) completed_detector.touch()
def download_adept( target_dir: Pathlike = ".", force_download: bool = False, ) -> Path: """ Download and untar the ADEPT dataset. :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :return: the path to downloaded and extracted directory with data. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) corpus_dir = target_dir / "ADEPT" completed_detector = corpus_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skipping downloading ADEPT because {completed_detector} exists.") return corpus_dir # Maybe-download the archive. zip_name = "ADEPT.zip" zip_path = target_dir / zip_name if force_download or not zip_path.is_file(): urlretrieve_progress(ADEPT_URL, filename=zip_path, desc=f"Downloading {zip_name}") # Remove partial unpacked files, if any, and unpack everything. shutil.rmtree(corpus_dir, ignore_errors=True) with zipfile.ZipFile(zip_path) as zip_f: zip_f.extractall(path=corpus_dir) completed_detector.touch() return corpus_dir
def download_cmu_arctic( target_dir: Pathlike = ".", speakers: Sequence[str] = SPEAKERS, force_download: Optional[bool] = False, base_url: Optional[str] = BASE_URL, ) -> None: """ Download and untar the CMU Arctic dataset. :param target_dir: Pathlike, the path of the dir to storage the dataset. :param speakers: a list of speakers to download. By default, downloads all. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of CMU Arctic download site. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) for spk in tqdm(speakers, desc="Downloading/unpacking CMU Arctic speakers"): name = f"cmu_us_{spk}_arctic" tar_name = f"{name}.tar.bz2" full_url = f"{base_url}{tar_name}" tar_path = target_dir / tar_name part_dir = target_dir / name completed_detector = part_dir / ".completed" if completed_detector.is_file(): logging.info(f"Skiping {spk} because {completed_detector} exists.") continue if force_download or not tar_path.is_file(): urlretrieve_progress( full_url, filename=tar_path, desc=f"Downloading {tar_name}" ) shutil.rmtree(part_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def download_hifitts( target_dir: Pathlike = ".", force_download: Optional[bool] = False, base_url: Optional[str] = "http://www.openslr.org/resources", ) -> None: """ Download and untar the HiFi TTS dataset. :param target_dir: Pathlike, the path of the dir to store the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) url = f"{base_url}/109" tar_name = "hi_fi_tts_v0.tar.gz" tar_path = target_dir / tar_name part_dir = target_dir / f"hi_fi_tts_v0" completed_detector = part_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skipping HiFiTTS preparation because {completed_detector} exists." ) return if force_download or not tar_path.is_file(): urlretrieve_progress(f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}") shutil.rmtree(part_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def download_timit( target_dir: Pathlike = ".", force_download: bool = False, base_url: Optional[str] = "https://data.deepai.org/timit.zip", ) -> Path: """ Download and unzip the dataset TIMIT. :param target_dir: Pathlike, the path of the dir to store the dataset. :param force_download: bool, if True, download the zips no matter if the zips exists. :param base_url: str, the URL of the TIMIT dataset to download. :return: the path to downloaded and extracted directory with data. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) zip_name = "timit.zip" zip_path = target_dir / zip_name corpus_dir = zip_path.with_suffix("") completed_detector = corpus_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skipping {zip_name} because {completed_detector} exists.") return corpus_dir if force_download or not zip_path.is_file(): urlretrieve_progress(base_url, filename=zip_path, desc=f"Downloading {zip_name}") with zipfile.ZipFile(zip_path) as zip_file: corpus_dir.mkdir(parents=True, exist_ok=True) for names in zip_file.namelist(): zip_file.extract(names, str(corpus_dir)) return corpus_dir
def download_ali_meeting( target_dir: Pathlike = ".", force_download: Optional[bool] = False, base_url: Optional[ str ] = "https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/", ) -> Path: """ Downdload and untar the dataset :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. :return: the path to downloaded and extracted directory with data. """ url = f"{base_url}/AliMeeting/openlr/" target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) dataset_tar_names = [ "Train_Ali_far.tar.gz", "Train_Ali_near.tar.gz", "Eval_Ali.tar.gz", "Test_Ali.tar.gz", ] for tar_name in dataset_tar_names: tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress( f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}" ) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) return target_dir
def download_musan( target_dir: Pathlike = ".", url: Optional[str] = MUSAN_URL, force_download: Optional[bool] = False, ) -> Path: """ Download and untar the MUSAN corpus. :param target_dir: Pathlike, the path of the dir to store the dataset. :param url: str, the url that downloads file called "musan.tar.gz". :param force_download: bool, if True, download the archive even if it already exists. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) tar_name = "musan.tar.gz" tar_path = target_dir / tar_name corpus_dir = target_dir / "musan" completed_detector = target_dir / ".musan_completed" if completed_detector.is_file(): logging.info( f"Skipping {tar_name} because {completed_detector} exists.") return corpus_dir if force_download or not tar_path.is_file(): urlretrieve_progress(url, filename=tar_path, desc=f"Downloading {tar_name}") with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch() return corpus_dir
def download_yesno( target_dir: Pathlike = ".", force_download: Optional[bool] = False, url: Optional[str] = _DEFAULT_URL, ): """Download and untar the dataset. :param target_dir: Pathlike, the path of the dir to store the dataset. The extracted files are saved to target_dir/waves_yesno/*.wav :param force_download: Bool, if True, download the tar file no matter whether it exists or not. :param url: str, the url to download the dataset. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) extracted_dir = target_dir / "waves_yesno" tar_path = target_dir / "waves_yesno.tar.gz" completed_detector = extracted_dir / ".completed" if completed_detector.is_file(): logging.info(f"Skipping - {completed_detector} exists.") return if force_download or not tar_path.is_file(): urlretrieve_progress( f"{url}", filename=tar_path, desc=f"Downloading waves_yesno.tar.gz" ) shutil.rmtree(extracted_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def download_musan( target_dir: Pathlike = '.', url: Optional[str] = MUSAN_URL, force_download: Optional[bool] = False, ) -> None: """ Download and untar the MUSAN corpus. :param target_dir: Pathlike, the path of the dir to store the dataset. :param url: str, the url that downloads file called "musan.tar.gz". :param force_download: bool, if True, download the archive even if it already exists. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) tar_name = 'musan.tar.gz' tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(url, filename=tar_path, desc=f'Downloading {tar_name}') completed_detector = target_dir / '.musan_completed' if not completed_detector.is_file(): with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def download_aishell4( target_dir: Pathlike = ".", force_download: Optional[bool] = False, base_url: Optional[str] = "http://www.openslr.org/resources", ) -> None: """ Downdload and untar the dataset :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. """ url = f"{base_url}/111" target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) dataset_tar_names = [ "train_L.tar.gz", "train_M.tar.gz", "train_S.tar.gz", "test.tar.gz", ] for tar_name in dataset_tar_names: tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}") with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir)
def download_rir_noise( target_dir: Pathlike = ".", url: Optional[str] = RIR_NOISE_ZIP_URL, force_download: Optional[bool] = False, ) -> Path: """ Download and untar the RIR Noise corpus. :param target_dir: Pathlike, the path of the dir to store the dataset. :param url: str, the url that downloads file called "rirs_noises.zip". :param force_download: bool, if True, download the archive even if it already exists. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) zip_name = "rirs_noises.zip" zip_path = target_dir / zip_name if zip_path.exists() and not force_download: logging.info(f"Skipping {zip_name} because file exists.") else: urlretrieve_progress(url, zip_path, desc=f"Downloading {zip_name}") logging.info(f"Downloaded {zip_name}.") zip_dir = target_dir / "RIRS_NOISES" if not zip_dir.exists(): logging.info(f"Unzipping {zip_name}.") with zipfile.ZipFile(zip_path) as zf: zf.extractall(target_dir) return zip_dir
def download_and_untar( target_dir: Pathlike = '.', force_download: Optional[bool] = False, base_url: Optional[str] = 'http://www.openslr.org/resources') -> None: """ Downdload and untar the dataset :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. """ url = f'{base_url}/33' target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) dataset_tar_name = 'data_aishell.tgz' resources_tar_name = 'resource_aishell.tgz' for tar_name in [dataset_tar_name, resources_tar_name]: tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(f'{url}/{tar_name}', filename=tar_path, desc=f'Downloading {tar_name}') corpus_dir = target_dir / 'aishell' extracted_dir = corpus_dir / tar_name[:-4] completed_detector = extracted_dir / '.completed' if not completed_detector.is_file(): shutil.rmtree(extracted_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=corpus_dir) completed_detector.touch()
def download_earnings21( target_dir: Pathlike = ".", force_download: Optional[bool] = False, url: Optional[str] = _DEFAULT_URL, ) -> Path: """Download and untar the dataset. :param target_dir: Pathlike, the path of the dir to store the dataset. The extracted files are saved to target_dir/earnings21/ Please note that the github repository contains other additional datasets and using this call, you will be downloading all of them and then throwing them out. :param force_download: Bool, if True, download the tar file no matter whether it exists or not. :param url: str, the url to download the dataset. :return: the path to downloaded and extracted directory with data. """ logging.info( "Downloading Earnings21 from github repository is not very efficient way" + " how to obtain the corpus. You will be downloading other data as well." ) target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) extracted_dir = target_dir / "earnings21" zip_path = target_dir / "speech-datasets-main.zip" completed_detector = extracted_dir / ".lhotse-download.completed" if completed_detector.is_file(): logging.info(f"Skipping - {completed_detector} exists.") return extracted_dir if force_download or not zip_path.is_file(): urlretrieve_progress(url, filename=zip_path, desc="Getting speech-datasets-main.zip") shutil.rmtree(extracted_dir, ignore_errors=True) with zipfile.ZipFile(zip_path) as zip: for f in zip.namelist(): if "earnings21" in f: zip.extract(f, path=target_dir) shutil.move(target_dir / "speech-datasets-main" / "earnings21", target_dir) shutil.rmtree(target_dir / "speech-datasets-main") completed_detector.touch() return extracted_dir
def download_and_untar( target_dir: Pathlike = '.', force_download: Optional[bool] = False, url: Optional[str] = 'http://www.openslr.org/resources/39' ) -> None: target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) tar_name = f'LDC2006S37.tar.gz' tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(f'{url}/{tar_name}', filename=tar_path, desc='Downloading Heroico') completed_detector = target_dir / '.completed' with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def download_and_unzip( target_dir: Pathlike = '.', force_download: Optional[bool] = False, url: Optional[str] = 'https://zenodo.org/record/3871592/files/MiniLibriMix.zip' ) -> None: target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) zip_path = target_dir / 'MiniLibriMix.zip' if force_download or not zip_path.is_file(): urlretrieve_progress(url, filename=zip_path, desc='Downloading MiniLibriMix') unzipped_dir = target_dir / 'MiniLibriMix' completed_detector = unzipped_dir / '.completed' if not completed_detector.is_file(): shutil.rmtree(unzipped_dir, ignore_errors=True) with ZipFile(zip_path) as zf: zf.extractall(path=target_dir) completed_detector.touch()
def download_callhome_metadata( target_dir: Pathlike = '.', force_download: bool = False, url: str = "http://www.openslr.org/resources/10/sre2000-key.tar.gz" ) -> Path: target_dir = Path(target_dir) sre_dir = target_dir / 'sre2000-key' if sre_dir.is_dir(): return sre_dir target_dir.mkdir(parents=True, exist_ok=True) tar_name = 'sre2000-key.tar.gz' tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(url, filename=tar_path, desc=f'Downloading {tar_name}') with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) return sre_dir
def download_and_untar(target_dir: Pathlike = '.', force_download: Optional[bool] = False) -> None: target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) tar_path = target_dir / 'TEDLIUM_release-3.tgz' if force_download or not tar_path.is_file(): urlretrieve_progress( 'http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz', filename=tar_path, desc='Downloading TEDLIUM v3') corpus_dir = target_dir / 'TEDLIUM_release-3.tgz' completed_detector = corpus_dir / '.completed' if not completed_detector.is_file(): shutil.rmtree(corpus_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def download_and_untar(target_dir: Pathlike = '.', force_download: bool = False, url: str = SWBD_TEXT_URL) -> Path: target_dir = Path(target_dir) transcript_dir = target_dir / 'swb_ms98_transcriptions' if transcript_dir.is_dir(): return transcript_dir target_dir.mkdir(parents=True, exist_ok=True) tar_name = 'switchboard_word_alignments.tar.gz' tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(url, filename=tar_path, desc=f'Downloading {tar_name}') with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) return transcript_dir
def download_audio( target_dir: Path, force_download: Optional[bool] = False, url: Optional[str] = "http://groups.inf.ed.ac.uk/ami", mic: Optional[str] = "ihm", ) -> None: # Audios for item in tqdm( itertools.chain.from_iterable(MEETINGS.values()), desc="Downloading AMI meetings", ): if mic == "ihm": headset_num = 5 if item in ("EN2001a", "EN2001d", "EN2001e") else 4 for m in range(headset_num): wav_name = f"{item}.Headset-{m}.wav" wav_url = f"{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}" wav_dir = target_dir / "wav_db" / item / "audio" wav_dir.mkdir(parents=True, exist_ok=True) wav_path = wav_dir / wav_name if force_download or not wav_path.is_file(): urlretrieve_progress(wav_url, filename=wav_path, desc=f"Downloading {wav_name}") elif mic == "ihm-mix": wav_name = f"{item}.Mix-Headset.wav" wav_url = f"{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}" wav_dir = target_dir / "wav_db" / item / "audio" wav_dir.mkdir(parents=True, exist_ok=True) wav_path = wav_dir / wav_name if force_download or not wav_path.is_file(): urlretrieve_progress(wav_url, filename=wav_path, desc=f"Downloading {wav_name}") elif mic == "sdm": wav_name = f"{item}.Array1-01.wav" wav_url = f"{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}" wav_dir = target_dir / "wav_db" / item / "audio" wav_dir.mkdir(parents=True, exist_ok=True) wav_path = wav_dir / wav_name if force_download or not wav_path.is_file(): urlretrieve_progress(wav_url, filename=wav_path, desc=f"Downloading {wav_name}") elif mic == "mdm": for array in MDM_ARRAYS: for channel in MDM_CHANNELS: wav_name = f"{item}.{array}-{channel}.wav" wav_url = f"{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}" wav_dir = target_dir / "wav_db" / item / "audio" wav_dir.mkdir(parents=True, exist_ok=True) wav_path = wav_dir / wav_name if force_download or not wav_path.is_file(): urlretrieve_progress(wav_url, filename=wav_path, desc=f"Downloading {wav_name}")
def download_and_untar(target_dir: Pathlike = '.', force_download: Optional[bool] = False) -> None: target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) dataset_name = 'LJSpeech-1.1' tar_path = target_dir / f'{dataset_name}.tar.bz2' if force_download or not tar_path.is_file(): urlretrieve_progress( f'http://data.keithito.com/data/speech/{dataset_name}.tar.bz2', filename=tar_path, desc='Downloading LJSpeech') corpus_dir = target_dir / dataset_name completed_detector = corpus_dir / '.completed' if not completed_detector.is_file(): shutil.rmtree(corpus_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def download_libritts( target_dir: Pathlike = ".", dataset_parts: Optional[Union[str, Sequence[str]]] = "all", force_download: Optional[bool] = False, base_url: Optional[str] = "http://www.openslr.org/resources", ) -> Path: """ Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech :param target_dir: Pathlike, the path of the dir to storage the dataset. :param dataset_parts: "librispeech", "mini_librispeech", or a list of splits (e.g. "dev-clean") to download. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. :return: the path to downloaded and extracted directory with data. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) if dataset_parts == "all": dataset_parts = LIBRITTS for part in tqdm(dataset_parts, desc="Downloading LibriSpeech parts"): if part not in LIBRITTS: logging.warning(f"Skipping invalid dataset part name: {part}") url = f"{base_url}/60" tar_name = f"{part}.tar.gz" tar_path = target_dir / tar_name part_dir = target_dir / f"LibriTTS/{part}" completed_detector = part_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skipping {part} because {completed_detector} exists.") continue if force_download or not tar_path.is_file(): urlretrieve_progress(f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}") shutil.rmtree(part_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch() return target_dir
def download_librispeech( target_dir: Pathlike = '.', dataset_parts: Optional[Union[str, Sequence[str]]] = "mini_librispeech", force_download: Optional[bool] = False, base_url: Optional[str] = 'http://www.openslr.org/resources') -> None: """ Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech :param target_dir: Pathlike, the path of the dir to storage the dataset. :param dataset_parts: "librispeech", "mini_librispeech", or a list of splits (e.g. "dev-clean") to download. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. """ target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) if dataset_parts == "librispeech": dataset_parts = LIBRISPEECH elif dataset_parts == "mini_librispeech": dataset_parts = MINI_LIBRISPEECH for part in tqdm(dataset_parts, desc='Downloading LibriSpeech parts'): if part in LIBRISPEECH: url = f'{base_url}/12' elif part in MINI_LIBRISPEECH: url = f'{base_url}/31' else: logging.warning(f'Invalid dataset part name: {part}') continue tar_name = f'{part}.tar.gz' tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(f'{url}/{tar_name}', filename=tar_path, desc=f'Downloading {tar_name}') part_dir = target_dir / f'LibriSpeech/{part}' completed_detector = part_dir / '.completed' if not completed_detector.is_file(): shutil.rmtree(part_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()
def download_and_untar_sph2pipe( target_dir: Pathlike, url: str, force_download: bool = False, ) -> Path: target_dir = Path(target_dir) sph2pipe_dir = target_dir / "sph2pipe-2.5" if (sph2pipe_dir / "Makefile").is_file() and not force_download: return sph2pipe_dir target_dir.mkdir(parents=True, exist_ok=True) tar_name = "sph2pipe-2.5.tar.gz" tar_path = target_dir / tar_name if force_download or not tar_path.is_file(): urlretrieve_progress(url, filename=tar_path, desc=f"Downloading {tar_name}") with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) return sph2pipe_dir
def download_audio( target_dir: Path, force_download: Optional[bool] = False, url: Optional[str] = 'http://groups.inf.ed.ac.uk/ami', mic: Optional[str] = 'ihm', ) -> None: # Audios for item in tqdm(itertools.chain.from_iterable(MEETINGS.values()), \ desc='Downloading AMI meetings'): if mic == 'ihm': headset_num = 5 if item in ('EN2001a', 'EN2001d', 'EN2001e') else 4 for m in range(headset_num): wav_name = f'{item}.Headset-{m}.wav' wav_url = f'{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}' wav_dir = target_dir / 'wav_db' / item / 'audio' wav_dir.mkdir(parents=True, exist_ok=True) wav_path = wav_dir / wav_name if force_download or not wav_path.is_file(): urlretrieve_progress(wav_url, filename=wav_path, desc=f'Downloading {wav_name}') elif mic == 'ihm-mix': wav_name = f'{item}.Mix-Headset.wav' wav_url = f'{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}' wav_dir = target_dir / 'wav_db' / item / 'audio' wav_dir.mkdir(parents=True, exist_ok=True) wav_path = wav_dir / wav_name if force_download or not wav_path.is_file(): urlretrieve_progress(wav_url, filename=wav_path, desc=f'Downloading {wav_name}') elif mic == 'sdm': wav_name = f'{item}.Array1-01.wav' wav_url = f'{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}' wav_dir = target_dir / 'wav_db' / item / 'audio' wav_dir.mkdir(parents=True, exist_ok=True) wav_path = wav_dir / wav_name if force_download or not wav_path.is_file(): urlretrieve_progress(wav_url, filename=wav_path, desc=f'Downloading {wav_name}') elif mic == 'mdm': for array in MDM_ARRAYS: for channel in MDM_CHANNELS: wav_name = f'{item}.{array}-{channel}.wav' wav_url = f'{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}' wav_dir = target_dir / 'wav_db' / item / 'audio' wav_dir.mkdir(parents=True, exist_ok=True) wav_path = wav_dir / wav_name if force_download or not wav_path.is_file(): urlretrieve_progress(wav_url, filename=wav_path, desc=f'Downloading {wav_name}')
def download_aishell( target_dir: Pathlike = ".", force_download: Optional[bool] = False, base_url: Optional[str] = "http://www.openslr.org/resources", ) -> Path: """ Downdload and untar the dataset :param target_dir: Pathlike, the path of the dir to storage the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. :return: the path to downloaded and extracted directory with data. """ url = f"{base_url}/33" target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) corpus_dir = target_dir / "aishell" dataset_tar_name = "data_aishell.tgz" resources_tar_name = "resource_aishell.tgz" for tar_name in [dataset_tar_name, resources_tar_name]: tar_path = target_dir / tar_name extracted_dir = corpus_dir / tar_name[:-4] completed_detector = extracted_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skipping download of because {completed_detector} exists.") continue if force_download or not tar_path.is_file(): urlretrieve_progress(f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}") shutil.rmtree(extracted_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=corpus_dir) if tar_name == dataset_tar_name: wav_dir = extracted_dir / "wav" for sub_tar_name in os.listdir(wav_dir): with tarfile.open(wav_dir / sub_tar_name) as tar: tar.extractall(path=wav_dir) completed_detector.touch() return corpus_dir
def download_aidatatang_200zh( target_dir: Pathlike = ".", force_download: Optional[bool] = False, base_url: Optional[str] = "http://www.openslr.org/resources", ) -> Path: """ Downdload and untar the dataset :param target_dir: Pathlike, the path of the dir to store the dataset. :param force_download: Bool, if True, download the tars no matter if the tars exist. :param base_url: str, the url of the OpenSLR resources. :return: the path to downloaded and extracted directory with data. """ url = f"{base_url}/62" target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) tar_name = "aidatatang_200zh.tgz" tar_path = target_dir / tar_name corpus_dir = target_dir extracted_dir = corpus_dir / tar_name[:-4] completed_detector = extracted_dir / ".completed" if completed_detector.is_file(): logging.info(f"Skipping because {completed_detector} exists.") return corpus_dir if force_download or not tar_path.is_file(): urlretrieve_progress(f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}") shutil.rmtree(extracted_dir, ignore_errors=True) with tarfile.open(tar_path) as tar: tar.extractall(path=corpus_dir) wav_dir = extracted_dir / "corpus" for s in ["test", "dev", "train"]: d = wav_dir / s logging.info(f"Processing {d}") for sub_tar_name in os.listdir(d): with tarfile.open(d / sub_tar_name) as tar: tar.extractall(path=d) completed_detector.touch() return corpus_dir
def download_mtedx( target_dir: Pathlike = ".", languages: Optional[Union[str, Sequence[str]]] = "all", ) -> Path: """ Download and untar the dataset. :param: target_dir: Pathlike, the path of the directory where the mtdex_corpus directory will be created and to which data will be downloaded. :param: languages: A str or sequence of strings specifying which languages to download. The default 'all', downloads all available languages. :return: the path to downloaded and extracted directory with data. """ target_dir = Path(target_dir) / "mtedx_corpus" target_dir.mkdir(parents=True, exist_ok=True) langs_list = list(ISOCODE2LANG.keys()) # If for some reason languages = None, assume this also means 'all' if isinstance(languages, str) and languages != "all": langs_list = [languages] elif isinstance(languages, list) or isinstance(languages, tuple): langs_list = languages for lang in tqdm(langs_list, "Downloading MTEDx languages"): tar_path = target_dir / f"{lang}-{lang}.tgz" completed_detector = target_dir / f".{lang}.completed" if completed_detector.is_file(): logging.info(f"Skipping {lang} because {completed_detector} exists.") continue urlretrieve_progress( f"http://www.openslr.org/resources/100/mtedx_{lang}.tgz", filename=tar_path, desc=f"Downloading MTEDx {lang}", ) with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch() return target_dir
def download_heroico( target_dir: Pathlike = ".", force_download: Optional[bool] = False, url: Optional[str] = "http://www.openslr.org/resources/39", ) -> None: target_dir = Path(target_dir) target_dir.mkdir(parents=True, exist_ok=True) tar_name = f"LDC2006S37.tar.gz" tar_path = target_dir / tar_name completed_detector = target_dir / ".completed" if completed_detector.is_file(): logging.info( f"Skipping {tar_name} because {completed_detector} exists.") return if force_download or not tar_path.is_file(): urlretrieve_progress(f"{url}/{tar_name}", filename=tar_path, desc="Downloading Heroico") with tarfile.open(tar_path) as tar: tar.extractall(path=target_dir) completed_detector.touch()