Esempio n. 1
0
def download_vctk(target_dir: Pathlike = '.',
                  force_download: Optional[bool] = False,
                  url: Optional[str] = CREST_VCTK_URL) -> None:
    """
    Download and untar/unzip the VCTK dataset.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param url: str, the url of tarred/zipped VCTK corpus.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    archive_name = url.split('/')[-1]
    archive_path = target_dir / archive_name
    if force_download or not archive_path.is_file():
        urlretrieve_progress(url,
                             filename=archive_path,
                             desc=f'Downloading {archive_name}')
    part_dir = target_dir / archive_name.replace('.zip', '').replace(
        '.tar.gz', '')
    completed_detector = part_dir / '.completed'
    if not completed_detector.is_file():
        shutil.rmtree(part_dir, ignore_errors=True)
        opener = zipfile.ZipFile if archive_name.endswith(
            '.zip') else tarfile.open
        with opener(archive_path) as archive:
            archive.extractall(path=target_dir)
            completed_detector.touch()
Esempio n. 2
0
def download_vctk(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    url: Optional[str] = CREST_VCTK_URL,
) -> Path:
    """
    Download and untar/unzip the VCTK dataset.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param url: str, the url of tarred/zipped VCTK corpus.
    :return: the path to downloaded and extracted directory with data.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    archive_name = url.split("/")[-1]
    archive_path = target_dir / archive_name
    part_dir = target_dir / archive_name.replace(".zip", "").replace(
        ".tar.gz", "")
    completed_detector = part_dir / ".completed"
    if completed_detector.is_file():
        logging.info(
            f"Skipping {archive_name} because {completed_detector} exists.")
        return part_dir
    if force_download or not archive_path.is_file():
        urlretrieve_progress(url,
                             filename=archive_path,
                             desc=f"Downloading {archive_name}")
    shutil.rmtree(part_dir, ignore_errors=True)
    opener = zipfile.ZipFile if archive_name.endswith(".zip") else tarfile.open
    with opener(archive_path) as archive:
        archive.extractall(path=target_dir)
    completed_detector.touch()
    return part_dir
Esempio n. 3
0
def download_mobvoihotwords(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    base_url: Optional[str] = "http://www.openslr.org/resources",
) -> None:
    """
    Downdload and untar the dataset

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    """

    url = f"{base_url}/87"
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    dataset_tar_name = "mobvoi_hotword_dataset.tgz"
    resources_tar_name = "mobvoi_hotword_dataset_resources.tgz"
    for tar_name in [dataset_tar_name, resources_tar_name]:
        tar_path = target_dir / tar_name
        corpus_dir = target_dir / "MobvoiHotwords"
        extracted_dir = corpus_dir / tar_name[:-4]
        completed_detector = extracted_dir / ".completed"
        if completed_detector.is_file():
            logging.info(
                f"Skip {tar_name} because {completed_detector} exists.")
            continue
        if force_download or not tar_path.is_file():
            urlretrieve_progress(f"{url}/{tar_name}",
                                 filename=tar_path,
                                 desc=f"Downloading {tar_name}")
        shutil.rmtree(extracted_dir, ignore_errors=True)
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=corpus_dir)
        completed_detector.touch()
Esempio n. 4
0
def download_adept(
    target_dir: Pathlike = ".",
    force_download: bool = False,
) -> Path:
    """
    Download and untar the ADEPT dataset.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :return: the path to downloaded and extracted directory with data.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    corpus_dir = target_dir / "ADEPT"
    completed_detector = corpus_dir / ".completed"
    if completed_detector.is_file():
        logging.info(
            f"Skipping downloading ADEPT because {completed_detector} exists.")
        return corpus_dir
    # Maybe-download the archive.
    zip_name = "ADEPT.zip"
    zip_path = target_dir / zip_name
    if force_download or not zip_path.is_file():
        urlretrieve_progress(ADEPT_URL,
                             filename=zip_path,
                             desc=f"Downloading {zip_name}")
    # Remove partial unpacked files, if any, and unpack everything.
    shutil.rmtree(corpus_dir, ignore_errors=True)
    with zipfile.ZipFile(zip_path) as zip_f:
        zip_f.extractall(path=corpus_dir)
    completed_detector.touch()

    return corpus_dir
Esempio n. 5
0
def download_cmu_arctic(
    target_dir: Pathlike = ".",
    speakers: Sequence[str] = SPEAKERS,
    force_download: Optional[bool] = False,
    base_url: Optional[str] = BASE_URL,
) -> None:
    """
    Download and untar the CMU Arctic dataset.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param speakers: a list of speakers to download. By default, downloads all.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of CMU Arctic download site.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    for spk in tqdm(speakers, desc="Downloading/unpacking CMU Arctic speakers"):
        name = f"cmu_us_{spk}_arctic"
        tar_name = f"{name}.tar.bz2"
        full_url = f"{base_url}{tar_name}"
        tar_path = target_dir / tar_name
        part_dir = target_dir / name
        completed_detector = part_dir / ".completed"
        if completed_detector.is_file():
            logging.info(f"Skiping {spk} because {completed_detector} exists.")
            continue
        if force_download or not tar_path.is_file():
            urlretrieve_progress(
                full_url, filename=tar_path, desc=f"Downloading {tar_name}"
            )
        shutil.rmtree(part_dir, ignore_errors=True)
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)
        completed_detector.touch()
Esempio n. 6
0
def download_hifitts(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    base_url: Optional[str] = "http://www.openslr.org/resources",
) -> None:
    """
    Download and untar the HiFi TTS dataset.

    :param target_dir: Pathlike, the path of the dir to store the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    url = f"{base_url}/109"
    tar_name = "hi_fi_tts_v0.tar.gz"
    tar_path = target_dir / tar_name
    part_dir = target_dir / f"hi_fi_tts_v0"
    completed_detector = part_dir / ".completed"
    if completed_detector.is_file():
        logging.info(
            f"Skipping HiFiTTS preparation because {completed_detector} exists."
        )
        return
    if force_download or not tar_path.is_file():
        urlretrieve_progress(f"{url}/{tar_name}",
                             filename=tar_path,
                             desc=f"Downloading {tar_name}")
    shutil.rmtree(part_dir, ignore_errors=True)
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)
    completed_detector.touch()
Esempio n. 7
0
def download_timit(
    target_dir: Pathlike = ".",
    force_download: bool = False,
    base_url: Optional[str] = "https://data.deepai.org/timit.zip",
) -> Path:
    """
    Download and unzip the dataset TIMIT.
    :param target_dir: Pathlike, the path of the dir to store the dataset.
    :param force_download: bool, if True, download the zips no matter if the zips exists.
    :param base_url: str, the URL of the TIMIT dataset to download.
    :return: the path to downloaded and extracted directory with data.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    zip_name = "timit.zip"
    zip_path = target_dir / zip_name
    corpus_dir = zip_path.with_suffix("")
    completed_detector = corpus_dir / ".completed"
    if completed_detector.is_file():
        logging.info(
            f"Skipping {zip_name} because {completed_detector} exists.")
        return corpus_dir
    if force_download or not zip_path.is_file():
        urlretrieve_progress(base_url,
                             filename=zip_path,
                             desc=f"Downloading {zip_name}")

    with zipfile.ZipFile(zip_path) as zip_file:
        corpus_dir.mkdir(parents=True, exist_ok=True)
        for names in zip_file.namelist():
            zip_file.extract(names, str(corpus_dir))
    return corpus_dir
Esempio n. 8
0
def download_ali_meeting(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    base_url: Optional[
        str
    ] = "https://speech-lab-share-data.oss-cn-shanghai.aliyuncs.com/",
) -> Path:
    """
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    """
    url = f"{base_url}/AliMeeting/openlr/"
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    dataset_tar_names = [
        "Train_Ali_far.tar.gz",
        "Train_Ali_near.tar.gz",
        "Eval_Ali.tar.gz",
        "Test_Ali.tar.gz",
    ]
    for tar_name in dataset_tar_names:
        tar_path = target_dir / tar_name
        if force_download or not tar_path.is_file():
            urlretrieve_progress(
                f"{url}/{tar_name}", filename=tar_path, desc=f"Downloading {tar_name}"
            )
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)

    return target_dir
Esempio n. 9
0
def download_musan(
    target_dir: Pathlike = ".",
    url: Optional[str] = MUSAN_URL,
    force_download: Optional[bool] = False,
) -> Path:
    """
    Download and untar the MUSAN corpus.

    :param target_dir: Pathlike, the path of the dir to store the dataset.
    :param url: str, the url that downloads file called "musan.tar.gz".
    :param force_download: bool, if True, download the archive even if it already exists.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    tar_name = "musan.tar.gz"
    tar_path = target_dir / tar_name
    corpus_dir = target_dir / "musan"
    completed_detector = target_dir / ".musan_completed"
    if completed_detector.is_file():
        logging.info(
            f"Skipping {tar_name} because {completed_detector} exists.")
        return corpus_dir
    if force_download or not tar_path.is_file():
        urlretrieve_progress(url,
                             filename=tar_path,
                             desc=f"Downloading {tar_name}")
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)
        completed_detector.touch()
    return corpus_dir
Esempio n. 10
0
def download_yesno(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    url: Optional[str] = _DEFAULT_URL,
):
    """Download and untar the dataset.
    :param target_dir: Pathlike, the path of the dir to store the dataset.
        The extracted files are saved to target_dir/waves_yesno/*.wav
    :param force_download: Bool, if True, download the tar file no matter
        whether it exists or not.
    :param url: str, the url to download the dataset.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    extracted_dir = target_dir / "waves_yesno"

    tar_path = target_dir / "waves_yesno.tar.gz"

    completed_detector = extracted_dir / ".completed"
    if completed_detector.is_file():
        logging.info(f"Skipping - {completed_detector} exists.")
        return

    if force_download or not tar_path.is_file():
        urlretrieve_progress(
            f"{url}", filename=tar_path, desc=f"Downloading waves_yesno.tar.gz"
        )

    shutil.rmtree(extracted_dir, ignore_errors=True)

    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)

    completed_detector.touch()
Esempio n. 11
0
def download_musan(
    target_dir: Pathlike = '.',
    url: Optional[str] = MUSAN_URL,
    force_download: Optional[bool] = False,
) -> None:
    """
    Download and untar the MUSAN corpus.

    :param target_dir: Pathlike, the path of the dir to store the dataset.
    :param url: str, the url that downloads file called "musan.tar.gz".
    :param force_download: bool, if True, download the archive even if it already exists.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    tar_name = 'musan.tar.gz'
    tar_path = target_dir / tar_name
    if force_download or not tar_path.is_file():
        urlretrieve_progress(url,
                             filename=tar_path,
                             desc=f'Downloading {tar_name}')
    completed_detector = target_dir / '.musan_completed'
    if not completed_detector.is_file():
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)
            completed_detector.touch()
Esempio n. 12
0
def download_aishell4(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    base_url: Optional[str] = "http://www.openslr.org/resources",
) -> None:
    """
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    """
    url = f"{base_url}/111"
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    dataset_tar_names = [
        "train_L.tar.gz",
        "train_M.tar.gz",
        "train_S.tar.gz",
        "test.tar.gz",
    ]
    for tar_name in dataset_tar_names:
        tar_path = target_dir / tar_name
        if force_download or not tar_path.is_file():
            urlretrieve_progress(f"{url}/{tar_name}",
                                 filename=tar_path,
                                 desc=f"Downloading {tar_name}")
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)
Esempio n. 13
0
def download_rir_noise(
    target_dir: Pathlike = ".",
    url: Optional[str] = RIR_NOISE_ZIP_URL,
    force_download: Optional[bool] = False,
) -> Path:
    """
    Download and untar the RIR Noise corpus.

    :param target_dir: Pathlike, the path of the dir to store the dataset.
    :param url: str, the url that downloads file called "rirs_noises.zip".
    :param force_download: bool, if True, download the archive even if it already exists.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    zip_name = "rirs_noises.zip"
    zip_path = target_dir / zip_name
    if zip_path.exists() and not force_download:
        logging.info(f"Skipping {zip_name} because file exists.")
    else:
        urlretrieve_progress(url, zip_path, desc=f"Downloading {zip_name}")
        logging.info(f"Downloaded {zip_name}.")
    zip_dir = target_dir / "RIRS_NOISES"
    if not zip_dir.exists():
        logging.info(f"Unzipping {zip_name}.")
        with zipfile.ZipFile(zip_path) as zf:
            zf.extractall(target_dir)
    return zip_dir
Esempio n. 14
0
def download_and_untar(
        target_dir: Pathlike = '.',
        force_download: Optional[bool] = False,
        base_url: Optional[str] = 'http://www.openslr.org/resources') -> None:
    """
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    """

    url = f'{base_url}/33'
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    dataset_tar_name = 'data_aishell.tgz'
    resources_tar_name = 'resource_aishell.tgz'
    for tar_name in [dataset_tar_name, resources_tar_name]:
        tar_path = target_dir / tar_name
        if force_download or not tar_path.is_file():
            urlretrieve_progress(f'{url}/{tar_name}',
                                 filename=tar_path,
                                 desc=f'Downloading {tar_name}')
        corpus_dir = target_dir / 'aishell'
        extracted_dir = corpus_dir / tar_name[:-4]
        completed_detector = extracted_dir / '.completed'
        if not completed_detector.is_file():
            shutil.rmtree(extracted_dir, ignore_errors=True)
            with tarfile.open(tar_path) as tar:
                tar.extractall(path=corpus_dir)
                completed_detector.touch()
Esempio n. 15
0
def download_earnings21(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    url: Optional[str] = _DEFAULT_URL,
) -> Path:
    """Download and untar the dataset.
    :param target_dir: Pathlike, the path of the dir to store the dataset.
        The extracted files are saved to target_dir/earnings21/
        Please note that the github repository contains other additional datasets and
        using this call, you will be downloading all of them and then throwing them out.
    :param force_download: Bool, if True, download the tar file no matter
        whether it exists or not.
    :param url: str, the url to download the dataset.
    :return: the path to downloaded and extracted directory with data.
    """
    logging.info(
        "Downloading Earnings21 from github repository is not very efficient way"
        +
        " how to obtain the corpus. You will be downloading other data as well."
    )
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    extracted_dir = target_dir / "earnings21"

    zip_path = target_dir / "speech-datasets-main.zip"

    completed_detector = extracted_dir / ".lhotse-download.completed"
    if completed_detector.is_file():
        logging.info(f"Skipping - {completed_detector} exists.")
        return extracted_dir

    if force_download or not zip_path.is_file():
        urlretrieve_progress(url,
                             filename=zip_path,
                             desc="Getting speech-datasets-main.zip")

    shutil.rmtree(extracted_dir, ignore_errors=True)

    with zipfile.ZipFile(zip_path) as zip:
        for f in zip.namelist():
            if "earnings21" in f:
                zip.extract(f, path=target_dir)

    shutil.move(target_dir / "speech-datasets-main" / "earnings21", target_dir)
    shutil.rmtree(target_dir / "speech-datasets-main")

    completed_detector.touch()

    return extracted_dir
Esempio n. 16
0
def download_and_untar(
        target_dir: Pathlike = '.',
        force_download: Optional[bool] = False,
        url: Optional[str] = 'http://www.openslr.org/resources/39'
) -> None:
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_name = f'LDC2006S37.tar.gz'
    tar_path = target_dir / tar_name
    if force_download or not tar_path.is_file():
        urlretrieve_progress(f'{url}/{tar_name}', filename=tar_path, desc='Downloading Heroico')

    completed_detector = target_dir / '.completed'
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)
        completed_detector.touch()
Esempio n. 17
0
def download_and_unzip(
        target_dir: Pathlike = '.',
        force_download: Optional[bool] = False,
        url: Optional[str] = 'https://zenodo.org/record/3871592/files/MiniLibriMix.zip'
) -> None:
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    zip_path = target_dir / 'MiniLibriMix.zip'
    if force_download or not zip_path.is_file():
        urlretrieve_progress(url, filename=zip_path, desc='Downloading MiniLibriMix')
    unzipped_dir = target_dir / 'MiniLibriMix'
    completed_detector = unzipped_dir / '.completed'
    if not completed_detector.is_file():
        shutil.rmtree(unzipped_dir, ignore_errors=True)
        with ZipFile(zip_path) as zf:
            zf.extractall(path=target_dir)
            completed_detector.touch()
Esempio n. 18
0
def download_callhome_metadata(
        target_dir: Pathlike = '.',
        force_download: bool = False,
        url: str = "http://www.openslr.org/resources/10/sre2000-key.tar.gz"
) -> Path:
    target_dir = Path(target_dir)
    sre_dir = target_dir / 'sre2000-key'
    if sre_dir.is_dir():
        return sre_dir
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_name = 'sre2000-key.tar.gz'
    tar_path = target_dir / tar_name
    if force_download or not tar_path.is_file():
        urlretrieve_progress(url, filename=tar_path, desc=f'Downloading {tar_name}')
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)
    return sre_dir
Esempio n. 19
0
def download_and_untar(target_dir: Pathlike = '.',
                       force_download: Optional[bool] = False) -> None:
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_path = target_dir / 'TEDLIUM_release-3.tgz'
    if force_download or not tar_path.is_file():
        urlretrieve_progress(
            'http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz',
            filename=tar_path,
            desc='Downloading TEDLIUM v3')
    corpus_dir = target_dir / 'TEDLIUM_release-3.tgz'
    completed_detector = corpus_dir / '.completed'
    if not completed_detector.is_file():
        shutil.rmtree(corpus_dir, ignore_errors=True)
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)
            completed_detector.touch()
Esempio n. 20
0
def download_and_untar(target_dir: Pathlike = '.',
                       force_download: bool = False,
                       url: str = SWBD_TEXT_URL) -> Path:
    target_dir = Path(target_dir)
    transcript_dir = target_dir / 'swb_ms98_transcriptions'
    if transcript_dir.is_dir():
        return transcript_dir
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_name = 'switchboard_word_alignments.tar.gz'
    tar_path = target_dir / tar_name
    if force_download or not tar_path.is_file():
        urlretrieve_progress(url,
                             filename=tar_path,
                             desc=f'Downloading {tar_name}')
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)
    return transcript_dir
Esempio n. 21
0
def download_audio(
    target_dir: Path,
    force_download: Optional[bool] = False,
    url: Optional[str] = "http://groups.inf.ed.ac.uk/ami",
    mic: Optional[str] = "ihm",
) -> None:
    # Audios
    for item in tqdm(
            itertools.chain.from_iterable(MEETINGS.values()),
            desc="Downloading AMI meetings",
    ):
        if mic == "ihm":
            headset_num = 5 if item in ("EN2001a", "EN2001d", "EN2001e") else 4
            for m in range(headset_num):
                wav_name = f"{item}.Headset-{m}.wav"
                wav_url = f"{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}"
                wav_dir = target_dir / "wav_db" / item / "audio"
                wav_dir.mkdir(parents=True, exist_ok=True)
                wav_path = wav_dir / wav_name
                if force_download or not wav_path.is_file():
                    urlretrieve_progress(wav_url,
                                         filename=wav_path,
                                         desc=f"Downloading {wav_name}")
        elif mic == "ihm-mix":
            wav_name = f"{item}.Mix-Headset.wav"
            wav_url = f"{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}"
            wav_dir = target_dir / "wav_db" / item / "audio"
            wav_dir.mkdir(parents=True, exist_ok=True)
            wav_path = wav_dir / wav_name
            if force_download or not wav_path.is_file():
                urlretrieve_progress(wav_url,
                                     filename=wav_path,
                                     desc=f"Downloading {wav_name}")
        elif mic == "sdm":
            wav_name = f"{item}.Array1-01.wav"
            wav_url = f"{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}"
            wav_dir = target_dir / "wav_db" / item / "audio"
            wav_dir.mkdir(parents=True, exist_ok=True)
            wav_path = wav_dir / wav_name
            if force_download or not wav_path.is_file():
                urlretrieve_progress(wav_url,
                                     filename=wav_path,
                                     desc=f"Downloading {wav_name}")
        elif mic == "mdm":
            for array in MDM_ARRAYS:
                for channel in MDM_CHANNELS:
                    wav_name = f"{item}.{array}-{channel}.wav"
                    wav_url = f"{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}"
                    wav_dir = target_dir / "wav_db" / item / "audio"
                    wav_dir.mkdir(parents=True, exist_ok=True)
                    wav_path = wav_dir / wav_name
                    if force_download or not wav_path.is_file():
                        urlretrieve_progress(wav_url,
                                             filename=wav_path,
                                             desc=f"Downloading {wav_name}")
Esempio n. 22
0
def download_and_untar(target_dir: Pathlike = '.',
                       force_download: Optional[bool] = False) -> None:
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    dataset_name = 'LJSpeech-1.1'
    tar_path = target_dir / f'{dataset_name}.tar.bz2'
    if force_download or not tar_path.is_file():
        urlretrieve_progress(
            f'http://data.keithito.com/data/speech/{dataset_name}.tar.bz2',
            filename=tar_path,
            desc='Downloading LJSpeech')
    corpus_dir = target_dir / dataset_name
    completed_detector = corpus_dir / '.completed'
    if not completed_detector.is_file():
        shutil.rmtree(corpus_dir, ignore_errors=True)
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)
            completed_detector.touch()
Esempio n. 23
0
def download_libritts(
    target_dir: Pathlike = ".",
    dataset_parts: Optional[Union[str, Sequence[str]]] = "all",
    force_download: Optional[bool] = False,
    base_url: Optional[str] = "http://www.openslr.org/resources",
) -> Path:
    """
    Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param dataset_parts: "librispeech", "mini_librispeech",
        or a list of splits (e.g. "dev-clean") to download.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    if dataset_parts == "all":
        dataset_parts = LIBRITTS

    for part in tqdm(dataset_parts, desc="Downloading LibriSpeech parts"):
        if part not in LIBRITTS:
            logging.warning(f"Skipping invalid dataset part name: {part}")
        url = f"{base_url}/60"
        tar_name = f"{part}.tar.gz"
        tar_path = target_dir / tar_name
        part_dir = target_dir / f"LibriTTS/{part}"
        completed_detector = part_dir / ".completed"
        if completed_detector.is_file():
            logging.info(
                f"Skipping {part} because {completed_detector} exists.")
            continue
        if force_download or not tar_path.is_file():
            urlretrieve_progress(f"{url}/{tar_name}",
                                 filename=tar_path,
                                 desc=f"Downloading {tar_name}")
        shutil.rmtree(part_dir, ignore_errors=True)
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)
        completed_detector.touch()

    return target_dir
Esempio n. 24
0
def download_librispeech(
        target_dir: Pathlike = '.',
        dataset_parts: Optional[Union[str,
                                      Sequence[str]]] = "mini_librispeech",
        force_download: Optional[bool] = False,
        base_url: Optional[str] = 'http://www.openslr.org/resources') -> None:
    """
    Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param dataset_parts: "librispeech", "mini_librispeech",
        or a list of splits (e.g. "dev-clean") to download.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    """
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    if dataset_parts == "librispeech":
        dataset_parts = LIBRISPEECH
    elif dataset_parts == "mini_librispeech":
        dataset_parts = MINI_LIBRISPEECH

    for part in tqdm(dataset_parts, desc='Downloading LibriSpeech parts'):
        if part in LIBRISPEECH:
            url = f'{base_url}/12'
        elif part in MINI_LIBRISPEECH:
            url = f'{base_url}/31'
        else:
            logging.warning(f'Invalid dataset part name: {part}')
            continue
        tar_name = f'{part}.tar.gz'
        tar_path = target_dir / tar_name
        if force_download or not tar_path.is_file():
            urlretrieve_progress(f'{url}/{tar_name}',
                                 filename=tar_path,
                                 desc=f'Downloading {tar_name}')
        part_dir = target_dir / f'LibriSpeech/{part}'
        completed_detector = part_dir / '.completed'
        if not completed_detector.is_file():
            shutil.rmtree(part_dir, ignore_errors=True)
            with tarfile.open(tar_path) as tar:
                tar.extractall(path=target_dir)
                completed_detector.touch()
Esempio n. 25
0
def download_and_untar_sph2pipe(
    target_dir: Pathlike,
    url: str,
    force_download: bool = False,
) -> Path:
    target_dir = Path(target_dir)
    sph2pipe_dir = target_dir / "sph2pipe-2.5"
    if (sph2pipe_dir / "Makefile").is_file() and not force_download:
        return sph2pipe_dir
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_name = "sph2pipe-2.5.tar.gz"
    tar_path = target_dir / tar_name
    if force_download or not tar_path.is_file():
        urlretrieve_progress(url,
                             filename=tar_path,
                             desc=f"Downloading {tar_name}")
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)
    return sph2pipe_dir
Esempio n. 26
0
def download_audio(
    target_dir: Path,
    force_download: Optional[bool] = False,
    url: Optional[str] = 'http://groups.inf.ed.ac.uk/ami',
    mic: Optional[str] = 'ihm',
) -> None:
    # Audios
    for item in tqdm(itertools.chain.from_iterable(MEETINGS.values()), \
                     desc='Downloading AMI meetings'):
        if mic == 'ihm':
            headset_num = 5 if item in ('EN2001a', 'EN2001d', 'EN2001e') else 4
            for m in range(headset_num):
                wav_name = f'{item}.Headset-{m}.wav'
                wav_url = f'{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}'
                wav_dir = target_dir / 'wav_db' / item / 'audio'
                wav_dir.mkdir(parents=True, exist_ok=True)
                wav_path = wav_dir / wav_name
                if force_download or not wav_path.is_file():
                    urlretrieve_progress(wav_url,
                                         filename=wav_path,
                                         desc=f'Downloading {wav_name}')
        elif mic == 'ihm-mix':
            wav_name = f'{item}.Mix-Headset.wav'
            wav_url = f'{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}'
            wav_dir = target_dir / 'wav_db' / item / 'audio'
            wav_dir.mkdir(parents=True, exist_ok=True)
            wav_path = wav_dir / wav_name
            if force_download or not wav_path.is_file():
                urlretrieve_progress(wav_url,
                                     filename=wav_path,
                                     desc=f'Downloading {wav_name}')
        elif mic == 'sdm':
            wav_name = f'{item}.Array1-01.wav'
            wav_url = f'{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}'
            wav_dir = target_dir / 'wav_db' / item / 'audio'
            wav_dir.mkdir(parents=True, exist_ok=True)
            wav_path = wav_dir / wav_name
            if force_download or not wav_path.is_file():
                urlretrieve_progress(wav_url,
                                     filename=wav_path,
                                     desc=f'Downloading {wav_name}')
        elif mic == 'mdm':
            for array in MDM_ARRAYS:
                for channel in MDM_CHANNELS:
                    wav_name = f'{item}.{array}-{channel}.wav'
                    wav_url = f'{url}/AMICorpusMirror/amicorpus/{item}/audio/{wav_name}'
                    wav_dir = target_dir / 'wav_db' / item / 'audio'
                    wav_dir.mkdir(parents=True, exist_ok=True)
                    wav_path = wav_dir / wav_name
                    if force_download or not wav_path.is_file():
                        urlretrieve_progress(wav_url,
                                             filename=wav_path,
                                             desc=f'Downloading {wav_name}')
Esempio n. 27
0
def download_aishell(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    base_url: Optional[str] = "http://www.openslr.org/resources",
) -> Path:
    """
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    """
    url = f"{base_url}/33"
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    corpus_dir = target_dir / "aishell"
    dataset_tar_name = "data_aishell.tgz"
    resources_tar_name = "resource_aishell.tgz"
    for tar_name in [dataset_tar_name, resources_tar_name]:
        tar_path = target_dir / tar_name
        extracted_dir = corpus_dir / tar_name[:-4]
        completed_detector = extracted_dir / ".completed"
        if completed_detector.is_file():
            logging.info(
                f"Skipping download of because {completed_detector} exists.")
            continue
        if force_download or not tar_path.is_file():
            urlretrieve_progress(f"{url}/{tar_name}",
                                 filename=tar_path,
                                 desc=f"Downloading {tar_name}")
        shutil.rmtree(extracted_dir, ignore_errors=True)
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=corpus_dir)
        if tar_name == dataset_tar_name:
            wav_dir = extracted_dir / "wav"
            for sub_tar_name in os.listdir(wav_dir):
                with tarfile.open(wav_dir / sub_tar_name) as tar:
                    tar.extractall(path=wav_dir)
        completed_detector.touch()

    return corpus_dir
Esempio n. 28
0
def download_aidatatang_200zh(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    base_url: Optional[str] = "http://www.openslr.org/resources",
) -> Path:
    """
    Downdload and untar the dataset
    :param target_dir: Pathlike, the path of the dir to store the dataset.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of the OpenSLR resources.
    :return: the path to downloaded and extracted directory with data.
    """
    url = f"{base_url}/62"
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_name = "aidatatang_200zh.tgz"
    tar_path = target_dir / tar_name
    corpus_dir = target_dir
    extracted_dir = corpus_dir / tar_name[:-4]
    completed_detector = extracted_dir / ".completed"
    if completed_detector.is_file():
        logging.info(f"Skipping because {completed_detector} exists.")
        return corpus_dir
    if force_download or not tar_path.is_file():
        urlretrieve_progress(f"{url}/{tar_name}",
                             filename=tar_path,
                             desc=f"Downloading {tar_name}")
    shutil.rmtree(extracted_dir, ignore_errors=True)
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=corpus_dir)

    wav_dir = extracted_dir / "corpus"
    for s in ["test", "dev", "train"]:
        d = wav_dir / s
        logging.info(f"Processing {d}")
        for sub_tar_name in os.listdir(d):
            with tarfile.open(d / sub_tar_name) as tar:
                tar.extractall(path=d)
    completed_detector.touch()

    return corpus_dir
Esempio n. 29
0
def download_mtedx(
    target_dir: Pathlike = ".",
    languages: Optional[Union[str, Sequence[str]]] = "all",
) -> Path:
    """
    Download and untar the dataset.

    :param: target_dir: Pathlike, the path of the directory where the
        mtdex_corpus directory will be created and to which data will
        be downloaded.
    :param: languages: A str or sequence of strings specifying which
        languages to download. The default 'all', downloads all available
        languages.
    :return: the path to downloaded and extracted directory with data.
    """
    target_dir = Path(target_dir) / "mtedx_corpus"
    target_dir.mkdir(parents=True, exist_ok=True)

    langs_list = list(ISOCODE2LANG.keys())
    # If for some reason languages = None, assume this also means 'all'
    if isinstance(languages, str) and languages != "all":
        langs_list = [languages]
    elif isinstance(languages, list) or isinstance(languages, tuple):
        langs_list = languages

    for lang in tqdm(langs_list, "Downloading MTEDx languages"):
        tar_path = target_dir / f"{lang}-{lang}.tgz"
        completed_detector = target_dir / f".{lang}.completed"
        if completed_detector.is_file():
            logging.info(f"Skipping {lang} because {completed_detector} exists.")
            continue
        urlretrieve_progress(
            f"http://www.openslr.org/resources/100/mtedx_{lang}.tgz",
            filename=tar_path,
            desc=f"Downloading MTEDx {lang}",
        )
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=target_dir)
        completed_detector.touch()

    return target_dir
Esempio n. 30
0
def download_heroico(
    target_dir: Pathlike = ".",
    force_download: Optional[bool] = False,
    url: Optional[str] = "http://www.openslr.org/resources/39",
) -> None:
    target_dir = Path(target_dir)
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_name = f"LDC2006S37.tar.gz"
    tar_path = target_dir / tar_name
    completed_detector = target_dir / ".completed"
    if completed_detector.is_file():
        logging.info(
            f"Skipping {tar_name} because {completed_detector} exists.")
        return
    if force_download or not tar_path.is_file():
        urlretrieve_progress(f"{url}/{tar_name}",
                             filename=tar_path,
                             desc="Downloading Heroico")
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)
    completed_detector.touch()