def download(d):
     """Binds voxforge_url, archive_dir, total, and counter into this scope
     Downloads the given file
     :param d: a tuple consisting of (index, file) where index is the index
               of the file to download and file is the name of the file to download
     """
     (i, file) = d
     download_url = voxforge_url + '/' + file
     c = counter.increment()
     print('Downloading file {} ({}/{})...'.format(i + 1, c, total))
     maybe_download(filename_of(download_url), archive_dir, download_url)
 def download(d):
     """Binds voxforge_url, archive_dir, total, and counter into this scope
     Downloads the given file
     :param d: a tuple consisting of (index, file) where index is the index
               of the file to download and file is the name of the file to download
     """
     (i, file) = d
     download_url = voxforge_url + '/' + file
     c = counter.increment()
     print('Downloading file {} ({}/{})...'.format(i+1, c, total))
     maybe_download(filename_of(download_url), archive_dir, download_url)
def _download_and_preprocess_data(data_dir):
    # Conditionally download data
    LDC93S1_BASE = "LDC93S1"
    LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/"
    local_file = maybe_download(LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav")
    trans_file = maybe_download(LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt")
    with open(trans_file, "r") as fin:
        transcript = ' '.join(fin.read().strip().lower().split(' ')[2:]).replace('.', '')

    df = pandas.DataFrame(data=[(os.path.abspath(local_file), os.path.getsize(local_file), transcript)],
                          columns=["wav_filename", "wav_filesize", "transcript"])
    df.to_csv(os.path.join(data_dir, "ldc93s1.csv"), index=False)
def _download_and_preprocess_data(data_dir):
    # Conditionally download data to data_dir
    print("Downloading a part of Librivox data (test-clean) into {} if not already present...".format(data_dir))
    with progressbar.ProgressBar(max_value=3, widget=progressbar.AdaptiveETA) as bar:
        TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz"

        def filename_of(x): return os.path.split(x)[1]
        test_clean = maybe_download(filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL)
        bar.update(1)

        # Conditionally extract LibriSpeech data
        # We extract each archive into data_dir, but test for existence in
        # data_dir/LibriSpeech because the archives share that root.
        print("Extracting librivox data if not already extracted...")
        LIBRIVOX_DIR = "LibriSpeech"
        work_dir = os.path.join(data_dir, LIBRIVOX_DIR)
        
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-clean"), test_clean)
        bar.update(2)
            
        # Convert FLAC data to wav, and split LibriSpeech transcriptions
        print("Converting FLAC to WAV and splitting transcriptions...")
        test_clean = _convert_audio_and_split_sentences(work_dir, "test-clean", "test-clean-wav")
        bar.update(3)

        test_clean.to_csv(os.path.join(data_dir, "librivox-test-clean.csv"), index=False)
Esempio n. 5
0
def _download_and_preprocess_data(target_dir):
    # Making path absolute
    target_dir = path.abspath(target_dir)
    # Conditionally download data
    archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL)
    # Conditionally extract data
    _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
    # Produce CSV files
    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)
Esempio n. 6
0
def _download_and_preprocess_data(target_dir):
    # Making path absolute
    target_dir = path.abspath(target_dir)
    # Conditionally download data
    archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL)
    # Conditionally extract common voice data
    _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
    # Conditionally convert common voice CSV files and mp3 data to DeepSpeech CSVs and wav
    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)
Esempio n. 7
0
def _download_and_preprocess_data(target_dir):
    # Making path absolute
    target_dir = path.abspath(target_dir)
    # Conditionally download data
    archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL)
    # Conditionally extract common voice data
    _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
    # Conditionally convert common voice CSV files and mp3 data to DeepSpeech CSVs and wav
    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)
Esempio n. 8
0
def _download_and_preprocess_data(target_dir, english_compatible=False):
    # Making path absolute
    target_dir = path.abspath(target_dir)
    # Conditionally download data
    archive_path = maybe_download('ts_' + ARCHIVE_NAME + '.zip', target_dir, ARCHIVE_URL)
    # Conditionally extract archive data
    _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
    # Conditionally convert TrainingSpeech data to DeepSpeech CSVs and wav
    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME, english_compatible=english_compatible)
Esempio n. 9
0
def _download_and_preprocess_data(target_dir):
    # Making path absolute
    target_dir = path.abspath(target_dir)
    # Conditionally download data
    archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL)
    # Conditionally extract data
    _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
    # Produce CSV files and convert ogg data to wav
    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)
def _download_and_preprocess_data(target_dir, english_compatible=False):
    # Making path absolute
    target_dir = path.abspath(target_dir)
    # Conditionally download data
    archive_path = maybe_download('ts_' + ARCHIVE_NAME + '.zip', target_dir, ARCHIVE_URL)
    # Conditionally extract archive data
    _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path)
    # Conditionally convert TrainingSpeech data to DeepSpeech CSVs and wav
    _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME, english_compatible=english_compatible)
def _download_and_preprocess_data(data_dir):
    # Conditionally download data
    TED_DATA = "TEDLIUM_release2.tar.gz"
    TED_DATA_URL = "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz"
    local_file = maybe_download(TED_DATA, data_dir, TED_DATA_URL)

    # Conditionally extract TED data
    TED_DIR = "TEDLIUM_release2"
    _maybe_extract(data_dir, TED_DIR, local_file)

    # Conditionally convert TED sph data to wav
    _maybe_convert_wav(data_dir, TED_DIR)

    # Conditionally split TED wav and text data into sentences
    train_files, dev_files, test_files = _maybe_split_sentences(
        data_dir, TED_DIR)

    # Write sets to disk as CSV files
    train_files.to_csv(path.join(data_dir, "ted-train.csv"), index=False)
    dev_files.to_csv(path.join(data_dir, "ted-dev.csv"), index=False)
    test_files.to_csv(path.join(data_dir, "ted-test.csv"), index=False)
Esempio n. 12
0
def download_and_prepare():
    archive = maybe_download(TUDA_ARCHIVE, CLI_ARGS.base_dir, TUDA_URL)
    extracted = maybe_extract(archive)
    write_csvs(extracted)
    cleanup(archive)
Esempio n. 13
0
def _download_and_preprocess_data(data_dir):
    # Conditionally download data to data_dir
    print(
        "Downloading Librivox data set (55GB) into {} if not already present..."
        .format(data_dir))
    with progressbar.ProgressBar(max_value=7,
                                 widget=progressbar.AdaptiveETA) as bar:
        TRAIN_CLEAN_100_URL = "http://www.openslr.org/resources/12/train-clean-100.tar.gz"
        TRAIN_CLEAN_360_URL = "http://www.openslr.org/resources/12/train-clean-360.tar.gz"
        TRAIN_OTHER_500_URL = "http://www.openslr.org/resources/12/train-other-500.tar.gz"

        DEV_CLEAN_URL = "http://www.openslr.org/resources/12/dev-clean.tar.gz"
        DEV_OTHER_URL = "http://www.openslr.org/resources/12/dev-other.tar.gz"

        TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz"
        TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz"

        def filename_of(x):
            return os.path.split(x)[1]

        train_clean_100 = maybe_download(filename_of(TRAIN_CLEAN_100_URL),
                                         data_dir, TRAIN_CLEAN_100_URL)
        bar.update(0)
        train_clean_360 = maybe_download(filename_of(TRAIN_CLEAN_360_URL),
                                         data_dir, TRAIN_CLEAN_360_URL)
        bar.update(1)
        train_other_500 = maybe_download(filename_of(TRAIN_OTHER_500_URL),
                                         data_dir, TRAIN_OTHER_500_URL)
        bar.update(2)

        dev_clean = maybe_download(filename_of(DEV_CLEAN_URL), data_dir,
                                   DEV_CLEAN_URL)
        bar.update(3)
        dev_other = maybe_download(filename_of(DEV_OTHER_URL), data_dir,
                                   DEV_OTHER_URL)
        bar.update(4)

        test_clean = maybe_download(filename_of(TEST_CLEAN_URL), data_dir,
                                    TEST_CLEAN_URL)
        bar.update(5)
        test_other = maybe_download(filename_of(TEST_OTHER_URL), data_dir,
                                    TEST_OTHER_URL)
        bar.update(6)

    # Conditionally extract LibriSpeech data
    # We extract each archive into data_dir, but test for existence in
    # data_dir/LibriSpeech because the archives share that root.
    print("Extracting librivox data if not already extracted...")
    with progressbar.ProgressBar(max_value=7,
                                 widget=progressbar.AdaptiveETA) as bar:
        LIBRIVOX_DIR = "LibriSpeech"
        work_dir = os.path.join(data_dir, LIBRIVOX_DIR)

        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"),
                       train_clean_100)
        bar.update(0)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"),
                       train_clean_360)
        bar.update(1)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"),
                       train_other_500)
        bar.update(2)

        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-clean"),
                       dev_clean)
        bar.update(3)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-other"),
                       dev_other)
        bar.update(4)

        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-clean"),
                       test_clean)
        bar.update(5)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-other"),
                       test_other)
        bar.update(6)

    # Convert FLAC data to wav, from:
    #  data_dir/LibriSpeech/split/1/2/1-2-3.flac
    # to:
    #  data_dir/LibriSpeech/split-wav/1-2-3.wav
    #
    # And split LibriSpeech transcriptions, from:
    #  data_dir/LibriSpeech/split/1/2/1-2.trans.txt
    # to:
    #  data_dir/LibriSpeech/split-wav/1-2-0.txt
    #  data_dir/LibriSpeech/split-wav/1-2-1.txt
    #  data_dir/LibriSpeech/split-wav/1-2-2.txt
    #  ...
    print("Converting FLAC to WAV and splitting transcriptions...")
    with progressbar.ProgressBar(max_value=7,
                                 widget=progressbar.AdaptiveETA) as bar:
        train_100 = _convert_audio_and_split_sentences(work_dir,
                                                       "train-clean-100",
                                                       "train-clean-100-wav")
        bar.update(0)
        train_360 = _convert_audio_and_split_sentences(work_dir,
                                                       "train-clean-360",
                                                       "train-clean-360-wav")
        bar.update(1)
        train_500 = _convert_audio_and_split_sentences(work_dir,
                                                       "train-other-500",
                                                       "train-other-500-wav")
        bar.update(2)

        dev_clean = _convert_audio_and_split_sentences(work_dir, "dev-clean",
                                                       "dev-clean-wav")
        bar.update(3)
        dev_other = _convert_audio_and_split_sentences(work_dir, "dev-other",
                                                       "dev-other-wav")
        bar.update(4)

        test_clean = _convert_audio_and_split_sentences(
            work_dir, "test-clean", "test-clean-wav")
        bar.update(5)
        test_other = _convert_audio_and_split_sentences(
            work_dir, "test-other", "test-other-wav")
        bar.update(6)

    # Write sets to disk as CSV files
    train_100.to_csv(os.path.join(data_dir, "librivox-train-clean-100.csv"),
                     index=False)
    train_360.to_csv(os.path.join(data_dir, "librivox-train-clean-360.csv"),
                     index=False)
    train_500.to_csv(os.path.join(data_dir, "librivox-train-other-500.csv"),
                     index=False)

    dev_clean.to_csv(os.path.join(data_dir, "librivox-dev-clean.csv"),
                     index=False)
    dev_other.to_csv(os.path.join(data_dir, "librivox-dev-other.csv"),
                     index=False)

    test_clean.to_csv(os.path.join(data_dir, "librivox-test-clean.csv"),
                      index=False)
    test_other.to_csv(os.path.join(data_dir, "librivox-test-other.csv"),
                      index=False)
Esempio n. 14
0
def maybe_download_language(language):
    lang_upper = language[0].upper() + language[1:]
    return maybe_download(SWC_ARCHIVE.format(language=lang_upper),
                          CLI_ARGS.base_dir,
                          SWC_URL.format(language=lang_upper))
Esempio n. 15
0
def _download_and_preprocess_data(data_dir):
    # Conditionally download data to data_dir
    print("Downloading Librivox data set (55GB) into {} if not already present...".format(data_dir))
    with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar:
        TRAIN_CLEAN_100_URL = "http://www.openslr.org/resources/12/train-clean-100.tar.gz"
        TRAIN_CLEAN_360_URL = "http://www.openslr.org/resources/12/train-clean-360.tar.gz"
        TRAIN_OTHER_500_URL = "http://www.openslr.org/resources/12/train-other-500.tar.gz"

        DEV_CLEAN_URL = "http://www.openslr.org/resources/12/dev-clean.tar.gz"
        DEV_OTHER_URL = "http://www.openslr.org/resources/12/dev-other.tar.gz"

        TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz"
        TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz"

        def filename_of(x): return os.path.split(x)[1]
        train_clean_100 = maybe_download(filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL)
        bar.update(0)
        train_clean_360 = maybe_download(filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL)
        bar.update(1)
        train_other_500 = maybe_download(filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL)
        bar.update(2)

        dev_clean = maybe_download(filename_of(DEV_CLEAN_URL), data_dir, DEV_CLEAN_URL)
        bar.update(3)
        dev_other = maybe_download(filename_of(DEV_OTHER_URL), data_dir, DEV_OTHER_URL)
        bar.update(4)

        test_clean = maybe_download(filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL)
        bar.update(5)
        test_other = maybe_download(filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL)
        bar.update(6)

    # Conditionally extract LibriSpeech data
    # We extract each archive into data_dir, but test for existence in
    # data_dir/LibriSpeech because the archives share that root.
    print("Extracting librivox data if not already extracted...")
    with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar:
        LIBRIVOX_DIR = "LibriSpeech"
        work_dir = os.path.join(data_dir, LIBRIVOX_DIR)

        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"), train_clean_100)
        bar.update(0)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"), train_clean_360)
        bar.update(1)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"), train_other_500)
        bar.update(2)

        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-clean"), dev_clean)
        bar.update(3)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-other"), dev_other)
        bar.update(4)

        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-clean"), test_clean)
        bar.update(5)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-other"), test_other)
        bar.update(6)

    # Convert FLAC data to wav, from:
    #  data_dir/LibriSpeech/split/1/2/1-2-3.flac
    # to:
    #  data_dir/LibriSpeech/split-wav/1-2-3.wav
    #
    # And split LibriSpeech transcriptions, from:
    #  data_dir/LibriSpeech/split/1/2/1-2.trans.txt
    # to:
    #  data_dir/LibriSpeech/split-wav/1-2-0.txt
    #  data_dir/LibriSpeech/split-wav/1-2-1.txt
    #  data_dir/LibriSpeech/split-wav/1-2-2.txt
    #  ...
    print("Converting FLAC to WAV and splitting transcriptions...")
    with progressbar.ProgressBar(max_value=7,  widget=progressbar.AdaptiveETA) as bar:
        train_100 = _convert_audio_and_split_sentences(work_dir, "train-clean-100", "train-clean-100-wav")
        bar.update(0)
        train_360 = _convert_audio_and_split_sentences(work_dir, "train-clean-360", "train-clean-360-wav")
        bar.update(1)
        train_500 = _convert_audio_and_split_sentences(work_dir, "train-other-500", "train-other-500-wav")
        bar.update(2)

        dev_clean = _convert_audio_and_split_sentences(work_dir, "dev-clean", "dev-clean-wav")
        bar.update(3)
        dev_other = _convert_audio_and_split_sentences(work_dir, "dev-other", "dev-other-wav")
        bar.update(4)

        test_clean = _convert_audio_and_split_sentences(work_dir, "test-clean", "test-clean-wav")
        bar.update(5)
        test_other = _convert_audio_and_split_sentences(work_dir, "test-other", "test-other-wav")
        bar.update(6)

    # Write sets to disk as CSV files
    train_100.to_csv(os.path.join(data_dir, "librivox-train-clean-100.csv"), index=False)
    train_360.to_csv(os.path.join(data_dir, "librivox-train-clean-360.csv"), index=False)
    train_500.to_csv(os.path.join(data_dir, "librivox-train-other-500.csv"), index=False)

    dev_clean.to_csv(os.path.join(data_dir, "librivox-dev-clean.csv"), index=False)
    dev_other.to_csv(os.path.join(data_dir, "librivox-dev-other.csv"), index=False)

    test_clean.to_csv(os.path.join(data_dir, "librivox-test-clean.csv"), index=False)
    test_other.to_csv(os.path.join(data_dir, "librivox-test-other.csv"), index=False)