def download(d): """Binds voxforge_url, archive_dir, total, and counter into this scope Downloads the given file :param d: a tuple consisting of (index, file) where index is the index of the file to download and file is the name of the file to download """ (i, file) = d download_url = voxforge_url + "/" + file c = counter.increment() print("Downloading file {} ({}/{})...".format(i + 1, c, total)) maybe_download(filename_of(download_url), archive_dir, download_url)
def maybe_download_language(language): lang_upper = language[0].upper() + language[1:] return maybe_download( SWC_ARCHIVE.format(language=lang_upper), CLI_ARGS.base_dir, SWC_URL.format(language=lang_upper), )
def _download_and_preprocess_data(target_dir): # Making path absolute target_dir = os.path.abspath(target_dir) # Conditionally download data archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL) # Conditionally extract data _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path) # Produce CSV files _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)
def _download_and_preprocess_data(target_dir): # Making path absolute target_dir = os.path.abspath(target_dir) # Conditionally download data archive_path = maybe_download(ARCHIVE_NAME, target_dir, ARCHIVE_URL) # Conditionally extract common voice data _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path) # Conditionally convert common voice CSV files and mp3 data to DeepSpeech CSVs and wav _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME)
def _download_and_preprocess_data(data_dir): # Conditionally download data LDC93S1_BASE = "LDC93S1" LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/" local_file = maybe_download(LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav") trans_file = maybe_download(LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt") with open(trans_file, "r") as fin: transcript = " ".join( fin.read().strip().lower().split(" ")[2:]).replace(".", "") df = pandas.DataFrame( data=[(os.path.abspath(local_file), os.path.getsize(local_file), transcript)], columns=["wav_filename", "wav_filesize", "transcript"], ) df.to_csv(os.path.join(data_dir, "ldc93s1.csv"), index=False)
def _download_and_preprocess_data(target_dir, english_compatible=False): # Making path absolute target_dir = os.path.abspath(target_dir) # Conditionally download data archive_path = maybe_download("ts_" + ARCHIVE_NAME + ".zip", target_dir, ARCHIVE_URL) # Conditionally extract archive data _maybe_extract(target_dir, ARCHIVE_DIR_NAME, archive_path) # Conditionally convert TrainingSpeech data to DeepSpeech CSVs and wav _maybe_convert_sets(target_dir, ARCHIVE_DIR_NAME, english_compatible=english_compatible)
def _download_and_preprocess_data(csv_url, target_dir): dataset_sources = os.path.join( target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020", "data.txt") if os.path.exists(dataset_sources): return dataset_sources # Making path absolute target_dir = os.path.abspath(target_dir) csv_ref = requests.get(csv_url).text.split('\r\n')[1:-1] for part in csv_ref: part_filename = requests.head(part).headers.get( "Content-Disposition").split(" ")[1].split("=")[1].replace( '"', "") if not os.path.exists(os.path.join(target_dir, part_filename)): part_path = maybe_download(part_filename, target_dir, part) def _big_sha1(fname): s = hashlib.sha1() buffer_size = 65536 with open(fname, "rb") as f: while True: data = f.read(buffer_size) if not data: break s.update(data) return s.hexdigest() for (sha1, filename) in DATASET_RELEASE_SHA: print("Checking {} SHA1:".format(filename)) csum = _big_sha1(os.path.join(target_dir, filename)) if csum == sha1: print("\t{}: OK {}".format(filename, sha1)) else: print("\t{}: ERROR: expected {}, computed {}".format( filename, sha1, csum)) assert csum == sha1 # Conditionally extract data _maybe_extract(target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020", "transcriptionsxml_audiomp3_mefr_ccpmf_2012-2020_2.zip", "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020.zip") # Produce source text for extraction / conversion return _maybe_create_sources( os.path.join(target_dir, "transcriptionsXML_audioMP3_MEFR_CCPMF_2012-2020"))
def _download_and_preprocess_data(data_dir): # Conditionally download data TED_DATA = "TEDLIUM_release2.tar.gz" TED_DATA_URL = "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz" local_file = maybe_download(TED_DATA, data_dir, TED_DATA_URL) # Conditionally extract TED data TED_DIR = "TEDLIUM_release2" _maybe_extract(data_dir, TED_DIR, local_file) # Conditionally convert TED sph data to wav _maybe_convert_wav(data_dir, TED_DIR) # Conditionally split TED wav and text data into sentences train_files, dev_files, test_files = _maybe_split_sentences( data_dir, TED_DIR) # Write sets to disk as CSV files train_files.to_csv(path.join(data_dir, "ted-train.csv"), index=False) dev_files.to_csv(path.join(data_dir, "ted-dev.csv"), index=False) test_files.to_csv(path.join(data_dir, "ted-test.csv"), index=False)
def _download_and_preprocess_data(data_dir): # Conditionally download data to data_dir print( "Downloading Librivox data set (55GB) into {} if not already present...".format( data_dir ) ) with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar: TRAIN_CLEAN_100_URL = ( "http://www.openslr.org/resources/12/train-clean-100.tar.gz" ) TRAIN_CLEAN_360_URL = ( "http://www.openslr.org/resources/12/train-clean-360.tar.gz" ) TRAIN_OTHER_500_URL = ( "http://www.openslr.org/resources/12/train-other-500.tar.gz" ) DEV_CLEAN_URL = "http://www.openslr.org/resources/12/dev-clean.tar.gz" DEV_OTHER_URL = "http://www.openslr.org/resources/12/dev-other.tar.gz" TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz" TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz" def filename_of(x): return os.path.split(x)[1] train_clean_100 = maybe_download( filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL ) bar.update(0) train_clean_360 = maybe_download( filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL ) bar.update(1) train_other_500 = maybe_download( filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL ) bar.update(2) dev_clean = maybe_download(filename_of(DEV_CLEAN_URL), data_dir, DEV_CLEAN_URL) bar.update(3) dev_other = maybe_download(filename_of(DEV_OTHER_URL), data_dir, DEV_OTHER_URL) bar.update(4) test_clean = maybe_download( filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL ) bar.update(5) test_other = maybe_download( filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL ) bar.update(6) # Conditionally extract LibriSpeech data # We extract each archive into data_dir, but test for existence in # data_dir/LibriSpeech because the archives share that root. print("Extracting librivox data if not already extracted...") with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar: LIBRIVOX_DIR = "LibriSpeech" work_dir = os.path.join(data_dir, LIBRIVOX_DIR) _maybe_extract( data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"), train_clean_100 ) bar.update(0) _maybe_extract( data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"), train_clean_360 ) bar.update(1) _maybe_extract( data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"), train_other_500 ) bar.update(2) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-clean"), dev_clean) bar.update(3) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-other"), dev_other) bar.update(4) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-clean"), test_clean) bar.update(5) _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-other"), test_other) bar.update(6) # Convert FLAC data to wav, from: # data_dir/LibriSpeech/split/1/2/1-2-3.flac # to: # data_dir/LibriSpeech/split-wav/1-2-3.wav # # And split LibriSpeech transcriptions, from: # data_dir/LibriSpeech/split/1/2/1-2.trans.txt # to: # data_dir/LibriSpeech/split-wav/1-2-0.txt # data_dir/LibriSpeech/split-wav/1-2-1.txt # data_dir/LibriSpeech/split-wav/1-2-2.txt # ... print("Converting FLAC to WAV and splitting transcriptions...") with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar: train_100 = _convert_audio_and_split_sentences( work_dir, "train-clean-100", "train-clean-100-wav" ) bar.update(0) train_360 = _convert_audio_and_split_sentences( work_dir, "train-clean-360", "train-clean-360-wav" ) bar.update(1) train_500 = _convert_audio_and_split_sentences( work_dir, "train-other-500", "train-other-500-wav" ) bar.update(2) dev_clean = _convert_audio_and_split_sentences( work_dir, "dev-clean", "dev-clean-wav" ) bar.update(3) dev_other = _convert_audio_and_split_sentences( work_dir, "dev-other", "dev-other-wav" ) bar.update(4) test_clean = _convert_audio_and_split_sentences( work_dir, "test-clean", "test-clean-wav" ) bar.update(5) test_other = _convert_audio_and_split_sentences( work_dir, "test-other", "test-other-wav" ) bar.update(6) # Write sets to disk as CSV files train_100.to_csv( os.path.join(data_dir, "librivox-train-clean-100.csv"), index=False ) train_360.to_csv( os.path.join(data_dir, "librivox-train-clean-360.csv"), index=False ) train_500.to_csv( os.path.join(data_dir, "librivox-train-other-500.csv"), index=False ) dev_clean.to_csv(os.path.join(data_dir, "librivox-dev-clean.csv"), index=False) dev_other.to_csv(os.path.join(data_dir, "librivox-dev-other.csv"), index=False) test_clean.to_csv(os.path.join(data_dir, "librivox-test-clean.csv"), index=False) test_other.to_csv(os.path.join(data_dir, "librivox-test-other.csv"), index=False)
def download_and_prepare(): archive = maybe_download(TUDA_ARCHIVE, CLI_ARGS.base_dir, TUDA_URL) extracted = maybe_extract(archive) write_csvs(extracted) cleanup(archive)