def prepare_dataset(url, md5sum, target_dir, annotation_path): """Download, unpack and create manifest file.""" data_dir = os.path.join(target_dir, 'data_thchs30') if not os.path.exists(data_dir): filepath = download(url, md5sum, target_dir) unpack(filepath, target_dir) os.remove(filepath) else: print( "Skip downloading and unpacking. THCHS-30 data already exists in %s." % target_dir) create_annotation_text(data_dir, annotation_path)
def prepare_dataset(url, md5sum, target_dir, annotation_path): """Download, unpack and create manifest file.""" data_dir = os.path.join(target_dir, 'ST-CMDS-20170001_1-OS') if not os.path.exists(data_dir): filepath = download(url, md5sum, target_dir) unpack(filepath, target_dir) os.remove(filepath) else: print( "Skip downloading and unpacking. Free ST-Chinese-Mandarin-Corpus data already exists in %s." % target_dir) create_annotation_text(data_dir, annotation_path)
def prepare_dataset(url, md5sum, target_dir, manifest_path): """Download, unpack and create summmary manifest file. """ if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): # download filepath = download(url, md5sum, target_dir) # unpack unpack(filepath, target_dir) else: print("Skip downloading and unpacking. Data already exists in %s." % target_dir) # create manifest json file create_manifest(target_dir, manifest_path)
def download_and_unpack(target_dir, url): wget_args = '-q -l 1 -N -nd -c -e robots=off -A tgz -r -np' tgz_dir = os.path.join(target_dir, 'tgz') exit_code = download_multi(url, tgz_dir, wget_args) if exit_code != 0: print('Download tgz audio files failed with exit code %d.' % exit_code) else: print('Download done, start unpacking ...') audio_dir = os.path.join(target_dir, 'audio') for root, dirs, files in os.walk(tgz_dir): for file in files: print(file) if file.endswith('.tgz'): unpack(os.path.join(root, file), audio_dir)
def prepare_dataset(url, md5sum, target_dir, manifest_path): """Download, unpack and create manifest file.""" data_dir = os.path.join(target_dir, 'data_aishell') if not os.path.exists(data_dir): filepath = download(url, md5sum, target_dir) unpack(filepath, target_dir) # unpack all audio tar files audio_dir = os.path.join(data_dir, 'wav') for subfolder, _, filelist in sorted(os.walk(audio_dir)): for ftar in filelist: unpack(os.path.join(subfolder, ftar), subfolder, True) else: print("Skip downloading and unpacking. Data already exists in %s." % target_dir) create_manifest(data_dir, manifest_path)