def main(): target_dir = args.target_dir target_dir = '/media/data/Soroush_data/CVSpeech' + target_dir output_final_dir = '/media/data/Soroush_data/CVSpeech/data/' print(target_dir) os.makedirs(target_dir, exist_ok=True) target_unpacked_dir = os.path.join(target_dir, "CV_unpacked") os.makedirs(target_unpacked_dir, exist_ok=True) if args.tar_path and os.path.exists(args.tar_path): print('Find existing file {}'.format(args.tar_path)) target_file = os.getcwd() + '/PycharmProjects/deepspeech.pytorch/' + args.tar_path else: print("Could not find downloaded Common Voice archive, Downloading corpus...") print(target_dir) filename = wget.download(COMMON_VOICE_URL, target_dir) target_file = os.path.join(target_dir, os.path.basename(filename)) print("Unpacking corpus to {} ...".format(target_unpacked_dir)) tar = tarfile.open(target_file) tar.extractall(target_unpacked_dir) tar.close() for csv_file in args.files_to_process.split(','): print(csv_file) convert_to_wav(os.path.join(target_unpacked_dir, 'cv_corpus_v1/', csv_file), os.path.join(target_dir, os.path.splitext(csv_file)[0])) print('Creating manifests...') for csv_file in args.files_to_process.split(','): create_manifest(os.path.join(target_dir, os.path.splitext(csv_file)[0]), os.path.join(output_final_dir, os.path.splitext(csv_file)[0]) + '_manifest.csv', args.min_duration, args.max_duration)
def main(): target_dl_dir = args.target_dir if not os.path.exists(target_dl_dir): os.makedirs(target_dl_dir) files_to_dl = args.files_to_use.strip().split(',') for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items(): split_dir = os.path.join(target_dl_dir, split_type) if not os.path.exists(split_dir): os.makedirs(split_dir) split_wav_dir = os.path.join(split_dir, "wav") if not os.path.exists(split_wav_dir): os.makedirs(split_wav_dir) split_txt_dir = os.path.join(split_dir, "txt") if not os.path.exists(split_txt_dir): os.makedirs(split_txt_dir) extracted_dir = os.path.join(split_dir, "LibriSpeech") if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir) for url in lst_libri_urls: # check if we want to dl this file dl_flag = False for f in files_to_dl: if url.find(f) != -1: dl_flag = True if not dl_flag: print("Skipping url: {}".format(url)) continue filename = url.split("/")[-1] target_filename = os.path.join(split_dir, filename) if not os.path.exists(target_filename): wget.download(url, split_dir) print("Unpacking {}...".format(filename)) tar = tarfile.open(target_filename) tar.extractall(split_dir) tar.close() os.remove(target_filename) print("Converting flac files to wav and extracting transcripts...") assert os.path.exists( extracted_dir ), "Archive {} was not properly uncompressed.".format(filename) for root, subdirs, files in tqdm(os.walk(extracted_dir)): for f in files: if f.find(".flac") != -1: _process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir, base_filename=f, root_dir=root) print("Finished {}".format(url)) shutil.rmtree(extracted_dir) if split_type == 'train': # Prune to min/max duration create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', args.min_duration, args.max_duration) else: create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv')
def main(): root_path = 'an4/' name = 'an4' subprocess.call(['wget http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz'], shell=True) subprocess.call(['tar -xzvf an4_raw.bigendian.tar.gz'], stdout=open(os.devnull, 'wb'), shell=True) os.makedirs(args.an4_path) _format_data(root_path, 'train', name, 'an4_clstk') _format_data(root_path, 'test', name, 'an4test_clstk') shutil.rmtree(root_path) os.remove('an4_raw.bigendian.tar.gz') train_path = args.an4_path + '/train/' test_path = args.an4_path + '/test/' print ('Creating manifests...') create_manifest(train_path, 'train') create_manifest(test_path, 'val')
def main(): root_path = 'an4/' name = 'an4' wget.download( 'http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz') tar = tarfile.open('an4_raw.bigendian.tar.gz') tar.extractall() os.makedirs(args.target_dir) _format_data(root_path, 'train', name, 'an4_clstk') _format_data(root_path, 'test', name, 'an4test_clstk') shutil.rmtree(root_path) os.remove('an4_raw.bigendian.tar.gz') train_path = args.target_dir + '/train/' test_path = args.target_dir + '/test/' print('\n', 'Creating manifests...') create_manifest(train_path, 'an4_train_manifest.csv', args.min_duration, args.max_duration) create_manifest(test_path, 'an4_val_manifest.csv')
def main(): target_dl_dir = args.target_dir if not os.path.exists(target_dl_dir): os.makedirs(target_dl_dir) target_unpacked_dir = os.path.join(target_dl_dir, "TEDLIUM_release2") if args.tar_path and os.path.exists(args.tar_path): target_file = args.tar_path else: print( "Could not find downloaded TEDLIUM archive, Downloading corpus...") wget.download(TED_LIUM_V2_DL_URL, target_dl_dir) target_file = os.path.join(target_dl_dir, "TEDLIUM_release2.tar.gz") if not os.path.exists(target_unpacked_dir): print("Unpacking corpus...") tar = tarfile.open(target_file) tar.extractall(target_dl_dir) tar.close() else: print("Found TEDLIUM directory, skipping unpacking of tar files") train_ted_dir = os.path.join(target_unpacked_dir, "train") val_ted_dir = os.path.join(target_unpacked_dir, "dev") test_ted_dir = os.path.join(target_unpacked_dir, "test") prepare_dir(train_ted_dir) prepare_dir(val_ted_dir) prepare_dir(test_ted_dir) print('Creating manifests...') create_manifest(train_ted_dir, 'ted_train_manifest.csv', args.min_duration, args.max_duration) create_manifest(val_ted_dir, 'ted_val_manifest.csv') create_manifest(test_ted_dir, 'ted_test_manifest.csv')