def process(args): root = op.join(args.data_root, args.src_lang) os.makedirs(root, exist_ok=True) # Extract features feature_root = op.join(root, 'fbank80') os.makedirs(feature_root, exist_ok=True) for split in CoVoST.SPLITS: print(f'Fetching split {split}...') dataset = CoVoST(root, split, args.src_lang, args.tgt_lang, download=True) print('Extracting log mel filter bank features...') for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): extract_fbank_features(waveform, sample_rate, op.join(feature_root, f'{utt_id}.npy')) # Pack features into ZIP zip_filename = 'fbank80.zip' zip_path = op.join(root, zip_filename) print('ZIPing features...') create_zip(feature_root, zip_path) print('Fetching ZIP manifest...') zip_manifest = get_zip_manifest(args.data_root, f'{args.src_lang}/{zip_filename}') # Generate TSV manifest print('Generating manifest...') train_text = [] task = f'asr_{args.src_lang}' if args.tgt_lang is not None: task = f'st_{args.src_lang}_{args.tgt_lang}' for split in CoVoST.SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = CoVoST(root, split, args.src_lang, args.tgt_lang) for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest['id'].append(utt_id) manifest['audio'].append(zip_manifest[utt_id]) duration_ms = int(wav.size(1) / sr * 1000) manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10)) manifest['tgt_text'].append( src_utt if args.tgt_lang is None else tgt_utt ) manifest['speaker'].append(speaker_id) is_train_split = split.startswith('train') if is_train_split: train_text.extend(manifest['tgt_text']) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, op.join(root, f'{split}_{task}.tsv')) # Generate vocab vocab_size_str = '' if args.vocab_type == 'char' else str(args.vocab_size) spm_filename_prefix = f'spm_{args.vocab_type}{vocab_size_str}_{task}' with NamedTemporaryFile(mode='w') as f: for t in train_text: f.write(t + '\n') gen_vocab(f.name, op.join(root, spm_filename_prefix), args.vocab_type, args.vocab_size) # Generate config YAML gen_config_yaml(root, spm_filename_prefix + '.model', yaml_filename=f'config_{task}.yaml', specaugment_policy='lb') # Clean up shutil.rmtree(feature_root)
def process(args): root = Path(args.data_root).absolute() / args.src_lang if not root.is_dir(): raise NotADirectoryError(f"{root} does not exist") # Extract features feature_root = root / "fbank80" feature_root.mkdir(exist_ok=True) for split in CoVoST.SPLITS: print(f"Fetching split {split}...") dataset = CoVoST(root, split, args.src_lang, args.tgt_lang) print("Extracting log mel filter bank features...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): extract_fbank_features(waveform, sample_rate, feature_root / f"{utt_id}.npy") # Pack features into ZIP zip_path = root / "fbank80.zip" print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") zip_manifest = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] task = f"asr_{args.src_lang}" if args.tgt_lang is not None: task = f"st_{args.src_lang}_{args.tgt_lang}" for split in CoVoST.SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = CoVoST(root, split, args.src_lang, args.tgt_lang) for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest["id"].append(utt_id) manifest["audio"].append(zip_manifest[utt_id]) duration_ms = int(wav.size(1) / sr * 1000) manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10)) manifest["tgt_text"].append( src_utt if args.tgt_lang is None else tgt_utt) manifest["speaker"].append(speaker_id) is_train_split = split.startswith("train") if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, root / f"{split}_{task}.tsv") # Generate vocab vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{task}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab(Path(f.name), root / spm_filename_prefix, args.vocab_type, args.vocab_size) # Generate config YAML gen_config_yaml( root, spm_filename_prefix + ".model", yaml_filename=f"config_{task}.yaml", specaugment_policy="lb", ) # Clean up shutil.rmtree(feature_root)
def process(args): out_root = Path(args.output_root).absolute() out_root.mkdir(exist_ok=True) # Extract features feature_root = out_root / "fbank80" feature_root.mkdir(exist_ok=True) for split in SPLITS: print(f"Fetching split {split}...") dataset = LIBRISPEECH(out_root.as_posix(), url=split, download=True) print("Extracting log mel filter bank features...") for wav, sample_rate, _, spk_id, chapter_no, utt_no in tqdm(dataset): sample_id = f"{spk_id}-{chapter_no}-{utt_no}" extract_fbank_features( wav, sample_rate, feature_root / f"{sample_id}.npy" ) # Pack features into ZIP zip_path = out_root / "fbank80.zip" print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = LIBRISPEECH(out_root.as_posix(), url=split) for _, _, utt, spk_id, chapter_no, utt_no in tqdm(dataset): sample_id = f"{spk_id}-{chapter_no}-{utt_no}" manifest["id"].append(sample_id) manifest["audio"].append(audio_paths[sample_id]) manifest["n_frames"].append(audio_lengths[sample_id]) manifest["tgt_text"].append(utt.lower()) manifest["speaker"].append(spk_id) save_df_to_tsv( pd.DataFrame.from_dict(manifest), out_root / f"{split}.tsv" ) if split.startswith("train"): train_text.extend(manifest["tgt_text"]) # Generate vocab vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), out_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML gen_config_yaml( out_root, spm_filename=spm_filename_prefix + ".model", specaugment_policy="ld" ) # Clean up shutil.rmtree(feature_root)
def process(args): os.makedirs(args.output_root, exist_ok=True) # Extract features feature_root = op.join(args.output_root, "fbank80") os.makedirs(feature_root, exist_ok=True) for split in SPLITS: print(f"Fetching split {split}...") dataset = LIBRISPEECH(args.output_root, url=split, download=True) print("Extracting log mel filter bank features...") for wav, sample_rate, _, spk_id, chapter_id, utt_id in tqdm(dataset): sample_id = f"{spk_id}-{chapter_id}-{utt_id}" extract_fbank_features(wav, sample_rate, op.join(feature_root, f"{sample_id}.npy")) # Pack features into ZIP zip_filename = "fbank80.zip" zip_path = op.join(args.output_root, zip_filename) print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") zip_manifest = get_zip_manifest(args.output_root, zip_filename) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = LIBRISPEECH(args.output_root, url=split) for wav, sample_rate, utt, spk_id, chapter_id, utt_id in tqdm(dataset): sample_id = f"{spk_id}-{chapter_id}-{utt_id}" manifest["id"].append(sample_id) manifest["audio"].append(zip_manifest[sample_id]) duration_ms = int(wav.size(1) / sample_rate * 1000) manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10)) manifest["tgt_text"].append(utt) manifest["speaker"].append(spk_id) save_df_to_tsv(pd.DataFrame.from_dict(manifest), op.join(args.output_root, f"{split}.tsv")) if split.startswith("train"): train_text.extend(manifest["tgt_text"]) # Generate vocab vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( f.name, op.join(args.output_root, spm_filename_prefix), args.vocab_type, args.vocab_size, ) # Generate config YAML gen_config_yaml(args.output_root, spm_filename_prefix + ".model", specaugment_policy="ld") # Clean up shutil.rmtree(feature_root)
def manifest_preparation(manifest, track, data, tgt_text, track_path): waveform, sample_rate = torchaudio.load(track_path) utt_id = data[1].removesuffix(".flac") extract_fbank_features(waveform, sample_rate, feature_root / utt_id + ".npy") manifest["id"].append(utt_id) manifest["audio"].append(feature_root / utt_id + ".npy") duration_ms = track.duration_seconds * ms manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10)) manifest["tgt_text"].append(tgt_text) manifest["speaker"].append(data[0])
def process(args): os.makedirs(args.output_root, exist_ok=True) # Extract features feature_root = op.join(args.output_root, 'fbank80') os.makedirs(feature_root, exist_ok=True) for split in SPLITS: print(f'Fetching split {split}...') dataset = LIBRISPEECH(args.output_root, url=split, download=True) print('Extracting log mel filter bank features...') for wav, sample_rate, _, spk_id, chapter_id, utt_id in tqdm(dataset): sample_id = f'{spk_id}-{chapter_id}-{utt_id}' extract_fbank_features(wav, sample_rate, op.join(feature_root, f'{sample_id}.npy')) # Pack features into ZIP zip_filename = 'fbank80.zip' zip_path = op.join(args.output_root, zip_filename) print('ZIPing features...') create_zip(feature_root, zip_path) print('Fetching ZIP manifest...') zip_manifest = get_zip_manifest(args.output_root, zip_filename) # Generate TSV manifest print('Generating manifest...') train_text = [] for split in SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = LIBRISPEECH(args.output_root, url=split) for wav, sample_rate, utt, spk_id, chapter_id, utt_id in tqdm(dataset): sample_id = f'{spk_id}-{chapter_id}-{utt_id}' manifest['id'].append(sample_id) manifest['audio'].append(zip_manifest[sample_id]) duration_ms = int(wav.size(1) / sample_rate * 1000) manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10)) manifest['tgt_text'].append(utt) manifest['speaker'].append(spk_id) save_df_to_tsv(pd.DataFrame.from_dict(manifest), op.join(args.output_root, f'{split}.tsv')) if split.startswith('train'): train_text.extend(manifest['tgt_text']) # Generate vocab vocab_size = '' if args.vocab_type == 'char' else str(args.vocab_size) spm_filename_prefix = f'spm_{args.vocab_type}{vocab_size}' with NamedTemporaryFile(mode='w') as f: for t in train_text: f.write(t + '\n') gen_vocab(f.name, op.join(args.output_root, spm_filename_prefix), args.vocab_type, args.vocab_size) # Generate config YAML gen_config_yaml(args.output_root, spm_filename_prefix + '.model', specaugment_policy='ld') # Clean up shutil.rmtree(feature_root)
def add_data_to_manifest(manifest, track_path_segment, data, counter, track_segment, tr): waveform, sample_rate = torchaudio.load(track_path_segment) feature_root = Path( "../speech_translation/data/sound").absolute() / "fbank" utt_id = data[0] + "_" + str(counter) extract_fbank_features(waveform, sample_rate, feature_root / f"{utt_id}.npy") manifest["id"].append(utt_id) manifest["audio"].append(feature_root / f"{utt_id}.npy") duration_ms = track_segment.duration_seconds * ms manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10)) manifest["tgt_text"].append(tr) manifest["speaker"].append(21918)
def process(args): for lang in MUSTC.LANGUAGES: cur_root = op.join(args.data_root, f"en-{lang}") if not op.isdir(cur_root): print(f"{cur_root} does not exist. Skipped.") continue # Extract features feature_root = op.join(cur_root, "fbank80") os.makedirs(feature_root, exist_ok=True) for split in MUSTC.SPLITS: print(f"Fetching split {split}...") dataset = MUSTC(args.data_root, lang, split) print("Extracting log mel filter bank features...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): extract_fbank_features(waveform, sample_rate, op.join(feature_root, f"{utt_id}.npy")) # Pack features into ZIP zip_filename = "fbank80.zip" zip_path = op.join(cur_root, zip_filename) print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") zip_manifest = get_zip_manifest(args.data_root, f"en-{lang}/{zip_filename}") # Generate TSV manifest print("Generating manifest...") train_text = [] for split in MUSTC.SPLITS: is_train_split = split.startswith("train") manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = MUSTC(args.data_root, lang, split) for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest["id"].append(utt_id) manifest["audio"].append(zip_manifest[utt_id]) duration_ms = int(wav.size(1) / sr * 1000) manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10)) manifest["tgt_text"].append(src_utt if args.task == "asr" else tgt_utt) manifest["speaker"].append(speaker_id) if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, op.join(cur_root, f"{split}_{args.task}.tsv")) # Generate vocab v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( f.name, op.join(cur_root, spm_filename_prefix), args.vocab_type, args.vocab_size, ) # Generate config YAML gen_config_yaml( cur_root, spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="lb", ) # Clean up shutil.rmtree(feature_root)
def process(args): root = Path(args.data_root).absolute() for lang in MUSTC.LANGUAGES: cur_root = root / f"en-{lang}" if not cur_root.is_dir(): print(f"{cur_root.as_posix()} does not exist. Skipped.") continue # Extract features audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80") audio_root.mkdir(exist_ok=True) for split in MUSTC.SPLITS: print(f"Fetching split {split}...") dataset = MUSTC(root.as_posix(), lang, split) if args.use_audio_input: print("Converting audios...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): tgt_sample_rate = 16_000 _wavform, _ = convert_waveform( waveform, sample_rate, to_mono=True, to_sample_rate=tgt_sample_rate) sf.write((audio_root / f"{utt_id}.flac").as_posix(), _wavform.T.numpy(), tgt_sample_rate) else: print("Extracting log mel filter bank features...") gcmvn_feature_list = [] if split == 'train' and args.cmvn_type == "global": print("And estimating cepstral mean and variance stats...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): features = extract_fbank_features( waveform, sample_rate, audio_root / f"{utt_id}.npy") if split == 'train' and args.cmvn_type == "global": if len(gcmvn_feature_list) < args.gcmvn_max_num: gcmvn_feature_list.append(features) if split == 'train' and args.cmvn_type == "global": # Estimate and save cmv stats = cal_gcmvn_stats(gcmvn_feature_list) with open(cur_root / "gcmvn.npz", "wb") as f: np.savez(f, mean=stats["mean"], std=stats["std"]) # Pack features into ZIP zip_path = cur_root / f"{audio_root.name}.zip" print("ZIPing audios/features...") create_zip(audio_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest( zip_path, is_audio=args.use_audio_input, ) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in MUSTC.SPLITS: is_train_split = split.startswith("train") manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = MUSTC(args.data_root, lang, split) for _, _, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest["id"].append(utt_id) manifest["audio"].append(audio_paths[utt_id]) manifest["n_frames"].append(audio_lengths[utt_id]) manifest["tgt_text"].append(src_utt if args.task == "asr" else tgt_utt) manifest["speaker"].append(speaker_id) if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv") # Generate vocab v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), cur_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML if args.use_audio_input: gen_config_yaml(cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy=None, extra={"use_audio_input": True}) else: gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="lb", cmvn_type=args.cmvn_type, gcmvn_path=(cur_root / "gcmvn.npz" if args.cmvn_type == "global" else None), ) # Clean up shutil.rmtree(audio_root)
def process(args): root = Path(args.data_root).absolute() for lang in mTEDx.LANGPAIRS: cur_root = root / f"{lang}" if not cur_root.is_dir(): print(f"{cur_root.as_posix()} does not exist. Skipped.") continue # Extract features audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80") audio_root.mkdir(exist_ok=True) for split in mTEDx.SPLITS: print(f"Fetching split {split}...") dataset = mTEDx(root.as_posix(), lang, split) if args.use_audio_input: print("Converting audios...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): tgt_sample_rate = 16_000 _wavform, _ = convert_waveform( waveform, sample_rate, to_mono=True, to_sample_rate=tgt_sample_rate ) sf.write( (audio_root / f"{utt_id}.flac").as_posix(), _wavform.numpy(), tgt_sample_rate ) else: print("Extracting log mel filter bank features...") for waveform, sample_rate, _, _, _, _, utt_id in tqdm(dataset): extract_fbank_features( waveform, sample_rate, audio_root / f"{utt_id}.npy" ) # Pack features into ZIP zip_path = cur_root / f"{audio_root.name}.zip" print("ZIPing audios/features...") create_zip(audio_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in mTEDx.SPLITS: is_train_split = split.startswith("train") manifest = {c: [] for c in MANIFEST_COLUMNS} ds = mTEDx(args.data_root, lang, split) for _, _, src_utt, tgt_utt, spk_id, tgt_lang, utt_id in tqdm(ds): manifest["id"].append(utt_id) manifest["audio"].append(audio_paths[utt_id]) manifest["n_frames"].append(audio_lengths[utt_id]) manifest["tgt_text"].append( src_utt if args.task == "asr" else tgt_utt ) manifest["speaker"].append(spk_id) manifest["tgt_lang"].append(tgt_lang) if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv") # Generate vocab v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), cur_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML if args.use_audio_input: gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy=None, extra={"use_audio_input": True} ) else: gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="lb", ) # Clean up shutil.rmtree(audio_root)
def process(args): for lang in MUSTC.LANGUAGES: cur_root = op.join(args.data_root, f'en-{lang}') if not op.isdir(cur_root): print(f'{cur_root} does not exist. Skipped.') continue # Extract features feature_root = op.join(cur_root, 'fbank80') os.makedirs(feature_root, exist_ok=True) for split in MUSTC.SPLITS: print(f'Fetching split {split}...') dataset = MUSTC(args.data_root, lang, split) print('Extracting log mel filter bank features...') for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): extract_fbank_features(waveform, sample_rate, op.join(feature_root, f'{utt_id}.npy')) # Pack features into ZIP zip_filename = 'fbank80.zip' zip_path = op.join(cur_root, zip_filename) print('ZIPing features...') create_zip(feature_root, zip_path) print('Fetching ZIP manifest...') zip_manifest = get_zip_manifest(args.data_root, f'en-{lang}/{zip_filename}') # Generate TSV manifest print('Generating manifest...') train_text = {task: [] for task in TASKS} for split in MUSTC.SPLITS: is_train_split = split.startswith('train') manifest = {c: [] for c in MANIFEST_COLUMNS} text = {task: [] for task in TASKS} dataset = MUSTC(args.data_root, lang, split) for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest['id'].append(utt_id) manifest['audio'].append(zip_manifest[utt_id]) duration_ms = int(wav.size(1) / sr * 1000) manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10)) text['asr'].append(src_utt) text['st'].append(tgt_utt) manifest['speaker'].append(speaker_id) if is_train_split: for task in TASKS: train_text[task].extend(text[task]) for task in TASKS: manifest['tgt_text'] = text[task] df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, op.join(cur_root, f'{split}_{task}.tsv')) # Generate vocab for task in TASKS: vocab_type, vocab_size = args.asr_vocab_type, args.asr_vocab_size if task == 'st': vocab_type, vocab_size = args.st_vocab_type, args.st_vocab_size vocab_size_str = '' if vocab_type == 'char' else str(vocab_size) spm_filename_prefix = f'spm_{vocab_type}{vocab_size_str}_{task}' with NamedTemporaryFile(mode='w') as f: for t in train_text[task]: f.write(t + '\n') gen_vocab(f.name, op.join(cur_root, spm_filename_prefix), vocab_type, vocab_size) # Generate config YAML gen_config_yaml(cur_root, spm_filename_prefix + '.model', yaml_filename=f'config_{task}.yaml', specaugment_policy='lb') # Clean up shutil.rmtree(feature_root)