def process(args): root = Path(args.data_root).absolute() / args.src_lang if not root.is_dir(): raise NotADirectoryError(f"{root} does not exist") # Extract features feature_root = root / "fbank80" feature_root.mkdir(exist_ok=True) for split in CoVoST.SPLITS: print(f"Fetching split {split}...") dataset = CoVoST(root, split, args.src_lang, args.tgt_lang) print("Extracting log mel filter bank features...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): extract_fbank_features(waveform, sample_rate, feature_root / f"{utt_id}.npy") # Pack features into ZIP zip_path = root / "fbank80.zip" print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") zip_manifest = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] task = f"asr_{args.src_lang}" if args.tgt_lang is not None: task = f"st_{args.src_lang}_{args.tgt_lang}" for split in CoVoST.SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = CoVoST(root, split, args.src_lang, args.tgt_lang) for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest["id"].append(utt_id) manifest["audio"].append(zip_manifest[utt_id]) duration_ms = int(wav.size(1) / sr * 1000) manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10)) manifest["tgt_text"].append( src_utt if args.tgt_lang is None else tgt_utt) manifest["speaker"].append(speaker_id) is_train_split = split.startswith("train") if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, root / f"{split}_{task}.tsv") # Generate vocab vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{task}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab(Path(f.name), root / spm_filename_prefix, args.vocab_type, args.vocab_size) # Generate config YAML gen_config_yaml( root, spm_filename_prefix + ".model", yaml_filename=f"config_{task}.yaml", specaugment_policy="lb", ) # Clean up shutil.rmtree(feature_root)
def process(args): root = op.join(args.data_root, args.src_lang) os.makedirs(root, exist_ok=True) # Extract features feature_root = op.join(root, 'fbank80') os.makedirs(feature_root, exist_ok=True) for split in CoVoST.SPLITS: print(f'Fetching split {split}...') dataset = CoVoST(root, split, args.src_lang, args.tgt_lang, download=True) print('Extracting log mel filter bank features...') for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): extract_fbank_features(waveform, sample_rate, op.join(feature_root, f'{utt_id}.npy')) # Pack features into ZIP zip_filename = 'fbank80.zip' zip_path = op.join(root, zip_filename) print('ZIPing features...') create_zip(feature_root, zip_path) print('Fetching ZIP manifest...') zip_manifest = get_zip_manifest(args.data_root, f'{args.src_lang}/{zip_filename}') # Generate TSV manifest print('Generating manifest...') train_text = [] task = f'asr_{args.src_lang}' if args.tgt_lang is not None: task = f'st_{args.src_lang}_{args.tgt_lang}' for split in CoVoST.SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = CoVoST(root, split, args.src_lang, args.tgt_lang) for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest['id'].append(utt_id) manifest['audio'].append(zip_manifest[utt_id]) duration_ms = int(wav.size(1) / sr * 1000) manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10)) manifest['tgt_text'].append( src_utt if args.tgt_lang is None else tgt_utt ) manifest['speaker'].append(speaker_id) is_train_split = split.startswith('train') if is_train_split: train_text.extend(manifest['tgt_text']) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, op.join(root, f'{split}_{task}.tsv')) # Generate vocab vocab_size_str = '' if args.vocab_type == 'char' else str(args.vocab_size) spm_filename_prefix = f'spm_{args.vocab_type}{vocab_size_str}_{task}' with NamedTemporaryFile(mode='w') as f: for t in train_text: f.write(t + '\n') gen_vocab(f.name, op.join(root, spm_filename_prefix), args.vocab_type, args.vocab_size) # Generate config YAML gen_config_yaml(root, spm_filename_prefix + '.model', yaml_filename=f'config_{task}.yaml', specaugment_policy='lb') # Clean up shutil.rmtree(feature_root)
def process_joint(args): assert all( op.isdir(op.join(args.data_root, f"en-{lang}")) for lang in MUSTC.LANGUAGES ), "do not have downloaded data available for all 8 languages" cur_root = args.data_root # Generate vocab vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for lang in MUSTC.LANGUAGES: tsv_path = op.join(cur_root, f"en-{lang}", f"train_{args.task}.tsv") df = load_df_from_tsv(tsv_path) for t in df["tgt_text"]: f.write(t + "\n") gen_vocab( f.name, op.join(cur_root, spm_filename_prefix), args.vocab_type, args.vocab_size, ) # Generate config YAML gen_config_yaml( cur_root, spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="lb", prepend_tgt_lang_tag=(args.task == "st"), ) # Make symbolic links to manifests for lang in MUSTC.LANGUAGES: for split in MUSTC.SPLITS: src_path = op.join(cur_root, f"en-{lang}", f"{split}_{args.task}.tsv") desc_path = op.join(cur_root, f"{split}_{lang}_{args.task}.tsv") if not op.islink(desc_path): os.symlink(src_path, desc_path)
def process(args): out_root = Path(args.output_root).absolute() out_root.mkdir(exist_ok=True) # Extract features feature_root = out_root / "fbank80" feature_root.mkdir(exist_ok=True) for split in SPLITS: print(f"Fetching split {split}...") dataset = LIBRISPEECH(out_root.as_posix(), url=split, download=True) print("Extracting log mel filter bank features...") for wav, sample_rate, _, spk_id, chapter_no, utt_no in tqdm(dataset): sample_id = f"{spk_id}-{chapter_no}-{utt_no}" extract_fbank_features( wav, sample_rate, feature_root / f"{sample_id}.npy" ) # Pack features into ZIP zip_path = out_root / "fbank80.zip" print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = LIBRISPEECH(out_root.as_posix(), url=split) for _, _, utt, spk_id, chapter_no, utt_no in tqdm(dataset): sample_id = f"{spk_id}-{chapter_no}-{utt_no}" manifest["id"].append(sample_id) manifest["audio"].append(audio_paths[sample_id]) manifest["n_frames"].append(audio_lengths[sample_id]) manifest["tgt_text"].append(utt.lower()) manifest["speaker"].append(spk_id) save_df_to_tsv( pd.DataFrame.from_dict(manifest), out_root / f"{split}.tsv" ) if split.startswith("train"): train_text.extend(manifest["tgt_text"]) # Generate vocab vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), out_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML gen_config_yaml( out_root, spm_filename=spm_filename_prefix + ".model", specaugment_policy="ld" ) # Clean up shutil.rmtree(feature_root)
def process(args): os.makedirs(args.output_root, exist_ok=True) # Extract features feature_root = op.join(args.output_root, "fbank80") os.makedirs(feature_root, exist_ok=True) for split in SPLITS: print(f"Fetching split {split}...") dataset = LIBRISPEECH(args.output_root, url=split, download=True) print("Extracting log mel filter bank features...") for wav, sample_rate, _, spk_id, chapter_id, utt_id in tqdm(dataset): sample_id = f"{spk_id}-{chapter_id}-{utt_id}" extract_fbank_features(wav, sample_rate, op.join(feature_root, f"{sample_id}.npy")) # Pack features into ZIP zip_filename = "fbank80.zip" zip_path = op.join(args.output_root, zip_filename) print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") zip_manifest = get_zip_manifest(args.output_root, zip_filename) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = LIBRISPEECH(args.output_root, url=split) for wav, sample_rate, utt, spk_id, chapter_id, utt_id in tqdm(dataset): sample_id = f"{spk_id}-{chapter_id}-{utt_id}" manifest["id"].append(sample_id) manifest["audio"].append(zip_manifest[sample_id]) duration_ms = int(wav.size(1) / sample_rate * 1000) manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10)) manifest["tgt_text"].append(utt) manifest["speaker"].append(spk_id) save_df_to_tsv(pd.DataFrame.from_dict(manifest), op.join(args.output_root, f"{split}.tsv")) if split.startswith("train"): train_text.extend(manifest["tgt_text"]) # Generate vocab vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( f.name, op.join(args.output_root, spm_filename_prefix), args.vocab_type, args.vocab_size, ) # Generate config YAML gen_config_yaml(args.output_root, spm_filename_prefix + ".model", specaugment_policy="ld") # Clean up shutil.rmtree(feature_root)
def process(args): os.makedirs(args.output_root, exist_ok=True) # Extract features feature_root = op.join(args.output_root, 'fbank80') os.makedirs(feature_root, exist_ok=True) for split in SPLITS: print(f'Fetching split {split}...') dataset = LIBRISPEECH(args.output_root, url=split, download=True) print('Extracting log mel filter bank features...') for wav, sample_rate, _, spk_id, chapter_id, utt_id in tqdm(dataset): sample_id = f'{spk_id}-{chapter_id}-{utt_id}' extract_fbank_features(wav, sample_rate, op.join(feature_root, f'{sample_id}.npy')) # Pack features into ZIP zip_filename = 'fbank80.zip' zip_path = op.join(args.output_root, zip_filename) print('ZIPing features...') create_zip(feature_root, zip_path) print('Fetching ZIP manifest...') zip_manifest = get_zip_manifest(args.output_root, zip_filename) # Generate TSV manifest print('Generating manifest...') train_text = [] for split in SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = LIBRISPEECH(args.output_root, url=split) for wav, sample_rate, utt, spk_id, chapter_id, utt_id in tqdm(dataset): sample_id = f'{spk_id}-{chapter_id}-{utt_id}' manifest['id'].append(sample_id) manifest['audio'].append(zip_manifest[sample_id]) duration_ms = int(wav.size(1) / sample_rate * 1000) manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10)) manifest['tgt_text'].append(utt) manifest['speaker'].append(spk_id) save_df_to_tsv(pd.DataFrame.from_dict(manifest), op.join(args.output_root, f'{split}.tsv')) if split.startswith('train'): train_text.extend(manifest['tgt_text']) # Generate vocab vocab_size = '' if args.vocab_type == 'char' else str(args.vocab_size) spm_filename_prefix = f'spm_{args.vocab_type}{vocab_size}' with NamedTemporaryFile(mode='w') as f: for t in train_text: f.write(t + '\n') gen_vocab(f.name, op.join(args.output_root, spm_filename_prefix), args.vocab_type, args.vocab_size) # Generate config YAML gen_config_yaml(args.output_root, spm_filename_prefix + '.model', specaugment_policy='ld') # Clean up shutil.rmtree(feature_root)
def process_joint(args): cur_root = Path(args.data_root) assert all((cur_root / f"{lang}").is_dir() for lang in mTEDx.LANGPAIRS), \ "do not have downloaded data available for all languages" # Generate vocab vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for lang in mTEDx.LANGPAIRS: tsv_path = cur_root / f"{lang}" / f"train_{args.task}.tsv" df = load_df_from_tsv(tsv_path) for t in df["tgt_text"]: f.write(t + "\n") special_symbols = None if args.joint: # Add tgt_lang tags to dict special_symbols = list( {f'<lang:{lang.split("-")[1]}>' for lang in mTEDx.LANGPAIRS} ) gen_vocab( Path(f.name), cur_root / spm_filename_prefix, args.vocab_type, args.vocab_size, special_symbols=special_symbols ) # Generate config YAML gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="ld", prepend_tgt_lang_tag=(args.joint), ) # Make symbolic links to manifests for lang in mTEDx.LANGPAIRS: for split in mTEDx.SPLITS: src_path = cur_root / f"{lang}" / f"{split}_{args.task}.tsv" desc_path = cur_root / f"{split}_{lang}_{args.task}.tsv" if not desc_path.is_symlink(): os.symlink(src_path, desc_path)
def process(args): for lang in MUSTC.LANGUAGES: cur_root = op.join(args.data_root, f"en-{lang}") if not op.isdir(cur_root): print(f"{cur_root} does not exist. Skipped.") continue # Extract features feature_root = op.join(cur_root, "fbank80") os.makedirs(feature_root, exist_ok=True) for split in MUSTC.SPLITS: print(f"Fetching split {split}...") dataset = MUSTC(args.data_root, lang, split) print("Extracting log mel filter bank features...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): extract_fbank_features(waveform, sample_rate, op.join(feature_root, f"{utt_id}.npy")) # Pack features into ZIP zip_filename = "fbank80.zip" zip_path = op.join(cur_root, zip_filename) print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") zip_manifest = get_zip_manifest(args.data_root, f"en-{lang}/{zip_filename}") # Generate TSV manifest print("Generating manifest...") train_text = [] for split in MUSTC.SPLITS: is_train_split = split.startswith("train") manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = MUSTC(args.data_root, lang, split) for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest["id"].append(utt_id) manifest["audio"].append(zip_manifest[utt_id]) duration_ms = int(wav.size(1) / sr * 1000) manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10)) manifest["tgt_text"].append(src_utt if args.task == "asr" else tgt_utt) manifest["speaker"].append(speaker_id) if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, op.join(cur_root, f"{split}_{args.task}.tsv")) # Generate vocab v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( f.name, op.join(cur_root, spm_filename_prefix), args.vocab_type, args.vocab_size, ) # Generate config YAML gen_config_yaml( cur_root, spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="lb", ) # Clean up shutil.rmtree(feature_root)
def process(args): root = Path(args.data_root).absolute() for lang in MUSTC.LANGUAGES: cur_root = root / f"en-{lang}" if not cur_root.is_dir(): print(f"{cur_root.as_posix()} does not exist. Skipped.") continue # Extract features audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80") audio_root.mkdir(exist_ok=True) for split in MUSTC.SPLITS: print(f"Fetching split {split}...") dataset = MUSTC(root.as_posix(), lang, split) if args.use_audio_input: print("Converting audios...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): tgt_sample_rate = 16_000 _wavform, _ = convert_waveform( waveform, sample_rate, to_mono=True, to_sample_rate=tgt_sample_rate) sf.write((audio_root / f"{utt_id}.flac").as_posix(), _wavform.T.numpy(), tgt_sample_rate) else: print("Extracting log mel filter bank features...") gcmvn_feature_list = [] if split == 'train' and args.cmvn_type == "global": print("And estimating cepstral mean and variance stats...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): features = extract_fbank_features( waveform, sample_rate, audio_root / f"{utt_id}.npy") if split == 'train' and args.cmvn_type == "global": if len(gcmvn_feature_list) < args.gcmvn_max_num: gcmvn_feature_list.append(features) if split == 'train' and args.cmvn_type == "global": # Estimate and save cmv stats = cal_gcmvn_stats(gcmvn_feature_list) with open(cur_root / "gcmvn.npz", "wb") as f: np.savez(f, mean=stats["mean"], std=stats["std"]) # Pack features into ZIP zip_path = cur_root / f"{audio_root.name}.zip" print("ZIPing audios/features...") create_zip(audio_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest( zip_path, is_audio=args.use_audio_input, ) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in MUSTC.SPLITS: is_train_split = split.startswith("train") manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = MUSTC(args.data_root, lang, split) for _, _, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest["id"].append(utt_id) manifest["audio"].append(audio_paths[utt_id]) manifest["n_frames"].append(audio_lengths[utt_id]) manifest["tgt_text"].append(src_utt if args.task == "asr" else tgt_utt) manifest["speaker"].append(speaker_id) if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv") # Generate vocab v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), cur_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML if args.use_audio_input: gen_config_yaml(cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy=None, extra={"use_audio_input": True}) else: gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="lb", cmvn_type=args.cmvn_type, gcmvn_path=(cur_root / "gcmvn.npz" if args.cmvn_type == "global" else None), ) # Clean up shutil.rmtree(audio_root)
def process(args): ''' This version assume that the fbank features are extracted before e.g. for purpose of data augmentation Arguments: args.feature_root: a list containing the paths of extracted fbanks for zipping args.info_dict: a dict with *split* as key and file path as *value* ''' os.makedirs(args.output_root, exist_ok=True) if args.feature_root is None: # Do not create new zip files assert args.path_fbankzip_root is not None, \ 'Please provide zipped filter banks' print('Load zipfile') zip_manifest = get_zip_manifest(args.path_fbankzip_root, 'fbank80.zip') else: zip_filename = "fbank80.zip" zip_path = op.join(args.output_root, zip_filename) print("ZIPing features...") create_zip_list( args.feature_root, zip_path ) # Allow fbanks to be saved over different dirs. but are gathered for one zip file print("Fetching ZIP manifest...") zip_manifest = get_zip_manifest(args.output_root, zip_filename) # Generate TSV manifest print("Generating manifest...") train_text = [] # Take the info file for each split and generate the .tsv files # info file has 3 columns: # 1) n_frames, 2) utterance id, and 3) transcription for split, info_path in args.info_dict.items(): manifest = {c: [] for c in MANIFEST_COLUMNS} with open(info_path, "r") as fin: for idx, l in enumerate(fin): line = l.strip().split() # Note: the utt_id here has no extra zeros for padding n_frames, uid, tp, tmin, tmax, total_time = line[0], line[1], \ line[2:-3], line[-3], \ line[-2], line[-1] _uid = uid.split("-") if args.augment: spk_id, chapter_id, utt_id, aug_id = _uid[0], _uid[1], \ _uid[2], _uid[3] sample_id = f"{spk_id}-{chapter_id}-{utt_id}-{aug_id}" else: spk_id, chapter_id, utt_id = _uid[0], _uid[1], _uid[2] sample_id = f"{spk_id}-{chapter_id}-{utt_id}" manifest["id"].append(sample_id) manifest["audio"].append(zip_manifest[sample_id]) manifest["n_frames"].append(n_frames) manifest["tgt_text"].append(" ".join(tp)) manifest["speaker"].append(spk_id) if split.startswith('train'): manifest["align_time_min"].append(tmin) manifest["align_time_max"].append(tmax) manifest["total_time"].append(total_time) save_df_to_tsv(pd.DataFrame.from_dict(manifest), op.join(args.output_root, "{}.tsv".format(split))) if split.startswith('train'): print(f'Add {split} to train_text') train_text.extend(manifest["tgt_text"]) print("length of train_text: {}".format(len(train_text))) # Generate vocab vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}" with NamedTemporaryFile(mode="w") as f_tmp: for t in train_text: f_tmp.write(t + "\n") gen_vocab( f_tmp.name, op.join(args.output_root, spm_filename_prefix), args.vocab_type, args.vocab_size, ) # Generate config YAML gen_config_yaml(args.output_root, spm_filename_prefix + ".model", specaugment_policy="ld")
def process(args): root = Path(args.data_root).absolute() for lang in mTEDx.LANGPAIRS: cur_root = root / f"{lang}" if not cur_root.is_dir(): print(f"{cur_root.as_posix()} does not exist. Skipped.") continue # Extract features audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80") audio_root.mkdir(exist_ok=True) for split in mTEDx.SPLITS: print(f"Fetching split {split}...") dataset = mTEDx(root.as_posix(), lang, split) if args.use_audio_input: print("Converting audios...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): tgt_sample_rate = 16_000 _wavform, _ = convert_waveform( waveform, sample_rate, to_mono=True, to_sample_rate=tgt_sample_rate ) sf.write( (audio_root / f"{utt_id}.flac").as_posix(), _wavform.numpy(), tgt_sample_rate ) else: print("Extracting log mel filter bank features...") for waveform, sample_rate, _, _, _, _, utt_id in tqdm(dataset): extract_fbank_features( waveform, sample_rate, audio_root / f"{utt_id}.npy" ) # Pack features into ZIP zip_path = cur_root / f"{audio_root.name}.zip" print("ZIPing audios/features...") create_zip(audio_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in mTEDx.SPLITS: is_train_split = split.startswith("train") manifest = {c: [] for c in MANIFEST_COLUMNS} ds = mTEDx(args.data_root, lang, split) for _, _, src_utt, tgt_utt, spk_id, tgt_lang, utt_id in tqdm(ds): manifest["id"].append(utt_id) manifest["audio"].append(audio_paths[utt_id]) manifest["n_frames"].append(audio_lengths[utt_id]) manifest["tgt_text"].append( src_utt if args.task == "asr" else tgt_utt ) manifest["speaker"].append(spk_id) manifest["tgt_lang"].append(tgt_lang) if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv") # Generate vocab v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), cur_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML if args.use_audio_input: gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy=None, extra={"use_audio_input": True} ) else: gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="lb", ) # Clean up shutil.rmtree(audio_root)
def process(args): assert "train" in args.splits out_root = Path(args.output_root).absolute() out_root.mkdir(exist_ok=True) print("Fetching data...") audio_manifest_root = Path(args.audio_manifest_root).absolute() samples = [] for s in args.splits: for e in load_tsv_to_dicts(audio_manifest_root / f"{s}.audio.tsv"): e["split"] = s samples.append(e) sample_ids = [s["id"] for s in samples] # Get alignment info id_to_alignment = None if args.textgrid_zip is not None: assert args.id_to_units_tsv is None id_to_alignment = get_mfa_alignment( args.textgrid_zip, sample_ids, args.sample_rate, args.hop_length ) elif args.id_to_units_tsv is not None: # assume identical hop length on the unit sequence id_to_alignment = get_unit_alignment(args.id_to_units_tsv, sample_ids) # Extract features and pack features into ZIP feature_name = "logmelspec80" zip_path = out_root / f"{feature_name}.zip" pitch_zip_path = out_root / "pitch.zip" energy_zip_path = out_root / "energy.zip" gcmvn_npz_path = out_root / "gcmvn_stats.npz" if zip_path.exists() and gcmvn_npz_path.exists(): print(f"{zip_path} and {gcmvn_npz_path} exist.") else: feature_root = out_root / feature_name feature_root.mkdir(exist_ok=True) pitch_root = out_root / "pitch" energy_root = out_root / "energy" if args.add_fastspeech_targets: pitch_root.mkdir(exist_ok=True) energy_root.mkdir(exist_ok=True) print("Extracting Mel spectrogram features...") for sample in tqdm(samples): waveform, sample_rate = torchaudio.load(sample["audio"]) waveform, sample_rate = convert_waveform( waveform, sample_rate, normalize_volume=args.normalize_volume, to_sample_rate=args.sample_rate ) sample_id = sample["id"] target_length = None if id_to_alignment is not None: a = id_to_alignment[sample_id] target_length = sum(a.frame_durations) if a.start_sec is not None and a.end_sec is not None: start_frame = int(a.start_sec * sample_rate) end_frame = int(a.end_sec * sample_rate) waveform = waveform[:, start_frame: end_frame] extract_logmel_spectrogram( waveform, sample_rate, feature_root / f"{sample_id}.npy", win_length=args.win_length, hop_length=args.hop_length, n_fft=args.n_fft, n_mels=args.n_mels, f_min=args.f_min, f_max=args.f_max, target_length=target_length ) if args.add_fastspeech_targets: assert id_to_alignment is not None extract_pitch( waveform, sample_rate, pitch_root / f"{sample_id}.npy", hop_length=args.hop_length, log_scale=True, phoneme_durations=id_to_alignment[sample_id].frame_durations ) extract_energy( waveform, energy_root / f"{sample_id}.npy", hop_length=args.hop_length, n_fft=args.n_fft, log_scale=True, phoneme_durations=id_to_alignment[sample_id].frame_durations ) print("ZIPing features...") create_zip(feature_root, zip_path) get_global_cmvn(feature_root, gcmvn_npz_path) shutil.rmtree(feature_root) if args.add_fastspeech_targets: create_zip(pitch_root, pitch_zip_path) shutil.rmtree(pitch_root) create_zip(energy_root, energy_zip_path) shutil.rmtree(energy_root) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest(zip_path) pitch_paths, pitch_lengths, energy_paths, energy_lengths = [None] * 4 if args.add_fastspeech_targets: pitch_paths, pitch_lengths = get_zip_manifest(pitch_zip_path) energy_paths, energy_lengths = get_zip_manifest(energy_zip_path) # Generate TSV manifest print("Generating manifest...") id_to_cer = None if args.cer_threshold is not None: assert Path(args.cer_tsv_path).is_file() id_to_cer = { x["id"]: x["uer"] for x in load_tsv_to_dicts(args.cer_tsv_path) } manifest_by_split = {split: defaultdict(list) for split in args.splits} for sample in tqdm(samples): sample_id, split = sample["id"], sample["split"] if args.snr_threshold is not None and "snr" in sample \ and sample["snr"] < args.snr_threshold: continue if args.cer_threshold is not None \ and id_to_cer[sample_id] > args.cer_threhold: continue normalized_utt = sample["tgt_text"] if id_to_alignment is not None: normalized_utt = " ".join(id_to_alignment[sample_id].tokens) elif args.ipa_vocab: normalized_utt = ipa_phonemize( normalized_utt, lang=args.lang, use_g2p=args.use_g2p ) manifest_by_split[split]["id"].append(sample_id) manifest_by_split[split]["audio"].append(audio_paths[sample_id]) manifest_by_split[split]["n_frames"].append(audio_lengths[sample_id]) manifest_by_split[split]["tgt_text"].append(normalized_utt) manifest_by_split[split]["speaker"].append(sample["speaker"]) manifest_by_split[split]["src_text"].append(sample["src_text"]) if args.add_fastspeech_targets: assert id_to_alignment is not None duration = " ".join( str(d) for d in id_to_alignment[sample_id].frame_durations ) manifest_by_split[split]["duration"].append(duration) manifest_by_split[split]["pitch"].append(pitch_paths[sample_id]) manifest_by_split[split]["energy"].append(energy_paths[sample_id]) for split in args.splits: save_df_to_tsv( pd.DataFrame.from_dict(manifest_by_split[split]), out_root / f"{split}.tsv" ) # Generate vocab vocab_name, spm_filename = None, None if id_to_alignment is not None or args.ipa_vocab: vocab = Counter() for t in manifest_by_split["train"]["tgt_text"]: vocab.update(t.split(" ")) vocab_name = "vocab.txt" with open(out_root / vocab_name, "w") as f: for s, c in vocab.most_common(): f.write(f"{s} {c}\n") else: spm_filename_prefix = "spm_char" spm_filename = f"{spm_filename_prefix}.model" with NamedTemporaryFile(mode="w") as f: for t in manifest_by_split["train"]["tgt_text"]: f.write(t + "\n") f.flush() # needed to ensure gen_vocab sees dumped text gen_vocab(Path(f.name), out_root / spm_filename_prefix, "char") # Generate speaker list speakers = sorted({sample["speaker"] for sample in samples}) speakers_path = out_root / "speakers.txt" with open(speakers_path, "w") as f: for speaker in speakers: f.write(f"{speaker}\n") # Generate config YAML win_len_t = args.win_length / args.sample_rate hop_len_t = args.hop_length / args.sample_rate extra = { "sample_rate": args.sample_rate, "features": { "type": "spectrogram+melscale+log", "eps": 1e-5, "n_mels": args.n_mels, "n_fft": args.n_fft, "window_fn": "hann", "win_length": args.win_length, "hop_length": args.hop_length, "sample_rate": args.sample_rate, "win_len_t": win_len_t, "hop_len_t": hop_len_t, "f_min": args.f_min, "f_max": args.f_max, "n_stft": args.n_fft // 2 + 1 } } if len(speakers) > 1: extra["speaker_set_filename"] = "speakers.txt" if args.add_fastspeech_targets: pitch_min, pitch_max = get_feature_value_min_max( [(out_root / n).as_posix() for n in pitch_paths.values()] ) energy_min, energy_max = get_feature_value_min_max( [(out_root / n).as_posix() for n in energy_paths.values()] ) extra["features"]["pitch_min"] = pitch_min extra["features"]["pitch_max"] = pitch_max extra["features"]["energy_min"] = energy_min extra["features"]["energy_max"] = energy_max gen_config_yaml( out_root, spm_filename=spm_filename, vocab_name=vocab_name, audio_root=out_root.as_posix(), input_channels=None, input_feat_per_channel=None, specaugment_policy=None, cmvn_type="global", gcmvn_path=gcmvn_npz_path, extra=extra )
def gen_voc(train_text, spm_filename_prefix): f = open(Path("../data/sound").absolute() / "test.txt", "a") for t in train_text: f.write(" ".join(t.split()[0:4]) + "\n") print(f.name) gen_vocab(Path(f.name), Path("../data/sound") / spm_filename_prefix)
def process(args): for lang in MUSTC.LANGUAGES: cur_root = op.join(args.data_root, f'en-{lang}') if not op.isdir(cur_root): print(f'{cur_root} does not exist. Skipped.') continue # Extract features feature_root = op.join(cur_root, 'fbank80') os.makedirs(feature_root, exist_ok=True) for split in MUSTC.SPLITS: print(f'Fetching split {split}...') dataset = MUSTC(args.data_root, lang, split) print('Extracting log mel filter bank features...') for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): extract_fbank_features(waveform, sample_rate, op.join(feature_root, f'{utt_id}.npy')) # Pack features into ZIP zip_filename = 'fbank80.zip' zip_path = op.join(cur_root, zip_filename) print('ZIPing features...') create_zip(feature_root, zip_path) print('Fetching ZIP manifest...') zip_manifest = get_zip_manifest(args.data_root, f'en-{lang}/{zip_filename}') # Generate TSV manifest print('Generating manifest...') train_text = {task: [] for task in TASKS} for split in MUSTC.SPLITS: is_train_split = split.startswith('train') manifest = {c: [] for c in MANIFEST_COLUMNS} text = {task: [] for task in TASKS} dataset = MUSTC(args.data_root, lang, split) for wav, sr, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest['id'].append(utt_id) manifest['audio'].append(zip_manifest[utt_id]) duration_ms = int(wav.size(1) / sr * 1000) manifest['n_frames'].append(int(1 + (duration_ms - 25) / 10)) text['asr'].append(src_utt) text['st'].append(tgt_utt) manifest['speaker'].append(speaker_id) if is_train_split: for task in TASKS: train_text[task].extend(text[task]) for task in TASKS: manifest['tgt_text'] = text[task] df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, op.join(cur_root, f'{split}_{task}.tsv')) # Generate vocab for task in TASKS: vocab_type, vocab_size = args.asr_vocab_type, args.asr_vocab_size if task == 'st': vocab_type, vocab_size = args.st_vocab_type, args.st_vocab_size vocab_size_str = '' if vocab_type == 'char' else str(vocab_size) spm_filename_prefix = f'spm_{vocab_type}{vocab_size_str}_{task}' with NamedTemporaryFile(mode='w') as f: for t in train_text[task]: f.write(t + '\n') gen_vocab(f.name, op.join(cur_root, spm_filename_prefix), vocab_type, vocab_size) # Generate config YAML gen_config_yaml(cur_root, spm_filename_prefix + '.model', yaml_filename=f'config_{task}.yaml', specaugment_policy='lb') # Clean up shutil.rmtree(feature_root)