def extract_fbank_features( waveform: torch.FloatTensor, sample_rate: int, output_path: Optional[Path] = None, n_mel_bins: int = 80, overwrite: bool = False, ): if output_path is not None and output_path.is_file() and not overwrite: return _waveform = convert_waveform(waveform, sample_rate, to_mono=True) # Kaldi compliance: 16-bit signed integers _waveform = _waveform * (2 ** 15) _waveform = _waveform.numpy() features = _get_kaldi_fbank(_waveform, sample_rate, n_mel_bins) if features is None: features = _get_torchaudio_fbank(_waveform, sample_rate, n_mel_bins) if features is None: raise ImportError( "Please install pyKaldi or torchaudio to enable fbank feature extraction" ) if output_path is not None: np.save(output_path.as_posix(), features) return features
def prepare_target_data(args, tgt_audios): feature_name = "logmelspec80" zip_path = args.output_root / f"{feature_name}.zip" if zip_path.exists(): print(f"{zip_path} exists.") return zip_path feature_root = args.output_root / feature_name feature_root.mkdir(exist_ok=True) print("Extracting Mel spectrogram features...") for tgt_audio in tqdm(tgt_audios): sample_id = tgt_audio.stem waveform, sample_rate = torchaudio.load(tgt_audio.as_posix()) waveform, sample_rate = convert_waveform( waveform, sample_rate, normalize_volume=args.normalize_volume, to_sample_rate=args.sample_rate) extract_logmel_spectrogram(waveform, sample_rate, feature_root / f"{sample_id}.npy", win_length=args.win_length, hop_length=args.hop_length, n_fft=args.n_fft, n_mels=args.n_mels, f_min=args.f_min, f_max=args.f_max) print("ZIPing features...") create_zip(feature_root, zip_path) shutil.rmtree(feature_root) return zip_path
def process(args): root = Path(args.data_root).absolute() for lang in MUSTC.LANGUAGES: cur_root = root / f"en-{lang}" if not cur_root.is_dir(): print(f"{cur_root.as_posix()} does not exist. Skipped.") continue # Extract features audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80") audio_root.mkdir(exist_ok=True) for split in MUSTC.SPLITS: print(f"Fetching split {split}...") dataset = MUSTC(root.as_posix(), lang, split) if args.use_audio_input: print("Converting audios...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): tgt_sample_rate = 16_000 _wavform, _ = convert_waveform( waveform, sample_rate, to_mono=True, to_sample_rate=tgt_sample_rate) sf.write((audio_root / f"{utt_id}.flac").as_posix(), _wavform.T.numpy(), tgt_sample_rate) else: print("Extracting log mel filter bank features...") gcmvn_feature_list = [] if split == 'train' and args.cmvn_type == "global": print("And estimating cepstral mean and variance stats...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): features = extract_fbank_features( waveform, sample_rate, audio_root / f"{utt_id}.npy") if split == 'train' and args.cmvn_type == "global": if len(gcmvn_feature_list) < args.gcmvn_max_num: gcmvn_feature_list.append(features) if split == 'train' and args.cmvn_type == "global": # Estimate and save cmv stats = cal_gcmvn_stats(gcmvn_feature_list) with open(cur_root / "gcmvn.npz", "wb") as f: np.savez(f, mean=stats["mean"], std=stats["std"]) # Pack features into ZIP zip_path = cur_root / f"{audio_root.name}.zip" print("ZIPing audios/features...") create_zip(audio_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest( zip_path, is_audio=args.use_audio_input, ) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in MUSTC.SPLITS: is_train_split = split.startswith("train") manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = MUSTC(args.data_root, lang, split) for _, _, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest["id"].append(utt_id) manifest["audio"].append(audio_paths[utt_id]) manifest["n_frames"].append(audio_lengths[utt_id]) manifest["tgt_text"].append(src_utt if args.task == "asr" else tgt_utt) manifest["speaker"].append(speaker_id) if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv") # Generate vocab v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), cur_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML if args.use_audio_input: gen_config_yaml(cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy=None, extra={"use_audio_input": True}) else: gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="lb", cmvn_type=args.cmvn_type, gcmvn_path=(cur_root / "gcmvn.npz" if args.cmvn_type == "global" else None), ) # Clean up shutil.rmtree(audio_root)
def process(args): root = Path(args.data_root).absolute() for lang in mTEDx.LANGPAIRS: cur_root = root / f"{lang}" if not cur_root.is_dir(): print(f"{cur_root.as_posix()} does not exist. Skipped.") continue # Extract features audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80") audio_root.mkdir(exist_ok=True) for split in mTEDx.SPLITS: print(f"Fetching split {split}...") dataset = mTEDx(root.as_posix(), lang, split) if args.use_audio_input: print("Converting audios...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): tgt_sample_rate = 16_000 _wavform, _ = convert_waveform( waveform, sample_rate, to_mono=True, to_sample_rate=tgt_sample_rate ) sf.write( (audio_root / f"{utt_id}.flac").as_posix(), _wavform.numpy(), tgt_sample_rate ) else: print("Extracting log mel filter bank features...") for waveform, sample_rate, _, _, _, _, utt_id in tqdm(dataset): extract_fbank_features( waveform, sample_rate, audio_root / f"{utt_id}.npy" ) # Pack features into ZIP zip_path = cur_root / f"{audio_root.name}.zip" print("ZIPing audios/features...") create_zip(audio_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in mTEDx.SPLITS: is_train_split = split.startswith("train") manifest = {c: [] for c in MANIFEST_COLUMNS} ds = mTEDx(args.data_root, lang, split) for _, _, src_utt, tgt_utt, spk_id, tgt_lang, utt_id in tqdm(ds): manifest["id"].append(utt_id) manifest["audio"].append(audio_paths[utt_id]) manifest["n_frames"].append(audio_lengths[utt_id]) manifest["tgt_text"].append( src_utt if args.task == "asr" else tgt_utt ) manifest["speaker"].append(spk_id) manifest["tgt_lang"].append(tgt_lang) if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv") # Generate vocab v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), cur_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML if args.use_audio_input: gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy=None, extra={"use_audio_input": True} ) else: gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="lb", ) # Clean up shutil.rmtree(audio_root)
def process(args): assert "train" in args.splits out_root = Path(args.output_root).absolute() out_root.mkdir(exist_ok=True) print("Fetching data...") audio_manifest_root = Path(args.audio_manifest_root).absolute() samples = [] for s in args.splits: for e in load_tsv_to_dicts(audio_manifest_root / f"{s}.audio.tsv"): e["split"] = s samples.append(e) sample_ids = [s["id"] for s in samples] # Get alignment info id_to_alignment = None if args.textgrid_zip is not None: assert args.id_to_units_tsv is None id_to_alignment = get_mfa_alignment( args.textgrid_zip, sample_ids, args.sample_rate, args.hop_length ) elif args.id_to_units_tsv is not None: # assume identical hop length on the unit sequence id_to_alignment = get_unit_alignment(args.id_to_units_tsv, sample_ids) # Extract features and pack features into ZIP feature_name = "logmelspec80" zip_path = out_root / f"{feature_name}.zip" pitch_zip_path = out_root / "pitch.zip" energy_zip_path = out_root / "energy.zip" gcmvn_npz_path = out_root / "gcmvn_stats.npz" if zip_path.exists() and gcmvn_npz_path.exists(): print(f"{zip_path} and {gcmvn_npz_path} exist.") else: feature_root = out_root / feature_name feature_root.mkdir(exist_ok=True) pitch_root = out_root / "pitch" energy_root = out_root / "energy" if args.add_fastspeech_targets: pitch_root.mkdir(exist_ok=True) energy_root.mkdir(exist_ok=True) print("Extracting Mel spectrogram features...") for sample in tqdm(samples): waveform, sample_rate = torchaudio.load(sample["audio"]) waveform, sample_rate = convert_waveform( waveform, sample_rate, normalize_volume=args.normalize_volume, to_sample_rate=args.sample_rate ) sample_id = sample["id"] target_length = None if id_to_alignment is not None: a = id_to_alignment[sample_id] target_length = sum(a.frame_durations) if a.start_sec is not None and a.end_sec is not None: start_frame = int(a.start_sec * sample_rate) end_frame = int(a.end_sec * sample_rate) waveform = waveform[:, start_frame: end_frame] extract_logmel_spectrogram( waveform, sample_rate, feature_root / f"{sample_id}.npy", win_length=args.win_length, hop_length=args.hop_length, n_fft=args.n_fft, n_mels=args.n_mels, f_min=args.f_min, f_max=args.f_max, target_length=target_length ) if args.add_fastspeech_targets: assert id_to_alignment is not None extract_pitch( waveform, sample_rate, pitch_root / f"{sample_id}.npy", hop_length=args.hop_length, log_scale=True, phoneme_durations=id_to_alignment[sample_id].frame_durations ) extract_energy( waveform, energy_root / f"{sample_id}.npy", hop_length=args.hop_length, n_fft=args.n_fft, log_scale=True, phoneme_durations=id_to_alignment[sample_id].frame_durations ) print("ZIPing features...") create_zip(feature_root, zip_path) get_global_cmvn(feature_root, gcmvn_npz_path) shutil.rmtree(feature_root) if args.add_fastspeech_targets: create_zip(pitch_root, pitch_zip_path) shutil.rmtree(pitch_root) create_zip(energy_root, energy_zip_path) shutil.rmtree(energy_root) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest(zip_path) pitch_paths, pitch_lengths, energy_paths, energy_lengths = [None] * 4 if args.add_fastspeech_targets: pitch_paths, pitch_lengths = get_zip_manifest(pitch_zip_path) energy_paths, energy_lengths = get_zip_manifest(energy_zip_path) # Generate TSV manifest print("Generating manifest...") id_to_cer = None if args.cer_threshold is not None: assert Path(args.cer_tsv_path).is_file() id_to_cer = { x["id"]: x["uer"] for x in load_tsv_to_dicts(args.cer_tsv_path) } manifest_by_split = {split: defaultdict(list) for split in args.splits} for sample in tqdm(samples): sample_id, split = sample["id"], sample["split"] if args.snr_threshold is not None and "snr" in sample \ and sample["snr"] < args.snr_threshold: continue if args.cer_threshold is not None \ and id_to_cer[sample_id] > args.cer_threhold: continue normalized_utt = sample["tgt_text"] if id_to_alignment is not None: normalized_utt = " ".join(id_to_alignment[sample_id].tokens) elif args.ipa_vocab: normalized_utt = ipa_phonemize( normalized_utt, lang=args.lang, use_g2p=args.use_g2p ) manifest_by_split[split]["id"].append(sample_id) manifest_by_split[split]["audio"].append(audio_paths[sample_id]) manifest_by_split[split]["n_frames"].append(audio_lengths[sample_id]) manifest_by_split[split]["tgt_text"].append(normalized_utt) manifest_by_split[split]["speaker"].append(sample["speaker"]) manifest_by_split[split]["src_text"].append(sample["src_text"]) if args.add_fastspeech_targets: assert id_to_alignment is not None duration = " ".join( str(d) for d in id_to_alignment[sample_id].frame_durations ) manifest_by_split[split]["duration"].append(duration) manifest_by_split[split]["pitch"].append(pitch_paths[sample_id]) manifest_by_split[split]["energy"].append(energy_paths[sample_id]) for split in args.splits: save_df_to_tsv( pd.DataFrame.from_dict(manifest_by_split[split]), out_root / f"{split}.tsv" ) # Generate vocab vocab_name, spm_filename = None, None if id_to_alignment is not None or args.ipa_vocab: vocab = Counter() for t in manifest_by_split["train"]["tgt_text"]: vocab.update(t.split(" ")) vocab_name = "vocab.txt" with open(out_root / vocab_name, "w") as f: for s, c in vocab.most_common(): f.write(f"{s} {c}\n") else: spm_filename_prefix = "spm_char" spm_filename = f"{spm_filename_prefix}.model" with NamedTemporaryFile(mode="w") as f: for t in manifest_by_split["train"]["tgt_text"]: f.write(t + "\n") f.flush() # needed to ensure gen_vocab sees dumped text gen_vocab(Path(f.name), out_root / spm_filename_prefix, "char") # Generate speaker list speakers = sorted({sample["speaker"] for sample in samples}) speakers_path = out_root / "speakers.txt" with open(speakers_path, "w") as f: for speaker in speakers: f.write(f"{speaker}\n") # Generate config YAML win_len_t = args.win_length / args.sample_rate hop_len_t = args.hop_length / args.sample_rate extra = { "sample_rate": args.sample_rate, "features": { "type": "spectrogram+melscale+log", "eps": 1e-5, "n_mels": args.n_mels, "n_fft": args.n_fft, "window_fn": "hann", "win_length": args.win_length, "hop_length": args.hop_length, "sample_rate": args.sample_rate, "win_len_t": win_len_t, "hop_len_t": hop_len_t, "f_min": args.f_min, "f_max": args.f_max, "n_stft": args.n_fft // 2 + 1 } } if len(speakers) > 1: extra["speaker_set_filename"] = "speakers.txt" if args.add_fastspeech_targets: pitch_min, pitch_max = get_feature_value_min_max( [(out_root / n).as_posix() for n in pitch_paths.values()] ) energy_min, energy_max = get_feature_value_min_max( [(out_root / n).as_posix() for n in energy_paths.values()] ) extra["features"]["pitch_min"] = pitch_min extra["features"]["pitch_max"] = pitch_max extra["features"]["energy_min"] = energy_min extra["features"]["energy_max"] = energy_max gen_config_yaml( out_root, spm_filename=spm_filename, vocab_name=vocab_name, audio_root=out_root.as_posix(), input_channels=None, input_feat_per_channel=None, specaugment_policy=None, cmvn_type="global", gcmvn_path=gcmvn_npz_path, extra=extra )