コード例 #1
0
ファイル: data_utils.py プロジェクト: sdadas/fairseq
def extract_fbank_features(
    waveform: torch.FloatTensor,
    sample_rate: int,
    output_path: Optional[Path] = None,
    n_mel_bins: int = 80,
    overwrite: bool = False,
):
    if output_path is not None and output_path.is_file() and not overwrite:
        return

    _waveform = convert_waveform(waveform, sample_rate, to_mono=True)
    # Kaldi compliance: 16-bit signed integers
    _waveform = _waveform * (2 ** 15)
    _waveform = _waveform.numpy()

    features = _get_kaldi_fbank(_waveform, sample_rate, n_mel_bins)
    if features is None:
        features = _get_torchaudio_fbank(_waveform, sample_rate, n_mel_bins)
    if features is None:
        raise ImportError(
            "Please install pyKaldi or torchaudio to enable fbank feature extraction"
        )

    if output_path is not None:
        np.save(output_path.as_posix(), features)
    return features
コード例 #2
0
def prepare_target_data(args, tgt_audios):
    feature_name = "logmelspec80"
    zip_path = args.output_root / f"{feature_name}.zip"
    if zip_path.exists():
        print(f"{zip_path} exists.")
        return zip_path

    feature_root = args.output_root / feature_name
    feature_root.mkdir(exist_ok=True)

    print("Extracting Mel spectrogram features...")
    for tgt_audio in tqdm(tgt_audios):
        sample_id = tgt_audio.stem
        waveform, sample_rate = torchaudio.load(tgt_audio.as_posix())
        waveform, sample_rate = convert_waveform(
            waveform,
            sample_rate,
            normalize_volume=args.normalize_volume,
            to_sample_rate=args.sample_rate)
        extract_logmel_spectrogram(waveform,
                                   sample_rate,
                                   feature_root / f"{sample_id}.npy",
                                   win_length=args.win_length,
                                   hop_length=args.hop_length,
                                   n_fft=args.n_fft,
                                   n_mels=args.n_mels,
                                   f_min=args.f_min,
                                   f_max=args.f_max)
    print("ZIPing features...")
    create_zip(feature_root, zip_path)
    shutil.rmtree(feature_root)

    return zip_path
コード例 #3
0
ファイル: prep_mustc_data.py プロジェクト: kahne/fairseq
def process(args):
    root = Path(args.data_root).absolute()
    for lang in MUSTC.LANGUAGES:
        cur_root = root / f"en-{lang}"
        if not cur_root.is_dir():
            print(f"{cur_root.as_posix()} does not exist. Skipped.")
            continue
        # Extract features
        audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80")
        audio_root.mkdir(exist_ok=True)

        for split in MUSTC.SPLITS:
            print(f"Fetching split {split}...")
            dataset = MUSTC(root.as_posix(), lang, split)
            if args.use_audio_input:
                print("Converting audios...")
                for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
                    tgt_sample_rate = 16_000
                    _wavform, _ = convert_waveform(
                        waveform,
                        sample_rate,
                        to_mono=True,
                        to_sample_rate=tgt_sample_rate)
                    sf.write((audio_root / f"{utt_id}.flac").as_posix(),
                             _wavform.T.numpy(), tgt_sample_rate)
            else:
                print("Extracting log mel filter bank features...")
                gcmvn_feature_list = []
                if split == 'train' and args.cmvn_type == "global":
                    print("And estimating cepstral mean and variance stats...")

                for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
                    features = extract_fbank_features(
                        waveform, sample_rate, audio_root / f"{utt_id}.npy")
                    if split == 'train' and args.cmvn_type == "global":
                        if len(gcmvn_feature_list) < args.gcmvn_max_num:
                            gcmvn_feature_list.append(features)

                if split == 'train' and args.cmvn_type == "global":
                    # Estimate and save cmv
                    stats = cal_gcmvn_stats(gcmvn_feature_list)
                    with open(cur_root / "gcmvn.npz", "wb") as f:
                        np.savez(f, mean=stats["mean"], std=stats["std"])

        # Pack features into ZIP
        zip_path = cur_root / f"{audio_root.name}.zip"
        print("ZIPing audios/features...")
        create_zip(audio_root, zip_path)
        print("Fetching ZIP manifest...")
        audio_paths, audio_lengths = get_zip_manifest(
            zip_path,
            is_audio=args.use_audio_input,
        )
        # Generate TSV manifest
        print("Generating manifest...")
        train_text = []
        for split in MUSTC.SPLITS:
            is_train_split = split.startswith("train")
            manifest = {c: [] for c in MANIFEST_COLUMNS}
            dataset = MUSTC(args.data_root, lang, split)
            for _, _, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset):
                manifest["id"].append(utt_id)
                manifest["audio"].append(audio_paths[utt_id])
                manifest["n_frames"].append(audio_lengths[utt_id])
                manifest["tgt_text"].append(src_utt if args.task ==
                                            "asr" else tgt_utt)
                manifest["speaker"].append(speaker_id)
            if is_train_split:
                train_text.extend(manifest["tgt_text"])
            df = pd.DataFrame.from_dict(manifest)
            df = filter_manifest_df(df, is_train_split=is_train_split)
            save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv")
        # Generate vocab
        v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
        spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}"
        with NamedTemporaryFile(mode="w") as f:
            for t in train_text:
                f.write(t + "\n")
            gen_vocab(
                Path(f.name),
                cur_root / spm_filename_prefix,
                args.vocab_type,
                args.vocab_size,
            )
        # Generate config YAML
        if args.use_audio_input:
            gen_config_yaml(cur_root,
                            spm_filename=spm_filename_prefix + ".model",
                            yaml_filename=f"config_{args.task}.yaml",
                            specaugment_policy=None,
                            extra={"use_audio_input": True})
        else:
            gen_config_yaml(
                cur_root,
                spm_filename=spm_filename_prefix + ".model",
                yaml_filename=f"config_{args.task}.yaml",
                specaugment_policy="lb",
                cmvn_type=args.cmvn_type,
                gcmvn_path=(cur_root / "gcmvn.npz"
                            if args.cmvn_type == "global" else None),
            )
        # Clean up
        shutil.rmtree(audio_root)
コード例 #4
0
ファイル: prep_mtedx_data.py プロジェクト: sdadas/fairseq
def process(args):
    root = Path(args.data_root).absolute()
    for lang in mTEDx.LANGPAIRS:
        cur_root = root / f"{lang}"
        if not cur_root.is_dir():
            print(f"{cur_root.as_posix()} does not exist. Skipped.")
            continue
        # Extract features
        audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80")
        audio_root.mkdir(exist_ok=True)
        for split in mTEDx.SPLITS:
            print(f"Fetching split {split}...")
            dataset = mTEDx(root.as_posix(), lang, split)
            if args.use_audio_input:
                print("Converting audios...")
                for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset):
                    tgt_sample_rate = 16_000
                    _wavform, _ = convert_waveform(
                        waveform, sample_rate, to_mono=True,
                        to_sample_rate=tgt_sample_rate
                    )
                    sf.write(
                        (audio_root / f"{utt_id}.flac").as_posix(),
                        _wavform.numpy(), tgt_sample_rate
                    )
            else:
                print("Extracting log mel filter bank features...")
                for waveform, sample_rate, _, _, _, _, utt_id in tqdm(dataset):
                    extract_fbank_features(
                        waveform, sample_rate, audio_root / f"{utt_id}.npy"
                    )
        # Pack features into ZIP
        zip_path = cur_root / f"{audio_root.name}.zip"
        print("ZIPing audios/features...")
        create_zip(audio_root, zip_path)
        print("Fetching ZIP manifest...")
        audio_paths, audio_lengths = get_zip_manifest(zip_path)
        # Generate TSV manifest
        print("Generating manifest...")
        train_text = []
        for split in mTEDx.SPLITS:
            is_train_split = split.startswith("train")
            manifest = {c: [] for c in MANIFEST_COLUMNS}
            ds = mTEDx(args.data_root, lang, split)
            for _, _, src_utt, tgt_utt, spk_id, tgt_lang, utt_id in tqdm(ds):
                manifest["id"].append(utt_id)
                manifest["audio"].append(audio_paths[utt_id])
                manifest["n_frames"].append(audio_lengths[utt_id])
                manifest["tgt_text"].append(
                    src_utt if args.task == "asr" else tgt_utt
                )
                manifest["speaker"].append(spk_id)
                manifest["tgt_lang"].append(tgt_lang)
            if is_train_split:
                train_text.extend(manifest["tgt_text"])
            df = pd.DataFrame.from_dict(manifest)
            df = filter_manifest_df(df, is_train_split=is_train_split)
            save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv")
        # Generate vocab
        v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
        spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}"
        with NamedTemporaryFile(mode="w") as f:
            for t in train_text:
                f.write(t + "\n")
            gen_vocab(
                Path(f.name),
                cur_root / spm_filename_prefix,
                args.vocab_type,
                args.vocab_size,
            )
        # Generate config YAML
        if args.use_audio_input:
            gen_config_yaml(
                cur_root,
                spm_filename=spm_filename_prefix + ".model",
                yaml_filename=f"config_{args.task}.yaml",
                specaugment_policy=None,
                extra={"use_audio_input": True}
            )
        else:
            gen_config_yaml(
                cur_root,
                spm_filename=spm_filename_prefix + ".model",
                yaml_filename=f"config_{args.task}.yaml",
                specaugment_policy="lb",
            )
        # Clean up
        shutil.rmtree(audio_root)
コード例 #5
0
def process(args):
    assert "train" in args.splits
    out_root = Path(args.output_root).absolute()
    out_root.mkdir(exist_ok=True)

    print("Fetching data...")
    audio_manifest_root = Path(args.audio_manifest_root).absolute()
    samples = []
    for s in args.splits:
        for e in load_tsv_to_dicts(audio_manifest_root / f"{s}.audio.tsv"):
            e["split"] = s
            samples.append(e)
    sample_ids = [s["id"] for s in samples]

    # Get alignment info
    id_to_alignment = None
    if args.textgrid_zip is not None:
        assert args.id_to_units_tsv is None
        id_to_alignment = get_mfa_alignment(
            args.textgrid_zip, sample_ids, args.sample_rate, args.hop_length
        )
    elif args.id_to_units_tsv is not None:
        # assume identical hop length on the unit sequence
        id_to_alignment = get_unit_alignment(args.id_to_units_tsv, sample_ids)

    # Extract features and pack features into ZIP
    feature_name = "logmelspec80"
    zip_path = out_root / f"{feature_name}.zip"
    pitch_zip_path = out_root / "pitch.zip"
    energy_zip_path = out_root / "energy.zip"
    gcmvn_npz_path = out_root / "gcmvn_stats.npz"
    if zip_path.exists() and gcmvn_npz_path.exists():
        print(f"{zip_path} and {gcmvn_npz_path} exist.")
    else:
        feature_root = out_root / feature_name
        feature_root.mkdir(exist_ok=True)
        pitch_root = out_root / "pitch"
        energy_root = out_root / "energy"
        if args.add_fastspeech_targets:
            pitch_root.mkdir(exist_ok=True)
            energy_root.mkdir(exist_ok=True)
        print("Extracting Mel spectrogram features...")
        for sample in tqdm(samples):
            waveform, sample_rate = torchaudio.load(sample["audio"])
            waveform, sample_rate = convert_waveform(
                waveform, sample_rate, normalize_volume=args.normalize_volume,
                to_sample_rate=args.sample_rate
            )
            sample_id = sample["id"]
            target_length = None
            if id_to_alignment is not None:
                a = id_to_alignment[sample_id]
                target_length = sum(a.frame_durations)
                if a.start_sec is not None and a.end_sec is not None:
                    start_frame = int(a.start_sec * sample_rate)
                    end_frame = int(a.end_sec * sample_rate)
                    waveform = waveform[:, start_frame: end_frame]
            extract_logmel_spectrogram(
                waveform, sample_rate, feature_root / f"{sample_id}.npy",
                win_length=args.win_length, hop_length=args.hop_length,
                n_fft=args.n_fft, n_mels=args.n_mels, f_min=args.f_min,
                f_max=args.f_max, target_length=target_length
            )
            if args.add_fastspeech_targets:
                assert id_to_alignment is not None
                extract_pitch(
                    waveform, sample_rate, pitch_root / f"{sample_id}.npy",
                    hop_length=args.hop_length, log_scale=True,
                    phoneme_durations=id_to_alignment[sample_id].frame_durations
                )
                extract_energy(
                    waveform, energy_root / f"{sample_id}.npy",
                    hop_length=args.hop_length, n_fft=args.n_fft,
                    log_scale=True,
                    phoneme_durations=id_to_alignment[sample_id].frame_durations
                )
        print("ZIPing features...")
        create_zip(feature_root, zip_path)
        get_global_cmvn(feature_root, gcmvn_npz_path)
        shutil.rmtree(feature_root)
        if args.add_fastspeech_targets:
            create_zip(pitch_root, pitch_zip_path)
            shutil.rmtree(pitch_root)
            create_zip(energy_root, energy_zip_path)
            shutil.rmtree(energy_root)

    print("Fetching ZIP manifest...")
    audio_paths, audio_lengths = get_zip_manifest(zip_path)
    pitch_paths, pitch_lengths, energy_paths, energy_lengths = [None] * 4
    if args.add_fastspeech_targets:
        pitch_paths, pitch_lengths = get_zip_manifest(pitch_zip_path)
        energy_paths, energy_lengths = get_zip_manifest(energy_zip_path)
    # Generate TSV manifest
    print("Generating manifest...")
    id_to_cer = None
    if args.cer_threshold is not None:
        assert Path(args.cer_tsv_path).is_file()
        id_to_cer = {
            x["id"]: x["uer"] for x in load_tsv_to_dicts(args.cer_tsv_path)
        }
    manifest_by_split = {split: defaultdict(list) for split in args.splits}
    for sample in tqdm(samples):
        sample_id, split = sample["id"], sample["split"]

        if args.snr_threshold is not None and "snr" in sample \
                and sample["snr"] < args.snr_threshold:
            continue
        if args.cer_threshold is not None \
                and id_to_cer[sample_id] > args.cer_threhold:
            continue

        normalized_utt = sample["tgt_text"]
        if id_to_alignment is not None:
            normalized_utt = " ".join(id_to_alignment[sample_id].tokens)
        elif args.ipa_vocab:
            normalized_utt = ipa_phonemize(
                normalized_utt, lang=args.lang, use_g2p=args.use_g2p
            )
        manifest_by_split[split]["id"].append(sample_id)
        manifest_by_split[split]["audio"].append(audio_paths[sample_id])
        manifest_by_split[split]["n_frames"].append(audio_lengths[sample_id])
        manifest_by_split[split]["tgt_text"].append(normalized_utt)
        manifest_by_split[split]["speaker"].append(sample["speaker"])
        manifest_by_split[split]["src_text"].append(sample["src_text"])
        if args.add_fastspeech_targets:
            assert id_to_alignment is not None
            duration = " ".join(
                str(d) for d in id_to_alignment[sample_id].frame_durations
            )
            manifest_by_split[split]["duration"].append(duration)
            manifest_by_split[split]["pitch"].append(pitch_paths[sample_id])
            manifest_by_split[split]["energy"].append(energy_paths[sample_id])
    for split in args.splits:
        save_df_to_tsv(
            pd.DataFrame.from_dict(manifest_by_split[split]),
            out_root / f"{split}.tsv"
        )
    # Generate vocab
    vocab_name, spm_filename = None, None
    if id_to_alignment is not None or args.ipa_vocab:
        vocab = Counter()
        for t in manifest_by_split["train"]["tgt_text"]:
            vocab.update(t.split(" "))
        vocab_name = "vocab.txt"
        with open(out_root / vocab_name, "w") as f:
            for s, c in vocab.most_common():
                f.write(f"{s} {c}\n")
    else:
        spm_filename_prefix = "spm_char"
        spm_filename = f"{spm_filename_prefix}.model"
        with NamedTemporaryFile(mode="w") as f:
            for t in manifest_by_split["train"]["tgt_text"]:
                f.write(t + "\n")
            f.flush()  # needed to ensure gen_vocab sees dumped text
            gen_vocab(Path(f.name), out_root / spm_filename_prefix, "char")
    # Generate speaker list
    speakers = sorted({sample["speaker"] for sample in samples})
    speakers_path = out_root / "speakers.txt"
    with open(speakers_path, "w") as f:
        for speaker in speakers:
            f.write(f"{speaker}\n")
    # Generate config YAML
    win_len_t = args.win_length / args.sample_rate
    hop_len_t = args.hop_length / args.sample_rate
    extra = {
        "sample_rate": args.sample_rate,
        "features": {
            "type": "spectrogram+melscale+log",
            "eps": 1e-5, "n_mels": args.n_mels, "n_fft": args.n_fft,
            "window_fn": "hann", "win_length": args.win_length,
            "hop_length": args.hop_length, "sample_rate": args.sample_rate,
            "win_len_t": win_len_t, "hop_len_t": hop_len_t,
            "f_min": args.f_min, "f_max": args.f_max,
            "n_stft": args.n_fft // 2 + 1
        }
    }
    if len(speakers) > 1:
        extra["speaker_set_filename"] = "speakers.txt"
    if args.add_fastspeech_targets:
        pitch_min, pitch_max = get_feature_value_min_max(
            [(out_root / n).as_posix() for n in pitch_paths.values()]
        )
        energy_min, energy_max = get_feature_value_min_max(
            [(out_root / n).as_posix() for n in energy_paths.values()]
        )
        extra["features"]["pitch_min"] = pitch_min
        extra["features"]["pitch_max"] = pitch_max
        extra["features"]["energy_min"] = energy_min
        extra["features"]["energy_max"] = energy_max
    gen_config_yaml(
        out_root, spm_filename=spm_filename, vocab_name=vocab_name,
        audio_root=out_root.as_posix(), input_channels=None,
        input_feat_per_channel=None, specaugment_policy=None,
        cmvn_type="global", gcmvn_path=gcmvn_npz_path, extra=extra
    )