Esempio n. 1
0
def infer_hifigan(args):
    import subprocess

    load_model(args.model_name)
    model_name = args.model_name if args.model_name != 'hifigan' else 'hifigan_v1'

    inference_file = "thirdparty/hifi-gan/inference.py"
    exe_inference_file = "thirdparty/hifi-gan/exe_inference.py"
    subprocess.call([
        f"sed '1 i #!/usr/bin/env python' {inference_file} > {exe_inference_file}"
    ],
                    shell=True)

    args_list = [
        "--input_wavs_dir",
        args.folder_in,
        "--output_dir",
        f"data/out/{model_name}",
        "--checkpoint_file",
        f"pretrained/{model_name}/model.pth",
    ]
    cmd = [exe_inference_file] + args_list
    os.chmod(exe_inference_file, 777)

    popen = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             universal_newlines=True)
    for stdout_line in iter(popen.stdout.readline, ""):
        print(stdout_line, end='')
    popen.stdout.close()
Esempio n. 2
0
def infer_wavenet(args):
    import sys
    sys.path.append('thirdparty/wavenet_vocoder')

    from train import build_model
    from synthesis import wavegen
    from tqdm import tqdm
    target_sample_rate = 22050

    hparams, model = load_model(args.model_name)
    meller = MelSpectrogram()
    files = [
        item for item in os.listdir(args.folder_in) if item.endswith('wav')
    ]
    for idx, audio in enumerate(files):
        wav_path = os.path.join(args.folder_in, audio)
        wav = load_wav(wav_path, target_sample_rate)
        c = meller(wav)[0]
        if c.shape[1] != hparams.num_mels:
            c = c.transpose(0, 1)
        # Range [0, 4] was used for training Tacotron2 but WaveNet vocoder assumes [0, 1]
        # c = np.interp(c, (0, 4), (0, 1))

        # Generate
        waveform = wavegen(model, c=c, fast=True, tqdm=tqdm)
        path = os.path.join(args.folder_out, audio)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        torchaudio.save(path, waveform, hparams.sample_rate)
Esempio n. 3
0
def infer_melgan(args):
    target_sample_rate = 22050
    model = load_model(args.model_name)
    files = [
        item for item in os.listdir(args.folder_in) if item.endswith('wav')
    ]
    for idx, audio in enumerate(files):
        wav_path = os.path.join(args.folder_in, audio)
        wav = load_wav(wav_path, target_sample_rate)
        with torch.no_grad():
            mel = model(wav)
            waveform = model.inverse(mel)
        path = os.path.join(args.folder_out, audio)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        torchaudio.save(path, waveform.cpu(), target_sample_rate)
Esempio n. 4
0
def infer_waveglow(args):
    target_sample_rate = 22050
    n_mels = 80
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = load_model(args.model_name, device=device)
    meller = MelSpectrogram().to(device)
    files = [
        item for item in os.listdir(args.folder_in) if item.endswith('wav')
    ]
    for idx, audio in enumerate(files):
        wav_path = os.path.join(args.folder_in, audio)
        wav = load_wav(wav_path, target_sample_rate).to(device)
        mel = meller(wav)
        if mel.shape[1] != n_mels:
            mel = mel.permute(0, 2, 1)
        waveform = model.inference(mel)
        path = os.path.join(args.folder_out, audio)
        folder = os.path.dirname(path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        torchaudio.save(path, waveform.cpu(), target_sample_rate)
Esempio n. 5
0
from speech_distances.models import load_model
import argparse
from scipy.stats import wilcoxon, mannwhitneyu
import numpy as np

# Test hypothesis that path1 and path2 files have the same quality
# against one-sided alternative that files in path1 are better than path2 files

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--path1",
        type=str,
        help="path to .wav files which are assumed to be better")
    parser.add_argument(
        "--path2",
        type=str,
        help="path to .wav files which are assumed to be worse")
    args = parser.parse_args()

    mos_pred = load_model("wave2vec_mos")

    moses_1 = np.array(mos_pred.calculate(args.path1, False))
    moses_2 = np.array(mos_pred.calculate(args.path2, False))

    print("Ratio:", (moses_1 > moses_2).sum() / len(moses_1))
    print("p-value:", wilcoxon(moses_1, moses_2, alternative="greater")[1])
def calculate_all_metrics(path, reference_path, n_max_files=None):
    metrics = {}
    FD = FrechetDistance(
        path=path,
        reference_path=reference_path,
        backbone="deepspeech2",
        sr=16000,
        sample_size=10000,
        num_runs=1,
        window_size=None,
        conditional=True,
        use_cached=True,
    )
    metrics["FDSD"] = FD.calculate_metric()[0].data.item()
    FD.backbone.encoder.cpu()
    mos_pred = load_model("wave2vec_mos")

    moses = np.array(mos_pred.calculate(path, False))
    moses_ref = np.array(mos_pred.calculate(reference_path, False))
    mos_pred.cpu()
    metrics["MOS_wav2vec"] = moses.mean(), moses.std()
    metrics["MOSdeg_wav2vec"] = np.mean(np.maximum(
        moses_ref - moses, 0)), np.std(np.maximum(moses_ref - moses, 0))
    metrics["MOSdeg_wav2vec_nonzero"] = np.sum(moses_ref - moses > 0) / len(
        moses.squeeze())

    computer = speechmetrics.load(
        ["bsseval", "mosnet", "pesq", "stoi", "sisdr"], None)
    ll = glob.glob(os.path.join(path, "*.wav"))
    ll_gt = glob.glob(os.path.join(reference_path, "*.wav"))

    scores = []
    for path_to_estimate_file, path_to_reference in tqdm(
            itertools.islice(zip(ll, ll_gt), n_max_files),
            total=n_max_files if n_max_files is not None else len(ll),
            desc="Calculating metrics from speechmetrics",
    ):
        scores.append(computer(path_to_estimate_file, path_to_reference))
    scores = {k: [dic[k] for dic in scores] for k in scores[0]}

    scores_ref = []
    for path_to_estimate_file, path_to_reference in tqdm(
            itertools.islice(zip(ll, ll_gt), n_max_files),
            total=n_max_files if n_max_files is not None else len(ll),
            desc="Calculating reference values of metrics",
    ):
        scores_ref.append(computer(path_to_reference, path_to_reference))
    scores_ref = {k: [dic[k] for dic in scores_ref] for k in scores_ref[0]}

    metrics["MOS_orig"] = np.mean(np.stack(scores["mosnet"])), np.std(
        np.stack(scores["mosnet"]))
    mosdeg = np.maximum(
        -np.stack(scores["mosnet"]) + np.stack(scores_ref["mosnet"]), 0)
    metrics["MOSdeg_orig"] = np.mean(mosdeg), np.std(mosdeg)
    metrics["MOSdeg_orig_nonzero"] = np.sum(mosdeg > 0) / len(mosdeg.squeeze())
    metrics["sisdr"] = np.mean(np.stack(scores["sisdr"])), np.std(
        np.stack(scores["sisdr"]))
    metrics["stoi"] = np.mean(np.stack(scores["stoi"])), np.std(
        np.stack(scores["stoi"]))
    metrics["pesq"] = np.mean(np.stack(scores["pesq"])), np.std(
        np.stack(scores["pesq"]))
    metrics["sdr"] = np.mean(np.stack(scores["sdr"])), np.std(
        np.stack(scores["sdr"]))

    LSD = []
    SNR = []
    for path_to_estimate_file, path_to_reference in tqdm(
            itertools.islice(zip(ll, ll_gt), n_max_files),
            total=n_max_files if n_max_files is not None else len(ll),
            desc="Calculating LSD and SNR metrics",
    ):
        x = librosa.load(path_to_estimate_file, sr=16000)[0]
        y = librosa.load(path_to_reference, sr=16000)[0]
        x = librosa.util.normalize(x[:min(len(x), len(y))])
        y = librosa.util.normalize(y[:min(len(x), len(y))])

        SNR.append(snr(x, y))
        LSD.append(lsd(x, y))

    metrics["snr"] = np.mean(SNR), np.std(SNR)
    metrics["lsd"] = np.mean(LSD), np.std(LSD)

    return metrics