Esempio n. 1
0
def speech_quality_metrics(in_dir, ref_dir, maxdepth, out_file):

    folders = []
    for depth in range(maxdepth):
        path_str = '*' + os.sep
        path_str = os.path.join(in_dir, path_str * depth)
        folders += glob(path_str)

    in_folder = os.path.basename(os.path.dirname(os.path.join(in_dir, '')))
    fields = ['snr', 'mosnet', 'srmr', 'wav_file']
    window_length = None
    abs_metrics = speechmetrics.load('absolute', window_length)
    if ref_dir:
        fields = [
            'snr', 'mosnet', 'srmr', 'sdr', 'isr', 'sar', 'pesq', 'sisdr',
            'stoi', 'wav_file'
        ]
        rel_metrics = speechmetrics.load('relative', window_length)
    csv_file = args.out_file

    with open(csv_file, 'w') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(fields)
        for fld in tqdm(folders):
            files = sorted(glob(os.path.join(fld, '*.wav')))
            for fle in files:
                fle_clip = fle[fle.find(in_folder) + len(in_folder) + 1:]
                try:
                    print('Computing metrics for %s.' % fle)
                    abs_scores = abs_metrics(fle)
                    snr_score = snr(fle, method='rms')
                    if ref_dir:
                        ref = os.path.join(ref_dir, fle_clip)
                        rel_scores = rel_metrics(fle, ref)
                except KeyboardInterrupt:
                    sys.exit()
                except:
                    print('Unable to compute quality scores for ' + fle + '.')
                    if ref_dir:
                        csvwriter.writerow(
                            ['', '', '', '', '', '', '', '', '', fle_clip])
                    else:
                        csvwriter.writerow(['', '', '', fle_clip])
                else:
                    if ref_dir:
                        csvwriter.writerow([
                            snr_score, abs_scores['mosnet'][0][0],
                            abs_scores['srmr'], rel_scores['sdr'][0][0],
                            rel_scores['isr'][0][0], rel_scores['sar'][0][0],
                            rel_scores['pesq'], rel_scores['sisdr'],
                            rel_scores['stoi'], fle_clip
                        ])
                    else:
                        csvwriter.writerow([
                            snr_score, abs_scores['mosnet'][0][0],
                            abs_scores['srmr'], fle_clip
                        ])
Esempio n. 2
0
def speechmetrics_featurize(wavfile):
    window_length = 5  # seconds
    metrics = speechmetrics.load('absolute', window_length)
    scores = metrics(wavfile)
    scores['mosnet'] = float(scores['mosnet'])
    scores['srmr'] = float(scores['srmr'])
    features = list(scores.values())
    labels = list(scores)
    return features, labels
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(
        description="Use MOSnet to predict quality scores.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("--outwavdir", type=str, help="Converted waveform directory")
    parser.add_argument(
        "--out",
        type=str,
        help="If omitted, then output to sys.stdout",
    )
    args = parser.parse_args()

    # logging info
    logging.basicConfig(
        level=logging.INFO,
        stream=sys.stdout,
        format="%(asctime)s (%(module)s:%(lineno)d) " "%(levelname)s: %(message)s",
    )

    # load converted files.
    converted_files = sorted(list(Path(args.outwavdir).rglob("*.wav")))
    logging.info(f"number of utterances = {len(converted_files)}")

    # construct metric class
    metrics = speechmetrics.load("mosnet", None)

    if args.out is None:
        out = sys.stdout
    else:
        out = open(args.out, "w", encoding="utf-8")

    # actual calculation
    scores = {}
    for cv_path in converted_files:
        score = metrics(str(cv_path))["mosnet"][0][0]
        basename = cv_path.stem
        number, orgspk, tarspk = basename.split("_")
        tarspk = tarspk.split("-")[-1]
        orgspk = orgspk.split("-")[-1]

        scores[f"{orgspk}-{tarspk}-{number}"] = score

    # summarize by pair
    pairwise_scores = {}
    for k, v in scores.items():
        orgspk, tarspk, _ = k.split("-")
        pair = orgspk + " " + tarspk
        if pair not in pairwise_scores:
            pairwise_scores[pair] = []
        pairwise_scores[pair].append(v)

    for k in sorted(pairwise_scores.keys()):
        score_list = pairwise_scores[k]
        mean_score = float(sum(score_list) / len(score_list))
        out.write(f"{k} {mean_score:.3f}\n")
Esempio n. 4
0
from torchmetrics.audio import SI_SNR
from torchmetrics.functional import si_snr
from torchmetrics.utilities.imports import _TORCH_GREATER_EQUAL_1_6

seed_all(42)

Time = 100

Input = namedtuple('Input', ["preds", "target"])

inputs = Input(
    preds=torch.rand(NUM_BATCHES, BATCH_SIZE, 1, Time),
    target=torch.rand(NUM_BATCHES, BATCH_SIZE, 1, Time),
)

speechmetrics_sisdr = speechmetrics.load('sisdr')


def speechmetrics_si_sdr(preds: Tensor, target: Tensor, zero_mean: bool = True):
    # shape: preds [BATCH_SIZE, 1, Time] , target [BATCH_SIZE, 1, Time]
    # or shape: preds [NUM_BATCHES*BATCH_SIZE, 1, Time] , target [NUM_BATCHES*BATCH_SIZE, 1, Time]
    if zero_mean:
        preds = preds - preds.mean(dim=2, keepdim=True)
        target = target - target.mean(dim=2, keepdim=True)
    target = target.detach().cpu().numpy()
    preds = preds.detach().cpu().numpy()
    mss = []
    for i in range(preds.shape[0]):
        ms = []
        for j in range(preds.shape[1]):
            metric = speechmetrics_sisdr(preds[i, j], target[i, j], rate=16000)
Esempio n. 5
0
                                    'realization', 'SNR', 'technique', 'pesq',
                                    'stoi', 'srmr'
                                ])
filename = []
speech_name = []
noise_name = []
realization = []
SNR = []
technique = []
pesq_score = []
stoi_score = []
srmr_score = []

# mixed case, still works
metric_function = speechmetrics.load(['pesq', 'stoi', 'srmr'],
                                     window=None,
                                     verbose=False)
for i in range(len(metric_function.metrics)):
    metric_function.metrics[i].fixed_rate = 8000

for k in trange(num_techniques):
    # for k in range(num_techniques):

    speech_folder = data_folder / 'speech'
    noisy_folder = data_folder / 'speech+noise'

    if technique_list[k] is 'noisy':
        processed_folder = data_folder / 'speech+noise'
    else:
        processed_folder = data_folder / 'processed' / technique_list[k]
Esempio n. 6
0
import speechmetrics as sm

if __name__ == '__main__':
    window = 5

    print('Trying ABSOLUTE metrics: ')
    metrics = sm.load('absolute', window)

    reference = 'data/m2_script1_produced.wav'
    tests = [
        'data/m2_script1_clean.wav', 'data/m2_script1_ipad_confroom1.wav',
        'data/m2_script1_produced.wav'
    ]

    for test in tests:
        import pprint
        print('Computing scores for ', test)
        scores = metrics(reference, test)
        pprint.pprint(scores)

    print('\nTrying RELATIVE metrics: ')

    metrics = sm.load('relative', window)

    reference = 'data/m2_script1_produced.wav'
    tests = [
        'data/m2_script1_clean.wav', 'data/m2_script1_ipad_confroom1.wav',
        'data/m2_script1_produced.wav'
    ]

    for test in tests:
def calculate_all_metrics(path, reference_path, n_max_files=None):
    metrics = {}
    FD = FrechetDistance(
        path=path,
        reference_path=reference_path,
        backbone="deepspeech2",
        sr=16000,
        sample_size=10000,
        num_runs=1,
        window_size=None,
        conditional=True,
        use_cached=True,
    )
    metrics["FDSD"] = FD.calculate_metric()[0].data.item()
    FD.backbone.encoder.cpu()
    mos_pred = load_model("wave2vec_mos")

    moses = np.array(mos_pred.calculate(path, False))
    moses_ref = np.array(mos_pred.calculate(reference_path, False))
    mos_pred.cpu()
    metrics["MOS_wav2vec"] = moses.mean(), moses.std()
    metrics["MOSdeg_wav2vec"] = np.mean(np.maximum(
        moses_ref - moses, 0)), np.std(np.maximum(moses_ref - moses, 0))
    metrics["MOSdeg_wav2vec_nonzero"] = np.sum(moses_ref - moses > 0) / len(
        moses.squeeze())

    computer = speechmetrics.load(
        ["bsseval", "mosnet", "pesq", "stoi", "sisdr"], None)
    ll = glob.glob(os.path.join(path, "*.wav"))
    ll_gt = glob.glob(os.path.join(reference_path, "*.wav"))

    scores = []
    for path_to_estimate_file, path_to_reference in tqdm(
            itertools.islice(zip(ll, ll_gt), n_max_files),
            total=n_max_files if n_max_files is not None else len(ll),
            desc="Calculating metrics from speechmetrics",
    ):
        scores.append(computer(path_to_estimate_file, path_to_reference))
    scores = {k: [dic[k] for dic in scores] for k in scores[0]}

    scores_ref = []
    for path_to_estimate_file, path_to_reference in tqdm(
            itertools.islice(zip(ll, ll_gt), n_max_files),
            total=n_max_files if n_max_files is not None else len(ll),
            desc="Calculating reference values of metrics",
    ):
        scores_ref.append(computer(path_to_reference, path_to_reference))
    scores_ref = {k: [dic[k] for dic in scores_ref] for k in scores_ref[0]}

    metrics["MOS_orig"] = np.mean(np.stack(scores["mosnet"])), np.std(
        np.stack(scores["mosnet"]))
    mosdeg = np.maximum(
        -np.stack(scores["mosnet"]) + np.stack(scores_ref["mosnet"]), 0)
    metrics["MOSdeg_orig"] = np.mean(mosdeg), np.std(mosdeg)
    metrics["MOSdeg_orig_nonzero"] = np.sum(mosdeg > 0) / len(mosdeg.squeeze())
    metrics["sisdr"] = np.mean(np.stack(scores["sisdr"])), np.std(
        np.stack(scores["sisdr"]))
    metrics["stoi"] = np.mean(np.stack(scores["stoi"])), np.std(
        np.stack(scores["stoi"]))
    metrics["pesq"] = np.mean(np.stack(scores["pesq"])), np.std(
        np.stack(scores["pesq"]))
    metrics["sdr"] = np.mean(np.stack(scores["sdr"])), np.std(
        np.stack(scores["sdr"]))

    LSD = []
    SNR = []
    for path_to_estimate_file, path_to_reference in tqdm(
            itertools.islice(zip(ll, ll_gt), n_max_files),
            total=n_max_files if n_max_files is not None else len(ll),
            desc="Calculating LSD and SNR metrics",
    ):
        x = librosa.load(path_to_estimate_file, sr=16000)[0]
        y = librosa.load(path_to_reference, sr=16000)[0]
        x = librosa.util.normalize(x[:min(len(x), len(y))])
        y = librosa.util.normalize(y[:min(len(x), len(y))])

        SNR.append(snr(x, y))
        LSD.append(lsd(x, y))

    metrics["snr"] = np.mean(SNR), np.std(SNR)
    metrics["lsd"] = np.mean(LSD), np.std(LSD)

    return metrics
Esempio n. 8
0
import speechmetrics as sm

if __name__ == '__main__':
    window = 5

    metrics = sm.load('absolute', window)

    reference = 'data/m2_script1_produced.wav'
    tests = [
        'data/m2_script1_clean.wav', 'data/m2_script1_ipad_confroom1.wav',
        'data/m2_script1_produced.wav'
    ]

    for test in tests:
        import pprint
        print('Computing scores for ', test)
        scores = metrics(test, reference)
        pprint.pprint(scores)
Esempio n. 9
0
from torchmetrics.audio import ScaleInvariantSignalDistortionRatio
from torchmetrics.functional import scale_invariant_signal_distortion_ratio
from torchmetrics.utilities.imports import _TORCH_GREATER_EQUAL_1_6

seed_all(42)

Time = 100

Input = namedtuple("Input", ["preds", "target"])

inputs = Input(
    preds=torch.rand(NUM_BATCHES, BATCH_SIZE, 1, Time),
    target=torch.rand(NUM_BATCHES, BATCH_SIZE, 1, Time),
)

speechmetrics_sisdr = speechmetrics.load("sisdr")


def speechmetrics_si_sdr(preds: Tensor, target: Tensor, zero_mean: bool):
    # shape: preds [BATCH_SIZE, 1, Time] , target [BATCH_SIZE, 1, Time]
    # or shape: preds [NUM_BATCHES*BATCH_SIZE, 1, Time] , target [NUM_BATCHES*BATCH_SIZE, 1, Time]
    if zero_mean:
        preds = preds - preds.mean(dim=2, keepdim=True)
        target = target - target.mean(dim=2, keepdim=True)
    target = target.detach().cpu().numpy()
    preds = preds.detach().cpu().numpy()
    mss = []
    for i in range(preds.shape[0]):
        ms = []
        for j in range(preds.shape[1]):
            metric = speechmetrics_sisdr(preds[i, j], target[i, j], rate=16000)