def speech_quality_metrics(in_dir, ref_dir, maxdepth, out_file): folders = [] for depth in range(maxdepth): path_str = '*' + os.sep path_str = os.path.join(in_dir, path_str * depth) folders += glob(path_str) in_folder = os.path.basename(os.path.dirname(os.path.join(in_dir, ''))) fields = ['snr', 'mosnet', 'srmr', 'wav_file'] window_length = None abs_metrics = speechmetrics.load('absolute', window_length) if ref_dir: fields = [ 'snr', 'mosnet', 'srmr', 'sdr', 'isr', 'sar', 'pesq', 'sisdr', 'stoi', 'wav_file' ] rel_metrics = speechmetrics.load('relative', window_length) csv_file = args.out_file with open(csv_file, 'w') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(fields) for fld in tqdm(folders): files = sorted(glob(os.path.join(fld, '*.wav'))) for fle in files: fle_clip = fle[fle.find(in_folder) + len(in_folder) + 1:] try: print('Computing metrics for %s.' % fle) abs_scores = abs_metrics(fle) snr_score = snr(fle, method='rms') if ref_dir: ref = os.path.join(ref_dir, fle_clip) rel_scores = rel_metrics(fle, ref) except KeyboardInterrupt: sys.exit() except: print('Unable to compute quality scores for ' + fle + '.') if ref_dir: csvwriter.writerow( ['', '', '', '', '', '', '', '', '', fle_clip]) else: csvwriter.writerow(['', '', '', fle_clip]) else: if ref_dir: csvwriter.writerow([ snr_score, abs_scores['mosnet'][0][0], abs_scores['srmr'], rel_scores['sdr'][0][0], rel_scores['isr'][0][0], rel_scores['sar'][0][0], rel_scores['pesq'], rel_scores['sisdr'], rel_scores['stoi'], fle_clip ]) else: csvwriter.writerow([ snr_score, abs_scores['mosnet'][0][0], abs_scores['srmr'], fle_clip ])
def speechmetrics_featurize(wavfile): window_length = 5 # seconds metrics = speechmetrics.load('absolute', window_length) scores = metrics(wavfile) scores['mosnet'] = float(scores['mosnet']) scores['srmr'] = float(scores['srmr']) features = list(scores.values()) labels = list(scores) return features, labels
def main(): parser = argparse.ArgumentParser( description="Use MOSnet to predict quality scores.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("--outwavdir", type=str, help="Converted waveform directory") parser.add_argument( "--out", type=str, help="If omitted, then output to sys.stdout", ) args = parser.parse_args() # logging info logging.basicConfig( level=logging.INFO, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) " "%(levelname)s: %(message)s", ) # load converted files. converted_files = sorted(list(Path(args.outwavdir).rglob("*.wav"))) logging.info(f"number of utterances = {len(converted_files)}") # construct metric class metrics = speechmetrics.load("mosnet", None) if args.out is None: out = sys.stdout else: out = open(args.out, "w", encoding="utf-8") # actual calculation scores = {} for cv_path in converted_files: score = metrics(str(cv_path))["mosnet"][0][0] basename = cv_path.stem number, orgspk, tarspk = basename.split("_") tarspk = tarspk.split("-")[-1] orgspk = orgspk.split("-")[-1] scores[f"{orgspk}-{tarspk}-{number}"] = score # summarize by pair pairwise_scores = {} for k, v in scores.items(): orgspk, tarspk, _ = k.split("-") pair = orgspk + " " + tarspk if pair not in pairwise_scores: pairwise_scores[pair] = [] pairwise_scores[pair].append(v) for k in sorted(pairwise_scores.keys()): score_list = pairwise_scores[k] mean_score = float(sum(score_list) / len(score_list)) out.write(f"{k} {mean_score:.3f}\n")
from torchmetrics.audio import SI_SNR from torchmetrics.functional import si_snr from torchmetrics.utilities.imports import _TORCH_GREATER_EQUAL_1_6 seed_all(42) Time = 100 Input = namedtuple('Input', ["preds", "target"]) inputs = Input( preds=torch.rand(NUM_BATCHES, BATCH_SIZE, 1, Time), target=torch.rand(NUM_BATCHES, BATCH_SIZE, 1, Time), ) speechmetrics_sisdr = speechmetrics.load('sisdr') def speechmetrics_si_sdr(preds: Tensor, target: Tensor, zero_mean: bool = True): # shape: preds [BATCH_SIZE, 1, Time] , target [BATCH_SIZE, 1, Time] # or shape: preds [NUM_BATCHES*BATCH_SIZE, 1, Time] , target [NUM_BATCHES*BATCH_SIZE, 1, Time] if zero_mean: preds = preds - preds.mean(dim=2, keepdim=True) target = target - target.mean(dim=2, keepdim=True) target = target.detach().cpu().numpy() preds = preds.detach().cpu().numpy() mss = [] for i in range(preds.shape[0]): ms = [] for j in range(preds.shape[1]): metric = speechmetrics_sisdr(preds[i, j], target[i, j], rate=16000)
'realization', 'SNR', 'technique', 'pesq', 'stoi', 'srmr' ]) filename = [] speech_name = [] noise_name = [] realization = [] SNR = [] technique = [] pesq_score = [] stoi_score = [] srmr_score = [] # mixed case, still works metric_function = speechmetrics.load(['pesq', 'stoi', 'srmr'], window=None, verbose=False) for i in range(len(metric_function.metrics)): metric_function.metrics[i].fixed_rate = 8000 for k in trange(num_techniques): # for k in range(num_techniques): speech_folder = data_folder / 'speech' noisy_folder = data_folder / 'speech+noise' if technique_list[k] is 'noisy': processed_folder = data_folder / 'speech+noise' else: processed_folder = data_folder / 'processed' / technique_list[k]
import speechmetrics as sm if __name__ == '__main__': window = 5 print('Trying ABSOLUTE metrics: ') metrics = sm.load('absolute', window) reference = 'data/m2_script1_produced.wav' tests = [ 'data/m2_script1_clean.wav', 'data/m2_script1_ipad_confroom1.wav', 'data/m2_script1_produced.wav' ] for test in tests: import pprint print('Computing scores for ', test) scores = metrics(reference, test) pprint.pprint(scores) print('\nTrying RELATIVE metrics: ') metrics = sm.load('relative', window) reference = 'data/m2_script1_produced.wav' tests = [ 'data/m2_script1_clean.wav', 'data/m2_script1_ipad_confroom1.wav', 'data/m2_script1_produced.wav' ] for test in tests:
def calculate_all_metrics(path, reference_path, n_max_files=None): metrics = {} FD = FrechetDistance( path=path, reference_path=reference_path, backbone="deepspeech2", sr=16000, sample_size=10000, num_runs=1, window_size=None, conditional=True, use_cached=True, ) metrics["FDSD"] = FD.calculate_metric()[0].data.item() FD.backbone.encoder.cpu() mos_pred = load_model("wave2vec_mos") moses = np.array(mos_pred.calculate(path, False)) moses_ref = np.array(mos_pred.calculate(reference_path, False)) mos_pred.cpu() metrics["MOS_wav2vec"] = moses.mean(), moses.std() metrics["MOSdeg_wav2vec"] = np.mean(np.maximum( moses_ref - moses, 0)), np.std(np.maximum(moses_ref - moses, 0)) metrics["MOSdeg_wav2vec_nonzero"] = np.sum(moses_ref - moses > 0) / len( moses.squeeze()) computer = speechmetrics.load( ["bsseval", "mosnet", "pesq", "stoi", "sisdr"], None) ll = glob.glob(os.path.join(path, "*.wav")) ll_gt = glob.glob(os.path.join(reference_path, "*.wav")) scores = [] for path_to_estimate_file, path_to_reference in tqdm( itertools.islice(zip(ll, ll_gt), n_max_files), total=n_max_files if n_max_files is not None else len(ll), desc="Calculating metrics from speechmetrics", ): scores.append(computer(path_to_estimate_file, path_to_reference)) scores = {k: [dic[k] for dic in scores] for k in scores[0]} scores_ref = [] for path_to_estimate_file, path_to_reference in tqdm( itertools.islice(zip(ll, ll_gt), n_max_files), total=n_max_files if n_max_files is not None else len(ll), desc="Calculating reference values of metrics", ): scores_ref.append(computer(path_to_reference, path_to_reference)) scores_ref = {k: [dic[k] for dic in scores_ref] for k in scores_ref[0]} metrics["MOS_orig"] = np.mean(np.stack(scores["mosnet"])), np.std( np.stack(scores["mosnet"])) mosdeg = np.maximum( -np.stack(scores["mosnet"]) + np.stack(scores_ref["mosnet"]), 0) metrics["MOSdeg_orig"] = np.mean(mosdeg), np.std(mosdeg) metrics["MOSdeg_orig_nonzero"] = np.sum(mosdeg > 0) / len(mosdeg.squeeze()) metrics["sisdr"] = np.mean(np.stack(scores["sisdr"])), np.std( np.stack(scores["sisdr"])) metrics["stoi"] = np.mean(np.stack(scores["stoi"])), np.std( np.stack(scores["stoi"])) metrics["pesq"] = np.mean(np.stack(scores["pesq"])), np.std( np.stack(scores["pesq"])) metrics["sdr"] = np.mean(np.stack(scores["sdr"])), np.std( np.stack(scores["sdr"])) LSD = [] SNR = [] for path_to_estimate_file, path_to_reference in tqdm( itertools.islice(zip(ll, ll_gt), n_max_files), total=n_max_files if n_max_files is not None else len(ll), desc="Calculating LSD and SNR metrics", ): x = librosa.load(path_to_estimate_file, sr=16000)[0] y = librosa.load(path_to_reference, sr=16000)[0] x = librosa.util.normalize(x[:min(len(x), len(y))]) y = librosa.util.normalize(y[:min(len(x), len(y))]) SNR.append(snr(x, y)) LSD.append(lsd(x, y)) metrics["snr"] = np.mean(SNR), np.std(SNR) metrics["lsd"] = np.mean(LSD), np.std(LSD) return metrics
import speechmetrics as sm if __name__ == '__main__': window = 5 metrics = sm.load('absolute', window) reference = 'data/m2_script1_produced.wav' tests = [ 'data/m2_script1_clean.wav', 'data/m2_script1_ipad_confroom1.wav', 'data/m2_script1_produced.wav' ] for test in tests: import pprint print('Computing scores for ', test) scores = metrics(test, reference) pprint.pprint(scores)
from torchmetrics.audio import ScaleInvariantSignalDistortionRatio from torchmetrics.functional import scale_invariant_signal_distortion_ratio from torchmetrics.utilities.imports import _TORCH_GREATER_EQUAL_1_6 seed_all(42) Time = 100 Input = namedtuple("Input", ["preds", "target"]) inputs = Input( preds=torch.rand(NUM_BATCHES, BATCH_SIZE, 1, Time), target=torch.rand(NUM_BATCHES, BATCH_SIZE, 1, Time), ) speechmetrics_sisdr = speechmetrics.load("sisdr") def speechmetrics_si_sdr(preds: Tensor, target: Tensor, zero_mean: bool): # shape: preds [BATCH_SIZE, 1, Time] , target [BATCH_SIZE, 1, Time] # or shape: preds [NUM_BATCHES*BATCH_SIZE, 1, Time] , target [NUM_BATCHES*BATCH_SIZE, 1, Time] if zero_mean: preds = preds - preds.mean(dim=2, keepdim=True) target = target - target.mean(dim=2, keepdim=True) target = target.detach().cpu().numpy() preds = preds.detach().cpu().numpy() mss = [] for i in range(preds.shape[0]): ms = [] for j in range(preds.shape[1]): metric = speechmetrics_sisdr(preds[i, j], target[i, j], rate=16000)