def test_output_metrics(): example = scenario() # Take speech image + noise as prediction, i.e. perfect croos talber suppression speech_prediction = example['speech_image'][ ..., 0, :] + example['noise_image'][..., 0, :] speech_image_1, speech_image_2 = example['speech_image'][..., 0, :] speech_contribution = np.array([ [speech_image_1, np.zeros_like(speech_image_2)], [np.zeros_like(speech_image_1), speech_image_2], ]) noise_contribution = np.array([ example['noise_image'][..., 0, :], example['noise_image'][..., 0, :], ]) metrics = OutputMetrics( speech_prediction=speech_prediction, # observation=example['observation'], speech_source=example['speech_source'], # speech_image=example['speech_image'], # noise_image=example['noise_image'], speech_contribution=speech_contribution, noise_contribution=noise_contribution, sample_rate=8000, # channel_score_reduce='mean', ) assert metrics.K_source == 2 for k, v in metrics.as_dict().items(): if k == 'invasive_sdr': np.testing.assert_allclose(v, [49.137625, 44.503376]) elif k == 'invasive_sir': np.testing.assert_allclose(v, np.inf) elif k == 'invasive_snr': np.testing.assert_allclose(v, [49.137625, 44.503376]) elif k == 'mir_eval_sdr': np.testing.assert_allclose(v, [17.071665, 24.711722]) elif k == 'mir_eval_sir': np.testing.assert_allclose(v, [29.423133, 37.060289]) elif k == 'mir_eval_sar': np.testing.assert_allclose(v, [17.336992, 24.973125]) elif k == 'pesq': np.testing.assert_allclose(v, [4.37408, 4.405752]) elif k == 'stoi': np.testing.assert_allclose(v, [0.968833, 0.976151], rtol=1e-6) elif k == 'mir_eval_selection': assert all(v == [0, 1]) elif k == 'srmr': np.testing.assert_allclose(v, [0.5504078, 0.50442512]) else: raise KeyError(k, v)
def get_scores( ex, mask, Observation='Observation', beamformer='mvdr_souden', postfilter=None, ): """ Calculate the scores, where the prediction/estimated signal is tested against the source/desired signal. This function is for oracle test to figure out, which metric can work with source signal. SI-SDR does not work, when the desired signal is the signal before the room impulse response and give strange results, when the channel is changed. Example: >>> ex = get_dataset('cv_dev93')[0] >>> mask = get_mask_from_oracle(ex, 'IBM') >>> metric, result = get_scores(ex, mask) >>> pprint(result) {'pesq': array([2.014, 1.78 ]), 'stoi': array([0.68236465, 0.61319396]), 'mir_eval_sxr_sdr': array([10.23933413, 10.01566298]), 'invasive_sxr_sdr': array([15.76439393, 13.86230425])} """ if Observation == 'Observation': metric = get_multi_speaker_metrics( mask=rearrange(mask, 'k t f -> t k f'), # T Ktarget F Observation=ex['audio_data'][Observation], # D T F (stft signal) speech_source=ex['audio_data'] ['speech_source'], # Ksource N (time signal) Speech_image=ex['audio_data'] ['Speech_image'], # Ksource D T F (stft signal) Noise_image=ex['audio_data']['Noise_image'], # D T F (stft signal) istft=istft, # callable(signal, num_samples=num_samples) bf_algorithm=beamformer, postfilter=postfilter, # [None, 'mask_mul'] ) else: assert mask is None, mask assert beamformer == 'ch0', beamformer assert postfilter is None, postfilter metric = OutputMetrics( speech_prediction=ex['audio_data'][Observation][:, 0], speech_source=ex['audio_data']['speech_source'], # speech_contribution=speech_contribution, # noise_contribution=noise_contribution, sample_rate=8000, enable_si_sdr=False, ) result = metric.as_dict() del result['mir_eval_sxr_selection'] del result['mir_eval_sxr_sar'] del result['mir_eval_sxr_sir'] if 'invasive_sxr_sir' in result: del result['invasive_sxr_sir'] del result['invasive_sxr_snr'] return metric, result
def get_multi_speaker_metrics( mask, # T Ktarget F Observation, # D T F (stft signal) speech_source, # Ksource N (time signal) Speech_image=None, # Ksource D T F (stft signal) Noise_image=None, # D T F (stft signal) istft=None, # callable(signal, num_samples=num_samples) bf_algorithm='mvdr_souden', postfilter=None, # [None, 'mask_mul'] ) -> OutputMetrics: """ >>> from IPython.lib.pretty import pprint >>> from pb_bss.testing import dummy_data >>> from paderbox.transform.module_stft import stft, istft >>> from pb_bss.extraction import ideal_ratio_mask, phase_sensitive_mask >>> from pb_bss.extraction import ideal_complex_mask >>> example = dummy_data.reverberation_data() >>> Observation = stft(example['audio_data']['observation']) >>> Speech_image = stft(example['audio_data']['speech_image']) >>> Noise_image = stft(example['audio_data']['noise_image']) >>> speech_source = example['audio_data']['speech_source'] >>> mask = ideal_ratio_mask(np.abs([*Speech_image, Noise_image]).sum(1)) >>> X_mask = mask[:-1] >>> N_mask = mask[-1] >>> kwargs = {} >>> kwargs['mask'] = np.stack([*mask], 1) >>> kwargs['Observation'] = Observation >>> kwargs['Speech_image'] = Speech_image >>> kwargs['Noise_image'] = Noise_image >>> kwargs['speech_source'] = speech_source >>> kwargs['istft'] = istft >>> pprint(get_multi_speaker_metrics(**kwargs).as_dict()) {'pesq': array([1.996, 2.105]), 'stoi': array([0.8425774 , 0.86015112]), 'mir_eval_sxr_sdr': array([13.82179099, 11.37128002]), 'mir_eval_sxr_sir': array([21.39419702, 18.52582023]), 'mir_eval_sxr_sar': array([14.68805087, 12.3606874 ]), 'mir_eval_sxr_selection': array([0, 1]), 'invasive_sxr_sdr': array([17.17792759, 14.49937822]), 'invasive_sxr_sir': array([18.9065789 , 16.07738463]), 'invasive_sxr_snr': array([22.01439067, 19.66127281])} >>> pprint(get_multi_speaker_metrics(**kwargs, postfilter='mask_mul').as_dict()) {'pesq': array([2.235, 2.271]), 'stoi': array([0.84173865, 0.85532424]), 'mir_eval_sxr_sdr': array([14.17958101, 11.69826193]), 'mir_eval_sxr_sir': array([29.62978561, 26.10579693]), 'mir_eval_sxr_sar': array([14.3099193, 11.8692283]), 'mir_eval_sxr_selection': array([0, 1]), 'invasive_sxr_sdr': array([24.00659296, 20.80162802]), 'invasive_sxr_sir': array([27.13945978, 24.21115858]), 'invasive_sxr_snr': array([26.89769041, 23.44632734])} >>> pprint(get_multi_speaker_metrics(**kwargs, bf_algorithm='ch0', postfilter='mask_mul').as_dict()) {'pesq': array([1.969, 2.018]), 'stoi': array([0.81097215, 0.80093435]), 'mir_eval_sxr_sdr': array([10.2343187 , 8.29797827]), 'mir_eval_sxr_sir': array([16.84226656, 14.64059341]), 'mir_eval_sxr_sar': array([11.3932819 , 9.59180288]), 'mir_eval_sxr_selection': array([0, 1]), 'invasive_sxr_sdr': array([14.70258429, 11.87061145]), 'invasive_sxr_sir': array([14.74794743, 11.92701556]), 'invasive_sxr_snr': array([34.53605847, 30.76351885])} >>> mask = ideal_ratio_mask(np.abs([*Speech_image, Noise_image])[:, 0]) >>> kwargs['mask'] = np.stack([*mask], 1) >>> kwargs['speech_source'] = example['audio_data']['speech_image'][:, 0] >>> pprint(get_multi_speaker_metrics(**kwargs, bf_algorithm='ch0', postfilter='mask_mul').as_dict()) {'pesq': array([3.471, 3.47 ]), 'stoi': array([0.96011783, 0.96072581]), 'mir_eval_sxr_sdr': array([13.50013349, 10.59091527]), 'mir_eval_sxr_sir': array([17.67436882, 14.76824653]), 'mir_eval_sxr_sar': array([15.66698718, 12.82478905]), 'mir_eval_sxr_selection': array([0, 1]), 'invasive_sxr_sdr': array([15.0283757 , 12.18546349]), 'invasive_sxr_sir': array([15.07095641, 12.23764194]), 'invasive_sxr_snr': array([35.13536337, 31.41445774])} >>> mask = phase_sensitive_mask(np.array([*Speech_image, Noise_image])[:, 0]) >>> kwargs['mask'] = np.stack([*mask], 1) >>> kwargs['speech_source'] = example['audio_data']['speech_image'][:, 0] >>> pprint(get_multi_speaker_metrics(**kwargs, bf_algorithm='ch0', postfilter='mask_mul').as_dict()) {'pesq': array([3.965, 3.968]), 'stoi': array([0.98172316, 0.98371817]), 'mir_eval_sxr_sdr': array([17.08649852, 14.51167667]), 'mir_eval_sxr_sir': array([25.39489935, 24.17276323]), 'mir_eval_sxr_sar': array([17.79271334, 15.0251782 ]), 'mir_eval_sxr_selection': array([0, 1]), 'invasive_sxr_sdr': array([14.67450877, 12.21865275]), 'invasive_sxr_sir': array([14.77642923, 12.32843497]), 'invasive_sxr_snr': array([31.02059848, 28.2459515 ])} >>> mask = ideal_complex_mask(np.array([*Speech_image, Noise_image])[:, 0]) >>> kwargs['mask'] = np.stack([*mask], 1) >>> kwargs['speech_source'] = example['audio_data']['speech_image'][:, 0] >>> pprint(get_multi_speaker_metrics(**kwargs, bf_algorithm='ch0', postfilter='mask_mul').as_dict()) {'pesq': array([4.549, 4.549]), 'stoi': array([1., 1.]), 'mir_eval_sxr_sdr': array([149.04269346, 147.03728106]), 'mir_eval_sxr_sir': array([170.73079352, 168.36046824]), 'mir_eval_sxr_sar': array([149.07223578, 147.06942287]), 'mir_eval_sxr_selection': array([0, 1]), 'invasive_sxr_sdr': array([12.32048218, 9.61471296]), 'invasive_sxr_sir': array([12.41346788, 9.69274082]), 'invasive_sxr_snr': array([29.06057363, 27.10901422])} """ _, N = speech_source.shape K = mask.shape[-2] D, T, F = Observation.shape assert K < 10, (K, mask.shape, N, D, T, F) assert D < 30, (K, N, D, T, F) psds = get_power_spectral_density_matrix( rearrange(Observation, 'd t f -> f d t', d=D, t=T, f=F), rearrange(mask, 't k f -> f k t', k=K, t=T, f=F), ) # shape: f, ktarget, d, d assert psds.shape == (F, K, D, D), (psds.shape, (F, K, D, D)) beamformers = list() for k_target in range(K): target_psd = psds[:, k_target] distortion_psd = np.sum(np.delete(psds, k_target, axis=1), axis=1) beamformers.append( get_single_source_bf_vector( bf_algorithm, target_psd_matrix=target_psd, noise_psd_matrix=distortion_psd, )) beamformers = np.stack(beamformers, axis=1) assert beamformers.shape == (F, K, D), (beamformers.shape, (F, K, D)) def postfiler_fn(Signal): if postfilter is None: return Signal elif postfilter == 'mask_mul': return Signal * rearrange(mask, 't k f -> k f t', k=K, t=T, f=F) else: raise ValueError(postfilter) Speech_prediction = apply_beamforming_vector( vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F), mix=rearrange(Observation, 'd t f -> f d t', d=D, t=T, f=F), ) Speech_prediction = postfiler_fn(Speech_prediction) speech_prediction = istft(rearrange(Speech_prediction, 'k f t -> k t f', k=K, t=T, f=F), num_samples=N) if Speech_image is None: speech_contribution = None else: Speech_contribution = apply_beamforming_vector( vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F), mix=rearrange(Speech_image, '(ksource k) d t f -> ksource k f d t', k=1, d=D, t=T, f=F), ) Speech_contribution = postfiler_fn(Speech_contribution) # ksource in [K-1, K] speech_contribution = istft(rearrange(Speech_contribution, 'ksource k f t -> ksource k t f', k=K, t=T, f=F), num_samples=N) if Noise_image is None: noise_contribution = None else: Noise_contribution = apply_beamforming_vector( vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F), mix=rearrange(Noise_image, '(k d) t f -> k f d t', k=1, d=D, t=T, f=F), ) Noise_contribution = postfiler_fn(Noise_contribution) noise_contribution = istft(rearrange(Noise_contribution, 'k f t -> k t f', k=K, t=T, f=F), num_samples=N) metric = OutputMetrics( speech_prediction=speech_prediction, speech_source=speech_source, speech_contribution=speech_contribution, noise_contribution=noise_contribution, sample_rate=8000, enable_si_sdr=False, ) return metric
def trainer_on_simulated_speech_data( Trainer=CACGMMTrainer, iterations=40, reverberation=False, ): reference_channel = 0 sample_rate = 8000 if reverberation: ex = reverberation_data() else: ex = low_reverberation_data() observation = ex['audio_data']['observation'] Observation = stft(observation) num_samples = observation.shape[-1] Y_mm = rearrange(Observation, 'd t f -> f t d') t = Trainer() affiliation = t.fit( Y_mm, num_classes=3, iterations=iterations * 2, weight_constant_axis=-1, ).predict(Y_mm) pa = DHTVPermutationAlignment.from_stft_size(512) affiliation_pa = pa(rearrange(affiliation, 'f k t -> k f t')) affiliation_pa = rearrange(affiliation_pa, 'k f t -> k t f') Speech_image_0_est, Speech_image_1_est, Noise_image_est = Observation[reference_channel, :, :] * affiliation_pa speech_image_0_est = istft(Speech_image_0_est, num_samples=num_samples) speech_image_1_est = istft(Speech_image_1_est, num_samples=num_samples) noise_image_est = istft(Noise_image_est, num_samples=num_samples) ########################################################################### # Calculate the metrics speech_image = ex['audio_data']['speech_image'] noise_image = ex['audio_data']['noise_image'] speech_source = ex['audio_data']['speech_source'] Speech_image = stft(speech_image) Noise_image = stft(noise_image) Speech_contribution = Speech_image[:, reference_channel, None, :, :] * affiliation_pa Noise_contribution = Noise_image[reference_channel, :, :] * affiliation_pa speech_contribution = istft(Speech_contribution, num_samples=num_samples) noise_contribution = istft(Noise_contribution, num_samples=num_samples) input_metric = InputMetrics( observation=observation, speech_source=speech_source, speech_image=speech_image, noise_image=noise_image, sample_rate=sample_rate, ) output_metric = OutputMetrics( speech_prediction=np.array( [speech_image_0_est, speech_image_1_est, noise_image_est]), speech_source=speech_source, speech_contribution=speech_contribution, noise_contribution=noise_contribution, sample_rate=sample_rate, ) return { 'invasive_sxr_sdr': output_metric.invasive_sxr['sdr'] - input_metric.invasive_sxr['sdr'][:, reference_channel], 'mir_eval_sxr_sdr': output_metric.mir_eval['sdr'] - input_metric.mir_eval['sdr'][:, reference_channel], }
def get_scores(ex, prediction, source): """ Calculate the scores, where the prediction/estimated signal is tested against the source/desired signal. This function is for oracle test to figure out, which metric can work with source signal. Example: SI-SDR does not work, when the desired signal is the signal befor the room impulse response and give strange results, when the channel is changed. >>> pprint(get_scores(get_dataset('cv_dev93')[0], 'image_0', 'early_0')) {'pesq': array([2.861]), 'stoi': array([0.97151566]), 'mir_eval_sxr_sdr': array([13.39136665]), 'si_sdr': array([10.81039897])} >>> pprint(get_scores(get_dataset('cv_dev93')[0], 'image_0', 'source')) {'pesq': array([2.234]), 'stoi': array([0.8005423]), 'mir_eval_sxr_sdr': array([12.11446204]), 'si_sdr': array([-20.05244551])} >>> pprint(get_scores(get_dataset('cv_dev93')[0], 'image_0', 'image_1')) {'pesq': array([3.608]), 'stoi': array([0.92216845]), 'mir_eval_sxr_sdr': array([9.55425598]), 'si_sdr': array([-0.16858895])} """ def get_signal(ex, name): assert isinstance(ex, dict), ex assert 'audio_data' in ex, ex assert isinstance(ex['audio_data'], dict), ex if name == 'source': return ex['audio_data']['speech_source'][:] elif name == 'early_0': return ex['audio_data']['speech_reverberation_early'][:, 0] elif name == 'early_1': return ex['audio_data']['speech_reverberation_early'][:, 1] elif name == 'image_0': return ex['audio_data']['speech_image'][:, 0] elif name == 'image_1': return ex['audio_data']['speech_image'][:, 1] elif name == 'image_0_noise': return ex['audio_data']['speech_image'][:, 0] + \ ex['audio_data']['noise_image'][0] elif name == 'image_1_noise': return ex['audio_data']['speech_image'][:, 1] + \ ex['audio_data']['noise_image'][0] else: raise ValueError(name) speech_prediction = get_signal(ex, prediction) speech_source = get_signal(ex, source) metric = OutputMetrics( speech_prediction=speech_prediction, speech_source=speech_source, sample_rate=8000, enable_si_sdr=True, ) result = metric.as_dict() del result['mir_eval_sxr_selection'] del result['mir_eval_sxr_sar'] del result['mir_eval_sxr_sir'] return result