def generate_and_verify_psd(self, x_shape, mask_shape, psd_shape=None):
     x, mask = self.generate_date(x_shape, mask_shape)
     if mask_shape is None:
         psd = get_power_spectral_density_matrix(x)
     else:
         psd = get_power_spectral_density_matrix(x, mask)
     if psd_shape is not None:
         tc.assert_equal(psd.shape, psd_shape)
     assert_hermitian(psd)
     assert_positive_semidefinite(psd)
    def test_different_valued_masks_output(self):
        x, mask = self.generate_date((self.F, self.D, self.T), (
            self.F,
            self.T,
        ))

        psd = get_power_spectral_density_matrix(x, mask)
        psd2 = get_power_spectral_density_matrix(x, mask * 2)
        psd3 = get_power_spectral_density_matrix(x, mask * 0.5)

        tc.assert_almost_equal(psd, psd2)
        tc.assert_almost_equal(psd, psd3)
    def test_predict_output_with_mask(self):
        x, _ = self.generate_date((self.D, ), None)
        x_rep = x[:, np.newaxis].repeat((self.T, ), 1)
        mask = np.ones((self.T, ))
        psd = get_power_spectral_density_matrix(x_rep, mask)
        psd2 = get_power_spectral_density_matrix(x_rep, mask * 2)
        psd3 = get_power_spectral_density_matrix(x_rep, mask * 0.5)

        psd_predict = x[:, np.newaxis].dot(x[np.newaxis, :].conj())
        tc.assert_almost_equal(psd, psd_predict)
        tc.assert_almost_equal(psd, psd2)
        tc.assert_almost_equal(psd, psd3)
    def test_predict_output(self):
        x, _ = self.generate_date((self.D, ), None)
        x_rep = x[:, np.newaxis].repeat((self.T, ), 1)
        psd = get_power_spectral_density_matrix(x_rep)

        psd_predict = x[:, np.newaxis].dot(x[np.newaxis, :].conj())
        tc.assert_almost_equal(psd, psd_predict)
 def test_covariance_with_mask_independent_dim(self):
     x = rand(2, 3, 4, data_type=np.complex128)
     mask = np.random.uniform(0, 1, (
         2,
         4,
     ))
     psd = get_power_spectral_density_matrix(x, mask)
     tc.assert_equal(psd.shape, (2, 3, 3))
     assert_positive_semidefinite(psd)
 def test_multiple_sources_for_source_separation(self):
     x = rand(2, 3, 4, data_type=np.complex128)
     mask = np.random.uniform(0, 1, (
         5,
         2,
         4,
     ))
     psd = get_power_spectral_density_matrix(x[np.newaxis, ...], mask)
     tc.assert_equal(psd.shape, (5, 2, 3, 3))
     assert_positive_semidefinite(psd)
Example #7
0
def get_multi_speaker_metrics(
        mask,  # T Ktarget F
        Observation,  # D T F (stft signal)
        speech_source,  # Ksource N (time signal)
        Speech_image=None,  # Ksource D T F (stft signal)
        Noise_image=None,  # D T F (stft signal)
        istft=None,  # callable(signal, num_samples=num_samples)
        bf_algorithm='mvdr_souden',
        postfilter=None,  # [None, 'mask_mul']
) -> OutputMetrics:
    """

    >>> from IPython.lib.pretty import pprint
    >>> from pb_bss.testing import dummy_data
    >>> from paderbox.transform.module_stft import stft, istft
    >>> from pb_bss.extraction import ideal_ratio_mask, phase_sensitive_mask
    >>> from pb_bss.extraction import ideal_complex_mask

    >>> example = dummy_data.reverberation_data()

    >>> Observation = stft(example['audio_data']['observation'])
    >>> Speech_image = stft(example['audio_data']['speech_image'])
    >>> Noise_image = stft(example['audio_data']['noise_image'])
    >>> speech_source = example['audio_data']['speech_source']

    >>> mask = ideal_ratio_mask(np.abs([*Speech_image, Noise_image]).sum(1))
    >>> X_mask = mask[:-1]
    >>> N_mask = mask[-1]
    >>> kwargs = {}
    >>> kwargs['mask'] = np.stack([*mask], 1)
    >>> kwargs['Observation'] = Observation
    >>> kwargs['Speech_image'] = Speech_image
    >>> kwargs['Noise_image'] = Noise_image
    >>> kwargs['speech_source'] = speech_source
    >>> kwargs['istft'] = istft
    >>> pprint(get_multi_speaker_metrics(**kwargs).as_dict())
    {'pesq': array([1.996, 2.105]),
     'stoi': array([0.8425774 , 0.86015112]),
     'mir_eval_sxr_sdr': array([13.82179099, 11.37128002]),
     'mir_eval_sxr_sir': array([21.39419702, 18.52582023]),
     'mir_eval_sxr_sar': array([14.68805087, 12.3606874 ]),
     'mir_eval_sxr_selection': array([0, 1]),
     'invasive_sxr_sdr': array([17.17792759, 14.49937822]),
     'invasive_sxr_sir': array([18.9065789 , 16.07738463]),
     'invasive_sxr_snr': array([22.01439067, 19.66127281])}
    >>> pprint(get_multi_speaker_metrics(**kwargs, postfilter='mask_mul').as_dict())
    {'pesq': array([2.235, 2.271]),
     'stoi': array([0.84173865, 0.85532424]),
     'mir_eval_sxr_sdr': array([14.17958101, 11.69826193]),
     'mir_eval_sxr_sir': array([29.62978561, 26.10579693]),
     'mir_eval_sxr_sar': array([14.3099193, 11.8692283]),
     'mir_eval_sxr_selection': array([0, 1]),
     'invasive_sxr_sdr': array([24.00659296, 20.80162802]),
     'invasive_sxr_sir': array([27.13945978, 24.21115858]),
     'invasive_sxr_snr': array([26.89769041, 23.44632734])}
    >>> pprint(get_multi_speaker_metrics(**kwargs, bf_algorithm='ch0', postfilter='mask_mul').as_dict())
    {'pesq': array([1.969, 2.018]),
     'stoi': array([0.81097215, 0.80093435]),
     'mir_eval_sxr_sdr': array([10.2343187 ,  8.29797827]),
     'mir_eval_sxr_sir': array([16.84226656, 14.64059341]),
     'mir_eval_sxr_sar': array([11.3932819 ,  9.59180288]),
     'mir_eval_sxr_selection': array([0, 1]),
     'invasive_sxr_sdr': array([14.70258429, 11.87061145]),
     'invasive_sxr_sir': array([14.74794743, 11.92701556]),
     'invasive_sxr_snr': array([34.53605847, 30.76351885])}

    >>> mask = ideal_ratio_mask(np.abs([*Speech_image, Noise_image])[:, 0])
    >>> kwargs['mask'] = np.stack([*mask], 1)
    >>> kwargs['speech_source'] = example['audio_data']['speech_image'][:, 0]
    >>> pprint(get_multi_speaker_metrics(**kwargs, bf_algorithm='ch0', postfilter='mask_mul').as_dict())
    {'pesq': array([3.471, 3.47 ]),
     'stoi': array([0.96011783, 0.96072581]),
     'mir_eval_sxr_sdr': array([13.50013349, 10.59091527]),
     'mir_eval_sxr_sir': array([17.67436882, 14.76824653]),
     'mir_eval_sxr_sar': array([15.66698718, 12.82478905]),
     'mir_eval_sxr_selection': array([0, 1]),
     'invasive_sxr_sdr': array([15.0283757 , 12.18546349]),
     'invasive_sxr_sir': array([15.07095641, 12.23764194]),
     'invasive_sxr_snr': array([35.13536337, 31.41445774])}

    >>> mask = phase_sensitive_mask(np.array([*Speech_image, Noise_image])[:, 0])
    >>> kwargs['mask'] = np.stack([*mask], 1)
    >>> kwargs['speech_source'] = example['audio_data']['speech_image'][:, 0]
    >>> pprint(get_multi_speaker_metrics(**kwargs, bf_algorithm='ch0', postfilter='mask_mul').as_dict())
    {'pesq': array([3.965, 3.968]),
     'stoi': array([0.98172316, 0.98371817]),
     'mir_eval_sxr_sdr': array([17.08649852, 14.51167667]),
     'mir_eval_sxr_sir': array([25.39489935, 24.17276323]),
     'mir_eval_sxr_sar': array([17.79271334, 15.0251782 ]),
     'mir_eval_sxr_selection': array([0, 1]),
     'invasive_sxr_sdr': array([14.67450877, 12.21865275]),
     'invasive_sxr_sir': array([14.77642923, 12.32843497]),
     'invasive_sxr_snr': array([31.02059848, 28.2459515 ])}
    >>> mask = ideal_complex_mask(np.array([*Speech_image, Noise_image])[:, 0])
    >>> kwargs['mask'] = np.stack([*mask], 1)
    >>> kwargs['speech_source'] = example['audio_data']['speech_image'][:, 0]
    >>> pprint(get_multi_speaker_metrics(**kwargs, bf_algorithm='ch0', postfilter='mask_mul').as_dict())
    {'pesq': array([4.549, 4.549]),
     'stoi': array([1., 1.]),
     'mir_eval_sxr_sdr': array([149.04269346, 147.03728106]),
     'mir_eval_sxr_sir': array([170.73079352, 168.36046824]),
     'mir_eval_sxr_sar': array([149.07223578, 147.06942287]),
     'mir_eval_sxr_selection': array([0, 1]),
     'invasive_sxr_sdr': array([12.32048218,  9.61471296]),
     'invasive_sxr_sir': array([12.41346788,  9.69274082]),
     'invasive_sxr_snr': array([29.06057363, 27.10901422])}

    """
    _, N = speech_source.shape
    K = mask.shape[-2]
    D, T, F = Observation.shape

    assert K < 10, (K, mask.shape, N, D, T, F)
    assert D < 30, (K, N, D, T, F)

    psds = get_power_spectral_density_matrix(
        rearrange(Observation, 'd t f -> f d t', d=D, t=T, f=F),
        rearrange(mask, 't k f -> f k t', k=K, t=T, f=F),
    )  # shape: f, ktarget, d, d

    assert psds.shape == (F, K, D, D), (psds.shape, (F, K, D, D))

    beamformers = list()
    for k_target in range(K):
        target_psd = psds[:, k_target]
        distortion_psd = np.sum(np.delete(psds, k_target, axis=1), axis=1)

        beamformers.append(
            get_single_source_bf_vector(
                bf_algorithm,
                target_psd_matrix=target_psd,
                noise_psd_matrix=distortion_psd,
            ))
    beamformers = np.stack(beamformers, axis=1)
    assert beamformers.shape == (F, K, D), (beamformers.shape, (F, K, D))

    def postfiler_fn(Signal):
        if postfilter is None:
            return Signal
        elif postfilter == 'mask_mul':
            return Signal * rearrange(mask, 't k f -> k f t', k=K, t=T, f=F)
        else:
            raise ValueError(postfilter)

    Speech_prediction = apply_beamforming_vector(
        vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F),
        mix=rearrange(Observation, 'd t f -> f d t', d=D, t=T, f=F),
    )
    Speech_prediction = postfiler_fn(Speech_prediction)
    speech_prediction = istft(rearrange(Speech_prediction,
                                        'k f t -> k t f',
                                        k=K,
                                        t=T,
                                        f=F),
                              num_samples=N)

    if Speech_image is None:
        speech_contribution = None
    else:
        Speech_contribution = apply_beamforming_vector(
            vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F),
            mix=rearrange(Speech_image,
                          '(ksource k) d t f -> ksource k f d t',
                          k=1,
                          d=D,
                          t=T,
                          f=F),
        )
        Speech_contribution = postfiler_fn(Speech_contribution)
        # ksource in [K-1, K]
        speech_contribution = istft(rearrange(Speech_contribution,
                                              'ksource k f t -> ksource k t f',
                                              k=K,
                                              t=T,
                                              f=F),
                                    num_samples=N)

    if Noise_image is None:
        noise_contribution = None
    else:
        Noise_contribution = apply_beamforming_vector(
            vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F),
            mix=rearrange(Noise_image,
                          '(k d) t f -> k f d t',
                          k=1,
                          d=D,
                          t=T,
                          f=F),
        )
        Noise_contribution = postfiler_fn(Noise_contribution)
        noise_contribution = istft(rearrange(Noise_contribution,
                                             'k f t -> k t f',
                                             k=K,
                                             t=T,
                                             f=F),
                                   num_samples=N)

    metric = OutputMetrics(
        speech_prediction=speech_prediction,
        speech_source=speech_source,
        speech_contribution=speech_contribution,
        noise_contribution=noise_contribution,
        sample_rate=8000,
        enable_si_sdr=False,
    )

    return metric
 def test_covariance_without_mask_independent_dim(self):
     x = rand(1, 2, 3, 4, data_type=np.complex128)
     psd = get_power_spectral_density_matrix(x)
     tc.assert_equal(psd.shape, (1, 2, 3, 3))
     assert_positive_semidefinite(psd)