def test_output_metrics():
    example = scenario()

    # Take speech image + noise as prediction, i.e. perfect croos talber suppression
    speech_prediction = example['speech_image'][
        ..., 0, :] + example['noise_image'][..., 0, :]

    speech_image_1, speech_image_2 = example['speech_image'][..., 0, :]

    speech_contribution = np.array([
        [speech_image_1, np.zeros_like(speech_image_2)],
        [np.zeros_like(speech_image_1), speech_image_2],
    ])
    noise_contribution = np.array([
        example['noise_image'][..., 0, :],
        example['noise_image'][..., 0, :],
    ])

    metrics = OutputMetrics(
        speech_prediction=speech_prediction,
        # observation=example['observation'],
        speech_source=example['speech_source'],
        # speech_image=example['speech_image'],
        # noise_image=example['noise_image'],
        speech_contribution=speech_contribution,
        noise_contribution=noise_contribution,
        sample_rate=8000,
        # channel_score_reduce='mean',
    )

    assert metrics.K_source == 2

    for k, v in metrics.as_dict().items():
        if k == 'invasive_sdr':
            np.testing.assert_allclose(v, [49.137625, 44.503376])
        elif k == 'invasive_sir':
            np.testing.assert_allclose(v, np.inf)
        elif k == 'invasive_snr':
            np.testing.assert_allclose(v, [49.137625, 44.503376])
        elif k == 'mir_eval_sdr':
            np.testing.assert_allclose(v, [17.071665, 24.711722])
        elif k == 'mir_eval_sir':
            np.testing.assert_allclose(v, [29.423133, 37.060289])
        elif k == 'mir_eval_sar':
            np.testing.assert_allclose(v, [17.336992, 24.973125])
        elif k == 'pesq':
            np.testing.assert_allclose(v, [4.37408, 4.405752])
        elif k == 'stoi':
            np.testing.assert_allclose(v, [0.968833, 0.976151], rtol=1e-6)
        elif k == 'mir_eval_selection':
            assert all(v == [0, 1])
        elif k == 'srmr':
            np.testing.assert_allclose(v, [0.5504078, 0.50442512])
        else:
            raise KeyError(k, v)
def get_scores(
    ex,
    mask,
    Observation='Observation',
    beamformer='mvdr_souden',
    postfilter=None,
):
    """
    Calculate the scores, where the prediction/estimated signal is tested
    against the source/desired signal.
    This function is for oracle test to figure out, which metric can work with
    source signal.

    SI-SDR does not work, when the desired signal is the signal before the
    room impulse response and give strange results, when the channel is
    changed.

    Example:

        >>> ex = get_dataset('cv_dev93')[0]
        >>> mask = get_mask_from_oracle(ex, 'IBM')
        >>> metric, result = get_scores(ex, mask)
        >>> pprint(result)
        {'pesq': array([2.014, 1.78 ]),
         'stoi': array([0.68236465, 0.61319396]),
         'mir_eval_sxr_sdr': array([10.23933413, 10.01566298]),
         'invasive_sxr_sdr': array([15.76439393, 13.86230425])}
    """

    if Observation == 'Observation':
        metric = get_multi_speaker_metrics(
            mask=rearrange(mask, 'k t f -> t k f'),  # T Ktarget F
            Observation=ex['audio_data'][Observation],  # D T F (stft signal)
            speech_source=ex['audio_data']
            ['speech_source'],  # Ksource N (time signal)
            Speech_image=ex['audio_data']
            ['Speech_image'],  # Ksource D T F (stft signal)
            Noise_image=ex['audio_data']['Noise_image'],  # D T F (stft signal)
            istft=istft,  # callable(signal, num_samples=num_samples)
            bf_algorithm=beamformer,
            postfilter=postfilter,  # [None, 'mask_mul']
        )
    else:
        assert mask is None, mask
        assert beamformer == 'ch0', beamformer
        assert postfilter is None, postfilter
        metric = OutputMetrics(
            speech_prediction=ex['audio_data'][Observation][:, 0],
            speech_source=ex['audio_data']['speech_source'],
            # speech_contribution=speech_contribution,
            # noise_contribution=noise_contribution,
            sample_rate=8000,
            enable_si_sdr=False,
        )

    result = metric.as_dict()
    del result['mir_eval_sxr_selection']
    del result['mir_eval_sxr_sar']
    del result['mir_eval_sxr_sir']
    if 'invasive_sxr_sir' in result:
        del result['invasive_sxr_sir']
        del result['invasive_sxr_snr']

    return metric, result
def get_multi_speaker_metrics(
        mask,  # T Ktarget F
        Observation,  # D T F (stft signal)
        speech_source,  # Ksource N (time signal)
        Speech_image=None,  # Ksource D T F (stft signal)
        Noise_image=None,  # D T F (stft signal)
        istft=None,  # callable(signal, num_samples=num_samples)
        bf_algorithm='mvdr_souden',
        postfilter=None,  # [None, 'mask_mul']
) -> OutputMetrics:
    """

    >>> from IPython.lib.pretty import pprint
    >>> from pb_bss.testing import dummy_data
    >>> from paderbox.transform.module_stft import stft, istft
    >>> from pb_bss.extraction import ideal_ratio_mask, phase_sensitive_mask
    >>> from pb_bss.extraction import ideal_complex_mask

    >>> example = dummy_data.reverberation_data()

    >>> Observation = stft(example['audio_data']['observation'])
    >>> Speech_image = stft(example['audio_data']['speech_image'])
    >>> Noise_image = stft(example['audio_data']['noise_image'])
    >>> speech_source = example['audio_data']['speech_source']

    >>> mask = ideal_ratio_mask(np.abs([*Speech_image, Noise_image]).sum(1))
    >>> X_mask = mask[:-1]
    >>> N_mask = mask[-1]
    >>> kwargs = {}
    >>> kwargs['mask'] = np.stack([*mask], 1)
    >>> kwargs['Observation'] = Observation
    >>> kwargs['Speech_image'] = Speech_image
    >>> kwargs['Noise_image'] = Noise_image
    >>> kwargs['speech_source'] = speech_source
    >>> kwargs['istft'] = istft
    >>> pprint(get_multi_speaker_metrics(**kwargs).as_dict())
    {'pesq': array([1.996, 2.105]),
     'stoi': array([0.8425774 , 0.86015112]),
     'mir_eval_sxr_sdr': array([13.82179099, 11.37128002]),
     'mir_eval_sxr_sir': array([21.39419702, 18.52582023]),
     'mir_eval_sxr_sar': array([14.68805087, 12.3606874 ]),
     'mir_eval_sxr_selection': array([0, 1]),
     'invasive_sxr_sdr': array([17.17792759, 14.49937822]),
     'invasive_sxr_sir': array([18.9065789 , 16.07738463]),
     'invasive_sxr_snr': array([22.01439067, 19.66127281])}
    >>> pprint(get_multi_speaker_metrics(**kwargs, postfilter='mask_mul').as_dict())
    {'pesq': array([2.235, 2.271]),
     'stoi': array([0.84173865, 0.85532424]),
     'mir_eval_sxr_sdr': array([14.17958101, 11.69826193]),
     'mir_eval_sxr_sir': array([29.62978561, 26.10579693]),
     'mir_eval_sxr_sar': array([14.3099193, 11.8692283]),
     'mir_eval_sxr_selection': array([0, 1]),
     'invasive_sxr_sdr': array([24.00659296, 20.80162802]),
     'invasive_sxr_sir': array([27.13945978, 24.21115858]),
     'invasive_sxr_snr': array([26.89769041, 23.44632734])}
    >>> pprint(get_multi_speaker_metrics(**kwargs, bf_algorithm='ch0', postfilter='mask_mul').as_dict())
    {'pesq': array([1.969, 2.018]),
     'stoi': array([0.81097215, 0.80093435]),
     'mir_eval_sxr_sdr': array([10.2343187 ,  8.29797827]),
     'mir_eval_sxr_sir': array([16.84226656, 14.64059341]),
     'mir_eval_sxr_sar': array([11.3932819 ,  9.59180288]),
     'mir_eval_sxr_selection': array([0, 1]),
     'invasive_sxr_sdr': array([14.70258429, 11.87061145]),
     'invasive_sxr_sir': array([14.74794743, 11.92701556]),
     'invasive_sxr_snr': array([34.53605847, 30.76351885])}

    >>> mask = ideal_ratio_mask(np.abs([*Speech_image, Noise_image])[:, 0])
    >>> kwargs['mask'] = np.stack([*mask], 1)
    >>> kwargs['speech_source'] = example['audio_data']['speech_image'][:, 0]
    >>> pprint(get_multi_speaker_metrics(**kwargs, bf_algorithm='ch0', postfilter='mask_mul').as_dict())
    {'pesq': array([3.471, 3.47 ]),
     'stoi': array([0.96011783, 0.96072581]),
     'mir_eval_sxr_sdr': array([13.50013349, 10.59091527]),
     'mir_eval_sxr_sir': array([17.67436882, 14.76824653]),
     'mir_eval_sxr_sar': array([15.66698718, 12.82478905]),
     'mir_eval_sxr_selection': array([0, 1]),
     'invasive_sxr_sdr': array([15.0283757 , 12.18546349]),
     'invasive_sxr_sir': array([15.07095641, 12.23764194]),
     'invasive_sxr_snr': array([35.13536337, 31.41445774])}

    >>> mask = phase_sensitive_mask(np.array([*Speech_image, Noise_image])[:, 0])
    >>> kwargs['mask'] = np.stack([*mask], 1)
    >>> kwargs['speech_source'] = example['audio_data']['speech_image'][:, 0]
    >>> pprint(get_multi_speaker_metrics(**kwargs, bf_algorithm='ch0', postfilter='mask_mul').as_dict())
    {'pesq': array([3.965, 3.968]),
     'stoi': array([0.98172316, 0.98371817]),
     'mir_eval_sxr_sdr': array([17.08649852, 14.51167667]),
     'mir_eval_sxr_sir': array([25.39489935, 24.17276323]),
     'mir_eval_sxr_sar': array([17.79271334, 15.0251782 ]),
     'mir_eval_sxr_selection': array([0, 1]),
     'invasive_sxr_sdr': array([14.67450877, 12.21865275]),
     'invasive_sxr_sir': array([14.77642923, 12.32843497]),
     'invasive_sxr_snr': array([31.02059848, 28.2459515 ])}
    >>> mask = ideal_complex_mask(np.array([*Speech_image, Noise_image])[:, 0])
    >>> kwargs['mask'] = np.stack([*mask], 1)
    >>> kwargs['speech_source'] = example['audio_data']['speech_image'][:, 0]
    >>> pprint(get_multi_speaker_metrics(**kwargs, bf_algorithm='ch0', postfilter='mask_mul').as_dict())
    {'pesq': array([4.549, 4.549]),
     'stoi': array([1., 1.]),
     'mir_eval_sxr_sdr': array([149.04269346, 147.03728106]),
     'mir_eval_sxr_sir': array([170.73079352, 168.36046824]),
     'mir_eval_sxr_sar': array([149.07223578, 147.06942287]),
     'mir_eval_sxr_selection': array([0, 1]),
     'invasive_sxr_sdr': array([12.32048218,  9.61471296]),
     'invasive_sxr_sir': array([12.41346788,  9.69274082]),
     'invasive_sxr_snr': array([29.06057363, 27.10901422])}

    """
    _, N = speech_source.shape
    K = mask.shape[-2]
    D, T, F = Observation.shape

    assert K < 10, (K, mask.shape, N, D, T, F)
    assert D < 30, (K, N, D, T, F)

    psds = get_power_spectral_density_matrix(
        rearrange(Observation, 'd t f -> f d t', d=D, t=T, f=F),
        rearrange(mask, 't k f -> f k t', k=K, t=T, f=F),
    )  # shape: f, ktarget, d, d

    assert psds.shape == (F, K, D, D), (psds.shape, (F, K, D, D))

    beamformers = list()
    for k_target in range(K):
        target_psd = psds[:, k_target]
        distortion_psd = np.sum(np.delete(psds, k_target, axis=1), axis=1)

        beamformers.append(
            get_single_source_bf_vector(
                bf_algorithm,
                target_psd_matrix=target_psd,
                noise_psd_matrix=distortion_psd,
            ))
    beamformers = np.stack(beamformers, axis=1)
    assert beamformers.shape == (F, K, D), (beamformers.shape, (F, K, D))

    def postfiler_fn(Signal):
        if postfilter is None:
            return Signal
        elif postfilter == 'mask_mul':
            return Signal * rearrange(mask, 't k f -> k f t', k=K, t=T, f=F)
        else:
            raise ValueError(postfilter)

    Speech_prediction = apply_beamforming_vector(
        vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F),
        mix=rearrange(Observation, 'd t f -> f d t', d=D, t=T, f=F),
    )
    Speech_prediction = postfiler_fn(Speech_prediction)
    speech_prediction = istft(rearrange(Speech_prediction,
                                        'k f t -> k t f',
                                        k=K,
                                        t=T,
                                        f=F),
                              num_samples=N)

    if Speech_image is None:
        speech_contribution = None
    else:
        Speech_contribution = apply_beamforming_vector(
            vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F),
            mix=rearrange(Speech_image,
                          '(ksource k) d t f -> ksource k f d t',
                          k=1,
                          d=D,
                          t=T,
                          f=F),
        )
        Speech_contribution = postfiler_fn(Speech_contribution)
        # ksource in [K-1, K]
        speech_contribution = istft(rearrange(Speech_contribution,
                                              'ksource k f t -> ksource k t f',
                                              k=K,
                                              t=T,
                                              f=F),
                                    num_samples=N)

    if Noise_image is None:
        noise_contribution = None
    else:
        Noise_contribution = apply_beamforming_vector(
            vector=rearrange(beamformers, 'f k d -> k f d', k=K, d=D, f=F),
            mix=rearrange(Noise_image,
                          '(k d) t f -> k f d t',
                          k=1,
                          d=D,
                          t=T,
                          f=F),
        )
        Noise_contribution = postfiler_fn(Noise_contribution)
        noise_contribution = istft(rearrange(Noise_contribution,
                                             'k f t -> k t f',
                                             k=K,
                                             t=T,
                                             f=F),
                                   num_samples=N)

    metric = OutputMetrics(
        speech_prediction=speech_prediction,
        speech_source=speech_source,
        speech_contribution=speech_contribution,
        noise_contribution=noise_contribution,
        sample_rate=8000,
        enable_si_sdr=False,
    )

    return metric
Exemple #4
0
def trainer_on_simulated_speech_data(
        Trainer=CACGMMTrainer,
        iterations=40,
        reverberation=False,
):
    reference_channel = 0
    sample_rate = 8000

    if reverberation:
        ex = reverberation_data()
    else:
        ex = low_reverberation_data()
    observation = ex['audio_data']['observation']
    Observation = stft(observation)
    num_samples = observation.shape[-1]

    Y_mm = rearrange(Observation, 'd t f -> f t d')

    t = Trainer()
    affiliation = t.fit(
        Y_mm,
        num_classes=3,
        iterations=iterations * 2,
        weight_constant_axis=-1,
    ).predict(Y_mm)
    
    pa = DHTVPermutationAlignment.from_stft_size(512)
    affiliation_pa = pa(rearrange(affiliation, 'f k t -> k f t'))
    affiliation_pa = rearrange(affiliation_pa, 'k f t -> k t f')

    Speech_image_0_est, Speech_image_1_est, Noise_image_est = Observation[reference_channel, :, :] * affiliation_pa

    speech_image_0_est = istft(Speech_image_0_est, num_samples=num_samples)
    speech_image_1_est = istft(Speech_image_1_est, num_samples=num_samples)
    noise_image_est = istft(Noise_image_est, num_samples=num_samples)

    ###########################################################################
    # Calculate the metrics

    speech_image = ex['audio_data']['speech_image']
    noise_image = ex['audio_data']['noise_image']
    speech_source = ex['audio_data']['speech_source']

    Speech_image = stft(speech_image)
    Noise_image = stft(noise_image)

    Speech_contribution = Speech_image[:, reference_channel, None, :, :] * affiliation_pa
    Noise_contribution = Noise_image[reference_channel, :, :] * affiliation_pa

    speech_contribution = istft(Speech_contribution, num_samples=num_samples)
    noise_contribution = istft(Noise_contribution, num_samples=num_samples)

    input_metric = InputMetrics(
        observation=observation,
        speech_source=speech_source,
        speech_image=speech_image,
        noise_image=noise_image,
        sample_rate=sample_rate,
    )

    output_metric = OutputMetrics(
        speech_prediction=np.array(
            [speech_image_0_est, speech_image_1_est, noise_image_est]),
        speech_source=speech_source,
        speech_contribution=speech_contribution,
        noise_contribution=noise_contribution,
        sample_rate=sample_rate,
    )

    return {
        'invasive_sxr_sdr': output_metric.invasive_sxr['sdr'] - input_metric.invasive_sxr['sdr'][:, reference_channel],
        'mir_eval_sxr_sdr': output_metric.mir_eval['sdr'] - input_metric.mir_eval['sdr'][:, reference_channel],
    }
def get_scores(ex, prediction, source):
    """
    Calculate the scores, where the prediction/estimated signal is tested
    against the source/desired signal.
    This function is for oracle test to figure out, which metric can work with
    source signal.

    Example:
        SI-SDR does not work, when the desired signal is the signal befor the
        room impulse response and give strange results, when the channel is
        changed.

    >>> pprint(get_scores(get_dataset('cv_dev93')[0], 'image_0', 'early_0'))
    {'pesq': array([2.861]),
     'stoi': array([0.97151566]),
     'mir_eval_sxr_sdr': array([13.39136665]),
     'si_sdr': array([10.81039897])}
    >>> pprint(get_scores(get_dataset('cv_dev93')[0], 'image_0', 'source'))
    {'pesq': array([2.234]),
     'stoi': array([0.8005423]),
     'mir_eval_sxr_sdr': array([12.11446204]),
     'si_sdr': array([-20.05244551])}
    >>> pprint(get_scores(get_dataset('cv_dev93')[0], 'image_0', 'image_1'))
    {'pesq': array([3.608]),
     'stoi': array([0.92216845]),
     'mir_eval_sxr_sdr': array([9.55425598]),
     'si_sdr': array([-0.16858895])}
    """
    def get_signal(ex, name):
        assert isinstance(ex, dict), ex
        assert 'audio_data' in ex, ex
        assert isinstance(ex['audio_data'], dict), ex
        if name == 'source':
            return ex['audio_data']['speech_source'][:]
        elif name == 'early_0':
            return ex['audio_data']['speech_reverberation_early'][:, 0]
        elif name == 'early_1':
            return ex['audio_data']['speech_reverberation_early'][:, 1]
        elif name == 'image_0':
            return ex['audio_data']['speech_image'][:, 0]
        elif name == 'image_1':
            return ex['audio_data']['speech_image'][:, 1]
        elif name == 'image_0_noise':
            return ex['audio_data']['speech_image'][:, 0] + \
                   ex['audio_data']['noise_image'][0]
        elif name == 'image_1_noise':
            return ex['audio_data']['speech_image'][:, 1] + \
                   ex['audio_data']['noise_image'][0]
        else:
            raise ValueError(name)

    speech_prediction = get_signal(ex, prediction)
    speech_source = get_signal(ex, source)

    metric = OutputMetrics(
        speech_prediction=speech_prediction,
        speech_source=speech_source,
        sample_rate=8000,
        enable_si_sdr=True,
    )

    result = metric.as_dict()
    del result['mir_eval_sxr_selection']
    del result['mir_eval_sxr_sar']
    del result['mir_eval_sxr_sir']

    return result