def test_pitch_feats(self, kwargs):
        """compute_kaldi_pitch produces numerically compatible result with compute-kaldi-pitch-feats"""
        sample_rate = kwargs['sample_rate']
        waveform = get_sinusoid(dtype='float32', sample_rate=sample_rate)
        result = F.compute_kaldi_pitch(waveform[0], **kwargs)

        waveform = get_sinusoid(dtype='int16', sample_rate=sample_rate)
        wave_file = self.get_temp_path('test.wav')
        save_wav(wave_file, waveform, sample_rate)

        command = ['compute-kaldi-pitch-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
        kaldi_result = run_kaldi(command, 'scp', wave_file)
        self.assert_equal(result, expected=kaldi_result)
Beispiel #2
0
def extract_features(x, sr):
    step = 0.01
    fft_time = 0.05
    n_mels = 128
    n_mfcc = 40
    n_fft = int(fft_time * sr)
    hop_length = int(step * sr)

    spec = AT.MelSpectrogram(sample_rate=sr,
                             n_fft=n_fft,
                             hop_length=hop_length,
                             n_mels=n_mels,
                             f_max=8000)(x)[0]
    intensity = spec.mean(dim=0).log()
    spec = AT.AmplitudeToDB()(spec)
    mfcc = AT.MFCC(
        sample_rate=sr,
        n_mfcc=n_mfcc,
        melkwargs={
            "n_fft": n_fft,
            "hop_length": hop_length,
            "n_mels": n_mels,
            "f_max": 8000,
        },
    )(x)[0]
    mfcc = (mfcc - mfcc.mean(dim=1, keepdim=True)) / mfcc.std(dim=1,
                                                              keepdim=True)
    pitch_feature = AF.compute_kaldi_pitch(
        x,
        sample_rate=sr,
        frame_length=fft_time * 1000,
        frame_shift=step * 1000,
        snip_edges=True,
        min_f0=70,
        max_f0=350,
        penalty_factor=0.01,
    )
    pitch = pitch_feature[0]
    return {
        "Waveform": x[0],
        "MelSpectrogram": spec,
        "MFCC": mfcc,
        "Pitch": pitch,
        "Intensity": intensity,
    }
plot_pitch(waveform, sample_rate, pitch)
play_audio(waveform, sample_rate)

######################################################################
# Kaldi Pitch (beta)
# ------------------
#
# Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic
# speech recognition (ASR) applications. This is a beta feature in ``torchaudio``,
# and it is available only in ``functional``.
#
# 1. A pitch extraction algorithm tuned for automatic speech recognition
#
#    Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S.
#    Khudanpur
#
#    2014 IEEE International Conference on Acoustics, Speech and Signal
#    Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi:
#    10.1109/ICASSP.2014.6854049.
#    [`abstract <https://ieeexplore.ieee.org/document/6854049>`__],
#    [`paper <https://danielpovey.com/files/2014_icassp_pitch.pdf>`__]
#

waveform, sample_rate = get_speech_sample(resample=16000)

pitch_feature = F.compute_kaldi_pitch(waveform, sample_rate)
pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1]

plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc)
play_audio(waveform, sample_rate)
 def func(tensor):
     sample_rate: float = 44100.
     return F.compute_kaldi_pitch(tensor, sample_rate)