Beispiel #1
0
    def test_stft_istft(self):
        try:
            import librosa
            ds = F.load_digit_wav()
            name = ds.keys()[0]
            path = ds[name]

            y, _ = speech.read(path, pcm=True)
            hop_length = int(0.01 * 8000)
            stft = signal.stft(y, n_fft=256, hop_length=hop_length, window='hann')
            stft_ = librosa.stft(y, n_fft=256, hop_length=hop_length, window='hann')
            self.assertTrue(np.allclose(stft, stft_.T))

            y1 = signal.istft(stft, hop_length=hop_length, window='hann')
            y2 = librosa.istft(stft_, hop_length=hop_length, window='hann')
            self.assertTrue(np.allclose(y1, y2))
        except ImportError:
            print("test_stft_istft require librosa.")
Beispiel #2
0
 def map(self, job):
     '''
     Return
     ------
     [(name, spec, mspec, mfcc, pitch, vad), ...]
     '''
     audio_path, segments = job[0] if len(job) == 1 else job
     try:
         # load audio data
         s, sr_orig = speech.read(audio_path)
         if sr_orig is not None and self.sr is not None and \
         sr_orig != self.sr:
             raise Exception('Given sample rate (%d Hz) is different from '
                             'audio file sample rate (%d Hz).' %
                             (self.sr, sr_orig))
         if sr_orig is None:
             sr_orig = self.sr
         N = len(s)
         # processing all segments
         ret = []
         for name, start, end, channel in segments:
             start = int(float(start) * sr_orig)
             end = int(N if end <= 0 else float(end) * sr_orig)
             data = s[start:end, channel] if s.ndim > 1 else s[start:end]
             with warnings.catch_warnings():
                 warnings.filterwarnings("ignore", category=UserWarning)
                 features = speech.speech_features(
                     data.ravel(),
                     sr=sr_orig,
                     win=self.win,
                     shift=self.shift,
                     nb_melfilters=self.nb_melfilters,
                     nb_ceps=self.nb_ceps,
                     get_spec=self.get_spec,
                     get_mspec=self.get_mspec,
                     get_mfcc=self.get_mfcc,
                     get_qspec=self.get_qspec,
                     get_phase=self.get_phase,
                     get_pitch=self.get_pitch,
                     get_vad=self.get_vad,
                     get_energy=self.get_energy,
                     get_delta=self.get_delta,
                     pitch_threshold=self.pitch_threshold,
                     pitch_fmax=self.pitch_fmax,
                     vad_smooth=self.vad_smooth,
                     vad_minlen=self.vad_minlen,
                     cqt_bins=self.cqt_bins,
                     fmin=self.fmin,
                     fmax=self.fmax,
                     sr_new=self.sr_new,
                     preemphasis=self.preemphasis,
                     center=self.center)
             if features is not None:
                 ret.append(
                     (name,
                      [features[i[0]] for i in self.__features_properties]))
             else:
                 msg = 'Ignore segments: %s, error: NaN values' % name
                 warnings.warn(msg)
         # return the results as a generator
         return (i for i in ret)
     except Exception as e:
         msg = 'Ignore file: %s, error: %s' % (audio_path, str(e))
         import traceback
         traceback.print_exc()
         raise e
Beispiel #3
0
 def transform(self, X, start=0., end=-1, channel=0):
     """
     fs : int
         original sample frequency of data (in case reading pcm file
         we don't know the original sample frequency)
     """
     get_mspec = False
     get_spec = False
     get_mfcc = False
     get_pitch = False
     if 'mspec' in self.feature_type:
         get_mspec = True
     if 'spec' in self.feature_type:
         get_spec = True
     if 'mfcc' in self.feature_type:
         get_mfcc = True
     if 'pitch' in self.feature_type:
         get_pitch = True
     # ====== Read audio ====== #
     if isinstance(X, str) and os.path.exists(X):
         X, orig_fs = speech.read(X)
     elif isinstance(X, np.ndarray):
         orig_fs = None
     elif isinstance(X, Data):
         X = X[:]
         orig_fs = None
     else:
         raise ValueError('Cannot process data type %s' % type(X))
     # if specifed DEFAULT_FS
     if orig_fs is None:
         orig_fs = (SpeechTransform.DEFAULT_FS
                    if SpeechTransform.DEFAULT_FS is not None else self.fs)
     # ====== check if downsample necessary ====== #
     if self.fs < orig_fs:  # downsample
         from scikits.samplerate import resample
         X = resample(X, self.fs / orig_fs, 'sinc_best')
     elif self.fs > orig_fs:
         raise ValueError('Cannot perform upsample from frequency: '
                          '{}Hz to {}Hz'.format(orig_fs, self.fs))
     fs = orig_fs if self.fs is None else self.fs
     # ====== preprocessing ====== #
     N = len(X)
     start = int(float(start) * fs)
     end = int(N if end < 0 else end * fs)
     X = X[start:end, channel] if X.ndim > 1 else X[start:end]
     data = speech_features_extraction(X.ravel(),
                                       fs=fs,
                                       n_filters=self.n_filters,
                                       n_ceps=self.n_ceps,
                                       win=self.win,
                                       shift=self.shift,
                                       delta_order=self.delta_order,
                                       energy=self.energy,
                                       vad=self.vad,
                                       dtype='float32',
                                       pitch_threshold=self.pitch_threshold,
                                       get_pitch=get_pitch,
                                       get_spec=get_spec,
                                       get_mspec=get_mspec,
                                       get_mfcc=get_mfcc)
     # ====== return results ====== #
     if data is None:
         return None
     results = {}
     if get_spec:
         X, sum1, sum2 = data[0]
         results['spec'] = X
     if get_mspec:
         X, sum1, sum2 = data[1]
         results['mspec'] = X
     if get_mfcc:
         X, sum1, sum2 = data[2]
         results['mfcc'] = X
     if get_pitch:
         X, sum1, sum2 = data[3]
         results['pitch'] = X
     results = [results[i] for i in self.feature_type]
     if len(results) == 1: results = results[0]
     if self.vad:
         return (results +
                 [data[-1]] if isinstance(results,
                                          (tuple,
                                           list)) else [results, data[-1]])
     return results
Beispiel #4
0
    return results[0] if len(results) == 1 else results

files = [
    '/Users/trungnt13/tmp/20051026_180611_340_fsp-b.sph',
    '/Users/trungnt13/tmp/20051118_212058_553_fsp-b.sph',
    '/Users/trungnt13/tmp/20110213_140144_892-b.sph',
    '/Users/trungnt13/tmp/en_4077-a.sph',
    '/Users/trungnt13/tmp/en_4660-a.sph',
    '/Users/trungnt13/tmp/en_6402-b.sph',
    '/Users/trungnt13/tmp/fla_0811-b.sph',
    '/Users/trungnt13/tmp/fla_0946-b.sph',
    '/Users/trungnt13/tmp/lre17_vlujcseb.sph'
]
for f in files:
    print(f)
    s, sr = speech.read(f, remove_dc_offset=True, remove_zeros=True)
    segs, vad, voices, cut = test_func(s, sr, frame_length=0.02 * sr, maximum_duration=10, minimum_duration=None,
                                       return_vad=True, return_cut=True, return_voices=True)
    for s in segs:
        print(s.shape, len(s) / sr)
    plt.figure(figsize=(10, 6))
    plt.subplot(4, 1, 1)
    plt.plot(speech.resample(s, sr, 2000, best_algorithm=False))
    plt.subplot(4, 1, 2)
    plt.plot(vad)
    plt.subplot(4, 1, 3)
    plt.plot(voices)
    plt.subplot(4, 1, 4)
    plt.plot(cut)
    plt.show()
Beispiel #5
0
import matplotlib
matplotlib.use('TkAgg')
from matplotlib import pyplot as plt
import seaborn

import numpy as np
import shutil
import os
from odin import fuel as F, utils
from odin.preprocessing import speech
from odin import visual

datapath = F.load_digit_wav()
print(datapath)
files = utils.get_all_files(datapath, lambda x: '.wav' in x)
y, sr = speech.read(files[0])
print('Raw signal:', y.shape, sr)

feat = speech.speech_features(y,
                              sr,
                              win=0.02,
                              shift=0.01,
                              nb_melfilters=40,
                              nb_ceps=13,
                              get_spec=True,
                              get_mspec=True,
                              get_mfcc=True,
                              get_qspec=True,
                              get_phase=True,
                              get_pitch=True,
                              get_vad=True,
    return results[0] if len(results) == 1 else results


files = [
    '/Users/trungnt13/tmp/20051026_180611_340_fsp-b.sph',
    '/Users/trungnt13/tmp/20051118_212058_553_fsp-b.sph',
    '/Users/trungnt13/tmp/20110213_140144_892-b.sph',
    '/Users/trungnt13/tmp/en_4077-a.sph', '/Users/trungnt13/tmp/en_4660-a.sph',
    '/Users/trungnt13/tmp/en_6402-b.sph',
    '/Users/trungnt13/tmp/fla_0811-b.sph',
    '/Users/trungnt13/tmp/fla_0946-b.sph',
    '/Users/trungnt13/tmp/lre17_vlujcseb.sph'
]
for f in files:
    print(f)
    s, sr = speech.read(f, remove_dc_offset=True, remove_zeros=True)
    segs, vad, voices, cut = test_func(s,
                                       sr,
                                       frame_length=0.02 * sr,
                                       maximum_duration=10,
                                       minimum_duration=None,
                                       return_vad=True,
                                       return_cut=True,
                                       return_voices=True)
    for s in segs:
        print(s.shape, len(s) / sr)
    plt.figure(figsize=(10, 6))
    plt.subplot(4, 1, 1)
    plt.plot(speech.resample(s, sr, 2000, best_algorithm=False))
    plt.subplot(4, 1, 2)
    plt.plot(vad)
Beispiel #7
0
def plot_audio(s,
               sr=None,
               win=0.02,
               shift=0.01,
               nb_melfilters=40,
               nb_ceps=12,
               get_qspec=False,
               get_vad=True,
               fmin=64,
               fmax=None,
               sr_new=None,
               preemphasis=0.97,
               pitch_threshold=0.8,
               pitch_fmax=1200,
               vad_smooth=3,
               vad_minlen=0.1,
               cqt_bins=96,
               center=True,
               title=""):
    from matplotlib import pyplot as plt
    from odin.preprocessing import speech

    # ====== helper ====== #
    def spectrogram(spec, vad, title):
        plt.figure()
        if spec.shape[0] / spec.shape[1] >= 8.:
            nb_samples = len(spec)
            n1, n2, n3 = nb_samples // 4, nb_samples // 2, 3 * nb_samples // 4
            plt.subplot2grid((3, 4), (0, 0), rowspan=1, colspan=4)
            plot_spectrogram(spec.T, vad=vad)
            plt.subplot2grid((3, 4), (1, 0), rowspan=1, colspan=2)
            plot_spectrogram(spec[:n1].T, vad=vad[:n1])
            plt.subplot2grid((3, 4), (1, 2), rowspan=1, colspan=2)
            plot_spectrogram(spec[n1:n2].T, vad=vad[n1:n2])
            plt.subplot2grid((3, 4), (2, 0), rowspan=1, colspan=2)
            plot_spectrogram(spec[n2:n3].T, vad=vad[n2:n3])
            plt.subplot2grid((3, 4), (2, 2), rowspan=1, colspan=2)
            plot_spectrogram(spec[n3:].T, vad=vad[n3:])
        else:
            plot_spectrogram(spec.T, vad=vad)
        plt.suptitle(str(title))
        plt.tight_layout()

    # ====== load signal ====== #
    if isinstance(s, string_types):
        name = os.path.basename(s)
        s, _ = speech.read(s)
        if sr is None:
            sr = _
    else:
        name = "-".join([str(s.shape), str(sr)])
    title = str(title) + ":" + name
    # ====== processing ====== #
    get_vad = True if not get_vad else get_vad
    y = speech.speech_features(s,
                               sr,
                               win=win,
                               shift=shift,
                               nb_melfilters=nb_melfilters,
                               nb_ceps=nb_ceps,
                               get_spec=True,
                               get_mspec=True,
                               get_mfcc=True,
                               get_qspec=get_qspec,
                               get_phase=False,
                               get_pitch=False,
                               get_vad=get_vad,
                               get_energy=True,
                               get_delta=False,
                               fmin=fmin,
                               fmax=fmax,
                               sr_new=sr_new,
                               preemphasis=preemphasis,
                               pitch_threshold=pitch_threshold,
                               pitch_fmax=pitch_fmax,
                               vad_smooth=vad_smooth,
                               vad_minlen=vad_minlen,
                               cqt_bins=cqt_bins,
                               center=center)
    # ====== plot raw signals ====== #
    if sr > 16000:
        s = speech.resample(s, sr, 16000)
        sr = 16000
    plt.figure()
    plt.subplot(2, 1, 1)
    plt.plot(s)
    plt.subplot(2, 1, 2)
    plt.plot(y['energy'].ravel())
    plt.tight_layout()
    plt.suptitle(title)
    # ====== plot spectrogram ====== #
    spectrogram(y['spec'], y['vad'], title='STFT power spectrum')
    if get_qspec:
        spectrogram(y['qspec'], y['vad'], title='CQT power spectrum')
    return y